webtools-cli 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
webtools/core.py ADDED
@@ -0,0 +1,2596 @@
1
+ import sys,os,re,requests,random,subprocess,time,socket,shutil,json,zipfile,atexit,concurrent.futures,threading,qrcode,logging,queue,urllib3,base64,traceback,csv,io,mtranslate,hashlib
2
+ sys.dont_write_bytecode = True
3
+
4
+ # --- PACKAGE PATHS ---
5
+ PACKAGE_DIR = os.path.dirname(os.path.abspath(__file__))
6
+ DATA_DIR = os.path.join(os.path.expanduser('~'), '.webtools')
7
+ os.makedirs(DATA_DIR, exist_ok=True)
8
+ try:
9
+ from colorama import init, Fore, Style
10
+ init(autoreset=True)
11
+ COLOR_SUPPORT = True
12
+ except ImportError:
13
+ COLOR_SUPPORT = False
14
+ import numpy as np
15
+ from bs4 import BeautifulSoup
16
+ from collections import Counter
17
+ from flask import Flask, render_template_string, send_from_directory, request, jsonify, send_file
18
+ from PIL import Image,ExifTags,ImageChops,ImageEnhance
19
+ from io import BytesIO
20
+
21
+ try:
22
+ from playwright.sync_api import sync_playwright
23
+ PLAYWRIGHT_AVAILABLE = True
24
+ except ImportError:
25
+ PLAYWRIGHT_AVAILABLE = False
26
+
27
+ # --- CLI AUTOCOMPLETE SETUP ---
28
+ try:
29
+ if os.name == 'nt':
30
+ try:
31
+ from pyreadline3 import Readline
32
+ readline = Readline()
33
+ except (ImportError, AttributeError):
34
+ import pyreadline3 as readline
35
+ else:
36
+ import readline
37
+
38
+ HISTORY_FILE = os.path.join(DATA_DIR, 'history')
39
+
40
+ def setup_autocomplete(commands):
41
+ def completer(text, state):
42
+ options = [i for i in commands if i.startswith(text)]
43
+ if state < len(options):
44
+ return options[state]
45
+ else:
46
+ return None
47
+
48
+ # Ensure the object has the required methods
49
+ if hasattr(readline, 'set_completer'):
50
+ readline.set_completer(completer)
51
+ if 'libedit' in (getattr(readline, '__doc__', '') or ''):
52
+ readline.parse_and_bind("bind ^I rl_complete")
53
+ else:
54
+ readline.parse_and_bind("tab: complete")
55
+ if os.name != 'nt' and hasattr(readline, 'parse_and_bind'):
56
+ readline.parse_and_bind("set show-all-if-ambiguous on")
57
+
58
+ if hasattr(readline, 'set_completer_delims'):
59
+ readline.set_completer_delims(' ')
60
+
61
+ # Load History
62
+ if os.path.exists(HISTORY_FILE):
63
+ try:
64
+ readline.read_history_file(HISTORY_FILE)
65
+ except: pass
66
+
67
+ # Save on exit
68
+ atexit.register(lambda: (readline.write_history_file(HISTORY_FILE) if AUTOCOMPLETE_AVAILABLE else None))
69
+
70
+ AUTOCOMPLETE_AVAILABLE = True
71
+ except Exception:
72
+ def setup_autocomplete(commands): pass
73
+ AUTOCOMPLETE_AVAILABLE = False
74
+
75
+ def print_gradient_text(text, start_rgb, end_rgb):
76
+ """Prints text with a vertical/horizontal color gradient using 24-bit ANSI"""
77
+ lines = text.splitlines()
78
+ if not lines: return
79
+
80
+ for i, line in enumerate(lines):
81
+ # Calculate ratio for this line
82
+ ratio = i / max(1, len(lines) - 1)
83
+
84
+ # Interpolate RGB
85
+ r = int(start_rgb[0] + (end_rgb[0] - start_rgb[0]) * ratio)
86
+ g = int(start_rgb[1] + (end_rgb[1] - start_rgb[1]) * ratio)
87
+ b = int(start_rgb[2] + (end_rgb[2] - start_rgb[2]) * ratio)
88
+
89
+ # Apply 24-bit color ANSI
90
+ print(f"\033[38;2;{r};{g};{b}m{line}\033[0m")
91
+
92
+ # Flask logs ko suppress karo
93
+ log = logging.getLogger('werkzeug')
94
+ log.setLevel(logging.ERROR)
95
+
96
+ # SSL warnings ko globally band karo
97
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
98
+
99
+ # Directories setup kar rahe hain
100
+ os.makedirs('webfiles/scraped', exist_ok=True)
101
+ os.makedirs('webfiles/scraped/images', exist_ok=True)
102
+ os.makedirs('webfiles/scraped/videos', exist_ok=True)
103
+
104
+ # --- PERFORMANCE AUDITOR ---
105
+ class PerformanceTracker:
106
+ def __init__(self):
107
+ self.stats_file = os.path.join(DATA_DIR, 'performance_stats.json')
108
+ self.data = self.load_data()
109
+ self.current_report = {}
110
+ self.last_mark = 0
111
+ self.session_url = ""
112
+
113
+ def load_data(self):
114
+ if os.path.exists(self.stats_file):
115
+ try:
116
+ with open(self.stats_file, 'r') as f:
117
+ return json.load(f)
118
+ except: pass
119
+ return {'best': float('inf'), 'worst': 0, 'total_time': 0, 'count': 0}
120
+
121
+ def save_data(self):
122
+ try:
123
+ with open(self.stats_file, 'w') as f:
124
+ json.dump(self.data, f)
125
+ except: pass
126
+
127
+ def start_session(self, url):
128
+ self.current_report = {}
129
+ self.last_mark = time.perf_counter()
130
+ self.session_url = url
131
+
132
+ def record_phase(self, name):
133
+ now = time.perf_counter()
134
+ duration = now - self.last_mark
135
+ self.current_report[name] = self.current_report.get(name, 0) + duration
136
+ self.last_mark = now
137
+
138
+ def finish_and_print(self):
139
+ total = sum(self.current_report.values())
140
+ if total <= 0: return {}
141
+
142
+ self.data['count'] += 1
143
+ self.data['total_time'] += total
144
+ if total < self.data['best']: self.data['best'] = total
145
+ if total > self.data['worst']: self.data['worst'] = total
146
+ self.save_data()
147
+
148
+ avg = self.data['total_time'] / self.data['count']
149
+
150
+ return {
151
+ 'total': total,
152
+ 'phases': dict(self.current_report),
153
+ 'avg': avg,
154
+ 'best': self.data['best'],
155
+ 'worst': self.data['worst']
156
+ }
157
+
158
+ perf_tracker = PerformanceTracker()
159
+
160
+ class MoonSpinner:
161
+ """Threaded moon-phase loading animation"""
162
+ def __init__(self, message="Processing"):
163
+ self.message = message
164
+ self.frames = ['🌑', '🌒', '🌓', '🌔', '🌕', '🌖', '🌗', '🌘']
165
+ self.stop_event = threading.Event()
166
+ self.thread = threading.Thread(target=self._animate, daemon=True)
167
+
168
+ def _animate(self):
169
+ while not self.stop_event.is_set():
170
+ for f in self.frames:
171
+ if self.stop_event.is_set(): break
172
+ sys.stdout.write(f'\r{self.message} {f} ')
173
+ sys.stdout.flush()
174
+ time.sleep(0.2)
175
+
176
+ def __enter__(self):
177
+ self.thread.start()
178
+ return self
179
+
180
+ def __exit__(self, exc_type, exc_val, exc_tb):
181
+ self.stop_event.set()
182
+ self.thread.join(timeout=1.0)
183
+ sys.stdout.write(f'\r{self.message} 🌕 Done! \n')
184
+ sys.stdout.flush()
185
+
186
+ # --- PROXY aur UA MANAGER ---
187
+ class ProxyManager:
188
+ def __init__(self):
189
+ self.proxies = []
190
+ self.last_fetch = 0
191
+ self.fetch_interval = 300 # Har 5 minute mein refresh hoga
192
+ self.user_agents = [
193
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
194
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
195
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0',
196
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
197
+ 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Mobile/15E148 Safari/604.1'
198
+ ]
199
+
200
+ # Background mein validation logic
201
+ self.valid_proxies = queue.Queue()
202
+ self.lock = threading.Lock()
203
+ self.running = True
204
+
205
+ # Smart Learner (Scores setup)
206
+ self.scores_file = os.path.join(DATA_DIR, 'scores.json')
207
+ self.scores = self.load_scores()
208
+
209
+ # Background thread chalao
210
+ threading.Thread(target=self._background_validator, daemon=True).start()
211
+
212
+ def load_scores(self):
213
+ if os.path.exists(self.scores_file):
214
+ try:
215
+ with open(self.scores_file, 'r') as f:
216
+ return json.load(f)
217
+ except:
218
+ pass
219
+ return {}
220
+
221
+ def save_scores(self):
222
+ try:
223
+ with open(self.scores_file, 'w') as f:
224
+ json.dump(self.scores, f, indent=2)
225
+ except Exception as e:
226
+ print(f"Failed to save scores: {e}")
227
+
228
+ def report_success(self, proxy, domain, task_type, success):
229
+ with self.lock:
230
+ # Score ke liye composite key (jaise "youtube.com::video")
231
+ score_key = f"{domain}::{task_type}"
232
+
233
+ if score_key not in self.scores:
234
+ self.scores[score_key] = {}
235
+
236
+ if proxy not in self.scores[score_key]:
237
+ self.scores[score_key][proxy] = {'success': 0, 'fail': 0, 'score': 0.5}
238
+
239
+ stats = self.scores[score_key][proxy]
240
+ if success:
241
+ stats['success'] += 1
242
+ # Score badhao (Stickiness factor)
243
+ stats['score'] = min(1.0, stats['score'] + 0.25)
244
+ else:
245
+ stats['fail'] += 1
246
+ # Penalty laga rahe hain
247
+ stats['score'] = max(0.0, stats['score'] - 0.2)
248
+
249
+ self.save_scores()
250
+
251
+ def get_smart_proxy(self, domain, task_type='general'):
252
+ # Epsilon-Greedy Strategy with Stickiness
253
+
254
+ score_key = f"{domain}::{task_type}"
255
+
256
+ if score_key in self.scores:
257
+ domain_scores = self.scores[score_key]
258
+ # Find best proxy
259
+ if domain_scores:
260
+ best_proxy = max(domain_scores, key=lambda p: domain_scores[p]['score'])
261
+ best_score = domain_scores[best_proxy]['score']
262
+
263
+ # STICKY LOGIC:
264
+ # If Score > 0.8 (Very Trusted): 95% Exploitation
265
+ # If Score > 0.5 (Trusted): 80% Exploitation
266
+ # Else: 50% Exploitation
267
+
268
+ exploitation_rate = 0.5
269
+ if best_score > 0.8: exploitation_rate = 0.95
270
+ elif best_score > 0.5: exploitation_rate = 0.80
271
+
272
+ if random.random() < exploitation_rate and best_score > 0.4:
273
+ print(f"🧠 Smart Learner: Sticky reuse of {best_proxy} for {score_key} (Score: {best_score:.2f})")
274
+ return best_proxy
275
+
276
+ # Fallback to general domain score if task-specific score missing
277
+ if task_type != 'general':
278
+ return self.get_smart_proxy(domain, 'general')
279
+
280
+ # Exploration fallback
281
+ return self.get_valid_proxy()
282
+
283
+ def fetch_proxies(self):
284
+ # Sirf tab fetch karo jab list empty ya purani ho
285
+ with self.lock:
286
+ if self.proxies and (time.time() - self.last_fetch < self.fetch_interval):
287
+ return
288
+
289
+ # Quiet mode if background
290
+ # print("Fetching new proxies...")
291
+ try:
292
+ # Optimized timing: timeout=3000 (3s)
293
+ url = "https://api.proxyscrape.com/v2/?request=getproxies&protocol=http&timeout=3000&country=all&ssl=all&anonymity=elite,anonymous"
294
+ resp = requests.get(url, timeout=5)
295
+ if resp.status_code == 200:
296
+ proxy_list = resp.text.strip().split('\r\n')
297
+ with self.lock:
298
+ self.proxies = [p for p in proxy_list if p]
299
+ self.last_fetch = time.time()
300
+ # print(f"Fetched {len(self.proxies)} high-quality proxies.")
301
+ except Exception as e:
302
+ print(f"Failed to fetch proxies: {e}")
303
+
304
+ def _background_validator(self):
305
+ while self.running:
306
+ # Queue bhar ke rakho (target: 20 valid proxies)
307
+ if self.valid_proxies.qsize() < 20:
308
+ self.fetch_proxies()
309
+
310
+ with self.lock:
311
+ if not self.proxies:
312
+ time.sleep(5)
313
+ continue
314
+ candidates = list(self.proxies)
315
+
316
+ # Random proxy select karo
317
+ proxy = random.choice(candidates)
318
+
319
+ if self._check_proxy(proxy):
320
+ self.valid_proxies.put(proxy)
321
+ # print(f"P: {proxy} ({self.valid_proxies.qsize()})")
322
+ else:
323
+ pass
324
+ else:
325
+ time.sleep(1) # Queue full
326
+
327
+ def _check_proxy(self, proxy):
328
+ try:
329
+ proxies = {'http': proxy, 'https': proxy}
330
+ # Jaldi wala check
331
+ resp = requests.get("http://httpbin.org/ip", proxies=proxies, timeout=3, verify=False)
332
+ return resp.status_code == 200
333
+ except:
334
+ return False
335
+
336
+ def get_valid_proxy(self):
337
+ # Agar available ho toh turant result do
338
+ try:
339
+ return self.valid_proxies.get_nowait()
340
+ except:
341
+ # Agar queue empty ho toh random unvalidated proxy use karo
342
+ return self.get_random_proxy()
343
+
344
+ def get_random_proxy(self):
345
+ self.fetch_proxies()
346
+ if not self.proxies:
347
+ return None
348
+ return random.choice(self.proxies)
349
+
350
+ def get_random_ua(self):
351
+ return random.choice(self.user_agents)
352
+
353
+ proxy_manager = ProxyManager()
354
+ # ---------------------
355
+
356
+ def get_free_port():
357
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
358
+ sock.bind(('', 0))
359
+ port = sock.getsockname()[1]
360
+ sock.close()
361
+ return port
362
+
363
+ PORT = get_free_port()
364
+ WEB_DIR = os.path.join(PACKAGE_DIR, 'web')
365
+ app = Flask(__name__)
366
+
367
+ # Video Extraction wala updated HTML UI
368
+ @app.route('/')
369
+ def index():
370
+ return send_from_directory(WEB_DIR, 'index.html')
371
+
372
+ @app.route('/style.css')
373
+ def serve_css():
374
+ return send_from_directory(WEB_DIR, 'style.css')
375
+
376
+ @app.route('/script.js')
377
+ def serve_js():
378
+ return send_from_directory(WEB_DIR, 'script.js')
379
+
380
+ @app.route('/favicon.png')
381
+ def serve_favicon():
382
+ return send_from_directory(WEB_DIR, 'Web_Tools.png')
383
+
384
+ @app.route('/download/<path:filename>')
385
+
386
+ @app.route('/download/<path:filename>')
387
+ def serve_scraped_file(filename):
388
+ return send_from_directory('webfiles/scraped', filename)
389
+
390
+ def scrape_with_playwright(url, proxy=None):
391
+ if not PLAYWRIGHT_AVAILABLE:
392
+ return None
393
+
394
+ try:
395
+ with sync_playwright() as p:
396
+ browser = p.chromium.launch(headless=True)
397
+
398
+ # Stealth settings ke sath context
399
+ context = browser.new_context(
400
+ viewport={'width': 1920, 'height': 1080},
401
+ user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
402
+ )
403
+
404
+ page = context.new_page()
405
+
406
+ # Heavy sites ke liye 30s ka timeout
407
+ try:
408
+ page.goto(url, timeout=30000, wait_until='domcontentloaded')
409
+
410
+ # Dynamic content ke liye thoda wait karo
411
+ time.sleep(5)
412
+
413
+ # Lazy loads trigger karne ke liye niche scroll karo
414
+ page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
415
+ time.sleep(2)
416
+
417
+ content = page.content()
418
+ return content
419
+ except Exception as e:
420
+ print(f"❌ Playwright navigation failed: {e}")
421
+ return None
422
+ finally:
423
+ browser.close()
424
+ except Exception as e:
425
+ print(f"❌ Playwright error: {e}")
426
+ return None
427
+
428
+
429
+ # --- OSINT HELPERS ---
430
+ def extract_emails(text):
431
+ """Regex se emails nikalo"""
432
+ # Sahi wala regex:
433
+ # 1. Start with alnum/dots/dashes
434
+ # 2. @ symbol
435
+ # 3. Domain name (alnum/dashes)
436
+ # 4. TLD (2+ chars)
437
+ # Filter: length < 50, accidental image matches remove karo
438
+ raw = re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', text)
439
+ valid = []
440
+ for email in set(raw):
441
+ if len(email) > 50: continue
442
+ if email.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp', '.svg')): continue
443
+ valid.append(email)
444
+ return valid
445
+
446
+ def extract_phones(text):
447
+ """Phone numbers nikalo (+91 aur intl formats par focus)"""
448
+ phones = set()
449
+
450
+ # 1. Strict Indian Mobile Numbers: +91 9876543210 / 9876543210 / 09876543210
451
+ # Matches: +91-9876543210, +91 98765 43210, 9876543210
452
+ # Indian mobiles ke liye [6-9] starting digit search karo
453
+ indian_regex = r'(?:(?:\+|0{0,2})91(\s*[\-]\s*)?|[0]?)?([6-9]\d{3}[\s\-]?\d{6})'
454
+ for match in re.findall(indian_regex, text):
455
+ # match[1] number wala part hai
456
+ full_num = match[1].replace(' ', '').replace('-', '')
457
+ if len(full_num) == 10:
458
+ phones.add("+91 " + full_num)
459
+
460
+ # 2. General International (Fallback) jaise +1 etc.
461
+ # Explicit plus sign aur digits search karo
462
+ intl_regex = r'\+(?:9[976]\d|8[987530]\d|6[987]\d|5[90]\d|42\d|3[875]\d|2[98654321]\d|9[8543210]|8[6421]|6[6543210]|5[87654321]|4[987654310]|3[9643210]|2[70]|7|1)\W*\d\W*\d\W*\d\W*\d\W*\d\W*\d\W*\d\W*\d\W*(\d{1,2})?'
463
+ for p in re.findall(intl_regex, text):
464
+ # Yeh thoda complex match hai, noise avoid karne ke liye simple rakhte hain
465
+ pass
466
+
467
+ # 3. "Tel:" links ke liye simple grab
468
+ # (Regex ne handle kar liya hoga par safe rehne ke liye)
469
+
470
+ return list(phones)
471
+
472
+ def extract_locations(soup):
473
+ """Physical addresses nikalo"""
474
+ locations = set()
475
+
476
+ # 1. Schema.org Parsing
477
+ for item in soup.find_all(attrs={"itemtype": re.compile(r"schema.org/PostalAddress", re.I)}):
478
+ text = item.get_text(separator=', ').strip()
479
+ if len(text) > 10: locations.add(text)
480
+
481
+ # 2. Google Maps Embeds
482
+ for iframe in soup.find_all('iframe', src=True):
483
+ if 'maps.google' in iframe['src']:
484
+ # Query nikalne ki koshish karo
485
+ match = re.search(r'q=([^&]+)', iframe['src'])
486
+ if match:
487
+ import urllib.parse
488
+ addr = urllib.parse.unquote(match.group(1).replace('+', ' '))
489
+ locations.add(addr)
490
+
491
+ # 3. Heuristic Keywords (Footer/Contact)
492
+ # Search ko footer ya contact sections tak limit rakho
493
+ search_area = soup.find('footer') or soup.find(id='contact') or soup.body
494
+ if search_area:
495
+ text = search_area.get_text(separator=' ')
496
+ # "Address: ..." ke liye rough regex
497
+ matches = re.findall(r'(?:Address|Location|Office):\s*([a-zA-Z0-9,\.\-\s]{10,100})', text, re.I)
498
+ for m in matches:
499
+ locations.add(m.strip())
500
+
501
+ return list(locations)
502
+
503
+
504
+ def extract_social_media(soup):
505
+ """Social media profile links nikalo"""
506
+ social_domains = {
507
+ 'facebook.com': 'Facebook',
508
+ 'twitter.com': 'Twitter',
509
+ 'x.com': 'X (Twitter)',
510
+ 'instagram.com': 'Instagram',
511
+ 'linkedin.com': 'LinkedIn',
512
+ 'youtube.com': 'YouTube',
513
+ 'tiktok.com': 'TikTok',
514
+ 'pinterest.com': 'Pinterest',
515
+ 'github.com': 'GitHub',
516
+ 'gitlab.com': 'GitLab',
517
+ 'discord.gg': 'Discord',
518
+ 't.me': 'Telegram'
519
+ }
520
+ found = {}
521
+ for a in soup.find_all('a', href=True):
522
+ href = a['href'].lower()
523
+ for domain, name in social_domains.items():
524
+ if domain in href and name not in found:
525
+ # Share links avoid karne ke liye basic check
526
+ if 'share' not in href and 'intent' not in href:
527
+ found[name] = a['href']
528
+
529
+ return [{'platform': k, 'url': v} for k, v in found.items()]
530
+
531
+ def detect_tech_stack(soup, response):
532
+ """Site par kaunsi technologies used hain woh check karo"""
533
+ stack = set()
534
+
535
+ # 1. Headers
536
+ if 'Server' in response.headers:
537
+ stack.add(f"Server: {response.headers['Server']}")
538
+ if 'X-Powered-By' in response.headers:
539
+ stack.add(f"Powered By: {response.headers['X-Powered-By']}")
540
+ if 'Via' in response.headers:
541
+ stack.add(f"Via: {response.headers['Via']}")
542
+
543
+ # 2. Meta Tags
544
+ generator = soup.find('meta', attrs={'name': 'generator'})
545
+ if generator and generator.get('content'):
546
+ stack.add(generator['content'])
547
+
548
+ # 3. Scripts / HTML Patterns
549
+ html_str = str(soup).lower()
550
+ if 'wp-content' in html_str: stack.add('WordPress')
551
+ if 'shopify' in html_str: stack.add('Shopify')
552
+ if 'wix.com' in html_str: stack.add('Wix')
553
+ if 'squarespace' in html_str: stack.add('Squarespace')
554
+ if 'react' in html_str or '_next' in html_str: stack.add('React/Next.js')
555
+ if 'vue' in html_str or 'nuxt' in html_str: stack.add('Vue.js')
556
+ if 'bootstrap' in html_str: stack.add('Bootstrap')
557
+ if 'tailwind' in html_str: stack.add('Tailwind CSS')
558
+ if 'jquery' in html_str: stack.add('jQuery')
559
+ if 'cloudflare' in html_str: stack.add('Cloudflare')
560
+ if 'google-analytics' in html_str: stack.add('Google Analytics')
561
+
562
+ return list(stack)
563
+
564
+ def analyze_ai_content(text):
565
+ """Text analyze karo (sentiment, summary, readability, aur keywords)"""
566
+ try:
567
+ from textblob import TextBlob
568
+ import re
569
+
570
+ # Text clean karo
571
+ clean_text = re.sub(r'\s+', ' ', text).strip()
572
+ if not clean_text: return None
573
+
574
+ blob = TextBlob(clean_text)
575
+ sentiment = blob.sentiment
576
+
577
+ # 1. Summarization (Simple Frequency-based)
578
+ sentences = blob.sentences
579
+ if len(sentences) > 0:
580
+ # Simple summary: pehla sentence + 2 interesting sentences
581
+ # NLTK/Spacy jaisi heavy libraries avoid karne ka tareeka
582
+ summary_sentences = [sentences[0].string]
583
+
584
+ # Baaki ke diverse sentences search karo
585
+ remaining = sentences[1:]
586
+ remaining.sort(key=lambda s: len(s.noun_phrases), reverse=True)
587
+ for s in remaining[:2]:
588
+ summary_sentences.append(s.string)
589
+
590
+ summary = ' '.join(summary_sentences)
591
+ else:
592
+ summary = clean_text[:200] + "..."
593
+
594
+ # 2. Readability (Flesch Reading Ease)
595
+ # Formula: 206.835 - 1.015 (total words / total sentences) - 84.6 (total syllables / total words)
596
+ words = blob.words
597
+ num_sentences = len(sentences) or 1
598
+ num_words = len(words) or 1
599
+
600
+ # Syllable approximation (vowel groups)
601
+ def count_syllables(word):
602
+ word = word.lower()
603
+ count = len(re.findall(r'[aeiouy]+', word))
604
+ if word.endswith('e'): count -= 1
605
+ return max(1, count)
606
+
607
+ num_syllables = sum(count_syllables(w) for w in words)
608
+
609
+ flesch_score = 206.835 - 1.015 * (num_words / num_sentences) - 84.6 * (num_syllables / num_words)
610
+
611
+ readability_level = "Standard"
612
+ if flesch_score > 80: readability_level = "Very Easy (Kids)"
613
+ elif flesch_score > 60: readability_level = "Plain English"
614
+ elif flesch_score > 40: readability_level = "Difficult (College)"
615
+ else: readability_level = "Very Difficult (Academic)"
616
+
617
+ # 3. Enhanced Keywords
618
+ keywords = []
619
+ seen = set()
620
+ for phrase in blob.noun_phrases:
621
+ p = phrase.lower().strip()
622
+ # Junk aur chhote words filter karo
623
+ if len(p) > 4 and p not in seen and not re.match(r'^\d+$', p) and len(keywords) < 12:
624
+ keywords.append(phrase.title())
625
+ seen.add(p)
626
+
627
+ return {
628
+ 'sentiment': {
629
+ 'polarity': round(sentiment.polarity, 2),
630
+ 'subjectivity': round(sentiment.subjectivity, 2),
631
+ 'label': 'Positive' if sentiment.polarity > 0.1 else 'Negative' if sentiment.polarity < -0.1 else 'Neutral',
632
+ 'subjectivity_label': 'Opinionated' if sentiment.subjectivity > 0.5 else 'Objective'
633
+ },
634
+ 'summary': summary,
635
+ 'readability': {
636
+ 'score': round(flesch_score, 1),
637
+ 'level': readability_level
638
+ },
639
+ 'keywords': keywords
640
+ }
641
+ except Exception as e:
642
+ print(f"AI Analysis failed: {e}")
643
+ return None
644
+
645
+ def check_broken_links(url, soup, headers):
646
+ """Broken links (404s) check karo (parallel mein)."""
647
+ broken_links = []
648
+ links_to_check = []
649
+
650
+ from urllib.parse import urlparse, urljoin
651
+ base_domain = urlparse(url).netloc
652
+
653
+ # Links deduplicate karo
654
+ seen_links = set()
655
+
656
+ for a in soup.find_all('a', href=True):
657
+ href = a['href']
658
+ full_url = urljoin(url, href)
659
+
660
+ # Faltu ya non-http links skip karo
661
+ if not full_url.startswith('http') or full_url in seen_links:
662
+ continue
663
+
664
+ seen_links.add(full_url)
665
+
666
+ is_internal = base_domain in full_url
667
+ link_text = a.get_text().strip()[:50] # Text truncate karo
668
+
669
+ links_to_check.append({
670
+ 'url': full_url,
671
+ 'text': link_text or "No Text",
672
+ 'is_internal': is_internal
673
+ })
674
+
675
+ # Iss demo ke performance ke liye 50 links tak limit rakho
676
+ links_to_check = links_to_check[:50]
677
+
678
+ def check_status(link_info):
679
+ try:
680
+ # Speed ke liye Head request use karo
681
+ r = requests.head(link_info['url'], headers=headers, timeout=5, allow_redirects=True, verify=False)
682
+ if r.status_code >= 400:
683
+ return {
684
+ 'url': link_info['url'],
685
+ 'text': link_info['text'],
686
+ 'status': r.status_code,
687
+ 'is_internal': link_info['is_internal']
688
+ }
689
+ except Exception:
690
+ # Agar head fail ho jaye toh get try karo (kuch servers HEAD block karte hain)
691
+ try:
692
+ r = requests.get(link_info['url'], headers=headers, stream=True, timeout=5, verify=False)
693
+ if r.status_code >= 400:
694
+ return {
695
+ 'url': link_info['url'],
696
+ 'text': link_info['text'],
697
+ 'status': r.status_code,
698
+ 'is_internal': link_info['is_internal']
699
+ }
700
+ except Exception:
701
+ return {
702
+ 'url': link_info['url'],
703
+ 'text': link_info['text'],
704
+ 'status': 0, # Connection Error
705
+ 'is_internal': link_info['is_internal']
706
+ }
707
+ return None
708
+
709
+ # Parallel Execution
710
+ with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
711
+ results = list(executor.map(check_status, links_to_check))
712
+
713
+ # Filter Nones
714
+ broken_links = [r for r in results if r]
715
+
716
+ return broken_links
717
+
718
+ def execute_scrape_logic(url, fetch_images=False, fetch_videos=False, crawl_depth=1, use_proxy=False, device='desktop'):
719
+ try:
720
+ if not url.startswith('http'):
721
+ url = 'https://' + url
722
+
723
+ # Start performance session
724
+ perf_tracker.start_session(url)
725
+
726
+ # UA rotate karo
727
+ ua = proxy_manager.get_random_ua()
728
+ headers = {'User-Agent': ua}
729
+ print(f"Using UA: {ua}")
730
+
731
+ # SSL warnings band karo
732
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
733
+
734
+ perf_tracker.record_phase("Setup & Proxy")
735
+
736
+ try:
737
+ response = None
738
+ if use_proxy:
739
+ # Smart Learning ke liye Domain nikalo
740
+ from urllib.parse import urlparse
741
+ domain = urlparse(url).netloc
742
+
743
+ # Task Type decide karo
744
+ task_type = 'general'
745
+ if fetch_videos: task_type = 'video'
746
+ elif fetch_images: task_type = 'image'
747
+
748
+ # Smart Loop: Pehle Exploitation fir Exploration
749
+ print(f"Selecting best proxy for {domain} (Task: {task_type})...")
750
+
751
+ for attempt in range(3):
752
+ # Attempt 0: Smart Choice (Is domain aur task ke liye best)
753
+ # Attempt 1+: Fallback (Random valid proxy)
754
+ if attempt == 0:
755
+ proxy = proxy_manager.get_smart_proxy(domain, task_type)
756
+ else:
757
+ proxy = proxy_manager.get_valid_proxy()
758
+
759
+ if not proxy:
760
+ break
761
+
762
+ print(f"Using proxy: {proxy} (Attempt {attempt+1})")
763
+ proxies = {'http': proxy, 'https': proxy}
764
+ try:
765
+ # SSL errors bypass karne ke liye verify=False
766
+ response = requests.get(url, headers=headers, proxies=proxies, timeout=30, verify=False)
767
+ if response.status_code == 200:
768
+ # SUCCESS: Model train karo
769
+ print(f"✅ Proxy {proxy} worked for {domain} ({task_type}). Boosting score!")
770
+ proxy_manager.report_success(proxy, domain, task_type, True)
771
+ break
772
+ else:
773
+ # FAIL (Non-200): Model train karo
774
+ print(f"❌ Proxy {proxy} failed for {domain} ({task_type}) (Status {response.status_code}). Penalizing.")
775
+ proxy_manager.report_success(proxy, domain, task_type, False)
776
+ except Exception as e:
777
+ # FAIL (Exception): Model train karo
778
+ print(f"❌ Proxy {proxy} error for {domain} ({task_type}): {e}. Penalizing.")
779
+ proxy_manager.report_success(proxy, domain, task_type, False)
780
+ continue
781
+
782
+ if response is None or response.status_code != 200:
783
+ print("All proxies failed. Falling back to direct connection.")
784
+ response = requests.get(url, headers=headers, timeout=30)
785
+ else:
786
+ # Direct Request (SPA Check ke sath)
787
+ if 'linkedin.com' in url or 'instagram.com' in url:
788
+ print("⚠️ SPA Pattern Detected (LinkedIn/Instagram). Skipping requests...")
789
+ response = None # Playwright force karo
790
+ else:
791
+ response = requests.get(url, headers=headers, timeout=30)
792
+
793
+ # Block ya failure check karo
794
+ if response and response.status_code in [403, 401, 406, 429]:
795
+ print(f"⚠️ Access Denied ({response.status_code}). Triggering Headless Browser...")
796
+ response = None # Force Playwright
797
+
798
+ if response is None:
799
+ raise Exception("SPA/Auth Wall ke liye Playwright force karo")
800
+
801
+ response.raise_for_status()
802
+ perf_tracker.record_phase("Fetch Content")
803
+
804
+ except Exception as e:
805
+ # Agar requests fail hon toh Playwright use karo
806
+ print(f"Requests failed ({e}). Attempting Playwright Fallback...")
807
+ if PLAYWRIGHT_AVAILABLE:
808
+ pw_html = scrape_with_playwright(url)
809
+ if pw_html:
810
+ # Compatibility ke liye mock response object
811
+ class MockResponse:
812
+ def __init__(self, text):
813
+ self.text = text
814
+ self.content = text.encode('utf-8')
815
+ self.status_code = 200
816
+ self.headers = {}
817
+ response = MockResponse(pw_html)
818
+ else:
819
+ return jsonify({'error': f'Request failed and Playwright fallback failed: {str(e)}'}), 400
820
+ else:
821
+ return jsonify({'error': f'Request failed: {str(e)}'}), 400
822
+
823
+ soup = BeautifulSoup(response.text, 'html.parser')
824
+ perf_tracker.record_phase("HTML Parsing")
825
+
826
+ # --- HONEYPOT DETECTOR (Security Scout) ---
827
+ security_report = {'level': 'LOW', 'threats': [], 'honeypots': 0}
828
+
829
+ # 1. Status Code Check
830
+ if response.status_code in [403, 406, 429, 503]:
831
+ security_report['threats'].append(f"Suspicious Status Code: {response.status_code}")
832
+ security_report['level'] = 'HIGH'
833
+
834
+ # 2. Keyword Analysis (Anti-Bot)
835
+ page_text_lower = response.text.lower()
836
+ threat_keywords = ['cloudflare', 'managed challenge', 'captcha', 'security check', 'access denied', 'waf']
837
+ found_threats = [kw for kw in threat_keywords if kw in page_text_lower]
838
+ if found_threats:
839
+ security_report['threats'].append(f"Anti-Bot Detected: {', '.join(found_threats)}")
840
+ if 'captcha' in found_threats or 'challenge' in found_threats:
841
+ security_report['level'] = 'HIGH'
842
+ elif security_report['level'] == 'LOW':
843
+ security_report['level'] = 'MEDIUM'
844
+
845
+ # 3. Honeypot Link Detection (CSS Traps)
846
+ # Aise links nikalo jo bots ko dikhte hain par humans ko nahi
847
+ honeypot_links = 0
848
+ for a in soup.find_all('a', style=True):
849
+ style = a['style'].lower().replace(' ', '')
850
+ if 'display:none' in style or 'visibility:hidden' in style or 'opacity:0' in style:
851
+ honeypot_links += 1
852
+
853
+ if honeypot_links > 0:
854
+ security_report['honeypots'] = honeypot_links
855
+ security_report['threats'].append(f"Honeypot Traps: {honeypot_links} hidden links found")
856
+ if security_report['level'] == 'LOW': security_report['level'] = 'MEDIUM'
857
+ # ------------------------------------------
858
+
859
+ # Containers initialize karo
860
+ videos = []
861
+ video_count = 0
862
+ images = []
863
+ image_count = 0
864
+ seen_images = set()
865
+
866
+ # Helper validation
867
+ def cleaner_url_validator(url):
868
+ try:
869
+ # Http se start hona chahiye, dot hona chahiye, spaces nahi
870
+ if not url.startswith('http'): return False
871
+ if ' ' in url: return False
872
+ if '.' not in url.split('://')[1]: return False
873
+ return True
874
+ except:
875
+ return False
876
+
877
+ # Turbo-Fetch (Parallel Chunk Download) ke liye helper
878
+ def download_file_turbo(url, filepath):
879
+ try:
880
+ # 1. Size nikalo
881
+ head = requests.head(url, headers=headers, timeout=5, verify=False)
882
+ size = int(head.headers.get('content-length', 0))
883
+
884
+ # 2MB se bade files ke liye Turbo use karo
885
+ if size < 2 * 1024 * 1024:
886
+ return False
887
+
888
+ print(f"Turbo-Fetch active: {os.path.basename(filepath)} ({size/1024/1024:.1f} MB)")
889
+
890
+ # 2. Chunks calculate karo (8 parts)
891
+ num_chunks = 8
892
+ chunk_size = size // num_chunks
893
+ chunks = []
894
+ for i in range(num_chunks):
895
+ start = i * chunk_size
896
+ end = start + chunk_size - 1 if i < num_chunks - 1 else size - 1
897
+ chunks.append((start, end, i))
898
+
899
+ # 3. Parallel Download
900
+ file_data = bytearray(size)
901
+
902
+ def download_chunk(c):
903
+ start, end, idx = c
904
+ h = headers.copy()
905
+ h['Range'] = f'bytes={start}-{end}'
906
+ r = requests.get(url, headers=h, timeout=20, verify=False)
907
+ if r.status_code in [200, 206]:
908
+ # Seedha buffer mein likho
909
+ file_data[start:end+1] = r.content
910
+ return True
911
+ return False
912
+
913
+ with concurrent.futures.ThreadPoolExecutor(max_workers=8) as exc:
914
+ futures = [exc.submit(download_chunk, c) for c in chunks]
915
+ concurrent.futures.wait(futures)
916
+
917
+ # 4. Disk par save karo
918
+ with open(filepath, 'wb') as f:
919
+ f.write(file_data)
920
+
921
+ return True
922
+
923
+ except Exception as e:
924
+ print(f"Turbo failed: {e}")
925
+ return False
926
+
927
+ def process_video_download_task(task_item):
928
+ v_url, quality, title_hint = task_item
929
+
930
+ # Sabse pehle m3u8 (HLS) check karo
931
+ if v_url.split('?')[0].lower().endswith('.m3u8'):
932
+ return {
933
+ 'url': v_url,
934
+ 'original_url': v_url,
935
+ 'filename': 'Stream.m3u8',
936
+ 'external': True,
937
+ 'is_m3u8': True,
938
+ 'quality': quality or 'auto'
939
+ }
940
+
941
+ # RETRY LOOP (Max 3 attempts)
942
+ for attempt in range(3):
943
+ try:
944
+ # verify=False use karo
945
+ # Stability ke liye 15s timeout, stream=True
946
+ vid_data = requests.get(v_url, headers=headers, timeout=15, stream=True, verify=False)
947
+
948
+ if vid_data.status_code == 200:
949
+ content_type = vid_data.headers.get('content-type', '').lower()
950
+
951
+ # Minimum size filter: 2MB se chhoti videos skip karo
952
+ content_length = vid_data.headers.get('content-length')
953
+ if content_length:
954
+ size_mb = int(content_length) / (1024 * 1024)
955
+ if size_mb < 2:
956
+ return None
957
+
958
+ if 'video' in content_type or 'octet-stream' in content_type or v_url.endswith(('.mp4', '.webm', '.mov')) or 'mpegurl' in content_type:
959
+ if 'mpegurl' in content_type or v_url.split('?')[0].lower().endswith('.m3u8'):
960
+ return {
961
+ 'url': v_url,
962
+ 'original_url': v_url,
963
+ 'filename': 'Stream.m3u8',
964
+ 'external': True,
965
+ 'is_m3u8': True
966
+ }
967
+ filename = os.path.basename(v_url.split('?')[0]) or 'video.mp4'
968
+ if not filename.endswith(('.mp4', '.webm', '.ogg', '.mov', '.avi', '.mkv')):
969
+ filename += '.mp4'
970
+
971
+ # Title sanitize karo aur filename generate karo
972
+ import uuid
973
+ import re
974
+
975
+ if title_hint:
976
+ safe_title = re.sub(r'[^a-zA-Z0-9_\-\. ]', '', title_hint).strip().replace(' ', '_')[:50]
977
+ if safe_title:
978
+ base, ext = os.path.splitext(filename)
979
+ filename = f"{safe_title}_{uuid.uuid4().hex[:8]}{ext}"
980
+ else:
981
+ base, ext = os.path.splitext(filename)
982
+ if quality and quality != 'unknown':
983
+ filename = f"{base}_{quality}_{uuid.uuid4().hex[:8]}{ext}"
984
+ else:
985
+ filename = f"{base}_{uuid.uuid4().hex[:8]}{ext}"
986
+ else:
987
+ base, ext = os.path.splitext(filename)
988
+ if quality and quality != 'unknown':
989
+ filename = f"{base}_{quality}_{uuid.uuid4().hex[:8]}{ext}"
990
+ else:
991
+ filename = f"{base}_{uuid.uuid4().hex[:8]}{ext}"
992
+
993
+ filepath = f'webfiles/scraped/videos/{filename}'
994
+
995
+ # Pehle TURBO FETCH try karo
996
+ if not download_file_turbo(v_url, filepath):
997
+ # Standard stream par wapas jao (fallback)
998
+ with open(filepath, 'wb') as f:
999
+ for chunk in vid_data.iter_content(chunk_size=8192):
1000
+ f.write(chunk)
1001
+ return {
1002
+ 'url': f'/download/videos/{filename}',
1003
+ 'original_url': v_url,
1004
+ 'filename': filename,
1005
+ 'external': False,
1006
+ 'quality': quality
1007
+ }
1008
+ elif vid_data.status_code in [403, 404, 401]:
1009
+ # In errors par retry mat karo
1010
+ return None
1011
+
1012
+ except Exception as ex:
1013
+ # Sirf aakhri attempt par poora error print karo
1014
+ if attempt == 2:
1015
+ err_str = str(ex)
1016
+ if 'NameResolutionError' in err_str:
1017
+ print(f"⚠️ DNS Error (Ad/Tracker ho sakta hai)")
1018
+ elif 'RemoteDisconnected' in err_str:
1019
+ print(f"⚠️ Connection Dropped: {v_url}")
1020
+ else:
1021
+ print(f"❌ Failed {v_url}: {ex}")
1022
+ else:
1023
+ time.sleep(1) # Retry se pehle thoda wait karo
1024
+ continue
1025
+ return None
1026
+
1027
+ # Extract CSS
1028
+ css_content = []
1029
+ for style in soup.find_all('style'):
1030
+ css_content.append(style.string or '')
1031
+ style.extract()
1032
+
1033
+ # OpenCV Image Validation
1034
+ def validate_image_quality(image_bytes):
1035
+ try:
1036
+ import cv2
1037
+ import numpy as np
1038
+
1039
+ # Image decode karo
1040
+ nparr = np.frombuffer(image_bytes, np.uint8)
1041
+ img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
1042
+
1043
+ if img is None: return False
1044
+
1045
+ # Check 1: Resolution (Chhote icons/thumbnails skip karo)
1046
+ h, w, _ = img.shape
1047
+ # Increased strictness from 50 to 150
1048
+ if w < 150 or h < 150:
1049
+ return False
1050
+
1051
+ # Check 2: Variance (Solid colors ya flat images)
1052
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
1053
+ variance = cv2.Laplacian(gray, cv2.CV_64F).var()
1054
+ # Increased strictness from 50 to 100
1055
+ if variance < 100:
1056
+ return False # Zyada blurry ya flat hai
1057
+
1058
+ # Check 3: Entropy (Information density)
1059
+ hist = cv2.calcHist([gray], [0], None, [256], [0, 256])
1060
+ hist_norm = hist.ravel() / hist.sum()
1061
+ logs = np.log2(hist_norm + 0.0001)
1062
+ entropy = -np.sum(hist_norm * logs)
1063
+
1064
+ # Increased strictness from 3.5 to 5.0
1065
+ if entropy < 5.0:
1066
+ return False # Information kam hai
1067
+
1068
+ return True
1069
+ except Exception as e:
1070
+ return True # Safe rehne ke liye fail open karo
1071
+
1072
+ def is_valuable_media(url_path, element, media_type='image'):
1073
+ """Ads aur logos ke liye filtering logic."""
1074
+ try:
1075
+ # 1. URL Path Keywords
1076
+ lower_url = url_path.lower()
1077
+ ad_keywords = [
1078
+ 'ad', 'advert', 'banner', 'doubleclick', 'googleads',
1079
+ 'syndication', 'amazon-adsystem', 'wp-content/ads/',
1080
+ 'promoted', 'sponsored', 'pixel', 'tracking', 'taboola', 'outbrain'
1081
+ ]
1082
+ # Logos
1083
+ logo_keywords = ['logo', 'brand-logo', 'header-logo', 'footer-logo', 'favicon']
1084
+
1085
+ # Ads check karo
1086
+ if any(k in lower_url for k in ad_keywords):
1087
+ return False
1088
+
1089
+ # Logos check karo
1090
+ if any(k in lower_url for k in logo_keywords):
1091
+ return False
1092
+
1093
+ # 2. Metadata (Alt/Title)
1094
+ alt = element.get('alt', '').lower() if element.get('alt') else ""
1095
+ title = element.get('title', '').lower() if element.get('title') else ""
1096
+ metadata_text = f"{alt} {title}"
1097
+
1098
+ if any(k in metadata_text for k in ['ad ', 'ads ', 'advertisement', 'sponsored', 'logo', 'branding']):
1099
+ return False
1100
+
1101
+ # 3. CSS Metadata (Class/ID)
1102
+ classes = " ".join(element.get('class', [])) if isinstance(element.get('class'), list) else str(element.get('class', ''))
1103
+ id_val = str(element.get('id', ''))
1104
+
1105
+ # Parents check karo (simple BS4 method)
1106
+ parent_context = ""
1107
+ parent = element.parent
1108
+ levels = 0
1109
+ while parent and levels < 3:
1110
+ p_classes = " ".join(parent.get('class', [])) if isinstance(parent.get('class'), list) else str(parent.get('class', ''))
1111
+ parent_context += p_classes + " " + str(parent.get('id', '')) + " "
1112
+ parent = parent.parent
1113
+ levels += 1
1114
+
1115
+ context_text = (classes + " " + id_val + " " + parent_context).lower()
1116
+ bad_contexts = [' ad ', ' ads ', 'banner', 'logo', 'brand', 'sponsored', 'advert', 'widget-area']
1117
+ if any(k in context_text for k in bad_contexts):
1118
+ # Header ya sidebar check karo (risk areas hain)
1119
+ if any(x in context_text for x in ['header', 'sidebar', 'footer', 'nav']):
1120
+ return False
1121
+
1122
+ # 4. Dimensions (sirf image ke liye)
1123
+ width = element.get('width')
1124
+ height = element.get('height')
1125
+ if width and height:
1126
+ try:
1127
+ w = int(width)
1128
+ h = int(height)
1129
+ if h > 0:
1130
+ ratio = w / h
1131
+ # Banners aksar bahut wide ya tall hote hain
1132
+ if (ratio > 4 or ratio < 0.25) and (w < 900 and h < 900):
1133
+ return False
1134
+ except:
1135
+ pass
1136
+
1137
+ return True
1138
+ except:
1139
+ return True # Fail open
1140
+
1141
+ # Images download karne ke liye helper
1142
+ def process_image_download_task(img_src):
1143
+ try:
1144
+ img_url = requests.compat.urljoin(url, img_src)
1145
+ img_data = requests.get(img_url, headers=headers, timeout=3, verify=False)
1146
+ if img_data.status_code == 200:
1147
+ content = img_data.content
1148
+
1149
+ # QUALITY CHECK KARO
1150
+ if not validate_image_quality(content):
1151
+ return None
1152
+
1153
+ filename = os.path.basename(img_url.split('?')[0]) or 'image.jpg'
1154
+ if not filename.endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg')):
1155
+ filename += '.jpg'
1156
+
1157
+ import uuid
1158
+ import hashlib
1159
+
1160
+ image_hash = hashlib.md5(content).hexdigest()
1161
+
1162
+ filename = f"{uuid.uuid4().hex[:8]}_{filename}"
1163
+
1164
+ filepath = f'webfiles/scraped/images/{filename}'
1165
+ with open(filepath, 'wb') as f:
1166
+ f.write(content)
1167
+ return (img_src, f'images/{filename}', f'/download/images/{filename}', image_hash, filepath)
1168
+ except:
1169
+ pass
1170
+ return None
1171
+
1172
+ # CSS nikalo
1173
+ css_content = []
1174
+ for style in soup.find_all('style'):
1175
+ css_content.append(style.string or '')
1176
+ style.extract()
1177
+
1178
+ for link in soup.find_all('link', rel='stylesheet'):
1179
+ href = link.get('href')
1180
+ if href:
1181
+ try:
1182
+ css_url = requests.compat.urljoin(url, href)
1183
+ css_resp = requests.get(css_url, headers=headers, timeout=5)
1184
+ css_content.append(f'/* From {href} */\n' + css_resp.text)
1185
+ except:
1186
+ pass
1187
+ link.extract()
1188
+
1189
+ # JS nikalo
1190
+ js_content = []
1191
+ # Find all scripts (head and body)
1192
+ for script in soup.find_all('script'):
1193
+ # Non-executable scripts (JSON-LD, etc) filter karo
1194
+ script_type = script.get('type', '').lower()
1195
+ if script_type and script_type not in ['text/javascript', 'application/javascript', 'module']:
1196
+ script.extract()
1197
+ continue
1198
+
1199
+ if script.string and not script.get('src'):
1200
+ js_content.append(script.string)
1201
+ elif script.get('src'):
1202
+ src = script.get('src')
1203
+ try:
1204
+ js_url = requests.compat.urljoin(url, src)
1205
+ js_resp = requests.get(js_url, headers=headers, timeout=5)
1206
+ js_content.append(f'// From {src}\n' + js_resp.text)
1207
+ except:
1208
+ pass
1209
+ # Original script tags hatao taaki execution errors/404s na aayein
1210
+ script.extract()
1211
+
1212
+ # Image Tasks collect karo
1213
+ image_tasks = []
1214
+ if fetch_images:
1215
+ os.makedirs('webfiles/scraped/images', exist_ok=True)
1216
+
1217
+ # Exclude karne ke liye Video Posters ID karo
1218
+ poster_blacklist = set()
1219
+ for video in soup.find_all('video'):
1220
+ poster = video.get('poster')
1221
+ if poster: poster_blacklist.add(poster)
1222
+
1223
+ for img in soup.find_all('img'):
1224
+ if len(image_tasks) >= 50: break # Limit to 50 images to prevent timeout
1225
+ src = img.get('src')
1226
+ if src and not src.startswith('data:') and not src.lower().endswith('.svg'):
1227
+ # Exclude video posters
1228
+ if src in poster_blacklist:
1229
+ continue
1230
+
1231
+ # VALUABLE MEDIA FILTER (Ads/Logos)
1232
+ if not is_valuable_media(src, img):
1233
+ continue
1234
+
1235
+ # Extension check taaki iframes/HTML avoid ho sakein
1236
+ if any(src.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff']):
1237
+ image_tasks.append(src)
1238
+
1239
+ # =========================================================
1240
+ # PERFORMANCE OPTIMIZATION: Images PEHLE process honge
1241
+ # Images download faster than videos, so they're prioritized
1242
+ # to improve perceived performance and provide quicker feedback.
1243
+ # =========================================================
1244
+
1245
+ # Image Downloads execute karo
1246
+ if fetch_images and image_tasks:
1247
+ seen_hashes = set()
1248
+ total_images = len(image_tasks)
1249
+ completed_images = 0
1250
+ with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
1251
+ future_to_img = {executor.submit(process_image_download_task, src): src for src in image_tasks}
1252
+ for future in concurrent.futures.as_completed(future_to_img):
1253
+ completed_images += 1
1254
+ progress = int((completed_images / total_images) * 100)
1255
+ frames = ['🌑','🌒','🌓','🌔','🌕','🌖','🌗','🌘']
1256
+ spinner = frames[completed_images % len(frames)]
1257
+ sys.stdout.write(f'\r{spinner} Images: {completed_images}/{total_images} ({progress}%) ')
1258
+ sys.stdout.flush()
1259
+ result = future.result()
1260
+ if result:
1261
+ orig_src, relative_path, download_path, img_hash, filepath = result
1262
+ if img_hash in seen_hashes:
1263
+ # Duplicate content, delete file
1264
+ try:
1265
+ os.remove(filepath)
1266
+ except:
1267
+ pass
1268
+ continue
1269
+ seen_hashes.add(img_hash)
1270
+ # Update soup
1271
+ for img in soup.find_all('img', src=orig_src):
1272
+ img['src'] = relative_path
1273
+ images.append(download_path)
1274
+ image_count += 1
1275
+ sys.stdout.write(f'\r🌕 Images Done! ✅ {image_count} accepted \n')
1276
+ sys.stdout.flush()
1277
+ # Video Tasks collect karo
1278
+ video_tasks = []
1279
+ if fetch_videos:
1280
+ os.makedirs('webfiles/scraped/videos', exist_ok=True)
1281
+ # -------------------------------------------------------------------------
1282
+ # RESOURCE SNIFFER (Deep Scan)
1283
+ # Scans raw HTML/JS for hidden video links (mp4, m3u8, etc)
1284
+ # -------------------------------------------------------------------------
1285
+ # Video extensions wale links dhundhne ka pattern
1286
+ # Handles escaped slashes (common in JSON)
1287
+ # Captures: "https://example.com/video.mp4"
1288
+ sniffer_regex = r'(https?:\\?\/\\?\/[^"\'\s<>]+?\.(?:mp4|m3u8|webm|mov|mkv|ts|flv|wmv|3gp|f4v|mpg|mpeg|avi|m4v|ogg)(?:[^"\'\s<>]*)?)'
1289
+ matches = re.findall(sniffer_regex, response.text)
1290
+ sniffed_count = 0
1291
+ for match in matches:
1292
+ # Fix escaped slashes (e.g. from JSON: https:\/\/example.com)
1293
+ clean_url = match.replace('\\/', '/')
1294
+ # Basic validation
1295
+ if len(clean_url) > 200: continue # Likely garbage
1296
+ if not cleaner_url_validator(clean_url): continue
1297
+
1298
+ # Sniffed links ke liye AD FILTER
1299
+ ad_domains = ['doubleclick', 'adnxs', 'amazon-adsystem', 'googlesyndication', 'taboola', 'outbrain', 'ads-twitter', 'fb-ads']
1300
+ if any(ad in clean_url.lower() for ad in ad_domains):
1301
+ continue
1302
+ # URL mein quality clues check karo
1303
+ quality = 'unknown'
1304
+ lower_url = clean_url.lower()
1305
+ if '1080' in lower_url: quality = '1080p'
1306
+ elif '720' in lower_url: quality = '720p'
1307
+ elif '480' in lower_url: quality = '480p'
1308
+ # Avoid duplicates
1309
+ is_duplicate = False
1310
+ # Avoid duplicates
1311
+ is_duplicate = False
1312
+ for existing_url, _, _ in video_tasks:
1313
+ if existing_url == clean_url: is_duplicate = True; break
1314
+ for existing_vid in videos:
1315
+ if existing_vid['original_url'] == clean_url: is_duplicate = True; break
1316
+ if not is_duplicate:
1317
+ if any(x in clean_url for x in ['youtube', 'youtu.be', 'vimeo', 'dailymotion']):
1318
+ pass # Skip external for simple sniffer, usually handled by iframes
1319
+ else:
1320
+ video_tasks.append((clean_url, quality, f"sniffed_video_{sniffed_count}"))
1321
+ sniffed_count += 1
1322
+ # Video tags dhundho (sources aur qualities scan karo)
1323
+ for video in soup.find_all('video'):
1324
+ # VALUABLE MEDIA FILTER
1325
+ if not is_valuable_media(video.get('src', ''), video, 'video'):
1326
+ continue
1327
+
1328
+ # Check for source tags
1329
+ sources = video.find_all('source')
1330
+ found_src = False
1331
+
1332
+ if sources:
1333
+ for source in sources:
1334
+ src = source.get('src')
1335
+ if src:
1336
+ # Detect quality from attributes or text
1337
+ quality = 'unknown'
1338
+ s_text = (str(source) + src).lower()
1339
+ if '1080' in s_text: quality = '1080p'
1340
+ elif '720' in s_text: quality = '720p'
1341
+ elif '480' in s_text: quality = '480p'
1342
+ if src.startswith('http'):
1343
+ video_url = src
1344
+ else:
1345
+ video_url = requests.compat.urljoin(url, src)
1346
+ # Check external
1347
+ if any(x in video_url for x in ['youtube', 'youtu.be', 'vimeo', 'dailymotion']):
1348
+ videos.append({'url': video_url,'original_url': video_url,'filename': 'External Video','external': True})
1349
+ video_count += 1
1350
+ else:
1351
+ # Extract Title from Video Tag or Context
1352
+ video_title = video.get('title') or video.get('aria-label')
1353
+ if not video_title:
1354
+ # Try previous sibling header
1355
+ prev = video.find_previous(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
1356
+ if prev: video_title = prev.get_text().strip()
1357
+
1358
+ video_tasks.append((video_url, quality, video_title))
1359
+ found_src = True
1360
+
1361
+ # Fallback to direct src on video tag
1362
+ if not found_src:
1363
+ src = video.get('src')
1364
+ if src:
1365
+ if src.startswith('http'):
1366
+ video_url = src
1367
+ else:
1368
+ video_url = requests.compat.urljoin(url, src)
1369
+
1370
+ if any(x in video_url for x in ['youtube', 'youtu.be', 'vimeo', 'dailymotion']):
1371
+ videos.append({'url': video_url,'original_url': video_url,'filename': 'External Video','external': True})
1372
+ video_count += 1
1373
+ else:
1374
+ # Extract Title
1375
+ video_title = video.get('title') or video.get('aria-label')
1376
+ if not video_title:
1377
+ prev = video.find_previous(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
1378
+ if prev: video_title = prev.get_text().strip()
1379
+ video_tasks.append((video_url, 'unknown', video_title))
1380
+ # Video files ke links (a tags) dhundho
1381
+ for a in soup.find_all('a'):
1382
+ href = a.get('href')
1383
+ if href:
1384
+ # VALUABLE MEDIA FILTER
1385
+ if not is_valuable_media(href, a, 'video'):
1386
+ continue
1387
+ lower_href = href.lower()
1388
+ if lower_href.endswith(('.mp4', '.webm', '.ogg', '.mov', '.avi', '.mkv', '.m4v', '.m3u8', '.flv', '.wmv', '.3gp', '.f4v', '.mpg', '.mpeg', '.ts')) or \
1389
+ (('/video/' in lower_href or '/videos/' in lower_href) and '.' in list(filter(None, lower_href.split('/')))[-1]):
1390
+ # Detect quality from link text
1391
+ quality = 'unknown'
1392
+ a_text = a.get_text().lower()
1393
+ if '1080' in a_text: quality = '1080p'
1394
+ elif '720' in a_text: quality = '720p'
1395
+ elif '480' in a_text: quality = '480p'
1396
+ if href.startswith('http'):
1397
+ video_url = href
1398
+ else:
1399
+ video_url = requests.compat.urljoin(url, href)
1400
+ if any(x in video_url for x in ['youtube', 'youtu.be', 'vimeo', 'dailymotion']):
1401
+ videos.append({'url': video_url,'original_url': video_url,'filename': 'External Video','external': True})
1402
+ video_count += 1
1403
+ else:
1404
+ # Extract Title from Link Text or Attributes
1405
+ video_title = a.get('title') or a.get('aria-label') or a.get_text().strip()
1406
+ if not video_title:
1407
+ prev = a.find_previous(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
1408
+ if prev: video_title = prev.get_text().strip()
1409
+
1410
+ # Fallback to Page Title if single video or very few
1411
+ if not video_title:
1412
+ page_title = soup.title.string if soup.title else ""
1413
+ if page_title:
1414
+ # Clean up page title (remove site name usually at end)
1415
+ video_title = page_title.split('|')[0].split('-')[0].strip()
1416
+
1417
+
1418
+ video_tasks.append((video_url, quality, video_title))
1419
+
1420
+ # Find iframes with video embeds
1421
+ for iframe in soup.find_all('iframe'):
1422
+ src = iframe.get('src', '')
1423
+ # VALUABLE MEDIA FILTER
1424
+ if not is_valuable_media(src, iframe, 'video'):
1425
+ continue
1426
+ if any(x in src for x in ['youtube', 'youtu.be', 'vimeo', 'dailymotion']):
1427
+ if not any(v['original_url'] == src for v in videos):
1428
+ videos.append({'url': src,'original_url': src,'filename': f'Embed: {src.split("/")[2]}','external': True,'quality': 'unknown'})
1429
+ video_count += 1
1430
+ # Video Downloads execute karo
1431
+ if fetch_videos and video_tasks:
1432
+ total_videos = len(video_tasks)
1433
+ completed_videos = 0
1434
+ with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
1435
+ future_to_vid = {executor.submit(process_video_download_task, item): item for item in video_tasks}
1436
+ for future in concurrent.futures.as_completed(future_to_vid):
1437
+ completed_videos += 1
1438
+ progress = int((completed_videos / total_videos) * 100)
1439
+ frames = ['🌑','🌒','🌓','🌔','🌕','🌖','🌗','🌘']
1440
+ spinner = frames[completed_videos % len(frames)]
1441
+ sys.stdout.write(f'\r{spinner} Videos: {completed_videos}/{total_videos} ({progress}%) ')
1442
+ sys.stdout.flush()
1443
+ result = future.result()
1444
+ if result:
1445
+ # Avoid duplicates in final list
1446
+ if not any(v['original_url'] == result['original_url'] for v in videos):
1447
+ videos.append(result)
1448
+ video_count += 1
1449
+ sys.stdout.write(f'\r🌕 Videos Done! ✅ {video_count} found \n')
1450
+ sys.stdout.flush()
1451
+ # HTML update karo
1452
+ head = soup.find('head')
1453
+ if head:
1454
+ for link in head.find_all('link', rel='stylesheet'):
1455
+ link.extract()
1456
+ css_link = soup.new_tag('link', rel='stylesheet', href='style.css')
1457
+ head.insert(0, css_link)
1458
+ body = soup.find('body')
1459
+ if body:
1460
+ js_script = soup.new_tag('script', src='script.js')
1461
+ body.append(js_script)
1462
+ # Files save karo
1463
+ html_content = str(soup)
1464
+ with open('webfiles/scraped/index.html', 'w', encoding='utf-8') as f:
1465
+ f.write(html_content)
1466
+ with open('webfiles/scraped/style.css', 'w', encoding='utf-8') as f:
1467
+ f.write('\n\n'.join(css_content) or '/* No CSS found */')
1468
+ with open('webfiles/scraped/script.js', 'w', encoding='utf-8') as f:
1469
+ f.write('\n\n'.join(js_content) or '// No JS found */')
1470
+ # Stats calculate karo
1471
+ def get_size(content):
1472
+ size = len(content.encode('utf-8'))
1473
+ if size < 1024:
1474
+ return f'{size} B'
1475
+ elif size < 1024*1024:
1476
+ return f'{size/1024:.1f} KB'
1477
+ else:
1478
+ return f'{size/(1024*1024):.1f} MB'
1479
+ # --- DESIGN INSPECTOR ---
1480
+ design_data = {'colors': [], 'fonts': []}
1481
+ full_css = '\n'.join(css_content) + '\n' + html_content
1482
+ # Colors (Hex) nikalo
1483
+ hex_colors = re.findall(r'#(?:[0-9a-fA-F]{3}){1,2}\b', full_css)
1484
+ # Filter for unique and sort by frequency
1485
+ unique_colors = Counter(hex_colors).most_common(20) # Top 20
1486
+ design_data['colors'] = [c[0] for c in unique_colors]
1487
+
1488
+ # Fonts nikalo
1489
+ fonts = re.findall(r'font-family:\s*([^;]+)', full_css, re.IGNORECASE)
1490
+ unique_fonts = Counter([f.strip().strip("'").strip('"') for f in fonts]).most_common(10)
1491
+ design_data['fonts'] = [f[0] for f in unique_fonts]
1492
+
1493
+ # --- SEO ANALYSIS ---
1494
+ seo_data = {
1495
+ 'title': soup.title.string if soup.title else None,
1496
+ 'description': None,'keywords': None,
1497
+ 'headings': {'h1': 0, 'h2': 0, 'h3': 0},
1498
+ 'images_analysis': {'total': 0, 'missing_alt': 0},
1499
+ 'links_internal': 0,'links_external': 0,'score': 0
1500
+ }
1501
+
1502
+ # Meta tags check karo
1503
+ msg_desc = soup.find('meta', attrs={'name': 'description'}) or soup.find('meta', attrs={'property': 'og:description'})
1504
+ if msg_desc: seo_data['description'] = msg_desc.get('content')
1505
+ msg_keys = soup.find('meta', attrs={'name': 'keywords'})
1506
+ if msg_keys: seo_data['keywords'] = msg_keys.get('content')
1507
+ # Headings check karo
1508
+ seo_data['headings'] = {
1509
+ 'h1': len(soup.find_all('h1')),'h2': len(soup.find_all('h2')),'h3': len(soup.find_all('h3')),
1510
+ }
1511
+
1512
+ # Images
1513
+ imgs = soup.find_all('img')
1514
+ seo_data['images_analysis']['total'] = len(imgs)
1515
+ seo_data['images_analysis']['missing_alt'] = len([img for img in imgs if not img.get('alt')])
1516
+ # Links aur Auditor logic
1517
+ all_links = soup.find_all('a')
1518
+ domain = requests.compat.urlparse(url).netloc
1519
+ check_urls = set()
1520
+ for link in all_links:
1521
+ href = link.get('href', '')
1522
+ if not href or href.startswith('#') or href.startswith('javascript'): continue
1523
+ if domain in href or href.startswith('/'):
1524
+ seo_data['links_internal'] += 1
1525
+ else:
1526
+ seo_data['links_external'] += 1
1527
+ # Collect for auditing (limit to 50 to avoid timeout)
1528
+ if href.startswith('http'):
1529
+ check_urls.add(href)
1530
+ elif href.startswith('/'):
1531
+ check_urls.add(requests.compat.urljoin(url, href))
1532
+
1533
+ # Link Auditor (Threaded logic)
1534
+ broken_links = []
1535
+ def check_link(l_url):
1536
+ try:
1537
+ r = requests.head(l_url, headers=headers, timeout=3)
1538
+ if r.status_code >= 400:
1539
+ return {'url': l_url, 'status': r.status_code}
1540
+ except:
1541
+ return {'url': l_url, 'status': 'Failed'}
1542
+ return None
1543
+
1544
+ # Verify up to 30 unique links
1545
+ with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
1546
+ futures = [executor.submit(check_link, u) for u in list(check_urls)[:30]]
1547
+ for f in concurrent.futures.as_completed(futures):
1548
+ res = f.result()
1549
+ if res: broken_links.append(res)
1550
+ seo_data['broken_links'] = broken_links
1551
+ # --- DEEP CRAWL LOGIC ---
1552
+ site_structure = {'url': url, 'title': seo_data['title'], 'children': []}
1553
+ perf_tracker.record_phase("SEO & Parsing")
1554
+
1555
+ if crawl_depth > 1:
1556
+ # Simple recursive scraper logic
1557
+ def scrape_node(node_url, current_level):
1558
+ # Max depth par ruk jao
1559
+ if current_level > crawl_depth: return None
1560
+ try:
1561
+ # Reuse headers/verify settings
1562
+ nr = requests.get(node_url, headers=headers, timeout=3, verify=False)
1563
+ if nr.status_code == 200:
1564
+ ns = BeautifulSoup(nr.text, 'html.parser')
1565
+ node_title = (ns.title.string or node_url).strip()[:50] # Limit title length
1566
+
1567
+ child_nodes = []
1568
+ # Agar max depth nahi pahoochi, toh children dhundho
1569
+ if current_level < crawl_depth:
1570
+ n_links = []
1571
+ for na in ns.find_all('a', href=True):
1572
+ nh = na['href']
1573
+ if not nh or nh.startswith('#') or nh.startswith('javascript'): continue
1574
+ n_full = requests.compat.urljoin(node_url, nh)
1575
+ # Strict Internal Domain Check
1576
+ if domain in n_full:
1577
+ if n_full not in n_links and n_full != node_url:
1578
+ n_links.append(n_full)
1579
+
1580
+ # Recurse for top 3 links to keep it fast
1581
+ # We don't parallelize here to avoid spawning too many threads recursively
1582
+ for nl in list(set(n_links))[:3]:
1583
+ child = scrape_node(nl, current_level + 1)
1584
+ if child: child_nodes.append(child)
1585
+
1586
+ return {'url': node_url, 'title': node_title, 'children': child_nodes}
1587
+ except:
1588
+ pass
1589
+ return {'url': node_url, 'title': 'Unreachable', 'children': []}
1590
+
1591
+ # Filter start links (Limit 5)
1592
+ start_links = list(set([u for u in check_urls if domain in u]))[:5]
1593
+
1594
+ # First level ko parallelize karo
1595
+ with concurrent.futures.ThreadPoolExecutor(max_workers=3) as crawler:
1596
+ futures = {crawler.submit(scrape_node, u, 2): u for u in start_links}
1597
+ for f in concurrent.futures.as_completed(futures):
1598
+ res = f.result()
1599
+ if res: site_structure['children'].append(res)
1600
+ perf_tracker.record_phase("Deep Crawl")
1601
+ score = 0
1602
+ if seo_data['title']: score += 20
1603
+ if seo_data['description']: score += 20
1604
+ if seo_data['headings']['h1'] > 0: score += 20
1605
+ if url.startswith('https'): score += 10
1606
+
1607
+ if seo_data['images_analysis']['total'] > 0:
1608
+ ratio = 1 - (seo_data['images_analysis']['missing_alt'] / seo_data['images_analysis']['total'])
1609
+ score += int(30 * ratio)
1610
+ else:
1611
+ score += 30
1612
+
1613
+ seo_data['score'] = min(100, score)
1614
+
1615
+ result = {
1616
+ 'success': True,
1617
+ 'security': security_report,
1618
+ 'site_structure': site_structure,
1619
+ 'seo': seo_data,
1620
+ 'design': design_data,
1621
+ 'stats': {
1622
+ 'html': get_size(html_content),
1623
+ 'css': get_size('\n\n'.join(css_content)),
1624
+ 'js': get_size('\n\n'.join(js_content)),
1625
+ 'image_count': image_count,
1626
+ 'video_count': video_count
1627
+ },
1628
+ 'images': images,
1629
+ 'videos': videos,
1630
+ 'broken_links': check_broken_links(url, soup, headers),
1631
+ 'intel': {
1632
+ 'emails': extract_emails(response.text),
1633
+ 'phones': extract_phones(response.text),
1634
+ 'locations': extract_locations(soup),
1635
+ 'socials': extract_social_media(soup),
1636
+ 'tech_stack': detect_tech_stack(soup, response),
1637
+ 'ai_analysis': analyze_ai_content(soup.get_text(separator=' ', strip=True)[:50000]) # Limit to 50k chars for perf
1638
+ }
1639
+ }
1640
+
1641
+ perf_tracker.record_phase("AI & Intel")
1642
+ perf_data = perf_tracker.finish_and_print()
1643
+ result['performance'] = perf_data
1644
+
1645
+ return result
1646
+
1647
+ except Exception as e:
1648
+ print(f"Error in execute_scrape_logic: {e}")
1649
+ traceback.print_exc()
1650
+ return {'success': False, 'error': str(e)}
1651
+
1652
+ @app.route('/api/scrape', methods=['POST'])
1653
+ def api_scrape():
1654
+ data = request.get_json()
1655
+ url = data.get('url', '')
1656
+ fetch_images = data.get('fetch_images', False)
1657
+ fetch_videos = data.get('fetch_videos', False)
1658
+ crawl_depth = int(data.get('crawl_depth', 2))
1659
+ use_proxy = data.get('use_proxy', False)
1660
+ device = data.get('device', 'desktop')
1661
+
1662
+ result = execute_scrape_logic(url, fetch_images, fetch_videos, crawl_depth, use_proxy, device)
1663
+ if result.get('success'):
1664
+ return jsonify(result)
1665
+ else:
1666
+ return jsonify(result), 500
1667
+
1668
+ @app.route('/api/bulk', methods=['POST'])
1669
+ def api_bulk():
1670
+ try:
1671
+ data = request.get_json()
1672
+ urls = data.get('urls', [])
1673
+ fetch_images = data.get('fetch_images', False)
1674
+
1675
+ if not urls:
1676
+ return jsonify({'success': False, 'error': 'No URLs provided'})
1677
+
1678
+ timestamp = int(time.time())
1679
+ base_folder = f'webfiles/bulk/batch_{timestamp}'
1680
+ os.makedirs(base_folder, exist_ok=True)
1681
+ headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
1682
+ processed = 0
1683
+ for i, url in enumerate(urls):
1684
+ if not url.strip(): continue
1685
+ try:
1686
+ if not url.startswith('http'): url = 'https://' + url
1687
+
1688
+ # Subfolder banao
1689
+ domain = requests.compat.urlparse(url).netloc.replace(':', '_')
1690
+ site_folder = f'{base_folder}/{i+1}_{domain}'
1691
+ os.makedirs(site_folder, exist_ok=True)
1692
+ os.makedirs(f'{site_folder}/images', exist_ok=True)
1693
+
1694
+ # Fetch karo
1695
+ response = requests.get(url, headers=headers, timeout=10)
1696
+ soup = BeautifulSoup(response.text, 'html.parser')
1697
+
1698
+ # HTML save karo
1699
+ with open(f'{site_folder}/index.html', 'w', encoding='utf-8') as f:
1700
+ f.write(str(soup))
1701
+
1702
+ # CSS logic
1703
+ css_content = []
1704
+ for link in soup.find_all('link', rel='stylesheet'):
1705
+ href = link.get('href')
1706
+ if href:
1707
+ try:
1708
+ css_url = requests.compat.urljoin(url, href)
1709
+ css_content.append(requests.get(css_url, headers=headers, timeout=5).text)
1710
+ except: pass
1711
+ with open(f'{site_folder}/style.css', 'w', encoding='utf-8') as f:
1712
+ f.write('\n'.join(css_content))
1713
+
1714
+ # Images check karo
1715
+ if fetch_images:
1716
+ for img in soup.find_all('img'):
1717
+ src = img.get('src')
1718
+ if src and not src.startswith('data:') and not src.lower().endswith('.svg'):
1719
+ try:
1720
+ img_url = requests.compat.urljoin(url, src)
1721
+ fname = os.path.basename(img_url.split('?')[0]) or 'image.jpg'
1722
+ if not fname.endswith(('.jpg','.png','.jpeg','.webp')): fname += '.jpg'
1723
+
1724
+ r = requests.get(img_url, headers=headers, timeout=5)
1725
+ if r.status_code == 200:
1726
+ with open(f'{site_folder}/images/{fname}', 'wb') as f:
1727
+ f.write(r.content)
1728
+ except: pass
1729
+
1730
+ processed += 1
1731
+ except Exception as e:
1732
+ print(f"Failed to scrape {url}: {e}")
1733
+
1734
+ # ZIP banao
1735
+ shutil.make_archive(base_folder, 'zip', base_folder)
1736
+ shutil.rmtree(base_folder) # Folder saaf karo, sirf ZIP rakho
1737
+
1738
+ return jsonify({
1739
+ 'success': True,
1740
+ 'message': f'Successfully scraped {processed} sites.',
1741
+ 'download_url': f'/download/bulk/batch_{timestamp}.zip'
1742
+ })
1743
+
1744
+ except Exception as e:
1745
+ return jsonify({'success': False, 'error': str(e)}), 500
1746
+
1747
+ @app.route('/download/bulk/<path:filename>')
1748
+ def serve_bulk_file(filename):
1749
+ return send_from_directory('webfiles/bulk', filename)
1750
+
1751
+ @app.route('/api/save', methods=['POST'])
1752
+ def api_save():
1753
+ try:
1754
+ data = request.get_json()
1755
+ filename = data.get('filename')
1756
+ content = data.get('content')
1757
+
1758
+ if filename in ['index.html', 'style.css', 'script.js']:
1759
+ with open(f'webfiles/scraped/{filename}', 'w', encoding='utf-8') as f:
1760
+ f.write(content)
1761
+ return jsonify({'success': True})
1762
+ else:
1763
+ return jsonify({'success': False, 'error': 'Invalid filename'}), 400
1764
+ except Exception as e:
1765
+ return jsonify({'success': False, 'error': str(e)}), 500
1766
+
1767
+ @app.route('/api/download-zip')
1768
+ def download_zip():
1769
+ try:
1770
+ zip_path = '/tmp/scraped_files.zip'
1771
+ with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
1772
+ for root, dirs, files in os.walk('webfiles/scraped'):
1773
+ for file in files:
1774
+ file_path = os.path.join(root, file)
1775
+ arcname = os.path.relpath(file_path, 'webfiles/scraped')
1776
+ zipf.write(file_path, arcname)
1777
+
1778
+ return send_file(zip_path, as_attachment=True, download_name='scraped_files.zip')
1779
+ except Exception as e:
1780
+ return jsonify({'error': str(e)}), 500
1781
+
1782
+ def clear_scraped_data():
1783
+ try:
1784
+ folder = 'webfiles/scraped'
1785
+ if os.path.exists(folder):
1786
+ shutil.rmtree(folder)
1787
+ os.makedirs('webfiles/scraped', exist_ok=True)
1788
+ os.makedirs('webfiles/scraped/images', exist_ok=True)
1789
+ os.makedirs('webfiles/scraped/videos', exist_ok=True)
1790
+ return True
1791
+ except Exception as e:
1792
+ print(f"Cleanup Error: {e}")
1793
+ return False
1794
+
1795
+ @app.route('/api/clear', methods=['POST'])
1796
+ def api_clear():
1797
+ if clear_scraped_data():
1798
+ return jsonify({'success': True})
1799
+ else:
1800
+ return jsonify({'success': False, 'error': 'Cleanup failed'}), 500
1801
+
1802
+ @app.route('/api/export', methods=['POST'])
1803
+ def api_export():
1804
+ try:
1805
+ req_data = request.get_json()
1806
+ data = req_data.get('data')
1807
+ export_format = req_data.get('format', 'csv').lower()
1808
+ filename = req_data.get('filename', 'export')
1809
+
1810
+ if not data:
1811
+ return jsonify({'error': 'No data provided'}), 400
1812
+
1813
+ # Buffer create karo
1814
+ if export_format == 'csv':
1815
+ si = io.StringIO()
1816
+ # Check karo data simple list hai ya list of dicts
1817
+ if isinstance(data, list) and len(data) > 0:
1818
+ if isinstance(data[0], dict):
1819
+ # List of Dicts (e.g. Socials)
1820
+ keys = data[0].keys()
1821
+ writer = csv.DictWriter(si, fieldnames=keys)
1822
+ writer.writeheader()
1823
+ writer.writerows(data)
1824
+ else:
1825
+ # Simple List (e.g. Emails)
1826
+ writer = csv.writer(si)
1827
+ writer.writerow(['Value']) # Generic header
1828
+ for item in data:
1829
+ writer.writerow([item])
1830
+
1831
+ output = si.getvalue()
1832
+ mem = io.BytesIO()
1833
+ mem.write(output.encode('utf-8'))
1834
+ mem.seek(0)
1835
+
1836
+ return send_file(
1837
+ mem,
1838
+ mimetype='text/csv',
1839
+ as_attachment=True,
1840
+ download_name=f'{filename}.csv'
1841
+ )
1842
+
1843
+ elif export_format == 'json':
1844
+ mem = io.BytesIO()
1845
+ mem.write(json.dumps(data, indent=2).encode('utf-8'))
1846
+ mem.seek(0)
1847
+
1848
+ return send_file(
1849
+ mem,
1850
+ mimetype='application/json',
1851
+ as_attachment=True,
1852
+ download_name=f'{filename}.json'
1853
+ )
1854
+
1855
+ else:
1856
+ return jsonify({'error': 'Unsupported format'}), 400
1857
+
1858
+ except Exception as e:
1859
+ print(f"Export Error: {e}")
1860
+ return jsonify({'error': str(e)}), 500
1861
+
1862
+ @app.route('/api/translate', methods=['POST'])
1863
+ def api_translate():
1864
+ try:
1865
+ data = request.get_json()
1866
+ text = data.get('text')
1867
+ target_lang = data.get('target', 'hi') # Default to Hindi
1868
+
1869
+ if not text:
1870
+ return jsonify({'error': 'No text provided'}), 400
1871
+
1872
+ translated = mtranslate.translate(text, target_lang)
1873
+ return jsonify({
1874
+ 'success': True,
1875
+ 'translated': translated
1876
+ })
1877
+ except Exception as e:
1878
+ print(f"Translation Error: {e}")
1879
+ return jsonify({'error': str(e)}), 500
1880
+
1881
+ def wait_for_server(port, timeout=10):
1882
+ start = time.time()
1883
+ while time.time() - start < timeout:
1884
+ try:
1885
+ import urllib.request
1886
+ urllib.request.urlopen(f'http://127.0.0.1:{port}/', timeout=1)
1887
+ return True
1888
+ except:
1889
+ time.sleep(0.5)
1890
+ return False
1891
+
1892
+
1893
+ # --- IMAGE ANALYSIS FEATURE ---
1894
+
1895
+ def get_decimal_from_dms(dms, ref):
1896
+ degrees = dms[0]
1897
+ minutes = dms[1]
1898
+ seconds = dms[2]
1899
+
1900
+ decimal = degrees + (minutes / 60.0) + (seconds / 3600.0)
1901
+ if ref in ['S', 'W']:
1902
+ decimal = -decimal
1903
+ return decimal
1904
+
1905
+ def get_image_metadata(image):
1906
+ info = {
1907
+ "Format": image.format,
1908
+ "Mode": image.mode,
1909
+ "Size": f"{image.width} x {image.height}",
1910
+ "Width": image.width,
1911
+ "Height": image.height,
1912
+ "Info": image.info.get('comment', '')
1913
+ }
1914
+
1915
+ # EXIF data
1916
+ exif_data = {}
1917
+ gps_data = {}
1918
+
1919
+ try:
1920
+ exif = image._getexif()
1921
+ if exif:
1922
+ for tag, value in exif.items():
1923
+ decoded = ExifTags.TAGS.get(tag, tag)
1924
+ if decoded == "GPSInfo":
1925
+ gps_data = {}
1926
+ for t in value:
1927
+ sub_decoded = ExifTags.GPSTAGS.get(t, t)
1928
+ gps_data[sub_decoded] = value[t]
1929
+ else:
1930
+ # Binary data filter karo
1931
+ if isinstance(value, bytes):
1932
+ try:
1933
+ value = value.decode()
1934
+ except:
1935
+ value = "<binary data>"
1936
+ exif_data[decoded] = str(value)
1937
+ except Exception as e:
1938
+ print(f"EXIF Error: {e}")
1939
+
1940
+ # GPS data process karo
1941
+ location = None
1942
+ if gps_data:
1943
+ try:
1944
+ lat = get_decimal_from_dms(gps_data.get('GPSLatitude'), gps_data.get('GPSLatitudeRef'))
1945
+ lon = get_decimal_from_dms(gps_data.get('GPSLongitude'), gps_data.get('GPSLongitudeRef'))
1946
+ location = {'lat': lat, 'lon': lon, 'map_url': f"https://www.google.com/maps?q={lat},{lon}"}
1947
+ except Exception as e:
1948
+ print(f"GPS Parse Error: {e}")
1949
+
1950
+ return {
1951
+ "basic": info,
1952
+ "exif": exif_data,
1953
+ "gps": str(gps_data),
1954
+ "location": location
1955
+ }
1956
+
1957
+ def generate_ela(image, quality=90, scale=10):
1958
+ """
1959
+ ELA (Error Level Analysis) image generate karta hai.
1960
+ 1. Original image ko specific quality (compression) pe save karta hai.
1961
+ 2. Original aur compressed ke beech ka difference nikalta hai.
1962
+ 3. Visualisation ke liye difference ko enhance karta hai.
1963
+ """
1964
+ try:
1965
+ # Agar RGB nahi hai (jaise RGBA, P) toh convert karo
1966
+ if image.mode != 'RGB':
1967
+ image = image.convert('RGB')
1968
+
1969
+ # Compressed version memory mein save karo
1970
+ buffer = BytesIO()
1971
+ image.save(buffer, 'JPEG', quality=quality)
1972
+ buffer.seek(0)
1973
+ compressed_image = Image.open(buffer)
1974
+
1975
+ # Difference calculate karo
1976
+ ela_image = ImageChops.difference(image, compressed_image)
1977
+
1978
+ # Differences dikhne ke liye brightness badhao
1979
+ ela_image = ImageEnhance.Brightness(ela_image).enhance(scale)
1980
+
1981
+ return ela_image
1982
+ except Exception as e:
1983
+ print(f"ELA Error: {e}")
1984
+ return None
1985
+
1986
+ @app.route('/api/analyze/ela', methods=['POST'])
1987
+ def analyze_ela():
1988
+ try:
1989
+ if 'image' not in request.files:
1990
+ return jsonify({'error': 'No image file provided'}), 400
1991
+
1992
+ file = request.files['image']
1993
+ image = Image.open(file.stream)
1994
+
1995
+ ela_image = generate_ela(image)
1996
+
1997
+ if ela_image:
1998
+ # ELA result ko base64 mein convert karo
1999
+ buffered = BytesIO()
2000
+ ela_image.save(buffered, format="PNG") # ELA details ke liye PNG format
2001
+ img_str = base64.b64encode(buffered.getvalue()).decode()
2002
+ return jsonify({'success': True, 'ela_image': f"data:image/png;base64,{img_str}"})
2003
+ else:
2004
+ return jsonify({'error': 'Failed to generate ELA'}), 500
2005
+
2006
+ except Exception as e:
2007
+ return jsonify({'error': str(e)}), 500
2008
+
2009
+ def compute_ai_likelihood(image):
2010
+ """
2011
+ Image mein AI generation artifacts check karta hai (FFT use karke).
2012
+ Likelihood score (0-100) aur label return karta hai.
2013
+ """
2014
+ try:
2015
+ # Greyscale aur resize (consistent analysis ke liye)
2016
+ img_gray = image.convert('L').resize((512, 512))
2017
+ img_array = np.array(img_gray)
2018
+
2019
+ # FFT logic
2020
+ f = np.fft.fft2(img_array)
2021
+ fshift = np.fft.fftshift(f)
2022
+ magnitude_spectrum = 20 * np.log(np.abs(fshift) + 1e-10) # Log scale
2023
+
2024
+ # Heuristic: AI images mein aksar high-frequency energy ya grid artifacts hote hain.
2025
+ # High-frequency components ka variance calculate karenge.
2026
+
2027
+ # High-pass mask banao
2028
+ rows, cols = img_array.shape
2029
+ crow, ccol = rows//2, cols//2
2030
+ mask_radius = 50
2031
+
2032
+ # Low frequencies (center) mask karo
2033
+ magnetude_high_freq = magnitude_spectrum.copy()
2034
+ magnetude_high_freq[crow-mask_radius:crow+mask_radius, ccol-mask_radius:ccol+mask_radius] = 0
2035
+
2036
+ # High frequencies par stats calculate karo
2037
+ hf_mean = np.mean(magnetude_high_freq)
2038
+ hf_std = np.std(magnetude_high_freq)
2039
+
2040
+ # Simple heuristic mapping (demo ke liye tuned)
2041
+ # Real images mein HF variance kam hoti hai jab tak textured na ho.
2042
+ # GANs aksar high-energy artifacts chhod dete hain.
2043
+
2044
+ # Score normalize karo (estimation hai)
2045
+ # Maan ke chalte hain natural image std 30-50 hai, AI zyada ho sakta hai.
2046
+ # Natural variations ke liye sigmoid-like mapping use karenge.
2047
+
2048
+ score = min(100, max(0, (hf_std - 40) * 2 + 50))
2049
+
2050
+ # Refinement: "checkerboard" artifacts check karo jo strong indicators hote hain
2051
+ # Peak detection chahiye par variance bhi achha proxy hai.
2052
+
2053
+ label = "Likely Real"
2054
+ if score > 60:
2055
+ label = "Possible AI / Edited"
2056
+ if score > 80:
2057
+ label = "Likely AI Generated"
2058
+
2059
+ return {
2060
+ "score": round(score, 1),
2061
+ "label": label,
2062
+ "details": f"HF Variance: {round(hf_std, 2)}"
2063
+ }
2064
+ except Exception as e:
2065
+ print(f"AI Detection Error: {e}")
2066
+ return {"score": 0, "label": "Error", "details": str(e)}
2067
+
2068
+ @app.route('/api/analyze/ai', methods=['POST'])
2069
+ def analyze_ai():
2070
+ try:
2071
+ if 'image' not in request.files:
2072
+ return jsonify({'error': 'No image file provided'}), 400
2073
+
2074
+ file = request.files['image']
2075
+ image = Image.open(file.stream)
2076
+
2077
+ result = compute_ai_likelihood(image)
2078
+
2079
+ return jsonify({'success': True, 'data': result})
2080
+
2081
+ except Exception as e:
2082
+ return jsonify({'error': str(e)}), 500
2083
+
2084
+ @app.route('/api/analyze-image', methods=['POST'])
2085
+ def analyze_image():
2086
+ try:
2087
+ image = None
2088
+ source_type = "upload"
2089
+
2090
+ # 1. File upload check karo
2091
+ if 'file' in request.files:
2092
+ file = request.files['file']
2093
+ if file.filename == '':
2094
+ return jsonify({'error': 'No selected file'}), 400
2095
+ try:
2096
+ image = Image.open(file.stream)
2097
+ except Exception as e:
2098
+ return jsonify({'error': f'Invalid image file: {e}'}), 400
2099
+
2100
+ # 2. URL check karo
2101
+ elif 'url' in request.form or (request.is_json and 'url' in request.get_json()):
2102
+ data = request.get_json() if request.is_json else request.form
2103
+ url = data.get('url')
2104
+ if not url:
2105
+ return jsonify({'error': 'No URL provided'}), 400
2106
+
2107
+ source_type = "url"
2108
+ try:
2109
+ headers = {'User-Agent': 'Mozilla/5.0'}
2110
+ resp = requests.get(url, headers=headers, stream=True, timeout=15, verify=False)
2111
+ resp.raise_for_status()
2112
+ image = Image.open(BytesIO(resp.content))
2113
+ except Exception as e:
2114
+ return jsonify({'error': f'Failed to fetch image from URL: {e}'}), 400
2115
+
2116
+ else:
2117
+ return jsonify({'error': 'No image provided (file or url)'}), 400
2118
+
2119
+ # Image process karo
2120
+ metadata = get_image_metadata(image)
2121
+ ai_detection = compute_ai_likelihood(image)
2122
+
2123
+ return jsonify({
2124
+ 'success': True,
2125
+ 'source': source_type,
2126
+ 'data': metadata,
2127
+ 'ai_detection': ai_detection
2128
+ })
2129
+
2130
+ except Exception as e:
2131
+ return jsonify({'error': str(e)}), 500
2132
+
2133
+ def display_qr_image(url):
2134
+ """QR code generate aur ASCII format mein terminal pe dikhao"""
2135
+ qr = qrcode.QRCode(
2136
+ version=1,
2137
+ error_correction=qrcode.constants.ERROR_CORRECT_L,
2138
+ box_size=10,
2139
+ border=4,
2140
+ )
2141
+ qr.add_data(url)
2142
+ qr.make(fit=True)
2143
+
2144
+ # ASCII QR code print karo (invert=True better options hai)
2145
+ qr.print_ascii(invert=True)
2146
+ print(f"\n🔗 {url}")
2147
+
2148
+ def start_cloudflare_tunnel(port):
2149
+ try:
2150
+ if os.name == 'nt':
2151
+ os.system("taskkill /F /IM cloudflared.exe >NUL 2>&1")
2152
+ else:
2153
+ os.system("pkill -f cloudflared 2>/dev/null")
2154
+ time.sleep(1)
2155
+
2156
+ # OS ke hisaab se executable choose karo
2157
+ cf_executable = os.path.join(DATA_DIR, 'cloudflared.exe') if os.name == 'nt' else os.path.join(DATA_DIR, 'cloudflared')
2158
+
2159
+ # Agar missing ho (Linux/Colab) toh download karo
2160
+ if not os.path.exists(cf_executable) and os.name != 'nt':
2161
+ print("Downloading cloudflared...")
2162
+ subprocess.run(['wget', '-q', '-O', cf_executable, 'https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64'])
2163
+ subprocess.run(['chmod', '+x', cf_executable])
2164
+
2165
+ process = subprocess.Popen(
2166
+ [cf_executable, 'tunnel', '--protocol', 'http2', '--url', f'http://127.0.0.1:{port}'],
2167
+ stdout=subprocess.PIPE,
2168
+ stderr=subprocess.STDOUT,
2169
+ text=True,
2170
+ bufsize=1
2171
+ )
2172
+
2173
+ url_pattern = r'https://[a-z0-9-]+\.trycloudflare\.com'
2174
+ start_time = time.time()
2175
+
2176
+ while time.time() - start_time < 30:
2177
+ line = process.stdout.readline()
2178
+ if line:
2179
+ match = re.search(url_pattern, line)
2180
+ if match:
2181
+ url = match.group(0)
2182
+ time.sleep(2)
2183
+ return url, process
2184
+ return None, None
2185
+ except:
2186
+ return None, None
2187
+
2188
+ def print_cli_report(data):
2189
+ """Beautiful Colorized two-column report for CLI mode"""
2190
+ if not data.get('success'):
2191
+ print(f"\n{Fore.RED if COLOR_SUPPORT else ''}❌ Scrape Failed: {data.get('error')}")
2192
+ return
2193
+
2194
+ c_b = Fore.CYAN if COLOR_SUPPORT else ""
2195
+ c_g = Fore.GREEN if COLOR_SUPPORT else ""
2196
+ c_y = Fore.YELLOW if COLOR_SUPPORT else ""
2197
+ c_r = Fore.RED if COLOR_SUPPORT else ""
2198
+ c_m = Fore.MAGENTA if COLOR_SUPPORT else ""
2199
+ c_w = Fore.WHITE if COLOR_SUPPORT else ""
2200
+ R = Style.RESET_ALL if COLOR_SUPPORT else ""
2201
+
2202
+ W = 37 # Inner width per column
2203
+ SEP = "─"
2204
+
2205
+ def vlen(s):
2206
+ return len(re.sub(r'\033\[[0-9;]*m', '', s))
2207
+
2208
+ def pad(s, w):
2209
+ return s + ' ' * max(0, w - vlen(s))
2210
+
2211
+ # --- Build LEFT column: Security + SEO + Intel ---
2212
+ left = []
2213
+ sec = data['security']
2214
+ sec_icon = "🟢" if sec['level'] == 'LOW' else "🟡" if sec['level'] == 'MEDIUM' else "🔴"
2215
+ sec_c = c_g if sec['level'] == 'LOW' else c_y if sec['level'] == 'MEDIUM' else c_r
2216
+ left.append(f"SECURITY: {sec_icon} {sec_c}{sec['level']}{R}")
2217
+ for t in sec['threats'][:3]:
2218
+ left.append(f" - {t[:W-3]}")
2219
+ left.append(SEP)
2220
+
2221
+ seo = data['seo']
2222
+ bar = '█' * int(seo['score']/5)
2223
+ left.append(f"SEO: {c_g}{seo['score']}{R}/100 {bar}")
2224
+ left.append(f"H1:{seo['headings']['h1']} H2:{seo['headings']['h2']} H3:{seo['headings']['h3']}")
2225
+ left.append(f"Imgs: {seo['images_analysis']['total']} total, {seo['images_analysis']['missing_alt']} no-alt")
2226
+ left.append(SEP)
2227
+
2228
+ intel = data['intel']
2229
+ left.append(f"INTEL (OSINT):")
2230
+ left.append(f" 📧 Emails: {len(intel['emails'])}")
2231
+ for e in intel['emails'][:2]:
2232
+ left.append(f" {e[:W-4]}")
2233
+ left.append(f" 📱 Phones: {len(intel['phones'])}")
2234
+ for p in intel['phones'][:2]:
2235
+ left.append(f" {p[:W-4]}")
2236
+ left.append(f" 📍 Locations: {len(intel['locations'])}")
2237
+ left.append(f" 🛠️ {c_m}{', '.join(intel['tech_stack'][:3])[:W-4]}{R}")
2238
+
2239
+ # --- Build RIGHT column: AI + Performance ---
2240
+ right = []
2241
+ if intel.get('ai_analysis'):
2242
+ ai = intel['ai_analysis']
2243
+ right.append(f"AI ANALYSIS:")
2244
+ right.append(f" Sentiment: {c_m}{ai['sentiment']['label']}{R} ({ai['sentiment']['polarity']})")
2245
+ right.append(f" Readability: {ai['readability']['level']} ({ai['readability']['score']})")
2246
+ right.append(f" Keywords:")
2247
+ kw = ', '.join(ai['keywords'][:5])
2248
+ right.append(f" {kw[:W-2]}")
2249
+ right.append(SEP)
2250
+
2251
+ perf = data.get('performance', {})
2252
+ if perf and perf.get('total'):
2253
+ right.append(f"PERFORMANCE: {c_y}{perf['total']:.2f}s{R}")
2254
+ for phase, dur in perf['phases'].items():
2255
+ perc = (dur / perf['total']) * 100
2256
+ bl = int(perc / 5)
2257
+ b = "█" * bl
2258
+ right.append(f" {phase[:12]:<12} {c_g}{b:<10}{R} {perc:>3.0f}%")
2259
+ right.append(f" Avg:{perf['avg']:.1f}s Best:{perf['best']:.1f}s")
2260
+
2261
+ # Equalize rows
2262
+ mx = max(len(left), len(right))
2263
+ while len(left) < mx: left.append("")
2264
+ while len(right) < mx: right.append("")
2265
+
2266
+ # --- Render ---
2267
+ url_display = data['site_structure']['url'][:W*2]
2268
+ total_w = W * 2 + 5 # inner total
2269
+
2270
+ print(f"\n{c_b}╔{'═'*total_w}╗{R}")
2271
+ print(f"{c_b}║ {R}{pad('intelligence report: ' + url_display, total_w - 1)}{c_b}║{R}")
2272
+ print(f"{c_b}╠{'═'*W}═╦═{'═'*W}══╣{R}")
2273
+
2274
+ for i in range(mx):
2275
+ l, r = left[i], right[i]
2276
+ l_sep = (l == SEP)
2277
+ r_sep = (r == SEP)
2278
+
2279
+ if l_sep and r_sep:
2280
+ print(f"{c_b}╟{'─'*W}─╫─{'─'*W}──╢{R}")
2281
+ elif l_sep:
2282
+ print(f"{c_b}╟{'─'*W}─╫ {R}{pad(r, W+1)}{c_b}║{R}")
2283
+ elif r_sep:
2284
+ print(f"{c_b}║ {R}{pad(l, W)}{c_b}╟ {R}{' '*W} {c_b}║{R}")
2285
+ else:
2286
+ print(f"{c_b}║ {R}{pad(l, W)}{c_b}║ {R}{pad(r, W+1)}{c_b}║{R}")
2287
+
2288
+ print(f"{c_b}╚{'═'*W}═╩═{'═'*W}══╝{R}\n")
2289
+
2290
+ def is_valid_url(text):
2291
+ """Smart detection for URLs/Domains - Ultra Strict"""
2292
+ text = text.lower().strip()
2293
+ if not text or ' ' in text or len(text) < 4: return False
2294
+ if text.startswith('http'): return True
2295
+ if text.startswith(('/', '.', '@')): return False
2296
+
2297
+ # Must have a legitimate domain-like structure
2298
+ parts = text.split('.')
2299
+ if len(parts) >= 2:
2300
+ # Check TLD (2-12 chars, letters only)
2301
+ tld = parts[-1].split('/')[0]
2302
+ if tld.isalpha() and 2 <= len(tld) <= 12:
2303
+ # Common file types to exclude
2304
+ if tld in ['py', 'json', 'txt', 'md', 'exe', 'log', 'bat', 'sh', 'zip', 'rar']:
2305
+ return False
2306
+ # Ensure the domain part isn't empty or invalid
2307
+ domain_part = parts[-2]
2308
+ if domain_part and any(c.isalnum() for c in domain_part):
2309
+ return True
2310
+ return False
2311
+
2312
+ def run_cli_mode(initial_url=None):
2313
+ """Interactive CLI prompting flow"""
2314
+ if initial_url and not is_valid_url(initial_url):
2315
+ print(f"\n{Fore.RED}⚠️ Invalid Link: {Fore.WHITE}{initial_url}")
2316
+ time.sleep(1.5)
2317
+ return
2318
+
2319
+ os.system('cls' if os.name == 'nt' else 'clear')
2320
+ banner = r"""██╗ ██╗███████╗██████╗ ████████╗ ██████╗ ██████╗ ██╗ ███████╗
2321
+ ██║ ██║██╔════╝██╔══██╗ ╚══██╔══╝██╔═══██╗██╔═══██╗██║ ██╔════╝
2322
+ ██║ █╗ ██║█████╗ ██████╔╝ ██║ ██║ ██║██║ ██║██║ ███████╗
2323
+ ██║███╗██║██╔══╝ ██╔══██╗ ██║ ██║ ██║██║ ██║██║ ╚════██║
2324
+ ╚███╔███╔╝███████╗██████╔╝ ██║ ╚██████╔╝╚██████╔╝███████╗███████║
2325
+ ╚══╝╚══╝ ╚══════╝╚═════╝ ╚═╝ ╚═════╝ ╚═════╝ ╚══════╝╚══════╝"""
2326
+ print_gradient_text(banner, (0, 255, 255), (255, 0, 255))
2327
+ print(f"{' ' * 45}{Fore.WHITE}{Style.DIM}Dev: Abhinav Adarsh{Style.RESET_ALL}")
2328
+ print(f"{Fore.CYAN}{Style.BRIGHT} ADVANCED CLI INTELLIGENCE MODE{Style.RESET_ALL}")
2329
+
2330
+ try:
2331
+ url = initial_url if initial_url else input(f"{Fore.LIGHTGREEN_EX if COLOR_SUPPORT else ''}link > {Style.RESET_ALL}").strip()
2332
+ if not url: return
2333
+
2334
+ # Double check validity if manual input
2335
+ if not initial_url and not is_valid_url(url):
2336
+ print(f"\n{Fore.RED}⚠️ Invalid Link: {Fore.WHITE}{url}")
2337
+ time.sleep(1.5)
2338
+ return
2339
+
2340
+ if AUTOCOMPLETE_AVAILABLE:
2341
+ if not url.startswith('/'):
2342
+ try: readline.write_history_file(HISTORY_FILE)
2343
+ except: pass
2344
+ else:
2345
+ try: readline.remove_history_item(readline.get_current_history_length() - 1)
2346
+ except: pass
2347
+
2348
+ print("\n⚙️ Scrape Options:")
2349
+ fetch_img = input(" - Fetch & Analyze Images? (y/N) > ").lower() == 'y'
2350
+ if AUTOCOMPLETE_AVAILABLE:
2351
+ try: readline.remove_history_item(readline.get_current_history_length() - 1)
2352
+ except: pass
2353
+ fetch_vid = input(" - Fetch & Deep-Scan Videos? (y/N) > ").lower() == 'y'
2354
+ if AUTOCOMPLETE_AVAILABLE:
2355
+ try: readline.remove_history_item(readline.get_current_history_length() - 1)
2356
+ except: pass
2357
+ depth = input(" - Crawl Depth (1-3) [Default 2] > ").strip()
2358
+ depth = int(depth) if depth.isdigit() else 2
2359
+ if AUTOCOMPLETE_AVAILABLE:
2360
+ try: readline.remove_history_item(readline.get_current_history_length() - 1)
2361
+ except: pass
2362
+ use_proxy = input(" - Use Intelligent Proxies? (y/N) > ").lower() == 'y'
2363
+ if AUTOCOMPLETE_AVAILABLE:
2364
+ try: readline.remove_history_item(readline.get_current_history_length() - 1)
2365
+ except: pass
2366
+
2367
+ with MoonSpinner("Scanning"):
2368
+ result = execute_scrape_logic(url, fetch_img, fetch_vid, depth, use_proxy)
2369
+ print_cli_report(result)
2370
+
2371
+ input("Press Enter to return to main menu and CLEAR session data...")
2372
+ clear_scraped_data()
2373
+ print("🧹 Session data cleared.")
2374
+ time.sleep(1)
2375
+ except KeyboardInterrupt:
2376
+ print("\n\n⚠️ Input interrupted. Returning to menu...")
2377
+ time.sleep(1)
2378
+
2379
+ def run_image_forensics_mode(initial_image=None):
2380
+ """CLI flow for image analysis"""
2381
+ os.system('cls' if os.name == 'nt' else 'clear')
2382
+ banner = r"""██╗███╗ ███╗ █████╗ ██████╗ ███████╗ ███████╗ ██████╗ ██████╗ ███████╗███╗ ██╗███████╗██╗ ██████╗███████╗
2383
+ ██║████╗ ████║██╔══██╗██╔════╝ ██╔════╝ ██╔════╝██╔═══██╗██╔══██╗██╔════╝████╗ ██║██╔════╝██║██╔════╝██╔════╝
2384
+ ██║██╔████╔██║███████║██║ ███╗█████╗ █████╗ ██║ ██║██████╔╝█████╗ ██╔██╗ ██║███████╗██║██║ ███████╗
2385
+ ██║██║╚██╔╝██║██╔══██║██║ ██║██╔══╝ ██╔══╝ ██║ ██║██╔══██╗██╔══╝ ██║╚██╗██║╚════██║██║██║ ╚════██║
2386
+ ██║██║ ╚═╝ ██║██║ ██║╚██████╔╝███████╗ ██║ ╚██████╔╝██║ ██║███████╗██║ ╚████║███████║██║╚██████╗███████║
2387
+ ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝ ╚═════╝ ╚══════╝ ╚═╝ ╚═════╝ ╚═╝ ╚═╝╚══════╝╚═╝ ╚═══╝╚══════╝╚═╝ ╚═════╝╚══════╝"""
2388
+ print_gradient_text(banner, (255, 100, 100), (100, 100, 255))
2389
+ print(f"{' ' * 45}{Fore.WHITE}{Style.DIM}Dev : Abhinav Adarsh{Style.RESET_ALL}\n")
2390
+
2391
+ try:
2392
+ user_input = initial_image if initial_image else input(f"{Fore.LIGHTGREEN_EX if COLOR_SUPPORT else ''}image link or local path > {Style.RESET_ALL}").strip()
2393
+ if not user_input: return
2394
+
2395
+ # Handle quotes (e.g. from copy-pasting path)
2396
+ user_input = user_input.strip('"\'')
2397
+
2398
+ image = None
2399
+ source_type = "local"
2400
+
2401
+ with MoonSpinner("Analyzing Image"):
2402
+ try:
2403
+ # Is it a URL?
2404
+ if user_input.startswith(('http://', 'https://')):
2405
+ source_type = "url"
2406
+ headers = {'User-Agent': proxy_manager.get_random_ua()}
2407
+ resp = requests.get(user_input, headers=headers, stream=True, timeout=15, verify=False)
2408
+ resp.raise_for_status()
2409
+ image = Image.open(BytesIO(resp.content))
2410
+ else:
2411
+ # Assume it's a local path
2412
+ if os.path.exists(user_input):
2413
+ image = Image.open(user_input)
2414
+ else:
2415
+ print(f"\n{Fore.RED}Error: File or URL not found: {Fore.WHITE}{user_input}")
2416
+ time.sleep(2)
2417
+ return
2418
+
2419
+ if not image:
2420
+ raise Exception("Failed to load image")
2421
+
2422
+ # Process
2423
+ metadata = get_image_metadata(image)
2424
+ ai_detection = compute_ai_likelihood(image)
2425
+
2426
+ print_image_forensics_report({
2427
+ 'source': source_type,
2428
+ 'path': user_input,
2429
+ 'metadata': metadata,
2430
+ 'ai': ai_detection
2431
+ })
2432
+
2433
+ except Exception as e:
2434
+ print(f"\n{Fore.RED}Analysis Failed: {Fore.WHITE}{e}")
2435
+ time.sleep(2)
2436
+ return
2437
+
2438
+ input("\nPress Enter to return to main menu...")
2439
+ except KeyboardInterrupt:
2440
+ pass
2441
+
2442
+ def print_image_forensics_report(data):
2443
+ """Beautiful terminal table for image analysis results"""
2444
+ c_b = Fore.CYAN if COLOR_SUPPORT else ""
2445
+ c_g = Fore.GREEN if COLOR_SUPPORT else ""
2446
+ c_y = Fore.YELLOW if COLOR_SUPPORT else ""
2447
+ c_r = Fore.RED if COLOR_SUPPORT else ""
2448
+ c_w = Fore.WHITE if COLOR_SUPPORT else ""
2449
+ R = Style.RESET_ALL if COLOR_SUPPORT else ""
2450
+
2451
+ meta = data['metadata']
2452
+ basic = meta['basic']
2453
+ ai = data['ai']
2454
+
2455
+ total_w = 80
2456
+ def vlen(s): return len(re.sub(r'\033\[[0-9;]*m', '', s))
2457
+ def pad(s, w): return s + ' ' * (w - vlen(s))
2458
+
2459
+ print(f"\n{c_b}╔{'═'*(total_w-2)}╗{R}")
2460
+ title = f" forensic report: {os.path.basename(data['path'])[:40]} "
2461
+ print(f"{c_b}║{R}{c_w}{Style.BRIGHT}{title.center(total_w-2)}{R}{c_b}║{R}")
2462
+ print(f"{c_b}╠{'═'*(total_w-2)}╣{R}")
2463
+
2464
+ # Row helper
2465
+ def print_row(key, val, color=c_w):
2466
+ k = f" {key}:"
2467
+ line = f"{c_y}{pad(k, 20)}{R} {color}{val}{R}"
2468
+ print(f"{c_b}║{R} {pad(line, total_w-4)} {c_b}║{R}")
2469
+
2470
+ # Basic Info
2471
+ print_row("Source Type", data['source'].upper(), c_g)
2472
+ print_row("Format", basic['Format'])
2473
+ print_row("Resolution", basic['Size'])
2474
+ print_row("Color Mode", basic['Mode'])
2475
+
2476
+ # AI Detection
2477
+ ai_score = ai['score']
2478
+ ai_color = c_r if ai_score > 70 else (c_y if ai_score > 40 else c_g)
2479
+ print_row("AI Likelihood", f"{ai_score}% ({ai['label']})", ai_color)
2480
+
2481
+ # GPS / Location
2482
+ if meta.get('location'):
2483
+ print_row("GPS Coordinates", f"{meta['location']['lat']}, {meta['location']['lon']}", c_m := (Fore.MAGENTA if COLOR_SUPPORT else ""))
2484
+ print_row("Map Link", meta['location']['map_url'], c_b)
2485
+
2486
+ # Crucial EXIF
2487
+ exif = meta['exif']
2488
+ important_tags = ['Make', 'Model', 'Software', 'DateTime', 'LensModel', 'ExposureTime', 'ISOSpeedRatings']
2489
+ found_exif = False
2490
+ for tag in important_tags:
2491
+ if tag in exif:
2492
+ if not found_exif:
2493
+ print(f"{c_b}╟{'─'*(total_w-2)}╢{R}")
2494
+ found_exif = True
2495
+ print_row(tag, exif[tag])
2496
+
2497
+ print(f"{c_b}╚{'═'*(total_w-2)}╝{R}\n")
2498
+
2499
+ def main_launcher():
2500
+ """Mode selection menu on startup"""
2501
+ menu_commands = ['/web', '/cli', '/image', '/help', '/clear', '/quit', '/history', '/w', '/c', '/i', '/h', '/q', '/hi', '--help']
2502
+ setup_autocomplete(menu_commands)
2503
+
2504
+ while True:
2505
+ try:
2506
+ os.system('cls' if os.name == 'nt' else 'clear')
2507
+ banner = r"""██╗ ██╗███████╗██████╗ ████████╗ ██████╗ ██████╗ ██╗ ███████╗
2508
+ ██║ ██║██╔════╝██╔══██╗ ╚══██╔══╝██╔═══██╗██╔═══██╗██║ ██╔════╝
2509
+ ██║ █╗ ██║█████╗ ██████╔╝ ██║ ██║ ██║██║ ██║██║ ███████╗
2510
+ ██║███╗██║██╔══╝ ██╔══██╗ ██║ ██║ ██║██║ ██║██║ ╚════██║
2511
+ ╚███╔███╔╝███████╗██████╔╝ ██║ ╚██████╔╝╚██████╔╝███████╗███████║
2512
+ ╚══╝╚══╝ ╚══════╝╚═════╝ ╚═╝ ╚═════╝ ╚═════╝ ╚══════╝╚══════╝"""
2513
+ print_gradient_text(banner, (0, 255, 255), (255, 0, 255))
2514
+ print(f"{' ' * 45}{Fore.WHITE}{Style.DIM}Dev: Abhinav Adarsh{Style.RESET_ALL}")
2515
+ print(f"{Fore.WHITE}Type {Fore.CYAN}/help{Fore.WHITE} or {Fore.CYAN}/h{Fore.WHITE} to see all commands.\n")
2516
+
2517
+ choice = input(f"{Fore.LIGHTGREEN_EX if COLOR_SUPPORT else ''}> {Style.RESET_ALL}").strip().lower()
2518
+ if AUTOCOMPLETE_AVAILABLE:
2519
+ try: readline.remove_history_item(readline.get_current_history_length() - 1)
2520
+ except: pass
2521
+
2522
+ if choice in ['/web', '/w']:
2523
+ start_web_server()
2524
+ elif choice in ['/cli', '/c']:
2525
+ run_cli_mode()
2526
+ elif choice in ['/image', '/i']:
2527
+ run_image_forensics_mode()
2528
+ elif choice in ['/help', '/h', '--help']:
2529
+ print(f"\n{Fore.CYAN if COLOR_SUPPORT else ''}Available Commands:")
2530
+ print(f" {Fore.CYAN}/web{Style.RESET_ALL} - Launches the web engine for browser-based auditing.")
2531
+ print(f" {Fore.CYAN}/cli{Style.RESET_ALL} - Runs a deep-scan intelligence report in the terminal.")
2532
+ print(f" {Fore.CYAN}/image{Style.RESET_ALL} - Local/Remote Image Forensics & AI detection (Alias: /i).")
2533
+ print(f" {Fore.CYAN}/clear{Style.RESET_ALL} - Purges the 'webfiles/scraped' directory and clears screen.")
2534
+ print(f" {Fore.CYAN}/history{Style.RESET_ALL} - Shows command history (Alias: /hi).")
2535
+ print(f" {Fore.CYAN}/help{Style.RESET_ALL} - Displays this help message (Alias: /h, --help).")
2536
+ print(f" {Fore.RED}/quit{Style.RESET_ALL} - Shuts down the application safely.")
2537
+ input("\nPress Enter to continue...")
2538
+ elif choice in ['/history', '/hi']:
2539
+ if AUTOCOMPLETE_AVAILABLE and os.path.exists(HISTORY_FILE):
2540
+ print(f"\n{Fore.CYAN}--- Command History ---{Style.RESET_ALL}")
2541
+ with open(HISTORY_FILE, 'r') as f:
2542
+ lines = f.readlines()
2543
+ for i, line in enumerate(lines[-20:]): # Show last 20
2544
+ print(f"{Fore.WHITE}{i+1}. {line.strip()}")
2545
+ else:
2546
+ print(f"\n{Fore.YELLOW}No history found.")
2547
+ input("\nPress Enter to continue...")
2548
+ elif choice in ['/clear', '/c']:
2549
+ clear_scraped_data()
2550
+ os.system('cls' if os.name == 'nt' else 'clear')
2551
+ print("Cache purged and screen cleared.")
2552
+ time.sleep(1)
2553
+ elif choice in ['/quit', '/q']:
2554
+ print(f"\n{Fore.YELLOW if COLOR_SUPPORT else ''}Goodbye!")
2555
+ sys.exit()
2556
+ elif is_valid_url(choice):
2557
+ run_cli_mode(choice)
2558
+ except KeyboardInterrupt:
2559
+ print("\n\nGoodbye!")
2560
+ sys.exit()
2561
+
2562
+ def start_web_server():
2563
+ """Original server startup logic"""
2564
+ public_url, tunnel_proc = None, None
2565
+ with MoonSpinner("Initializing Web Engine"):
2566
+ threading.Thread(target=lambda: app.run(host='0.0.0.0', port=PORT, debug=False, use_reloader=False, threaded=True), daemon=True).start()
2567
+
2568
+ if wait_for_server(PORT, timeout=10):
2569
+ public_url, tunnel_proc = start_cloudflare_tunnel(PORT)
2570
+ else:
2571
+ print("❌ Server failed to start")
2572
+ return
2573
+
2574
+ if public_url:
2575
+ os.system('cls' if os.name == 'nt' else 'clear')
2576
+ print(f"Scan this QR code :\n")
2577
+ display_qr_image(public_url)
2578
+ print(f"\nPress Ctrl+C to stop.")
2579
+
2580
+ try:
2581
+ if 'google.colab' in sys.modules:
2582
+ print("\n Running in Background.")
2583
+ while True: time.sleep(100)
2584
+ else:
2585
+ while True: time.sleep(1)
2586
+ except KeyboardInterrupt:
2587
+ print("\n Thank you for using ^_^ Web Tools")
2588
+ if tunnel_proc:
2589
+ try: tunnel_proc.terminate()
2590
+ except: pass
2591
+ else:
2592
+ print("❌ Failed to create tunnel")
2593
+
2594
+ # Sab start karo
2595
+ if __name__ == '__main__':
2596
+ main_launcher()