webtools-cli 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- webtools/__init__.py +4 -0
- webtools/__main__.py +5 -0
- webtools/cli.py +15 -0
- webtools/core.py +2596 -0
- webtools/web/Web_Tools.png +0 -0
- webtools/web/index.html +1102 -0
- webtools/web/script.js +1805 -0
- webtools/web/style.css +71 -0
- webtools_cli-1.0.0.dist-info/METADATA +110 -0
- webtools_cli-1.0.0.dist-info/RECORD +14 -0
- webtools_cli-1.0.0.dist-info/WHEEL +5 -0
- webtools_cli-1.0.0.dist-info/entry_points.txt +2 -0
- webtools_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
- webtools_cli-1.0.0.dist-info/top_level.txt +1 -0
webtools/core.py
ADDED
|
@@ -0,0 +1,2596 @@
|
|
|
1
|
+
import sys,os,re,requests,random,subprocess,time,socket,shutil,json,zipfile,atexit,concurrent.futures,threading,qrcode,logging,queue,urllib3,base64,traceback,csv,io,mtranslate,hashlib
|
|
2
|
+
sys.dont_write_bytecode = True
|
|
3
|
+
|
|
4
|
+
# --- PACKAGE PATHS ---
|
|
5
|
+
PACKAGE_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
6
|
+
DATA_DIR = os.path.join(os.path.expanduser('~'), '.webtools')
|
|
7
|
+
os.makedirs(DATA_DIR, exist_ok=True)
|
|
8
|
+
try:
|
|
9
|
+
from colorama import init, Fore, Style
|
|
10
|
+
init(autoreset=True)
|
|
11
|
+
COLOR_SUPPORT = True
|
|
12
|
+
except ImportError:
|
|
13
|
+
COLOR_SUPPORT = False
|
|
14
|
+
import numpy as np
|
|
15
|
+
from bs4 import BeautifulSoup
|
|
16
|
+
from collections import Counter
|
|
17
|
+
from flask import Flask, render_template_string, send_from_directory, request, jsonify, send_file
|
|
18
|
+
from PIL import Image,ExifTags,ImageChops,ImageEnhance
|
|
19
|
+
from io import BytesIO
|
|
20
|
+
|
|
21
|
+
try:
|
|
22
|
+
from playwright.sync_api import sync_playwright
|
|
23
|
+
PLAYWRIGHT_AVAILABLE = True
|
|
24
|
+
except ImportError:
|
|
25
|
+
PLAYWRIGHT_AVAILABLE = False
|
|
26
|
+
|
|
27
|
+
# --- CLI AUTOCOMPLETE SETUP ---
|
|
28
|
+
try:
|
|
29
|
+
if os.name == 'nt':
|
|
30
|
+
try:
|
|
31
|
+
from pyreadline3 import Readline
|
|
32
|
+
readline = Readline()
|
|
33
|
+
except (ImportError, AttributeError):
|
|
34
|
+
import pyreadline3 as readline
|
|
35
|
+
else:
|
|
36
|
+
import readline
|
|
37
|
+
|
|
38
|
+
HISTORY_FILE = os.path.join(DATA_DIR, 'history')
|
|
39
|
+
|
|
40
|
+
def setup_autocomplete(commands):
|
|
41
|
+
def completer(text, state):
|
|
42
|
+
options = [i for i in commands if i.startswith(text)]
|
|
43
|
+
if state < len(options):
|
|
44
|
+
return options[state]
|
|
45
|
+
else:
|
|
46
|
+
return None
|
|
47
|
+
|
|
48
|
+
# Ensure the object has the required methods
|
|
49
|
+
if hasattr(readline, 'set_completer'):
|
|
50
|
+
readline.set_completer(completer)
|
|
51
|
+
if 'libedit' in (getattr(readline, '__doc__', '') or ''):
|
|
52
|
+
readline.parse_and_bind("bind ^I rl_complete")
|
|
53
|
+
else:
|
|
54
|
+
readline.parse_and_bind("tab: complete")
|
|
55
|
+
if os.name != 'nt' and hasattr(readline, 'parse_and_bind'):
|
|
56
|
+
readline.parse_and_bind("set show-all-if-ambiguous on")
|
|
57
|
+
|
|
58
|
+
if hasattr(readline, 'set_completer_delims'):
|
|
59
|
+
readline.set_completer_delims(' ')
|
|
60
|
+
|
|
61
|
+
# Load History
|
|
62
|
+
if os.path.exists(HISTORY_FILE):
|
|
63
|
+
try:
|
|
64
|
+
readline.read_history_file(HISTORY_FILE)
|
|
65
|
+
except: pass
|
|
66
|
+
|
|
67
|
+
# Save on exit
|
|
68
|
+
atexit.register(lambda: (readline.write_history_file(HISTORY_FILE) if AUTOCOMPLETE_AVAILABLE else None))
|
|
69
|
+
|
|
70
|
+
AUTOCOMPLETE_AVAILABLE = True
|
|
71
|
+
except Exception:
|
|
72
|
+
def setup_autocomplete(commands): pass
|
|
73
|
+
AUTOCOMPLETE_AVAILABLE = False
|
|
74
|
+
|
|
75
|
+
def print_gradient_text(text, start_rgb, end_rgb):
|
|
76
|
+
"""Prints text with a vertical/horizontal color gradient using 24-bit ANSI"""
|
|
77
|
+
lines = text.splitlines()
|
|
78
|
+
if not lines: return
|
|
79
|
+
|
|
80
|
+
for i, line in enumerate(lines):
|
|
81
|
+
# Calculate ratio for this line
|
|
82
|
+
ratio = i / max(1, len(lines) - 1)
|
|
83
|
+
|
|
84
|
+
# Interpolate RGB
|
|
85
|
+
r = int(start_rgb[0] + (end_rgb[0] - start_rgb[0]) * ratio)
|
|
86
|
+
g = int(start_rgb[1] + (end_rgb[1] - start_rgb[1]) * ratio)
|
|
87
|
+
b = int(start_rgb[2] + (end_rgb[2] - start_rgb[2]) * ratio)
|
|
88
|
+
|
|
89
|
+
# Apply 24-bit color ANSI
|
|
90
|
+
print(f"\033[38;2;{r};{g};{b}m{line}\033[0m")
|
|
91
|
+
|
|
92
|
+
# Flask logs ko suppress karo
|
|
93
|
+
log = logging.getLogger('werkzeug')
|
|
94
|
+
log.setLevel(logging.ERROR)
|
|
95
|
+
|
|
96
|
+
# SSL warnings ko globally band karo
|
|
97
|
+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
98
|
+
|
|
99
|
+
# Directories setup kar rahe hain
|
|
100
|
+
os.makedirs('webfiles/scraped', exist_ok=True)
|
|
101
|
+
os.makedirs('webfiles/scraped/images', exist_ok=True)
|
|
102
|
+
os.makedirs('webfiles/scraped/videos', exist_ok=True)
|
|
103
|
+
|
|
104
|
+
# --- PERFORMANCE AUDITOR ---
|
|
105
|
+
class PerformanceTracker:
|
|
106
|
+
def __init__(self):
|
|
107
|
+
self.stats_file = os.path.join(DATA_DIR, 'performance_stats.json')
|
|
108
|
+
self.data = self.load_data()
|
|
109
|
+
self.current_report = {}
|
|
110
|
+
self.last_mark = 0
|
|
111
|
+
self.session_url = ""
|
|
112
|
+
|
|
113
|
+
def load_data(self):
|
|
114
|
+
if os.path.exists(self.stats_file):
|
|
115
|
+
try:
|
|
116
|
+
with open(self.stats_file, 'r') as f:
|
|
117
|
+
return json.load(f)
|
|
118
|
+
except: pass
|
|
119
|
+
return {'best': float('inf'), 'worst': 0, 'total_time': 0, 'count': 0}
|
|
120
|
+
|
|
121
|
+
def save_data(self):
|
|
122
|
+
try:
|
|
123
|
+
with open(self.stats_file, 'w') as f:
|
|
124
|
+
json.dump(self.data, f)
|
|
125
|
+
except: pass
|
|
126
|
+
|
|
127
|
+
def start_session(self, url):
|
|
128
|
+
self.current_report = {}
|
|
129
|
+
self.last_mark = time.perf_counter()
|
|
130
|
+
self.session_url = url
|
|
131
|
+
|
|
132
|
+
def record_phase(self, name):
|
|
133
|
+
now = time.perf_counter()
|
|
134
|
+
duration = now - self.last_mark
|
|
135
|
+
self.current_report[name] = self.current_report.get(name, 0) + duration
|
|
136
|
+
self.last_mark = now
|
|
137
|
+
|
|
138
|
+
def finish_and_print(self):
|
|
139
|
+
total = sum(self.current_report.values())
|
|
140
|
+
if total <= 0: return {}
|
|
141
|
+
|
|
142
|
+
self.data['count'] += 1
|
|
143
|
+
self.data['total_time'] += total
|
|
144
|
+
if total < self.data['best']: self.data['best'] = total
|
|
145
|
+
if total > self.data['worst']: self.data['worst'] = total
|
|
146
|
+
self.save_data()
|
|
147
|
+
|
|
148
|
+
avg = self.data['total_time'] / self.data['count']
|
|
149
|
+
|
|
150
|
+
return {
|
|
151
|
+
'total': total,
|
|
152
|
+
'phases': dict(self.current_report),
|
|
153
|
+
'avg': avg,
|
|
154
|
+
'best': self.data['best'],
|
|
155
|
+
'worst': self.data['worst']
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
perf_tracker = PerformanceTracker()
|
|
159
|
+
|
|
160
|
+
class MoonSpinner:
|
|
161
|
+
"""Threaded moon-phase loading animation"""
|
|
162
|
+
def __init__(self, message="Processing"):
|
|
163
|
+
self.message = message
|
|
164
|
+
self.frames = ['🌑', '🌒', '🌓', '🌔', '🌕', '🌖', '🌗', '🌘']
|
|
165
|
+
self.stop_event = threading.Event()
|
|
166
|
+
self.thread = threading.Thread(target=self._animate, daemon=True)
|
|
167
|
+
|
|
168
|
+
def _animate(self):
|
|
169
|
+
while not self.stop_event.is_set():
|
|
170
|
+
for f in self.frames:
|
|
171
|
+
if self.stop_event.is_set(): break
|
|
172
|
+
sys.stdout.write(f'\r{self.message} {f} ')
|
|
173
|
+
sys.stdout.flush()
|
|
174
|
+
time.sleep(0.2)
|
|
175
|
+
|
|
176
|
+
def __enter__(self):
|
|
177
|
+
self.thread.start()
|
|
178
|
+
return self
|
|
179
|
+
|
|
180
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
181
|
+
self.stop_event.set()
|
|
182
|
+
self.thread.join(timeout=1.0)
|
|
183
|
+
sys.stdout.write(f'\r{self.message} 🌕 Done! \n')
|
|
184
|
+
sys.stdout.flush()
|
|
185
|
+
|
|
186
|
+
# --- PROXY aur UA MANAGER ---
|
|
187
|
+
class ProxyManager:
|
|
188
|
+
def __init__(self):
|
|
189
|
+
self.proxies = []
|
|
190
|
+
self.last_fetch = 0
|
|
191
|
+
self.fetch_interval = 300 # Har 5 minute mein refresh hoga
|
|
192
|
+
self.user_agents = [
|
|
193
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
194
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
|
|
195
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0',
|
|
196
|
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
197
|
+
'Mozilla/5.0 (iPhone; CPU iPhone OS 17_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Mobile/15E148 Safari/604.1'
|
|
198
|
+
]
|
|
199
|
+
|
|
200
|
+
# Background mein validation logic
|
|
201
|
+
self.valid_proxies = queue.Queue()
|
|
202
|
+
self.lock = threading.Lock()
|
|
203
|
+
self.running = True
|
|
204
|
+
|
|
205
|
+
# Smart Learner (Scores setup)
|
|
206
|
+
self.scores_file = os.path.join(DATA_DIR, 'scores.json')
|
|
207
|
+
self.scores = self.load_scores()
|
|
208
|
+
|
|
209
|
+
# Background thread chalao
|
|
210
|
+
threading.Thread(target=self._background_validator, daemon=True).start()
|
|
211
|
+
|
|
212
|
+
def load_scores(self):
|
|
213
|
+
if os.path.exists(self.scores_file):
|
|
214
|
+
try:
|
|
215
|
+
with open(self.scores_file, 'r') as f:
|
|
216
|
+
return json.load(f)
|
|
217
|
+
except:
|
|
218
|
+
pass
|
|
219
|
+
return {}
|
|
220
|
+
|
|
221
|
+
def save_scores(self):
|
|
222
|
+
try:
|
|
223
|
+
with open(self.scores_file, 'w') as f:
|
|
224
|
+
json.dump(self.scores, f, indent=2)
|
|
225
|
+
except Exception as e:
|
|
226
|
+
print(f"Failed to save scores: {e}")
|
|
227
|
+
|
|
228
|
+
def report_success(self, proxy, domain, task_type, success):
|
|
229
|
+
with self.lock:
|
|
230
|
+
# Score ke liye composite key (jaise "youtube.com::video")
|
|
231
|
+
score_key = f"{domain}::{task_type}"
|
|
232
|
+
|
|
233
|
+
if score_key not in self.scores:
|
|
234
|
+
self.scores[score_key] = {}
|
|
235
|
+
|
|
236
|
+
if proxy not in self.scores[score_key]:
|
|
237
|
+
self.scores[score_key][proxy] = {'success': 0, 'fail': 0, 'score': 0.5}
|
|
238
|
+
|
|
239
|
+
stats = self.scores[score_key][proxy]
|
|
240
|
+
if success:
|
|
241
|
+
stats['success'] += 1
|
|
242
|
+
# Score badhao (Stickiness factor)
|
|
243
|
+
stats['score'] = min(1.0, stats['score'] + 0.25)
|
|
244
|
+
else:
|
|
245
|
+
stats['fail'] += 1
|
|
246
|
+
# Penalty laga rahe hain
|
|
247
|
+
stats['score'] = max(0.0, stats['score'] - 0.2)
|
|
248
|
+
|
|
249
|
+
self.save_scores()
|
|
250
|
+
|
|
251
|
+
def get_smart_proxy(self, domain, task_type='general'):
|
|
252
|
+
# Epsilon-Greedy Strategy with Stickiness
|
|
253
|
+
|
|
254
|
+
score_key = f"{domain}::{task_type}"
|
|
255
|
+
|
|
256
|
+
if score_key in self.scores:
|
|
257
|
+
domain_scores = self.scores[score_key]
|
|
258
|
+
# Find best proxy
|
|
259
|
+
if domain_scores:
|
|
260
|
+
best_proxy = max(domain_scores, key=lambda p: domain_scores[p]['score'])
|
|
261
|
+
best_score = domain_scores[best_proxy]['score']
|
|
262
|
+
|
|
263
|
+
# STICKY LOGIC:
|
|
264
|
+
# If Score > 0.8 (Very Trusted): 95% Exploitation
|
|
265
|
+
# If Score > 0.5 (Trusted): 80% Exploitation
|
|
266
|
+
# Else: 50% Exploitation
|
|
267
|
+
|
|
268
|
+
exploitation_rate = 0.5
|
|
269
|
+
if best_score > 0.8: exploitation_rate = 0.95
|
|
270
|
+
elif best_score > 0.5: exploitation_rate = 0.80
|
|
271
|
+
|
|
272
|
+
if random.random() < exploitation_rate and best_score > 0.4:
|
|
273
|
+
print(f"🧠 Smart Learner: Sticky reuse of {best_proxy} for {score_key} (Score: {best_score:.2f})")
|
|
274
|
+
return best_proxy
|
|
275
|
+
|
|
276
|
+
# Fallback to general domain score if task-specific score missing
|
|
277
|
+
if task_type != 'general':
|
|
278
|
+
return self.get_smart_proxy(domain, 'general')
|
|
279
|
+
|
|
280
|
+
# Exploration fallback
|
|
281
|
+
return self.get_valid_proxy()
|
|
282
|
+
|
|
283
|
+
def fetch_proxies(self):
|
|
284
|
+
# Sirf tab fetch karo jab list empty ya purani ho
|
|
285
|
+
with self.lock:
|
|
286
|
+
if self.proxies and (time.time() - self.last_fetch < self.fetch_interval):
|
|
287
|
+
return
|
|
288
|
+
|
|
289
|
+
# Quiet mode if background
|
|
290
|
+
# print("Fetching new proxies...")
|
|
291
|
+
try:
|
|
292
|
+
# Optimized timing: timeout=3000 (3s)
|
|
293
|
+
url = "https://api.proxyscrape.com/v2/?request=getproxies&protocol=http&timeout=3000&country=all&ssl=all&anonymity=elite,anonymous"
|
|
294
|
+
resp = requests.get(url, timeout=5)
|
|
295
|
+
if resp.status_code == 200:
|
|
296
|
+
proxy_list = resp.text.strip().split('\r\n')
|
|
297
|
+
with self.lock:
|
|
298
|
+
self.proxies = [p for p in proxy_list if p]
|
|
299
|
+
self.last_fetch = time.time()
|
|
300
|
+
# print(f"Fetched {len(self.proxies)} high-quality proxies.")
|
|
301
|
+
except Exception as e:
|
|
302
|
+
print(f"Failed to fetch proxies: {e}")
|
|
303
|
+
|
|
304
|
+
def _background_validator(self):
|
|
305
|
+
while self.running:
|
|
306
|
+
# Queue bhar ke rakho (target: 20 valid proxies)
|
|
307
|
+
if self.valid_proxies.qsize() < 20:
|
|
308
|
+
self.fetch_proxies()
|
|
309
|
+
|
|
310
|
+
with self.lock:
|
|
311
|
+
if not self.proxies:
|
|
312
|
+
time.sleep(5)
|
|
313
|
+
continue
|
|
314
|
+
candidates = list(self.proxies)
|
|
315
|
+
|
|
316
|
+
# Random proxy select karo
|
|
317
|
+
proxy = random.choice(candidates)
|
|
318
|
+
|
|
319
|
+
if self._check_proxy(proxy):
|
|
320
|
+
self.valid_proxies.put(proxy)
|
|
321
|
+
# print(f"P: {proxy} ({self.valid_proxies.qsize()})")
|
|
322
|
+
else:
|
|
323
|
+
pass
|
|
324
|
+
else:
|
|
325
|
+
time.sleep(1) # Queue full
|
|
326
|
+
|
|
327
|
+
def _check_proxy(self, proxy):
|
|
328
|
+
try:
|
|
329
|
+
proxies = {'http': proxy, 'https': proxy}
|
|
330
|
+
# Jaldi wala check
|
|
331
|
+
resp = requests.get("http://httpbin.org/ip", proxies=proxies, timeout=3, verify=False)
|
|
332
|
+
return resp.status_code == 200
|
|
333
|
+
except:
|
|
334
|
+
return False
|
|
335
|
+
|
|
336
|
+
def get_valid_proxy(self):
|
|
337
|
+
# Agar available ho toh turant result do
|
|
338
|
+
try:
|
|
339
|
+
return self.valid_proxies.get_nowait()
|
|
340
|
+
except:
|
|
341
|
+
# Agar queue empty ho toh random unvalidated proxy use karo
|
|
342
|
+
return self.get_random_proxy()
|
|
343
|
+
|
|
344
|
+
def get_random_proxy(self):
|
|
345
|
+
self.fetch_proxies()
|
|
346
|
+
if not self.proxies:
|
|
347
|
+
return None
|
|
348
|
+
return random.choice(self.proxies)
|
|
349
|
+
|
|
350
|
+
def get_random_ua(self):
|
|
351
|
+
return random.choice(self.user_agents)
|
|
352
|
+
|
|
353
|
+
proxy_manager = ProxyManager()
|
|
354
|
+
# ---------------------
|
|
355
|
+
|
|
356
|
+
def get_free_port():
|
|
357
|
+
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
|
358
|
+
sock.bind(('', 0))
|
|
359
|
+
port = sock.getsockname()[1]
|
|
360
|
+
sock.close()
|
|
361
|
+
return port
|
|
362
|
+
|
|
363
|
+
PORT = get_free_port()
|
|
364
|
+
WEB_DIR = os.path.join(PACKAGE_DIR, 'web')
|
|
365
|
+
app = Flask(__name__)
|
|
366
|
+
|
|
367
|
+
# Video Extraction wala updated HTML UI
|
|
368
|
+
@app.route('/')
|
|
369
|
+
def index():
|
|
370
|
+
return send_from_directory(WEB_DIR, 'index.html')
|
|
371
|
+
|
|
372
|
+
@app.route('/style.css')
|
|
373
|
+
def serve_css():
|
|
374
|
+
return send_from_directory(WEB_DIR, 'style.css')
|
|
375
|
+
|
|
376
|
+
@app.route('/script.js')
|
|
377
|
+
def serve_js():
|
|
378
|
+
return send_from_directory(WEB_DIR, 'script.js')
|
|
379
|
+
|
|
380
|
+
@app.route('/favicon.png')
|
|
381
|
+
def serve_favicon():
|
|
382
|
+
return send_from_directory(WEB_DIR, 'Web_Tools.png')
|
|
383
|
+
|
|
384
|
+
@app.route('/download/<path:filename>')
|
|
385
|
+
|
|
386
|
+
@app.route('/download/<path:filename>')
|
|
387
|
+
def serve_scraped_file(filename):
|
|
388
|
+
return send_from_directory('webfiles/scraped', filename)
|
|
389
|
+
|
|
390
|
+
def scrape_with_playwright(url, proxy=None):
|
|
391
|
+
if not PLAYWRIGHT_AVAILABLE:
|
|
392
|
+
return None
|
|
393
|
+
|
|
394
|
+
try:
|
|
395
|
+
with sync_playwright() as p:
|
|
396
|
+
browser = p.chromium.launch(headless=True)
|
|
397
|
+
|
|
398
|
+
# Stealth settings ke sath context
|
|
399
|
+
context = browser.new_context(
|
|
400
|
+
viewport={'width': 1920, 'height': 1080},
|
|
401
|
+
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
page = context.new_page()
|
|
405
|
+
|
|
406
|
+
# Heavy sites ke liye 30s ka timeout
|
|
407
|
+
try:
|
|
408
|
+
page.goto(url, timeout=30000, wait_until='domcontentloaded')
|
|
409
|
+
|
|
410
|
+
# Dynamic content ke liye thoda wait karo
|
|
411
|
+
time.sleep(5)
|
|
412
|
+
|
|
413
|
+
# Lazy loads trigger karne ke liye niche scroll karo
|
|
414
|
+
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
|
415
|
+
time.sleep(2)
|
|
416
|
+
|
|
417
|
+
content = page.content()
|
|
418
|
+
return content
|
|
419
|
+
except Exception as e:
|
|
420
|
+
print(f"❌ Playwright navigation failed: {e}")
|
|
421
|
+
return None
|
|
422
|
+
finally:
|
|
423
|
+
browser.close()
|
|
424
|
+
except Exception as e:
|
|
425
|
+
print(f"❌ Playwright error: {e}")
|
|
426
|
+
return None
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
# --- OSINT HELPERS ---
|
|
430
|
+
def extract_emails(text):
|
|
431
|
+
"""Regex se emails nikalo"""
|
|
432
|
+
# Sahi wala regex:
|
|
433
|
+
# 1. Start with alnum/dots/dashes
|
|
434
|
+
# 2. @ symbol
|
|
435
|
+
# 3. Domain name (alnum/dashes)
|
|
436
|
+
# 4. TLD (2+ chars)
|
|
437
|
+
# Filter: length < 50, accidental image matches remove karo
|
|
438
|
+
raw = re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', text)
|
|
439
|
+
valid = []
|
|
440
|
+
for email in set(raw):
|
|
441
|
+
if len(email) > 50: continue
|
|
442
|
+
if email.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp', '.svg')): continue
|
|
443
|
+
valid.append(email)
|
|
444
|
+
return valid
|
|
445
|
+
|
|
446
|
+
def extract_phones(text):
|
|
447
|
+
"""Phone numbers nikalo (+91 aur intl formats par focus)"""
|
|
448
|
+
phones = set()
|
|
449
|
+
|
|
450
|
+
# 1. Strict Indian Mobile Numbers: +91 9876543210 / 9876543210 / 09876543210
|
|
451
|
+
# Matches: +91-9876543210, +91 98765 43210, 9876543210
|
|
452
|
+
# Indian mobiles ke liye [6-9] starting digit search karo
|
|
453
|
+
indian_regex = r'(?:(?:\+|0{0,2})91(\s*[\-]\s*)?|[0]?)?([6-9]\d{3}[\s\-]?\d{6})'
|
|
454
|
+
for match in re.findall(indian_regex, text):
|
|
455
|
+
# match[1] number wala part hai
|
|
456
|
+
full_num = match[1].replace(' ', '').replace('-', '')
|
|
457
|
+
if len(full_num) == 10:
|
|
458
|
+
phones.add("+91 " + full_num)
|
|
459
|
+
|
|
460
|
+
# 2. General International (Fallback) jaise +1 etc.
|
|
461
|
+
# Explicit plus sign aur digits search karo
|
|
462
|
+
intl_regex = r'\+(?:9[976]\d|8[987530]\d|6[987]\d|5[90]\d|42\d|3[875]\d|2[98654321]\d|9[8543210]|8[6421]|6[6543210]|5[87654321]|4[987654310]|3[9643210]|2[70]|7|1)\W*\d\W*\d\W*\d\W*\d\W*\d\W*\d\W*\d\W*\d\W*(\d{1,2})?'
|
|
463
|
+
for p in re.findall(intl_regex, text):
|
|
464
|
+
# Yeh thoda complex match hai, noise avoid karne ke liye simple rakhte hain
|
|
465
|
+
pass
|
|
466
|
+
|
|
467
|
+
# 3. "Tel:" links ke liye simple grab
|
|
468
|
+
# (Regex ne handle kar liya hoga par safe rehne ke liye)
|
|
469
|
+
|
|
470
|
+
return list(phones)
|
|
471
|
+
|
|
472
|
+
def extract_locations(soup):
|
|
473
|
+
"""Physical addresses nikalo"""
|
|
474
|
+
locations = set()
|
|
475
|
+
|
|
476
|
+
# 1. Schema.org Parsing
|
|
477
|
+
for item in soup.find_all(attrs={"itemtype": re.compile(r"schema.org/PostalAddress", re.I)}):
|
|
478
|
+
text = item.get_text(separator=', ').strip()
|
|
479
|
+
if len(text) > 10: locations.add(text)
|
|
480
|
+
|
|
481
|
+
# 2. Google Maps Embeds
|
|
482
|
+
for iframe in soup.find_all('iframe', src=True):
|
|
483
|
+
if 'maps.google' in iframe['src']:
|
|
484
|
+
# Query nikalne ki koshish karo
|
|
485
|
+
match = re.search(r'q=([^&]+)', iframe['src'])
|
|
486
|
+
if match:
|
|
487
|
+
import urllib.parse
|
|
488
|
+
addr = urllib.parse.unquote(match.group(1).replace('+', ' '))
|
|
489
|
+
locations.add(addr)
|
|
490
|
+
|
|
491
|
+
# 3. Heuristic Keywords (Footer/Contact)
|
|
492
|
+
# Search ko footer ya contact sections tak limit rakho
|
|
493
|
+
search_area = soup.find('footer') or soup.find(id='contact') or soup.body
|
|
494
|
+
if search_area:
|
|
495
|
+
text = search_area.get_text(separator=' ')
|
|
496
|
+
# "Address: ..." ke liye rough regex
|
|
497
|
+
matches = re.findall(r'(?:Address|Location|Office):\s*([a-zA-Z0-9,\.\-\s]{10,100})', text, re.I)
|
|
498
|
+
for m in matches:
|
|
499
|
+
locations.add(m.strip())
|
|
500
|
+
|
|
501
|
+
return list(locations)
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
def extract_social_media(soup):
|
|
505
|
+
"""Social media profile links nikalo"""
|
|
506
|
+
social_domains = {
|
|
507
|
+
'facebook.com': 'Facebook',
|
|
508
|
+
'twitter.com': 'Twitter',
|
|
509
|
+
'x.com': 'X (Twitter)',
|
|
510
|
+
'instagram.com': 'Instagram',
|
|
511
|
+
'linkedin.com': 'LinkedIn',
|
|
512
|
+
'youtube.com': 'YouTube',
|
|
513
|
+
'tiktok.com': 'TikTok',
|
|
514
|
+
'pinterest.com': 'Pinterest',
|
|
515
|
+
'github.com': 'GitHub',
|
|
516
|
+
'gitlab.com': 'GitLab',
|
|
517
|
+
'discord.gg': 'Discord',
|
|
518
|
+
't.me': 'Telegram'
|
|
519
|
+
}
|
|
520
|
+
found = {}
|
|
521
|
+
for a in soup.find_all('a', href=True):
|
|
522
|
+
href = a['href'].lower()
|
|
523
|
+
for domain, name in social_domains.items():
|
|
524
|
+
if domain in href and name not in found:
|
|
525
|
+
# Share links avoid karne ke liye basic check
|
|
526
|
+
if 'share' not in href and 'intent' not in href:
|
|
527
|
+
found[name] = a['href']
|
|
528
|
+
|
|
529
|
+
return [{'platform': k, 'url': v} for k, v in found.items()]
|
|
530
|
+
|
|
531
|
+
def detect_tech_stack(soup, response):
|
|
532
|
+
"""Site par kaunsi technologies used hain woh check karo"""
|
|
533
|
+
stack = set()
|
|
534
|
+
|
|
535
|
+
# 1. Headers
|
|
536
|
+
if 'Server' in response.headers:
|
|
537
|
+
stack.add(f"Server: {response.headers['Server']}")
|
|
538
|
+
if 'X-Powered-By' in response.headers:
|
|
539
|
+
stack.add(f"Powered By: {response.headers['X-Powered-By']}")
|
|
540
|
+
if 'Via' in response.headers:
|
|
541
|
+
stack.add(f"Via: {response.headers['Via']}")
|
|
542
|
+
|
|
543
|
+
# 2. Meta Tags
|
|
544
|
+
generator = soup.find('meta', attrs={'name': 'generator'})
|
|
545
|
+
if generator and generator.get('content'):
|
|
546
|
+
stack.add(generator['content'])
|
|
547
|
+
|
|
548
|
+
# 3. Scripts / HTML Patterns
|
|
549
|
+
html_str = str(soup).lower()
|
|
550
|
+
if 'wp-content' in html_str: stack.add('WordPress')
|
|
551
|
+
if 'shopify' in html_str: stack.add('Shopify')
|
|
552
|
+
if 'wix.com' in html_str: stack.add('Wix')
|
|
553
|
+
if 'squarespace' in html_str: stack.add('Squarespace')
|
|
554
|
+
if 'react' in html_str or '_next' in html_str: stack.add('React/Next.js')
|
|
555
|
+
if 'vue' in html_str or 'nuxt' in html_str: stack.add('Vue.js')
|
|
556
|
+
if 'bootstrap' in html_str: stack.add('Bootstrap')
|
|
557
|
+
if 'tailwind' in html_str: stack.add('Tailwind CSS')
|
|
558
|
+
if 'jquery' in html_str: stack.add('jQuery')
|
|
559
|
+
if 'cloudflare' in html_str: stack.add('Cloudflare')
|
|
560
|
+
if 'google-analytics' in html_str: stack.add('Google Analytics')
|
|
561
|
+
|
|
562
|
+
return list(stack)
|
|
563
|
+
|
|
564
|
+
def analyze_ai_content(text):
|
|
565
|
+
"""Text analyze karo (sentiment, summary, readability, aur keywords)"""
|
|
566
|
+
try:
|
|
567
|
+
from textblob import TextBlob
|
|
568
|
+
import re
|
|
569
|
+
|
|
570
|
+
# Text clean karo
|
|
571
|
+
clean_text = re.sub(r'\s+', ' ', text).strip()
|
|
572
|
+
if not clean_text: return None
|
|
573
|
+
|
|
574
|
+
blob = TextBlob(clean_text)
|
|
575
|
+
sentiment = blob.sentiment
|
|
576
|
+
|
|
577
|
+
# 1. Summarization (Simple Frequency-based)
|
|
578
|
+
sentences = blob.sentences
|
|
579
|
+
if len(sentences) > 0:
|
|
580
|
+
# Simple summary: pehla sentence + 2 interesting sentences
|
|
581
|
+
# NLTK/Spacy jaisi heavy libraries avoid karne ka tareeka
|
|
582
|
+
summary_sentences = [sentences[0].string]
|
|
583
|
+
|
|
584
|
+
# Baaki ke diverse sentences search karo
|
|
585
|
+
remaining = sentences[1:]
|
|
586
|
+
remaining.sort(key=lambda s: len(s.noun_phrases), reverse=True)
|
|
587
|
+
for s in remaining[:2]:
|
|
588
|
+
summary_sentences.append(s.string)
|
|
589
|
+
|
|
590
|
+
summary = ' '.join(summary_sentences)
|
|
591
|
+
else:
|
|
592
|
+
summary = clean_text[:200] + "..."
|
|
593
|
+
|
|
594
|
+
# 2. Readability (Flesch Reading Ease)
|
|
595
|
+
# Formula: 206.835 - 1.015 (total words / total sentences) - 84.6 (total syllables / total words)
|
|
596
|
+
words = blob.words
|
|
597
|
+
num_sentences = len(sentences) or 1
|
|
598
|
+
num_words = len(words) or 1
|
|
599
|
+
|
|
600
|
+
# Syllable approximation (vowel groups)
|
|
601
|
+
def count_syllables(word):
|
|
602
|
+
word = word.lower()
|
|
603
|
+
count = len(re.findall(r'[aeiouy]+', word))
|
|
604
|
+
if word.endswith('e'): count -= 1
|
|
605
|
+
return max(1, count)
|
|
606
|
+
|
|
607
|
+
num_syllables = sum(count_syllables(w) for w in words)
|
|
608
|
+
|
|
609
|
+
flesch_score = 206.835 - 1.015 * (num_words / num_sentences) - 84.6 * (num_syllables / num_words)
|
|
610
|
+
|
|
611
|
+
readability_level = "Standard"
|
|
612
|
+
if flesch_score > 80: readability_level = "Very Easy (Kids)"
|
|
613
|
+
elif flesch_score > 60: readability_level = "Plain English"
|
|
614
|
+
elif flesch_score > 40: readability_level = "Difficult (College)"
|
|
615
|
+
else: readability_level = "Very Difficult (Academic)"
|
|
616
|
+
|
|
617
|
+
# 3. Enhanced Keywords
|
|
618
|
+
keywords = []
|
|
619
|
+
seen = set()
|
|
620
|
+
for phrase in blob.noun_phrases:
|
|
621
|
+
p = phrase.lower().strip()
|
|
622
|
+
# Junk aur chhote words filter karo
|
|
623
|
+
if len(p) > 4 and p not in seen and not re.match(r'^\d+$', p) and len(keywords) < 12:
|
|
624
|
+
keywords.append(phrase.title())
|
|
625
|
+
seen.add(p)
|
|
626
|
+
|
|
627
|
+
return {
|
|
628
|
+
'sentiment': {
|
|
629
|
+
'polarity': round(sentiment.polarity, 2),
|
|
630
|
+
'subjectivity': round(sentiment.subjectivity, 2),
|
|
631
|
+
'label': 'Positive' if sentiment.polarity > 0.1 else 'Negative' if sentiment.polarity < -0.1 else 'Neutral',
|
|
632
|
+
'subjectivity_label': 'Opinionated' if sentiment.subjectivity > 0.5 else 'Objective'
|
|
633
|
+
},
|
|
634
|
+
'summary': summary,
|
|
635
|
+
'readability': {
|
|
636
|
+
'score': round(flesch_score, 1),
|
|
637
|
+
'level': readability_level
|
|
638
|
+
},
|
|
639
|
+
'keywords': keywords
|
|
640
|
+
}
|
|
641
|
+
except Exception as e:
|
|
642
|
+
print(f"AI Analysis failed: {e}")
|
|
643
|
+
return None
|
|
644
|
+
|
|
645
|
+
def check_broken_links(url, soup, headers):
|
|
646
|
+
"""Broken links (404s) check karo (parallel mein)."""
|
|
647
|
+
broken_links = []
|
|
648
|
+
links_to_check = []
|
|
649
|
+
|
|
650
|
+
from urllib.parse import urlparse, urljoin
|
|
651
|
+
base_domain = urlparse(url).netloc
|
|
652
|
+
|
|
653
|
+
# Links deduplicate karo
|
|
654
|
+
seen_links = set()
|
|
655
|
+
|
|
656
|
+
for a in soup.find_all('a', href=True):
|
|
657
|
+
href = a['href']
|
|
658
|
+
full_url = urljoin(url, href)
|
|
659
|
+
|
|
660
|
+
# Faltu ya non-http links skip karo
|
|
661
|
+
if not full_url.startswith('http') or full_url in seen_links:
|
|
662
|
+
continue
|
|
663
|
+
|
|
664
|
+
seen_links.add(full_url)
|
|
665
|
+
|
|
666
|
+
is_internal = base_domain in full_url
|
|
667
|
+
link_text = a.get_text().strip()[:50] # Text truncate karo
|
|
668
|
+
|
|
669
|
+
links_to_check.append({
|
|
670
|
+
'url': full_url,
|
|
671
|
+
'text': link_text or "No Text",
|
|
672
|
+
'is_internal': is_internal
|
|
673
|
+
})
|
|
674
|
+
|
|
675
|
+
# Iss demo ke performance ke liye 50 links tak limit rakho
|
|
676
|
+
links_to_check = links_to_check[:50]
|
|
677
|
+
|
|
678
|
+
def check_status(link_info):
|
|
679
|
+
try:
|
|
680
|
+
# Speed ke liye Head request use karo
|
|
681
|
+
r = requests.head(link_info['url'], headers=headers, timeout=5, allow_redirects=True, verify=False)
|
|
682
|
+
if r.status_code >= 400:
|
|
683
|
+
return {
|
|
684
|
+
'url': link_info['url'],
|
|
685
|
+
'text': link_info['text'],
|
|
686
|
+
'status': r.status_code,
|
|
687
|
+
'is_internal': link_info['is_internal']
|
|
688
|
+
}
|
|
689
|
+
except Exception:
|
|
690
|
+
# Agar head fail ho jaye toh get try karo (kuch servers HEAD block karte hain)
|
|
691
|
+
try:
|
|
692
|
+
r = requests.get(link_info['url'], headers=headers, stream=True, timeout=5, verify=False)
|
|
693
|
+
if r.status_code >= 400:
|
|
694
|
+
return {
|
|
695
|
+
'url': link_info['url'],
|
|
696
|
+
'text': link_info['text'],
|
|
697
|
+
'status': r.status_code,
|
|
698
|
+
'is_internal': link_info['is_internal']
|
|
699
|
+
}
|
|
700
|
+
except Exception:
|
|
701
|
+
return {
|
|
702
|
+
'url': link_info['url'],
|
|
703
|
+
'text': link_info['text'],
|
|
704
|
+
'status': 0, # Connection Error
|
|
705
|
+
'is_internal': link_info['is_internal']
|
|
706
|
+
}
|
|
707
|
+
return None
|
|
708
|
+
|
|
709
|
+
# Parallel Execution
|
|
710
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
|
|
711
|
+
results = list(executor.map(check_status, links_to_check))
|
|
712
|
+
|
|
713
|
+
# Filter Nones
|
|
714
|
+
broken_links = [r for r in results if r]
|
|
715
|
+
|
|
716
|
+
return broken_links
|
|
717
|
+
|
|
718
|
+
def execute_scrape_logic(url, fetch_images=False, fetch_videos=False, crawl_depth=1, use_proxy=False, device='desktop'):
|
|
719
|
+
try:
|
|
720
|
+
if not url.startswith('http'):
|
|
721
|
+
url = 'https://' + url
|
|
722
|
+
|
|
723
|
+
# Start performance session
|
|
724
|
+
perf_tracker.start_session(url)
|
|
725
|
+
|
|
726
|
+
# UA rotate karo
|
|
727
|
+
ua = proxy_manager.get_random_ua()
|
|
728
|
+
headers = {'User-Agent': ua}
|
|
729
|
+
print(f"Using UA: {ua}")
|
|
730
|
+
|
|
731
|
+
# SSL warnings band karo
|
|
732
|
+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
733
|
+
|
|
734
|
+
perf_tracker.record_phase("Setup & Proxy")
|
|
735
|
+
|
|
736
|
+
try:
|
|
737
|
+
response = None
|
|
738
|
+
if use_proxy:
|
|
739
|
+
# Smart Learning ke liye Domain nikalo
|
|
740
|
+
from urllib.parse import urlparse
|
|
741
|
+
domain = urlparse(url).netloc
|
|
742
|
+
|
|
743
|
+
# Task Type decide karo
|
|
744
|
+
task_type = 'general'
|
|
745
|
+
if fetch_videos: task_type = 'video'
|
|
746
|
+
elif fetch_images: task_type = 'image'
|
|
747
|
+
|
|
748
|
+
# Smart Loop: Pehle Exploitation fir Exploration
|
|
749
|
+
print(f"Selecting best proxy for {domain} (Task: {task_type})...")
|
|
750
|
+
|
|
751
|
+
for attempt in range(3):
|
|
752
|
+
# Attempt 0: Smart Choice (Is domain aur task ke liye best)
|
|
753
|
+
# Attempt 1+: Fallback (Random valid proxy)
|
|
754
|
+
if attempt == 0:
|
|
755
|
+
proxy = proxy_manager.get_smart_proxy(domain, task_type)
|
|
756
|
+
else:
|
|
757
|
+
proxy = proxy_manager.get_valid_proxy()
|
|
758
|
+
|
|
759
|
+
if not proxy:
|
|
760
|
+
break
|
|
761
|
+
|
|
762
|
+
print(f"Using proxy: {proxy} (Attempt {attempt+1})")
|
|
763
|
+
proxies = {'http': proxy, 'https': proxy}
|
|
764
|
+
try:
|
|
765
|
+
# SSL errors bypass karne ke liye verify=False
|
|
766
|
+
response = requests.get(url, headers=headers, proxies=proxies, timeout=30, verify=False)
|
|
767
|
+
if response.status_code == 200:
|
|
768
|
+
# SUCCESS: Model train karo
|
|
769
|
+
print(f"✅ Proxy {proxy} worked for {domain} ({task_type}). Boosting score!")
|
|
770
|
+
proxy_manager.report_success(proxy, domain, task_type, True)
|
|
771
|
+
break
|
|
772
|
+
else:
|
|
773
|
+
# FAIL (Non-200): Model train karo
|
|
774
|
+
print(f"❌ Proxy {proxy} failed for {domain} ({task_type}) (Status {response.status_code}). Penalizing.")
|
|
775
|
+
proxy_manager.report_success(proxy, domain, task_type, False)
|
|
776
|
+
except Exception as e:
|
|
777
|
+
# FAIL (Exception): Model train karo
|
|
778
|
+
print(f"❌ Proxy {proxy} error for {domain} ({task_type}): {e}. Penalizing.")
|
|
779
|
+
proxy_manager.report_success(proxy, domain, task_type, False)
|
|
780
|
+
continue
|
|
781
|
+
|
|
782
|
+
if response is None or response.status_code != 200:
|
|
783
|
+
print("All proxies failed. Falling back to direct connection.")
|
|
784
|
+
response = requests.get(url, headers=headers, timeout=30)
|
|
785
|
+
else:
|
|
786
|
+
# Direct Request (SPA Check ke sath)
|
|
787
|
+
if 'linkedin.com' in url or 'instagram.com' in url:
|
|
788
|
+
print("⚠️ SPA Pattern Detected (LinkedIn/Instagram). Skipping requests...")
|
|
789
|
+
response = None # Playwright force karo
|
|
790
|
+
else:
|
|
791
|
+
response = requests.get(url, headers=headers, timeout=30)
|
|
792
|
+
|
|
793
|
+
# Block ya failure check karo
|
|
794
|
+
if response and response.status_code in [403, 401, 406, 429]:
|
|
795
|
+
print(f"⚠️ Access Denied ({response.status_code}). Triggering Headless Browser...")
|
|
796
|
+
response = None # Force Playwright
|
|
797
|
+
|
|
798
|
+
if response is None:
|
|
799
|
+
raise Exception("SPA/Auth Wall ke liye Playwright force karo")
|
|
800
|
+
|
|
801
|
+
response.raise_for_status()
|
|
802
|
+
perf_tracker.record_phase("Fetch Content")
|
|
803
|
+
|
|
804
|
+
except Exception as e:
|
|
805
|
+
# Agar requests fail hon toh Playwright use karo
|
|
806
|
+
print(f"Requests failed ({e}). Attempting Playwright Fallback...")
|
|
807
|
+
if PLAYWRIGHT_AVAILABLE:
|
|
808
|
+
pw_html = scrape_with_playwright(url)
|
|
809
|
+
if pw_html:
|
|
810
|
+
# Compatibility ke liye mock response object
|
|
811
|
+
class MockResponse:
|
|
812
|
+
def __init__(self, text):
|
|
813
|
+
self.text = text
|
|
814
|
+
self.content = text.encode('utf-8')
|
|
815
|
+
self.status_code = 200
|
|
816
|
+
self.headers = {}
|
|
817
|
+
response = MockResponse(pw_html)
|
|
818
|
+
else:
|
|
819
|
+
return jsonify({'error': f'Request failed and Playwright fallback failed: {str(e)}'}), 400
|
|
820
|
+
else:
|
|
821
|
+
return jsonify({'error': f'Request failed: {str(e)}'}), 400
|
|
822
|
+
|
|
823
|
+
soup = BeautifulSoup(response.text, 'html.parser')
|
|
824
|
+
perf_tracker.record_phase("HTML Parsing")
|
|
825
|
+
|
|
826
|
+
# --- HONEYPOT DETECTOR (Security Scout) ---
|
|
827
|
+
security_report = {'level': 'LOW', 'threats': [], 'honeypots': 0}
|
|
828
|
+
|
|
829
|
+
# 1. Status Code Check
|
|
830
|
+
if response.status_code in [403, 406, 429, 503]:
|
|
831
|
+
security_report['threats'].append(f"Suspicious Status Code: {response.status_code}")
|
|
832
|
+
security_report['level'] = 'HIGH'
|
|
833
|
+
|
|
834
|
+
# 2. Keyword Analysis (Anti-Bot)
|
|
835
|
+
page_text_lower = response.text.lower()
|
|
836
|
+
threat_keywords = ['cloudflare', 'managed challenge', 'captcha', 'security check', 'access denied', 'waf']
|
|
837
|
+
found_threats = [kw for kw in threat_keywords if kw in page_text_lower]
|
|
838
|
+
if found_threats:
|
|
839
|
+
security_report['threats'].append(f"Anti-Bot Detected: {', '.join(found_threats)}")
|
|
840
|
+
if 'captcha' in found_threats or 'challenge' in found_threats:
|
|
841
|
+
security_report['level'] = 'HIGH'
|
|
842
|
+
elif security_report['level'] == 'LOW':
|
|
843
|
+
security_report['level'] = 'MEDIUM'
|
|
844
|
+
|
|
845
|
+
# 3. Honeypot Link Detection (CSS Traps)
|
|
846
|
+
# Aise links nikalo jo bots ko dikhte hain par humans ko nahi
|
|
847
|
+
honeypot_links = 0
|
|
848
|
+
for a in soup.find_all('a', style=True):
|
|
849
|
+
style = a['style'].lower().replace(' ', '')
|
|
850
|
+
if 'display:none' in style or 'visibility:hidden' in style or 'opacity:0' in style:
|
|
851
|
+
honeypot_links += 1
|
|
852
|
+
|
|
853
|
+
if honeypot_links > 0:
|
|
854
|
+
security_report['honeypots'] = honeypot_links
|
|
855
|
+
security_report['threats'].append(f"Honeypot Traps: {honeypot_links} hidden links found")
|
|
856
|
+
if security_report['level'] == 'LOW': security_report['level'] = 'MEDIUM'
|
|
857
|
+
# ------------------------------------------
|
|
858
|
+
|
|
859
|
+
# Containers initialize karo
|
|
860
|
+
videos = []
|
|
861
|
+
video_count = 0
|
|
862
|
+
images = []
|
|
863
|
+
image_count = 0
|
|
864
|
+
seen_images = set()
|
|
865
|
+
|
|
866
|
+
# Helper validation
|
|
867
|
+
def cleaner_url_validator(url):
|
|
868
|
+
try:
|
|
869
|
+
# Http se start hona chahiye, dot hona chahiye, spaces nahi
|
|
870
|
+
if not url.startswith('http'): return False
|
|
871
|
+
if ' ' in url: return False
|
|
872
|
+
if '.' not in url.split('://')[1]: return False
|
|
873
|
+
return True
|
|
874
|
+
except:
|
|
875
|
+
return False
|
|
876
|
+
|
|
877
|
+
# Turbo-Fetch (Parallel Chunk Download) ke liye helper
|
|
878
|
+
def download_file_turbo(url, filepath):
|
|
879
|
+
try:
|
|
880
|
+
# 1. Size nikalo
|
|
881
|
+
head = requests.head(url, headers=headers, timeout=5, verify=False)
|
|
882
|
+
size = int(head.headers.get('content-length', 0))
|
|
883
|
+
|
|
884
|
+
# 2MB se bade files ke liye Turbo use karo
|
|
885
|
+
if size < 2 * 1024 * 1024:
|
|
886
|
+
return False
|
|
887
|
+
|
|
888
|
+
print(f"Turbo-Fetch active: {os.path.basename(filepath)} ({size/1024/1024:.1f} MB)")
|
|
889
|
+
|
|
890
|
+
# 2. Chunks calculate karo (8 parts)
|
|
891
|
+
num_chunks = 8
|
|
892
|
+
chunk_size = size // num_chunks
|
|
893
|
+
chunks = []
|
|
894
|
+
for i in range(num_chunks):
|
|
895
|
+
start = i * chunk_size
|
|
896
|
+
end = start + chunk_size - 1 if i < num_chunks - 1 else size - 1
|
|
897
|
+
chunks.append((start, end, i))
|
|
898
|
+
|
|
899
|
+
# 3. Parallel Download
|
|
900
|
+
file_data = bytearray(size)
|
|
901
|
+
|
|
902
|
+
def download_chunk(c):
|
|
903
|
+
start, end, idx = c
|
|
904
|
+
h = headers.copy()
|
|
905
|
+
h['Range'] = f'bytes={start}-{end}'
|
|
906
|
+
r = requests.get(url, headers=h, timeout=20, verify=False)
|
|
907
|
+
if r.status_code in [200, 206]:
|
|
908
|
+
# Seedha buffer mein likho
|
|
909
|
+
file_data[start:end+1] = r.content
|
|
910
|
+
return True
|
|
911
|
+
return False
|
|
912
|
+
|
|
913
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as exc:
|
|
914
|
+
futures = [exc.submit(download_chunk, c) for c in chunks]
|
|
915
|
+
concurrent.futures.wait(futures)
|
|
916
|
+
|
|
917
|
+
# 4. Disk par save karo
|
|
918
|
+
with open(filepath, 'wb') as f:
|
|
919
|
+
f.write(file_data)
|
|
920
|
+
|
|
921
|
+
return True
|
|
922
|
+
|
|
923
|
+
except Exception as e:
|
|
924
|
+
print(f"Turbo failed: {e}")
|
|
925
|
+
return False
|
|
926
|
+
|
|
927
|
+
def process_video_download_task(task_item):
|
|
928
|
+
v_url, quality, title_hint = task_item
|
|
929
|
+
|
|
930
|
+
# Sabse pehle m3u8 (HLS) check karo
|
|
931
|
+
if v_url.split('?')[0].lower().endswith('.m3u8'):
|
|
932
|
+
return {
|
|
933
|
+
'url': v_url,
|
|
934
|
+
'original_url': v_url,
|
|
935
|
+
'filename': 'Stream.m3u8',
|
|
936
|
+
'external': True,
|
|
937
|
+
'is_m3u8': True,
|
|
938
|
+
'quality': quality or 'auto'
|
|
939
|
+
}
|
|
940
|
+
|
|
941
|
+
# RETRY LOOP (Max 3 attempts)
|
|
942
|
+
for attempt in range(3):
|
|
943
|
+
try:
|
|
944
|
+
# verify=False use karo
|
|
945
|
+
# Stability ke liye 15s timeout, stream=True
|
|
946
|
+
vid_data = requests.get(v_url, headers=headers, timeout=15, stream=True, verify=False)
|
|
947
|
+
|
|
948
|
+
if vid_data.status_code == 200:
|
|
949
|
+
content_type = vid_data.headers.get('content-type', '').lower()
|
|
950
|
+
|
|
951
|
+
# Minimum size filter: 2MB se chhoti videos skip karo
|
|
952
|
+
content_length = vid_data.headers.get('content-length')
|
|
953
|
+
if content_length:
|
|
954
|
+
size_mb = int(content_length) / (1024 * 1024)
|
|
955
|
+
if size_mb < 2:
|
|
956
|
+
return None
|
|
957
|
+
|
|
958
|
+
if 'video' in content_type or 'octet-stream' in content_type or v_url.endswith(('.mp4', '.webm', '.mov')) or 'mpegurl' in content_type:
|
|
959
|
+
if 'mpegurl' in content_type or v_url.split('?')[0].lower().endswith('.m3u8'):
|
|
960
|
+
return {
|
|
961
|
+
'url': v_url,
|
|
962
|
+
'original_url': v_url,
|
|
963
|
+
'filename': 'Stream.m3u8',
|
|
964
|
+
'external': True,
|
|
965
|
+
'is_m3u8': True
|
|
966
|
+
}
|
|
967
|
+
filename = os.path.basename(v_url.split('?')[0]) or 'video.mp4'
|
|
968
|
+
if not filename.endswith(('.mp4', '.webm', '.ogg', '.mov', '.avi', '.mkv')):
|
|
969
|
+
filename += '.mp4'
|
|
970
|
+
|
|
971
|
+
# Title sanitize karo aur filename generate karo
|
|
972
|
+
import uuid
|
|
973
|
+
import re
|
|
974
|
+
|
|
975
|
+
if title_hint:
|
|
976
|
+
safe_title = re.sub(r'[^a-zA-Z0-9_\-\. ]', '', title_hint).strip().replace(' ', '_')[:50]
|
|
977
|
+
if safe_title:
|
|
978
|
+
base, ext = os.path.splitext(filename)
|
|
979
|
+
filename = f"{safe_title}_{uuid.uuid4().hex[:8]}{ext}"
|
|
980
|
+
else:
|
|
981
|
+
base, ext = os.path.splitext(filename)
|
|
982
|
+
if quality and quality != 'unknown':
|
|
983
|
+
filename = f"{base}_{quality}_{uuid.uuid4().hex[:8]}{ext}"
|
|
984
|
+
else:
|
|
985
|
+
filename = f"{base}_{uuid.uuid4().hex[:8]}{ext}"
|
|
986
|
+
else:
|
|
987
|
+
base, ext = os.path.splitext(filename)
|
|
988
|
+
if quality and quality != 'unknown':
|
|
989
|
+
filename = f"{base}_{quality}_{uuid.uuid4().hex[:8]}{ext}"
|
|
990
|
+
else:
|
|
991
|
+
filename = f"{base}_{uuid.uuid4().hex[:8]}{ext}"
|
|
992
|
+
|
|
993
|
+
filepath = f'webfiles/scraped/videos/{filename}'
|
|
994
|
+
|
|
995
|
+
# Pehle TURBO FETCH try karo
|
|
996
|
+
if not download_file_turbo(v_url, filepath):
|
|
997
|
+
# Standard stream par wapas jao (fallback)
|
|
998
|
+
with open(filepath, 'wb') as f:
|
|
999
|
+
for chunk in vid_data.iter_content(chunk_size=8192):
|
|
1000
|
+
f.write(chunk)
|
|
1001
|
+
return {
|
|
1002
|
+
'url': f'/download/videos/{filename}',
|
|
1003
|
+
'original_url': v_url,
|
|
1004
|
+
'filename': filename,
|
|
1005
|
+
'external': False,
|
|
1006
|
+
'quality': quality
|
|
1007
|
+
}
|
|
1008
|
+
elif vid_data.status_code in [403, 404, 401]:
|
|
1009
|
+
# In errors par retry mat karo
|
|
1010
|
+
return None
|
|
1011
|
+
|
|
1012
|
+
except Exception as ex:
|
|
1013
|
+
# Sirf aakhri attempt par poora error print karo
|
|
1014
|
+
if attempt == 2:
|
|
1015
|
+
err_str = str(ex)
|
|
1016
|
+
if 'NameResolutionError' in err_str:
|
|
1017
|
+
print(f"⚠️ DNS Error (Ad/Tracker ho sakta hai)")
|
|
1018
|
+
elif 'RemoteDisconnected' in err_str:
|
|
1019
|
+
print(f"⚠️ Connection Dropped: {v_url}")
|
|
1020
|
+
else:
|
|
1021
|
+
print(f"❌ Failed {v_url}: {ex}")
|
|
1022
|
+
else:
|
|
1023
|
+
time.sleep(1) # Retry se pehle thoda wait karo
|
|
1024
|
+
continue
|
|
1025
|
+
return None
|
|
1026
|
+
|
|
1027
|
+
# Extract CSS
|
|
1028
|
+
css_content = []
|
|
1029
|
+
for style in soup.find_all('style'):
|
|
1030
|
+
css_content.append(style.string or '')
|
|
1031
|
+
style.extract()
|
|
1032
|
+
|
|
1033
|
+
# OpenCV Image Validation
|
|
1034
|
+
def validate_image_quality(image_bytes):
|
|
1035
|
+
try:
|
|
1036
|
+
import cv2
|
|
1037
|
+
import numpy as np
|
|
1038
|
+
|
|
1039
|
+
# Image decode karo
|
|
1040
|
+
nparr = np.frombuffer(image_bytes, np.uint8)
|
|
1041
|
+
img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
|
|
1042
|
+
|
|
1043
|
+
if img is None: return False
|
|
1044
|
+
|
|
1045
|
+
# Check 1: Resolution (Chhote icons/thumbnails skip karo)
|
|
1046
|
+
h, w, _ = img.shape
|
|
1047
|
+
# Increased strictness from 50 to 150
|
|
1048
|
+
if w < 150 or h < 150:
|
|
1049
|
+
return False
|
|
1050
|
+
|
|
1051
|
+
# Check 2: Variance (Solid colors ya flat images)
|
|
1052
|
+
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
|
1053
|
+
variance = cv2.Laplacian(gray, cv2.CV_64F).var()
|
|
1054
|
+
# Increased strictness from 50 to 100
|
|
1055
|
+
if variance < 100:
|
|
1056
|
+
return False # Zyada blurry ya flat hai
|
|
1057
|
+
|
|
1058
|
+
# Check 3: Entropy (Information density)
|
|
1059
|
+
hist = cv2.calcHist([gray], [0], None, [256], [0, 256])
|
|
1060
|
+
hist_norm = hist.ravel() / hist.sum()
|
|
1061
|
+
logs = np.log2(hist_norm + 0.0001)
|
|
1062
|
+
entropy = -np.sum(hist_norm * logs)
|
|
1063
|
+
|
|
1064
|
+
# Increased strictness from 3.5 to 5.0
|
|
1065
|
+
if entropy < 5.0:
|
|
1066
|
+
return False # Information kam hai
|
|
1067
|
+
|
|
1068
|
+
return True
|
|
1069
|
+
except Exception as e:
|
|
1070
|
+
return True # Safe rehne ke liye fail open karo
|
|
1071
|
+
|
|
1072
|
+
def is_valuable_media(url_path, element, media_type='image'):
|
|
1073
|
+
"""Ads aur logos ke liye filtering logic."""
|
|
1074
|
+
try:
|
|
1075
|
+
# 1. URL Path Keywords
|
|
1076
|
+
lower_url = url_path.lower()
|
|
1077
|
+
ad_keywords = [
|
|
1078
|
+
'ad', 'advert', 'banner', 'doubleclick', 'googleads',
|
|
1079
|
+
'syndication', 'amazon-adsystem', 'wp-content/ads/',
|
|
1080
|
+
'promoted', 'sponsored', 'pixel', 'tracking', 'taboola', 'outbrain'
|
|
1081
|
+
]
|
|
1082
|
+
# Logos
|
|
1083
|
+
logo_keywords = ['logo', 'brand-logo', 'header-logo', 'footer-logo', 'favicon']
|
|
1084
|
+
|
|
1085
|
+
# Ads check karo
|
|
1086
|
+
if any(k in lower_url for k in ad_keywords):
|
|
1087
|
+
return False
|
|
1088
|
+
|
|
1089
|
+
# Logos check karo
|
|
1090
|
+
if any(k in lower_url for k in logo_keywords):
|
|
1091
|
+
return False
|
|
1092
|
+
|
|
1093
|
+
# 2. Metadata (Alt/Title)
|
|
1094
|
+
alt = element.get('alt', '').lower() if element.get('alt') else ""
|
|
1095
|
+
title = element.get('title', '').lower() if element.get('title') else ""
|
|
1096
|
+
metadata_text = f"{alt} {title}"
|
|
1097
|
+
|
|
1098
|
+
if any(k in metadata_text for k in ['ad ', 'ads ', 'advertisement', 'sponsored', 'logo', 'branding']):
|
|
1099
|
+
return False
|
|
1100
|
+
|
|
1101
|
+
# 3. CSS Metadata (Class/ID)
|
|
1102
|
+
classes = " ".join(element.get('class', [])) if isinstance(element.get('class'), list) else str(element.get('class', ''))
|
|
1103
|
+
id_val = str(element.get('id', ''))
|
|
1104
|
+
|
|
1105
|
+
# Parents check karo (simple BS4 method)
|
|
1106
|
+
parent_context = ""
|
|
1107
|
+
parent = element.parent
|
|
1108
|
+
levels = 0
|
|
1109
|
+
while parent and levels < 3:
|
|
1110
|
+
p_classes = " ".join(parent.get('class', [])) if isinstance(parent.get('class'), list) else str(parent.get('class', ''))
|
|
1111
|
+
parent_context += p_classes + " " + str(parent.get('id', '')) + " "
|
|
1112
|
+
parent = parent.parent
|
|
1113
|
+
levels += 1
|
|
1114
|
+
|
|
1115
|
+
context_text = (classes + " " + id_val + " " + parent_context).lower()
|
|
1116
|
+
bad_contexts = [' ad ', ' ads ', 'banner', 'logo', 'brand', 'sponsored', 'advert', 'widget-area']
|
|
1117
|
+
if any(k in context_text for k in bad_contexts):
|
|
1118
|
+
# Header ya sidebar check karo (risk areas hain)
|
|
1119
|
+
if any(x in context_text for x in ['header', 'sidebar', 'footer', 'nav']):
|
|
1120
|
+
return False
|
|
1121
|
+
|
|
1122
|
+
# 4. Dimensions (sirf image ke liye)
|
|
1123
|
+
width = element.get('width')
|
|
1124
|
+
height = element.get('height')
|
|
1125
|
+
if width and height:
|
|
1126
|
+
try:
|
|
1127
|
+
w = int(width)
|
|
1128
|
+
h = int(height)
|
|
1129
|
+
if h > 0:
|
|
1130
|
+
ratio = w / h
|
|
1131
|
+
# Banners aksar bahut wide ya tall hote hain
|
|
1132
|
+
if (ratio > 4 or ratio < 0.25) and (w < 900 and h < 900):
|
|
1133
|
+
return False
|
|
1134
|
+
except:
|
|
1135
|
+
pass
|
|
1136
|
+
|
|
1137
|
+
return True
|
|
1138
|
+
except:
|
|
1139
|
+
return True # Fail open
|
|
1140
|
+
|
|
1141
|
+
# Images download karne ke liye helper
|
|
1142
|
+
def process_image_download_task(img_src):
|
|
1143
|
+
try:
|
|
1144
|
+
img_url = requests.compat.urljoin(url, img_src)
|
|
1145
|
+
img_data = requests.get(img_url, headers=headers, timeout=3, verify=False)
|
|
1146
|
+
if img_data.status_code == 200:
|
|
1147
|
+
content = img_data.content
|
|
1148
|
+
|
|
1149
|
+
# QUALITY CHECK KARO
|
|
1150
|
+
if not validate_image_quality(content):
|
|
1151
|
+
return None
|
|
1152
|
+
|
|
1153
|
+
filename = os.path.basename(img_url.split('?')[0]) or 'image.jpg'
|
|
1154
|
+
if not filename.endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg')):
|
|
1155
|
+
filename += '.jpg'
|
|
1156
|
+
|
|
1157
|
+
import uuid
|
|
1158
|
+
import hashlib
|
|
1159
|
+
|
|
1160
|
+
image_hash = hashlib.md5(content).hexdigest()
|
|
1161
|
+
|
|
1162
|
+
filename = f"{uuid.uuid4().hex[:8]}_{filename}"
|
|
1163
|
+
|
|
1164
|
+
filepath = f'webfiles/scraped/images/{filename}'
|
|
1165
|
+
with open(filepath, 'wb') as f:
|
|
1166
|
+
f.write(content)
|
|
1167
|
+
return (img_src, f'images/{filename}', f'/download/images/{filename}', image_hash, filepath)
|
|
1168
|
+
except:
|
|
1169
|
+
pass
|
|
1170
|
+
return None
|
|
1171
|
+
|
|
1172
|
+
# CSS nikalo
|
|
1173
|
+
css_content = []
|
|
1174
|
+
for style in soup.find_all('style'):
|
|
1175
|
+
css_content.append(style.string or '')
|
|
1176
|
+
style.extract()
|
|
1177
|
+
|
|
1178
|
+
for link in soup.find_all('link', rel='stylesheet'):
|
|
1179
|
+
href = link.get('href')
|
|
1180
|
+
if href:
|
|
1181
|
+
try:
|
|
1182
|
+
css_url = requests.compat.urljoin(url, href)
|
|
1183
|
+
css_resp = requests.get(css_url, headers=headers, timeout=5)
|
|
1184
|
+
css_content.append(f'/* From {href} */\n' + css_resp.text)
|
|
1185
|
+
except:
|
|
1186
|
+
pass
|
|
1187
|
+
link.extract()
|
|
1188
|
+
|
|
1189
|
+
# JS nikalo
|
|
1190
|
+
js_content = []
|
|
1191
|
+
# Find all scripts (head and body)
|
|
1192
|
+
for script in soup.find_all('script'):
|
|
1193
|
+
# Non-executable scripts (JSON-LD, etc) filter karo
|
|
1194
|
+
script_type = script.get('type', '').lower()
|
|
1195
|
+
if script_type and script_type not in ['text/javascript', 'application/javascript', 'module']:
|
|
1196
|
+
script.extract()
|
|
1197
|
+
continue
|
|
1198
|
+
|
|
1199
|
+
if script.string and not script.get('src'):
|
|
1200
|
+
js_content.append(script.string)
|
|
1201
|
+
elif script.get('src'):
|
|
1202
|
+
src = script.get('src')
|
|
1203
|
+
try:
|
|
1204
|
+
js_url = requests.compat.urljoin(url, src)
|
|
1205
|
+
js_resp = requests.get(js_url, headers=headers, timeout=5)
|
|
1206
|
+
js_content.append(f'// From {src}\n' + js_resp.text)
|
|
1207
|
+
except:
|
|
1208
|
+
pass
|
|
1209
|
+
# Original script tags hatao taaki execution errors/404s na aayein
|
|
1210
|
+
script.extract()
|
|
1211
|
+
|
|
1212
|
+
# Image Tasks collect karo
|
|
1213
|
+
image_tasks = []
|
|
1214
|
+
if fetch_images:
|
|
1215
|
+
os.makedirs('webfiles/scraped/images', exist_ok=True)
|
|
1216
|
+
|
|
1217
|
+
# Exclude karne ke liye Video Posters ID karo
|
|
1218
|
+
poster_blacklist = set()
|
|
1219
|
+
for video in soup.find_all('video'):
|
|
1220
|
+
poster = video.get('poster')
|
|
1221
|
+
if poster: poster_blacklist.add(poster)
|
|
1222
|
+
|
|
1223
|
+
for img in soup.find_all('img'):
|
|
1224
|
+
if len(image_tasks) >= 50: break # Limit to 50 images to prevent timeout
|
|
1225
|
+
src = img.get('src')
|
|
1226
|
+
if src and not src.startswith('data:') and not src.lower().endswith('.svg'):
|
|
1227
|
+
# Exclude video posters
|
|
1228
|
+
if src in poster_blacklist:
|
|
1229
|
+
continue
|
|
1230
|
+
|
|
1231
|
+
# VALUABLE MEDIA FILTER (Ads/Logos)
|
|
1232
|
+
if not is_valuable_media(src, img):
|
|
1233
|
+
continue
|
|
1234
|
+
|
|
1235
|
+
# Extension check taaki iframes/HTML avoid ho sakein
|
|
1236
|
+
if any(src.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff']):
|
|
1237
|
+
image_tasks.append(src)
|
|
1238
|
+
|
|
1239
|
+
# =========================================================
|
|
1240
|
+
# PERFORMANCE OPTIMIZATION: Images PEHLE process honge
|
|
1241
|
+
# Images download faster than videos, so they're prioritized
|
|
1242
|
+
# to improve perceived performance and provide quicker feedback.
|
|
1243
|
+
# =========================================================
|
|
1244
|
+
|
|
1245
|
+
# Image Downloads execute karo
|
|
1246
|
+
if fetch_images and image_tasks:
|
|
1247
|
+
seen_hashes = set()
|
|
1248
|
+
total_images = len(image_tasks)
|
|
1249
|
+
completed_images = 0
|
|
1250
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
|
|
1251
|
+
future_to_img = {executor.submit(process_image_download_task, src): src for src in image_tasks}
|
|
1252
|
+
for future in concurrent.futures.as_completed(future_to_img):
|
|
1253
|
+
completed_images += 1
|
|
1254
|
+
progress = int((completed_images / total_images) * 100)
|
|
1255
|
+
frames = ['🌑','🌒','🌓','🌔','🌕','🌖','🌗','🌘']
|
|
1256
|
+
spinner = frames[completed_images % len(frames)]
|
|
1257
|
+
sys.stdout.write(f'\r{spinner} Images: {completed_images}/{total_images} ({progress}%) ')
|
|
1258
|
+
sys.stdout.flush()
|
|
1259
|
+
result = future.result()
|
|
1260
|
+
if result:
|
|
1261
|
+
orig_src, relative_path, download_path, img_hash, filepath = result
|
|
1262
|
+
if img_hash in seen_hashes:
|
|
1263
|
+
# Duplicate content, delete file
|
|
1264
|
+
try:
|
|
1265
|
+
os.remove(filepath)
|
|
1266
|
+
except:
|
|
1267
|
+
pass
|
|
1268
|
+
continue
|
|
1269
|
+
seen_hashes.add(img_hash)
|
|
1270
|
+
# Update soup
|
|
1271
|
+
for img in soup.find_all('img', src=orig_src):
|
|
1272
|
+
img['src'] = relative_path
|
|
1273
|
+
images.append(download_path)
|
|
1274
|
+
image_count += 1
|
|
1275
|
+
sys.stdout.write(f'\r🌕 Images Done! ✅ {image_count} accepted \n')
|
|
1276
|
+
sys.stdout.flush()
|
|
1277
|
+
# Video Tasks collect karo
|
|
1278
|
+
video_tasks = []
|
|
1279
|
+
if fetch_videos:
|
|
1280
|
+
os.makedirs('webfiles/scraped/videos', exist_ok=True)
|
|
1281
|
+
# -------------------------------------------------------------------------
|
|
1282
|
+
# RESOURCE SNIFFER (Deep Scan)
|
|
1283
|
+
# Scans raw HTML/JS for hidden video links (mp4, m3u8, etc)
|
|
1284
|
+
# -------------------------------------------------------------------------
|
|
1285
|
+
# Video extensions wale links dhundhne ka pattern
|
|
1286
|
+
# Handles escaped slashes (common in JSON)
|
|
1287
|
+
# Captures: "https://example.com/video.mp4"
|
|
1288
|
+
sniffer_regex = r'(https?:\\?\/\\?\/[^"\'\s<>]+?\.(?:mp4|m3u8|webm|mov|mkv|ts|flv|wmv|3gp|f4v|mpg|mpeg|avi|m4v|ogg)(?:[^"\'\s<>]*)?)'
|
|
1289
|
+
matches = re.findall(sniffer_regex, response.text)
|
|
1290
|
+
sniffed_count = 0
|
|
1291
|
+
for match in matches:
|
|
1292
|
+
# Fix escaped slashes (e.g. from JSON: https:\/\/example.com)
|
|
1293
|
+
clean_url = match.replace('\\/', '/')
|
|
1294
|
+
# Basic validation
|
|
1295
|
+
if len(clean_url) > 200: continue # Likely garbage
|
|
1296
|
+
if not cleaner_url_validator(clean_url): continue
|
|
1297
|
+
|
|
1298
|
+
# Sniffed links ke liye AD FILTER
|
|
1299
|
+
ad_domains = ['doubleclick', 'adnxs', 'amazon-adsystem', 'googlesyndication', 'taboola', 'outbrain', 'ads-twitter', 'fb-ads']
|
|
1300
|
+
if any(ad in clean_url.lower() for ad in ad_domains):
|
|
1301
|
+
continue
|
|
1302
|
+
# URL mein quality clues check karo
|
|
1303
|
+
quality = 'unknown'
|
|
1304
|
+
lower_url = clean_url.lower()
|
|
1305
|
+
if '1080' in lower_url: quality = '1080p'
|
|
1306
|
+
elif '720' in lower_url: quality = '720p'
|
|
1307
|
+
elif '480' in lower_url: quality = '480p'
|
|
1308
|
+
# Avoid duplicates
|
|
1309
|
+
is_duplicate = False
|
|
1310
|
+
# Avoid duplicates
|
|
1311
|
+
is_duplicate = False
|
|
1312
|
+
for existing_url, _, _ in video_tasks:
|
|
1313
|
+
if existing_url == clean_url: is_duplicate = True; break
|
|
1314
|
+
for existing_vid in videos:
|
|
1315
|
+
if existing_vid['original_url'] == clean_url: is_duplicate = True; break
|
|
1316
|
+
if not is_duplicate:
|
|
1317
|
+
if any(x in clean_url for x in ['youtube', 'youtu.be', 'vimeo', 'dailymotion']):
|
|
1318
|
+
pass # Skip external for simple sniffer, usually handled by iframes
|
|
1319
|
+
else:
|
|
1320
|
+
video_tasks.append((clean_url, quality, f"sniffed_video_{sniffed_count}"))
|
|
1321
|
+
sniffed_count += 1
|
|
1322
|
+
# Video tags dhundho (sources aur qualities scan karo)
|
|
1323
|
+
for video in soup.find_all('video'):
|
|
1324
|
+
# VALUABLE MEDIA FILTER
|
|
1325
|
+
if not is_valuable_media(video.get('src', ''), video, 'video'):
|
|
1326
|
+
continue
|
|
1327
|
+
|
|
1328
|
+
# Check for source tags
|
|
1329
|
+
sources = video.find_all('source')
|
|
1330
|
+
found_src = False
|
|
1331
|
+
|
|
1332
|
+
if sources:
|
|
1333
|
+
for source in sources:
|
|
1334
|
+
src = source.get('src')
|
|
1335
|
+
if src:
|
|
1336
|
+
# Detect quality from attributes or text
|
|
1337
|
+
quality = 'unknown'
|
|
1338
|
+
s_text = (str(source) + src).lower()
|
|
1339
|
+
if '1080' in s_text: quality = '1080p'
|
|
1340
|
+
elif '720' in s_text: quality = '720p'
|
|
1341
|
+
elif '480' in s_text: quality = '480p'
|
|
1342
|
+
if src.startswith('http'):
|
|
1343
|
+
video_url = src
|
|
1344
|
+
else:
|
|
1345
|
+
video_url = requests.compat.urljoin(url, src)
|
|
1346
|
+
# Check external
|
|
1347
|
+
if any(x in video_url for x in ['youtube', 'youtu.be', 'vimeo', 'dailymotion']):
|
|
1348
|
+
videos.append({'url': video_url,'original_url': video_url,'filename': 'External Video','external': True})
|
|
1349
|
+
video_count += 1
|
|
1350
|
+
else:
|
|
1351
|
+
# Extract Title from Video Tag or Context
|
|
1352
|
+
video_title = video.get('title') or video.get('aria-label')
|
|
1353
|
+
if not video_title:
|
|
1354
|
+
# Try previous sibling header
|
|
1355
|
+
prev = video.find_previous(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
|
|
1356
|
+
if prev: video_title = prev.get_text().strip()
|
|
1357
|
+
|
|
1358
|
+
video_tasks.append((video_url, quality, video_title))
|
|
1359
|
+
found_src = True
|
|
1360
|
+
|
|
1361
|
+
# Fallback to direct src on video tag
|
|
1362
|
+
if not found_src:
|
|
1363
|
+
src = video.get('src')
|
|
1364
|
+
if src:
|
|
1365
|
+
if src.startswith('http'):
|
|
1366
|
+
video_url = src
|
|
1367
|
+
else:
|
|
1368
|
+
video_url = requests.compat.urljoin(url, src)
|
|
1369
|
+
|
|
1370
|
+
if any(x in video_url for x in ['youtube', 'youtu.be', 'vimeo', 'dailymotion']):
|
|
1371
|
+
videos.append({'url': video_url,'original_url': video_url,'filename': 'External Video','external': True})
|
|
1372
|
+
video_count += 1
|
|
1373
|
+
else:
|
|
1374
|
+
# Extract Title
|
|
1375
|
+
video_title = video.get('title') or video.get('aria-label')
|
|
1376
|
+
if not video_title:
|
|
1377
|
+
prev = video.find_previous(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
|
|
1378
|
+
if prev: video_title = prev.get_text().strip()
|
|
1379
|
+
video_tasks.append((video_url, 'unknown', video_title))
|
|
1380
|
+
# Video files ke links (a tags) dhundho
|
|
1381
|
+
for a in soup.find_all('a'):
|
|
1382
|
+
href = a.get('href')
|
|
1383
|
+
if href:
|
|
1384
|
+
# VALUABLE MEDIA FILTER
|
|
1385
|
+
if not is_valuable_media(href, a, 'video'):
|
|
1386
|
+
continue
|
|
1387
|
+
lower_href = href.lower()
|
|
1388
|
+
if lower_href.endswith(('.mp4', '.webm', '.ogg', '.mov', '.avi', '.mkv', '.m4v', '.m3u8', '.flv', '.wmv', '.3gp', '.f4v', '.mpg', '.mpeg', '.ts')) or \
|
|
1389
|
+
(('/video/' in lower_href or '/videos/' in lower_href) and '.' in list(filter(None, lower_href.split('/')))[-1]):
|
|
1390
|
+
# Detect quality from link text
|
|
1391
|
+
quality = 'unknown'
|
|
1392
|
+
a_text = a.get_text().lower()
|
|
1393
|
+
if '1080' in a_text: quality = '1080p'
|
|
1394
|
+
elif '720' in a_text: quality = '720p'
|
|
1395
|
+
elif '480' in a_text: quality = '480p'
|
|
1396
|
+
if href.startswith('http'):
|
|
1397
|
+
video_url = href
|
|
1398
|
+
else:
|
|
1399
|
+
video_url = requests.compat.urljoin(url, href)
|
|
1400
|
+
if any(x in video_url for x in ['youtube', 'youtu.be', 'vimeo', 'dailymotion']):
|
|
1401
|
+
videos.append({'url': video_url,'original_url': video_url,'filename': 'External Video','external': True})
|
|
1402
|
+
video_count += 1
|
|
1403
|
+
else:
|
|
1404
|
+
# Extract Title from Link Text or Attributes
|
|
1405
|
+
video_title = a.get('title') or a.get('aria-label') or a.get_text().strip()
|
|
1406
|
+
if not video_title:
|
|
1407
|
+
prev = a.find_previous(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
|
|
1408
|
+
if prev: video_title = prev.get_text().strip()
|
|
1409
|
+
|
|
1410
|
+
# Fallback to Page Title if single video or very few
|
|
1411
|
+
if not video_title:
|
|
1412
|
+
page_title = soup.title.string if soup.title else ""
|
|
1413
|
+
if page_title:
|
|
1414
|
+
# Clean up page title (remove site name usually at end)
|
|
1415
|
+
video_title = page_title.split('|')[0].split('-')[0].strip()
|
|
1416
|
+
|
|
1417
|
+
|
|
1418
|
+
video_tasks.append((video_url, quality, video_title))
|
|
1419
|
+
|
|
1420
|
+
# Find iframes with video embeds
|
|
1421
|
+
for iframe in soup.find_all('iframe'):
|
|
1422
|
+
src = iframe.get('src', '')
|
|
1423
|
+
# VALUABLE MEDIA FILTER
|
|
1424
|
+
if not is_valuable_media(src, iframe, 'video'):
|
|
1425
|
+
continue
|
|
1426
|
+
if any(x in src for x in ['youtube', 'youtu.be', 'vimeo', 'dailymotion']):
|
|
1427
|
+
if not any(v['original_url'] == src for v in videos):
|
|
1428
|
+
videos.append({'url': src,'original_url': src,'filename': f'Embed: {src.split("/")[2]}','external': True,'quality': 'unknown'})
|
|
1429
|
+
video_count += 1
|
|
1430
|
+
# Video Downloads execute karo
|
|
1431
|
+
if fetch_videos and video_tasks:
|
|
1432
|
+
total_videos = len(video_tasks)
|
|
1433
|
+
completed_videos = 0
|
|
1434
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
|
|
1435
|
+
future_to_vid = {executor.submit(process_video_download_task, item): item for item in video_tasks}
|
|
1436
|
+
for future in concurrent.futures.as_completed(future_to_vid):
|
|
1437
|
+
completed_videos += 1
|
|
1438
|
+
progress = int((completed_videos / total_videos) * 100)
|
|
1439
|
+
frames = ['🌑','🌒','🌓','🌔','🌕','🌖','🌗','🌘']
|
|
1440
|
+
spinner = frames[completed_videos % len(frames)]
|
|
1441
|
+
sys.stdout.write(f'\r{spinner} Videos: {completed_videos}/{total_videos} ({progress}%) ')
|
|
1442
|
+
sys.stdout.flush()
|
|
1443
|
+
result = future.result()
|
|
1444
|
+
if result:
|
|
1445
|
+
# Avoid duplicates in final list
|
|
1446
|
+
if not any(v['original_url'] == result['original_url'] for v in videos):
|
|
1447
|
+
videos.append(result)
|
|
1448
|
+
video_count += 1
|
|
1449
|
+
sys.stdout.write(f'\r🌕 Videos Done! ✅ {video_count} found \n')
|
|
1450
|
+
sys.stdout.flush()
|
|
1451
|
+
# HTML update karo
|
|
1452
|
+
head = soup.find('head')
|
|
1453
|
+
if head:
|
|
1454
|
+
for link in head.find_all('link', rel='stylesheet'):
|
|
1455
|
+
link.extract()
|
|
1456
|
+
css_link = soup.new_tag('link', rel='stylesheet', href='style.css')
|
|
1457
|
+
head.insert(0, css_link)
|
|
1458
|
+
body = soup.find('body')
|
|
1459
|
+
if body:
|
|
1460
|
+
js_script = soup.new_tag('script', src='script.js')
|
|
1461
|
+
body.append(js_script)
|
|
1462
|
+
# Files save karo
|
|
1463
|
+
html_content = str(soup)
|
|
1464
|
+
with open('webfiles/scraped/index.html', 'w', encoding='utf-8') as f:
|
|
1465
|
+
f.write(html_content)
|
|
1466
|
+
with open('webfiles/scraped/style.css', 'w', encoding='utf-8') as f:
|
|
1467
|
+
f.write('\n\n'.join(css_content) or '/* No CSS found */')
|
|
1468
|
+
with open('webfiles/scraped/script.js', 'w', encoding='utf-8') as f:
|
|
1469
|
+
f.write('\n\n'.join(js_content) or '// No JS found */')
|
|
1470
|
+
# Stats calculate karo
|
|
1471
|
+
def get_size(content):
|
|
1472
|
+
size = len(content.encode('utf-8'))
|
|
1473
|
+
if size < 1024:
|
|
1474
|
+
return f'{size} B'
|
|
1475
|
+
elif size < 1024*1024:
|
|
1476
|
+
return f'{size/1024:.1f} KB'
|
|
1477
|
+
else:
|
|
1478
|
+
return f'{size/(1024*1024):.1f} MB'
|
|
1479
|
+
# --- DESIGN INSPECTOR ---
|
|
1480
|
+
design_data = {'colors': [], 'fonts': []}
|
|
1481
|
+
full_css = '\n'.join(css_content) + '\n' + html_content
|
|
1482
|
+
# Colors (Hex) nikalo
|
|
1483
|
+
hex_colors = re.findall(r'#(?:[0-9a-fA-F]{3}){1,2}\b', full_css)
|
|
1484
|
+
# Filter for unique and sort by frequency
|
|
1485
|
+
unique_colors = Counter(hex_colors).most_common(20) # Top 20
|
|
1486
|
+
design_data['colors'] = [c[0] for c in unique_colors]
|
|
1487
|
+
|
|
1488
|
+
# Fonts nikalo
|
|
1489
|
+
fonts = re.findall(r'font-family:\s*([^;]+)', full_css, re.IGNORECASE)
|
|
1490
|
+
unique_fonts = Counter([f.strip().strip("'").strip('"') for f in fonts]).most_common(10)
|
|
1491
|
+
design_data['fonts'] = [f[0] for f in unique_fonts]
|
|
1492
|
+
|
|
1493
|
+
# --- SEO ANALYSIS ---
|
|
1494
|
+
seo_data = {
|
|
1495
|
+
'title': soup.title.string if soup.title else None,
|
|
1496
|
+
'description': None,'keywords': None,
|
|
1497
|
+
'headings': {'h1': 0, 'h2': 0, 'h3': 0},
|
|
1498
|
+
'images_analysis': {'total': 0, 'missing_alt': 0},
|
|
1499
|
+
'links_internal': 0,'links_external': 0,'score': 0
|
|
1500
|
+
}
|
|
1501
|
+
|
|
1502
|
+
# Meta tags check karo
|
|
1503
|
+
msg_desc = soup.find('meta', attrs={'name': 'description'}) or soup.find('meta', attrs={'property': 'og:description'})
|
|
1504
|
+
if msg_desc: seo_data['description'] = msg_desc.get('content')
|
|
1505
|
+
msg_keys = soup.find('meta', attrs={'name': 'keywords'})
|
|
1506
|
+
if msg_keys: seo_data['keywords'] = msg_keys.get('content')
|
|
1507
|
+
# Headings check karo
|
|
1508
|
+
seo_data['headings'] = {
|
|
1509
|
+
'h1': len(soup.find_all('h1')),'h2': len(soup.find_all('h2')),'h3': len(soup.find_all('h3')),
|
|
1510
|
+
}
|
|
1511
|
+
|
|
1512
|
+
# Images
|
|
1513
|
+
imgs = soup.find_all('img')
|
|
1514
|
+
seo_data['images_analysis']['total'] = len(imgs)
|
|
1515
|
+
seo_data['images_analysis']['missing_alt'] = len([img for img in imgs if not img.get('alt')])
|
|
1516
|
+
# Links aur Auditor logic
|
|
1517
|
+
all_links = soup.find_all('a')
|
|
1518
|
+
domain = requests.compat.urlparse(url).netloc
|
|
1519
|
+
check_urls = set()
|
|
1520
|
+
for link in all_links:
|
|
1521
|
+
href = link.get('href', '')
|
|
1522
|
+
if not href or href.startswith('#') or href.startswith('javascript'): continue
|
|
1523
|
+
if domain in href or href.startswith('/'):
|
|
1524
|
+
seo_data['links_internal'] += 1
|
|
1525
|
+
else:
|
|
1526
|
+
seo_data['links_external'] += 1
|
|
1527
|
+
# Collect for auditing (limit to 50 to avoid timeout)
|
|
1528
|
+
if href.startswith('http'):
|
|
1529
|
+
check_urls.add(href)
|
|
1530
|
+
elif href.startswith('/'):
|
|
1531
|
+
check_urls.add(requests.compat.urljoin(url, href))
|
|
1532
|
+
|
|
1533
|
+
# Link Auditor (Threaded logic)
|
|
1534
|
+
broken_links = []
|
|
1535
|
+
def check_link(l_url):
|
|
1536
|
+
try:
|
|
1537
|
+
r = requests.head(l_url, headers=headers, timeout=3)
|
|
1538
|
+
if r.status_code >= 400:
|
|
1539
|
+
return {'url': l_url, 'status': r.status_code}
|
|
1540
|
+
except:
|
|
1541
|
+
return {'url': l_url, 'status': 'Failed'}
|
|
1542
|
+
return None
|
|
1543
|
+
|
|
1544
|
+
# Verify up to 30 unique links
|
|
1545
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
|
|
1546
|
+
futures = [executor.submit(check_link, u) for u in list(check_urls)[:30]]
|
|
1547
|
+
for f in concurrent.futures.as_completed(futures):
|
|
1548
|
+
res = f.result()
|
|
1549
|
+
if res: broken_links.append(res)
|
|
1550
|
+
seo_data['broken_links'] = broken_links
|
|
1551
|
+
# --- DEEP CRAWL LOGIC ---
|
|
1552
|
+
site_structure = {'url': url, 'title': seo_data['title'], 'children': []}
|
|
1553
|
+
perf_tracker.record_phase("SEO & Parsing")
|
|
1554
|
+
|
|
1555
|
+
if crawl_depth > 1:
|
|
1556
|
+
# Simple recursive scraper logic
|
|
1557
|
+
def scrape_node(node_url, current_level):
|
|
1558
|
+
# Max depth par ruk jao
|
|
1559
|
+
if current_level > crawl_depth: return None
|
|
1560
|
+
try:
|
|
1561
|
+
# Reuse headers/verify settings
|
|
1562
|
+
nr = requests.get(node_url, headers=headers, timeout=3, verify=False)
|
|
1563
|
+
if nr.status_code == 200:
|
|
1564
|
+
ns = BeautifulSoup(nr.text, 'html.parser')
|
|
1565
|
+
node_title = (ns.title.string or node_url).strip()[:50] # Limit title length
|
|
1566
|
+
|
|
1567
|
+
child_nodes = []
|
|
1568
|
+
# Agar max depth nahi pahoochi, toh children dhundho
|
|
1569
|
+
if current_level < crawl_depth:
|
|
1570
|
+
n_links = []
|
|
1571
|
+
for na in ns.find_all('a', href=True):
|
|
1572
|
+
nh = na['href']
|
|
1573
|
+
if not nh or nh.startswith('#') or nh.startswith('javascript'): continue
|
|
1574
|
+
n_full = requests.compat.urljoin(node_url, nh)
|
|
1575
|
+
# Strict Internal Domain Check
|
|
1576
|
+
if domain in n_full:
|
|
1577
|
+
if n_full not in n_links and n_full != node_url:
|
|
1578
|
+
n_links.append(n_full)
|
|
1579
|
+
|
|
1580
|
+
# Recurse for top 3 links to keep it fast
|
|
1581
|
+
# We don't parallelize here to avoid spawning too many threads recursively
|
|
1582
|
+
for nl in list(set(n_links))[:3]:
|
|
1583
|
+
child = scrape_node(nl, current_level + 1)
|
|
1584
|
+
if child: child_nodes.append(child)
|
|
1585
|
+
|
|
1586
|
+
return {'url': node_url, 'title': node_title, 'children': child_nodes}
|
|
1587
|
+
except:
|
|
1588
|
+
pass
|
|
1589
|
+
return {'url': node_url, 'title': 'Unreachable', 'children': []}
|
|
1590
|
+
|
|
1591
|
+
# Filter start links (Limit 5)
|
|
1592
|
+
start_links = list(set([u for u in check_urls if domain in u]))[:5]
|
|
1593
|
+
|
|
1594
|
+
# First level ko parallelize karo
|
|
1595
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as crawler:
|
|
1596
|
+
futures = {crawler.submit(scrape_node, u, 2): u for u in start_links}
|
|
1597
|
+
for f in concurrent.futures.as_completed(futures):
|
|
1598
|
+
res = f.result()
|
|
1599
|
+
if res: site_structure['children'].append(res)
|
|
1600
|
+
perf_tracker.record_phase("Deep Crawl")
|
|
1601
|
+
score = 0
|
|
1602
|
+
if seo_data['title']: score += 20
|
|
1603
|
+
if seo_data['description']: score += 20
|
|
1604
|
+
if seo_data['headings']['h1'] > 0: score += 20
|
|
1605
|
+
if url.startswith('https'): score += 10
|
|
1606
|
+
|
|
1607
|
+
if seo_data['images_analysis']['total'] > 0:
|
|
1608
|
+
ratio = 1 - (seo_data['images_analysis']['missing_alt'] / seo_data['images_analysis']['total'])
|
|
1609
|
+
score += int(30 * ratio)
|
|
1610
|
+
else:
|
|
1611
|
+
score += 30
|
|
1612
|
+
|
|
1613
|
+
seo_data['score'] = min(100, score)
|
|
1614
|
+
|
|
1615
|
+
result = {
|
|
1616
|
+
'success': True,
|
|
1617
|
+
'security': security_report,
|
|
1618
|
+
'site_structure': site_structure,
|
|
1619
|
+
'seo': seo_data,
|
|
1620
|
+
'design': design_data,
|
|
1621
|
+
'stats': {
|
|
1622
|
+
'html': get_size(html_content),
|
|
1623
|
+
'css': get_size('\n\n'.join(css_content)),
|
|
1624
|
+
'js': get_size('\n\n'.join(js_content)),
|
|
1625
|
+
'image_count': image_count,
|
|
1626
|
+
'video_count': video_count
|
|
1627
|
+
},
|
|
1628
|
+
'images': images,
|
|
1629
|
+
'videos': videos,
|
|
1630
|
+
'broken_links': check_broken_links(url, soup, headers),
|
|
1631
|
+
'intel': {
|
|
1632
|
+
'emails': extract_emails(response.text),
|
|
1633
|
+
'phones': extract_phones(response.text),
|
|
1634
|
+
'locations': extract_locations(soup),
|
|
1635
|
+
'socials': extract_social_media(soup),
|
|
1636
|
+
'tech_stack': detect_tech_stack(soup, response),
|
|
1637
|
+
'ai_analysis': analyze_ai_content(soup.get_text(separator=' ', strip=True)[:50000]) # Limit to 50k chars for perf
|
|
1638
|
+
}
|
|
1639
|
+
}
|
|
1640
|
+
|
|
1641
|
+
perf_tracker.record_phase("AI & Intel")
|
|
1642
|
+
perf_data = perf_tracker.finish_and_print()
|
|
1643
|
+
result['performance'] = perf_data
|
|
1644
|
+
|
|
1645
|
+
return result
|
|
1646
|
+
|
|
1647
|
+
except Exception as e:
|
|
1648
|
+
print(f"Error in execute_scrape_logic: {e}")
|
|
1649
|
+
traceback.print_exc()
|
|
1650
|
+
return {'success': False, 'error': str(e)}
|
|
1651
|
+
|
|
1652
|
+
@app.route('/api/scrape', methods=['POST'])
|
|
1653
|
+
def api_scrape():
|
|
1654
|
+
data = request.get_json()
|
|
1655
|
+
url = data.get('url', '')
|
|
1656
|
+
fetch_images = data.get('fetch_images', False)
|
|
1657
|
+
fetch_videos = data.get('fetch_videos', False)
|
|
1658
|
+
crawl_depth = int(data.get('crawl_depth', 2))
|
|
1659
|
+
use_proxy = data.get('use_proxy', False)
|
|
1660
|
+
device = data.get('device', 'desktop')
|
|
1661
|
+
|
|
1662
|
+
result = execute_scrape_logic(url, fetch_images, fetch_videos, crawl_depth, use_proxy, device)
|
|
1663
|
+
if result.get('success'):
|
|
1664
|
+
return jsonify(result)
|
|
1665
|
+
else:
|
|
1666
|
+
return jsonify(result), 500
|
|
1667
|
+
|
|
1668
|
+
@app.route('/api/bulk', methods=['POST'])
|
|
1669
|
+
def api_bulk():
|
|
1670
|
+
try:
|
|
1671
|
+
data = request.get_json()
|
|
1672
|
+
urls = data.get('urls', [])
|
|
1673
|
+
fetch_images = data.get('fetch_images', False)
|
|
1674
|
+
|
|
1675
|
+
if not urls:
|
|
1676
|
+
return jsonify({'success': False, 'error': 'No URLs provided'})
|
|
1677
|
+
|
|
1678
|
+
timestamp = int(time.time())
|
|
1679
|
+
base_folder = f'webfiles/bulk/batch_{timestamp}'
|
|
1680
|
+
os.makedirs(base_folder, exist_ok=True)
|
|
1681
|
+
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
|
|
1682
|
+
processed = 0
|
|
1683
|
+
for i, url in enumerate(urls):
|
|
1684
|
+
if not url.strip(): continue
|
|
1685
|
+
try:
|
|
1686
|
+
if not url.startswith('http'): url = 'https://' + url
|
|
1687
|
+
|
|
1688
|
+
# Subfolder banao
|
|
1689
|
+
domain = requests.compat.urlparse(url).netloc.replace(':', '_')
|
|
1690
|
+
site_folder = f'{base_folder}/{i+1}_{domain}'
|
|
1691
|
+
os.makedirs(site_folder, exist_ok=True)
|
|
1692
|
+
os.makedirs(f'{site_folder}/images', exist_ok=True)
|
|
1693
|
+
|
|
1694
|
+
# Fetch karo
|
|
1695
|
+
response = requests.get(url, headers=headers, timeout=10)
|
|
1696
|
+
soup = BeautifulSoup(response.text, 'html.parser')
|
|
1697
|
+
|
|
1698
|
+
# HTML save karo
|
|
1699
|
+
with open(f'{site_folder}/index.html', 'w', encoding='utf-8') as f:
|
|
1700
|
+
f.write(str(soup))
|
|
1701
|
+
|
|
1702
|
+
# CSS logic
|
|
1703
|
+
css_content = []
|
|
1704
|
+
for link in soup.find_all('link', rel='stylesheet'):
|
|
1705
|
+
href = link.get('href')
|
|
1706
|
+
if href:
|
|
1707
|
+
try:
|
|
1708
|
+
css_url = requests.compat.urljoin(url, href)
|
|
1709
|
+
css_content.append(requests.get(css_url, headers=headers, timeout=5).text)
|
|
1710
|
+
except: pass
|
|
1711
|
+
with open(f'{site_folder}/style.css', 'w', encoding='utf-8') as f:
|
|
1712
|
+
f.write('\n'.join(css_content))
|
|
1713
|
+
|
|
1714
|
+
# Images check karo
|
|
1715
|
+
if fetch_images:
|
|
1716
|
+
for img in soup.find_all('img'):
|
|
1717
|
+
src = img.get('src')
|
|
1718
|
+
if src and not src.startswith('data:') and not src.lower().endswith('.svg'):
|
|
1719
|
+
try:
|
|
1720
|
+
img_url = requests.compat.urljoin(url, src)
|
|
1721
|
+
fname = os.path.basename(img_url.split('?')[0]) or 'image.jpg'
|
|
1722
|
+
if not fname.endswith(('.jpg','.png','.jpeg','.webp')): fname += '.jpg'
|
|
1723
|
+
|
|
1724
|
+
r = requests.get(img_url, headers=headers, timeout=5)
|
|
1725
|
+
if r.status_code == 200:
|
|
1726
|
+
with open(f'{site_folder}/images/{fname}', 'wb') as f:
|
|
1727
|
+
f.write(r.content)
|
|
1728
|
+
except: pass
|
|
1729
|
+
|
|
1730
|
+
processed += 1
|
|
1731
|
+
except Exception as e:
|
|
1732
|
+
print(f"Failed to scrape {url}: {e}")
|
|
1733
|
+
|
|
1734
|
+
# ZIP banao
|
|
1735
|
+
shutil.make_archive(base_folder, 'zip', base_folder)
|
|
1736
|
+
shutil.rmtree(base_folder) # Folder saaf karo, sirf ZIP rakho
|
|
1737
|
+
|
|
1738
|
+
return jsonify({
|
|
1739
|
+
'success': True,
|
|
1740
|
+
'message': f'Successfully scraped {processed} sites.',
|
|
1741
|
+
'download_url': f'/download/bulk/batch_{timestamp}.zip'
|
|
1742
|
+
})
|
|
1743
|
+
|
|
1744
|
+
except Exception as e:
|
|
1745
|
+
return jsonify({'success': False, 'error': str(e)}), 500
|
|
1746
|
+
|
|
1747
|
+
@app.route('/download/bulk/<path:filename>')
|
|
1748
|
+
def serve_bulk_file(filename):
|
|
1749
|
+
return send_from_directory('webfiles/bulk', filename)
|
|
1750
|
+
|
|
1751
|
+
@app.route('/api/save', methods=['POST'])
|
|
1752
|
+
def api_save():
|
|
1753
|
+
try:
|
|
1754
|
+
data = request.get_json()
|
|
1755
|
+
filename = data.get('filename')
|
|
1756
|
+
content = data.get('content')
|
|
1757
|
+
|
|
1758
|
+
if filename in ['index.html', 'style.css', 'script.js']:
|
|
1759
|
+
with open(f'webfiles/scraped/{filename}', 'w', encoding='utf-8') as f:
|
|
1760
|
+
f.write(content)
|
|
1761
|
+
return jsonify({'success': True})
|
|
1762
|
+
else:
|
|
1763
|
+
return jsonify({'success': False, 'error': 'Invalid filename'}), 400
|
|
1764
|
+
except Exception as e:
|
|
1765
|
+
return jsonify({'success': False, 'error': str(e)}), 500
|
|
1766
|
+
|
|
1767
|
+
@app.route('/api/download-zip')
|
|
1768
|
+
def download_zip():
|
|
1769
|
+
try:
|
|
1770
|
+
zip_path = '/tmp/scraped_files.zip'
|
|
1771
|
+
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
|
1772
|
+
for root, dirs, files in os.walk('webfiles/scraped'):
|
|
1773
|
+
for file in files:
|
|
1774
|
+
file_path = os.path.join(root, file)
|
|
1775
|
+
arcname = os.path.relpath(file_path, 'webfiles/scraped')
|
|
1776
|
+
zipf.write(file_path, arcname)
|
|
1777
|
+
|
|
1778
|
+
return send_file(zip_path, as_attachment=True, download_name='scraped_files.zip')
|
|
1779
|
+
except Exception as e:
|
|
1780
|
+
return jsonify({'error': str(e)}), 500
|
|
1781
|
+
|
|
1782
|
+
def clear_scraped_data():
|
|
1783
|
+
try:
|
|
1784
|
+
folder = 'webfiles/scraped'
|
|
1785
|
+
if os.path.exists(folder):
|
|
1786
|
+
shutil.rmtree(folder)
|
|
1787
|
+
os.makedirs('webfiles/scraped', exist_ok=True)
|
|
1788
|
+
os.makedirs('webfiles/scraped/images', exist_ok=True)
|
|
1789
|
+
os.makedirs('webfiles/scraped/videos', exist_ok=True)
|
|
1790
|
+
return True
|
|
1791
|
+
except Exception as e:
|
|
1792
|
+
print(f"Cleanup Error: {e}")
|
|
1793
|
+
return False
|
|
1794
|
+
|
|
1795
|
+
@app.route('/api/clear', methods=['POST'])
|
|
1796
|
+
def api_clear():
|
|
1797
|
+
if clear_scraped_data():
|
|
1798
|
+
return jsonify({'success': True})
|
|
1799
|
+
else:
|
|
1800
|
+
return jsonify({'success': False, 'error': 'Cleanup failed'}), 500
|
|
1801
|
+
|
|
1802
|
+
@app.route('/api/export', methods=['POST'])
|
|
1803
|
+
def api_export():
|
|
1804
|
+
try:
|
|
1805
|
+
req_data = request.get_json()
|
|
1806
|
+
data = req_data.get('data')
|
|
1807
|
+
export_format = req_data.get('format', 'csv').lower()
|
|
1808
|
+
filename = req_data.get('filename', 'export')
|
|
1809
|
+
|
|
1810
|
+
if not data:
|
|
1811
|
+
return jsonify({'error': 'No data provided'}), 400
|
|
1812
|
+
|
|
1813
|
+
# Buffer create karo
|
|
1814
|
+
if export_format == 'csv':
|
|
1815
|
+
si = io.StringIO()
|
|
1816
|
+
# Check karo data simple list hai ya list of dicts
|
|
1817
|
+
if isinstance(data, list) and len(data) > 0:
|
|
1818
|
+
if isinstance(data[0], dict):
|
|
1819
|
+
# List of Dicts (e.g. Socials)
|
|
1820
|
+
keys = data[0].keys()
|
|
1821
|
+
writer = csv.DictWriter(si, fieldnames=keys)
|
|
1822
|
+
writer.writeheader()
|
|
1823
|
+
writer.writerows(data)
|
|
1824
|
+
else:
|
|
1825
|
+
# Simple List (e.g. Emails)
|
|
1826
|
+
writer = csv.writer(si)
|
|
1827
|
+
writer.writerow(['Value']) # Generic header
|
|
1828
|
+
for item in data:
|
|
1829
|
+
writer.writerow([item])
|
|
1830
|
+
|
|
1831
|
+
output = si.getvalue()
|
|
1832
|
+
mem = io.BytesIO()
|
|
1833
|
+
mem.write(output.encode('utf-8'))
|
|
1834
|
+
mem.seek(0)
|
|
1835
|
+
|
|
1836
|
+
return send_file(
|
|
1837
|
+
mem,
|
|
1838
|
+
mimetype='text/csv',
|
|
1839
|
+
as_attachment=True,
|
|
1840
|
+
download_name=f'{filename}.csv'
|
|
1841
|
+
)
|
|
1842
|
+
|
|
1843
|
+
elif export_format == 'json':
|
|
1844
|
+
mem = io.BytesIO()
|
|
1845
|
+
mem.write(json.dumps(data, indent=2).encode('utf-8'))
|
|
1846
|
+
mem.seek(0)
|
|
1847
|
+
|
|
1848
|
+
return send_file(
|
|
1849
|
+
mem,
|
|
1850
|
+
mimetype='application/json',
|
|
1851
|
+
as_attachment=True,
|
|
1852
|
+
download_name=f'{filename}.json'
|
|
1853
|
+
)
|
|
1854
|
+
|
|
1855
|
+
else:
|
|
1856
|
+
return jsonify({'error': 'Unsupported format'}), 400
|
|
1857
|
+
|
|
1858
|
+
except Exception as e:
|
|
1859
|
+
print(f"Export Error: {e}")
|
|
1860
|
+
return jsonify({'error': str(e)}), 500
|
|
1861
|
+
|
|
1862
|
+
@app.route('/api/translate', methods=['POST'])
|
|
1863
|
+
def api_translate():
|
|
1864
|
+
try:
|
|
1865
|
+
data = request.get_json()
|
|
1866
|
+
text = data.get('text')
|
|
1867
|
+
target_lang = data.get('target', 'hi') # Default to Hindi
|
|
1868
|
+
|
|
1869
|
+
if not text:
|
|
1870
|
+
return jsonify({'error': 'No text provided'}), 400
|
|
1871
|
+
|
|
1872
|
+
translated = mtranslate.translate(text, target_lang)
|
|
1873
|
+
return jsonify({
|
|
1874
|
+
'success': True,
|
|
1875
|
+
'translated': translated
|
|
1876
|
+
})
|
|
1877
|
+
except Exception as e:
|
|
1878
|
+
print(f"Translation Error: {e}")
|
|
1879
|
+
return jsonify({'error': str(e)}), 500
|
|
1880
|
+
|
|
1881
|
+
def wait_for_server(port, timeout=10):
|
|
1882
|
+
start = time.time()
|
|
1883
|
+
while time.time() - start < timeout:
|
|
1884
|
+
try:
|
|
1885
|
+
import urllib.request
|
|
1886
|
+
urllib.request.urlopen(f'http://127.0.0.1:{port}/', timeout=1)
|
|
1887
|
+
return True
|
|
1888
|
+
except:
|
|
1889
|
+
time.sleep(0.5)
|
|
1890
|
+
return False
|
|
1891
|
+
|
|
1892
|
+
|
|
1893
|
+
# --- IMAGE ANALYSIS FEATURE ---
|
|
1894
|
+
|
|
1895
|
+
def get_decimal_from_dms(dms, ref):
|
|
1896
|
+
degrees = dms[0]
|
|
1897
|
+
minutes = dms[1]
|
|
1898
|
+
seconds = dms[2]
|
|
1899
|
+
|
|
1900
|
+
decimal = degrees + (minutes / 60.0) + (seconds / 3600.0)
|
|
1901
|
+
if ref in ['S', 'W']:
|
|
1902
|
+
decimal = -decimal
|
|
1903
|
+
return decimal
|
|
1904
|
+
|
|
1905
|
+
def get_image_metadata(image):
|
|
1906
|
+
info = {
|
|
1907
|
+
"Format": image.format,
|
|
1908
|
+
"Mode": image.mode,
|
|
1909
|
+
"Size": f"{image.width} x {image.height}",
|
|
1910
|
+
"Width": image.width,
|
|
1911
|
+
"Height": image.height,
|
|
1912
|
+
"Info": image.info.get('comment', '')
|
|
1913
|
+
}
|
|
1914
|
+
|
|
1915
|
+
# EXIF data
|
|
1916
|
+
exif_data = {}
|
|
1917
|
+
gps_data = {}
|
|
1918
|
+
|
|
1919
|
+
try:
|
|
1920
|
+
exif = image._getexif()
|
|
1921
|
+
if exif:
|
|
1922
|
+
for tag, value in exif.items():
|
|
1923
|
+
decoded = ExifTags.TAGS.get(tag, tag)
|
|
1924
|
+
if decoded == "GPSInfo":
|
|
1925
|
+
gps_data = {}
|
|
1926
|
+
for t in value:
|
|
1927
|
+
sub_decoded = ExifTags.GPSTAGS.get(t, t)
|
|
1928
|
+
gps_data[sub_decoded] = value[t]
|
|
1929
|
+
else:
|
|
1930
|
+
# Binary data filter karo
|
|
1931
|
+
if isinstance(value, bytes):
|
|
1932
|
+
try:
|
|
1933
|
+
value = value.decode()
|
|
1934
|
+
except:
|
|
1935
|
+
value = "<binary data>"
|
|
1936
|
+
exif_data[decoded] = str(value)
|
|
1937
|
+
except Exception as e:
|
|
1938
|
+
print(f"EXIF Error: {e}")
|
|
1939
|
+
|
|
1940
|
+
# GPS data process karo
|
|
1941
|
+
location = None
|
|
1942
|
+
if gps_data:
|
|
1943
|
+
try:
|
|
1944
|
+
lat = get_decimal_from_dms(gps_data.get('GPSLatitude'), gps_data.get('GPSLatitudeRef'))
|
|
1945
|
+
lon = get_decimal_from_dms(gps_data.get('GPSLongitude'), gps_data.get('GPSLongitudeRef'))
|
|
1946
|
+
location = {'lat': lat, 'lon': lon, 'map_url': f"https://www.google.com/maps?q={lat},{lon}"}
|
|
1947
|
+
except Exception as e:
|
|
1948
|
+
print(f"GPS Parse Error: {e}")
|
|
1949
|
+
|
|
1950
|
+
return {
|
|
1951
|
+
"basic": info,
|
|
1952
|
+
"exif": exif_data,
|
|
1953
|
+
"gps": str(gps_data),
|
|
1954
|
+
"location": location
|
|
1955
|
+
}
|
|
1956
|
+
|
|
1957
|
+
def generate_ela(image, quality=90, scale=10):
|
|
1958
|
+
"""
|
|
1959
|
+
ELA (Error Level Analysis) image generate karta hai.
|
|
1960
|
+
1. Original image ko specific quality (compression) pe save karta hai.
|
|
1961
|
+
2. Original aur compressed ke beech ka difference nikalta hai.
|
|
1962
|
+
3. Visualisation ke liye difference ko enhance karta hai.
|
|
1963
|
+
"""
|
|
1964
|
+
try:
|
|
1965
|
+
# Agar RGB nahi hai (jaise RGBA, P) toh convert karo
|
|
1966
|
+
if image.mode != 'RGB':
|
|
1967
|
+
image = image.convert('RGB')
|
|
1968
|
+
|
|
1969
|
+
# Compressed version memory mein save karo
|
|
1970
|
+
buffer = BytesIO()
|
|
1971
|
+
image.save(buffer, 'JPEG', quality=quality)
|
|
1972
|
+
buffer.seek(0)
|
|
1973
|
+
compressed_image = Image.open(buffer)
|
|
1974
|
+
|
|
1975
|
+
# Difference calculate karo
|
|
1976
|
+
ela_image = ImageChops.difference(image, compressed_image)
|
|
1977
|
+
|
|
1978
|
+
# Differences dikhne ke liye brightness badhao
|
|
1979
|
+
ela_image = ImageEnhance.Brightness(ela_image).enhance(scale)
|
|
1980
|
+
|
|
1981
|
+
return ela_image
|
|
1982
|
+
except Exception as e:
|
|
1983
|
+
print(f"ELA Error: {e}")
|
|
1984
|
+
return None
|
|
1985
|
+
|
|
1986
|
+
@app.route('/api/analyze/ela', methods=['POST'])
|
|
1987
|
+
def analyze_ela():
|
|
1988
|
+
try:
|
|
1989
|
+
if 'image' not in request.files:
|
|
1990
|
+
return jsonify({'error': 'No image file provided'}), 400
|
|
1991
|
+
|
|
1992
|
+
file = request.files['image']
|
|
1993
|
+
image = Image.open(file.stream)
|
|
1994
|
+
|
|
1995
|
+
ela_image = generate_ela(image)
|
|
1996
|
+
|
|
1997
|
+
if ela_image:
|
|
1998
|
+
# ELA result ko base64 mein convert karo
|
|
1999
|
+
buffered = BytesIO()
|
|
2000
|
+
ela_image.save(buffered, format="PNG") # ELA details ke liye PNG format
|
|
2001
|
+
img_str = base64.b64encode(buffered.getvalue()).decode()
|
|
2002
|
+
return jsonify({'success': True, 'ela_image': f"data:image/png;base64,{img_str}"})
|
|
2003
|
+
else:
|
|
2004
|
+
return jsonify({'error': 'Failed to generate ELA'}), 500
|
|
2005
|
+
|
|
2006
|
+
except Exception as e:
|
|
2007
|
+
return jsonify({'error': str(e)}), 500
|
|
2008
|
+
|
|
2009
|
+
def compute_ai_likelihood(image):
|
|
2010
|
+
"""
|
|
2011
|
+
Image mein AI generation artifacts check karta hai (FFT use karke).
|
|
2012
|
+
Likelihood score (0-100) aur label return karta hai.
|
|
2013
|
+
"""
|
|
2014
|
+
try:
|
|
2015
|
+
# Greyscale aur resize (consistent analysis ke liye)
|
|
2016
|
+
img_gray = image.convert('L').resize((512, 512))
|
|
2017
|
+
img_array = np.array(img_gray)
|
|
2018
|
+
|
|
2019
|
+
# FFT logic
|
|
2020
|
+
f = np.fft.fft2(img_array)
|
|
2021
|
+
fshift = np.fft.fftshift(f)
|
|
2022
|
+
magnitude_spectrum = 20 * np.log(np.abs(fshift) + 1e-10) # Log scale
|
|
2023
|
+
|
|
2024
|
+
# Heuristic: AI images mein aksar high-frequency energy ya grid artifacts hote hain.
|
|
2025
|
+
# High-frequency components ka variance calculate karenge.
|
|
2026
|
+
|
|
2027
|
+
# High-pass mask banao
|
|
2028
|
+
rows, cols = img_array.shape
|
|
2029
|
+
crow, ccol = rows//2, cols//2
|
|
2030
|
+
mask_radius = 50
|
|
2031
|
+
|
|
2032
|
+
# Low frequencies (center) mask karo
|
|
2033
|
+
magnetude_high_freq = magnitude_spectrum.copy()
|
|
2034
|
+
magnetude_high_freq[crow-mask_radius:crow+mask_radius, ccol-mask_radius:ccol+mask_radius] = 0
|
|
2035
|
+
|
|
2036
|
+
# High frequencies par stats calculate karo
|
|
2037
|
+
hf_mean = np.mean(magnetude_high_freq)
|
|
2038
|
+
hf_std = np.std(magnetude_high_freq)
|
|
2039
|
+
|
|
2040
|
+
# Simple heuristic mapping (demo ke liye tuned)
|
|
2041
|
+
# Real images mein HF variance kam hoti hai jab tak textured na ho.
|
|
2042
|
+
# GANs aksar high-energy artifacts chhod dete hain.
|
|
2043
|
+
|
|
2044
|
+
# Score normalize karo (estimation hai)
|
|
2045
|
+
# Maan ke chalte hain natural image std 30-50 hai, AI zyada ho sakta hai.
|
|
2046
|
+
# Natural variations ke liye sigmoid-like mapping use karenge.
|
|
2047
|
+
|
|
2048
|
+
score = min(100, max(0, (hf_std - 40) * 2 + 50))
|
|
2049
|
+
|
|
2050
|
+
# Refinement: "checkerboard" artifacts check karo jo strong indicators hote hain
|
|
2051
|
+
# Peak detection chahiye par variance bhi achha proxy hai.
|
|
2052
|
+
|
|
2053
|
+
label = "Likely Real"
|
|
2054
|
+
if score > 60:
|
|
2055
|
+
label = "Possible AI / Edited"
|
|
2056
|
+
if score > 80:
|
|
2057
|
+
label = "Likely AI Generated"
|
|
2058
|
+
|
|
2059
|
+
return {
|
|
2060
|
+
"score": round(score, 1),
|
|
2061
|
+
"label": label,
|
|
2062
|
+
"details": f"HF Variance: {round(hf_std, 2)}"
|
|
2063
|
+
}
|
|
2064
|
+
except Exception as e:
|
|
2065
|
+
print(f"AI Detection Error: {e}")
|
|
2066
|
+
return {"score": 0, "label": "Error", "details": str(e)}
|
|
2067
|
+
|
|
2068
|
+
@app.route('/api/analyze/ai', methods=['POST'])
|
|
2069
|
+
def analyze_ai():
|
|
2070
|
+
try:
|
|
2071
|
+
if 'image' not in request.files:
|
|
2072
|
+
return jsonify({'error': 'No image file provided'}), 400
|
|
2073
|
+
|
|
2074
|
+
file = request.files['image']
|
|
2075
|
+
image = Image.open(file.stream)
|
|
2076
|
+
|
|
2077
|
+
result = compute_ai_likelihood(image)
|
|
2078
|
+
|
|
2079
|
+
return jsonify({'success': True, 'data': result})
|
|
2080
|
+
|
|
2081
|
+
except Exception as e:
|
|
2082
|
+
return jsonify({'error': str(e)}), 500
|
|
2083
|
+
|
|
2084
|
+
@app.route('/api/analyze-image', methods=['POST'])
|
|
2085
|
+
def analyze_image():
|
|
2086
|
+
try:
|
|
2087
|
+
image = None
|
|
2088
|
+
source_type = "upload"
|
|
2089
|
+
|
|
2090
|
+
# 1. File upload check karo
|
|
2091
|
+
if 'file' in request.files:
|
|
2092
|
+
file = request.files['file']
|
|
2093
|
+
if file.filename == '':
|
|
2094
|
+
return jsonify({'error': 'No selected file'}), 400
|
|
2095
|
+
try:
|
|
2096
|
+
image = Image.open(file.stream)
|
|
2097
|
+
except Exception as e:
|
|
2098
|
+
return jsonify({'error': f'Invalid image file: {e}'}), 400
|
|
2099
|
+
|
|
2100
|
+
# 2. URL check karo
|
|
2101
|
+
elif 'url' in request.form or (request.is_json and 'url' in request.get_json()):
|
|
2102
|
+
data = request.get_json() if request.is_json else request.form
|
|
2103
|
+
url = data.get('url')
|
|
2104
|
+
if not url:
|
|
2105
|
+
return jsonify({'error': 'No URL provided'}), 400
|
|
2106
|
+
|
|
2107
|
+
source_type = "url"
|
|
2108
|
+
try:
|
|
2109
|
+
headers = {'User-Agent': 'Mozilla/5.0'}
|
|
2110
|
+
resp = requests.get(url, headers=headers, stream=True, timeout=15, verify=False)
|
|
2111
|
+
resp.raise_for_status()
|
|
2112
|
+
image = Image.open(BytesIO(resp.content))
|
|
2113
|
+
except Exception as e:
|
|
2114
|
+
return jsonify({'error': f'Failed to fetch image from URL: {e}'}), 400
|
|
2115
|
+
|
|
2116
|
+
else:
|
|
2117
|
+
return jsonify({'error': 'No image provided (file or url)'}), 400
|
|
2118
|
+
|
|
2119
|
+
# Image process karo
|
|
2120
|
+
metadata = get_image_metadata(image)
|
|
2121
|
+
ai_detection = compute_ai_likelihood(image)
|
|
2122
|
+
|
|
2123
|
+
return jsonify({
|
|
2124
|
+
'success': True,
|
|
2125
|
+
'source': source_type,
|
|
2126
|
+
'data': metadata,
|
|
2127
|
+
'ai_detection': ai_detection
|
|
2128
|
+
})
|
|
2129
|
+
|
|
2130
|
+
except Exception as e:
|
|
2131
|
+
return jsonify({'error': str(e)}), 500
|
|
2132
|
+
|
|
2133
|
+
def display_qr_image(url):
|
|
2134
|
+
"""QR code generate aur ASCII format mein terminal pe dikhao"""
|
|
2135
|
+
qr = qrcode.QRCode(
|
|
2136
|
+
version=1,
|
|
2137
|
+
error_correction=qrcode.constants.ERROR_CORRECT_L,
|
|
2138
|
+
box_size=10,
|
|
2139
|
+
border=4,
|
|
2140
|
+
)
|
|
2141
|
+
qr.add_data(url)
|
|
2142
|
+
qr.make(fit=True)
|
|
2143
|
+
|
|
2144
|
+
# ASCII QR code print karo (invert=True better options hai)
|
|
2145
|
+
qr.print_ascii(invert=True)
|
|
2146
|
+
print(f"\n🔗 {url}")
|
|
2147
|
+
|
|
2148
|
+
def start_cloudflare_tunnel(port):
|
|
2149
|
+
try:
|
|
2150
|
+
if os.name == 'nt':
|
|
2151
|
+
os.system("taskkill /F /IM cloudflared.exe >NUL 2>&1")
|
|
2152
|
+
else:
|
|
2153
|
+
os.system("pkill -f cloudflared 2>/dev/null")
|
|
2154
|
+
time.sleep(1)
|
|
2155
|
+
|
|
2156
|
+
# OS ke hisaab se executable choose karo
|
|
2157
|
+
cf_executable = os.path.join(DATA_DIR, 'cloudflared.exe') if os.name == 'nt' else os.path.join(DATA_DIR, 'cloudflared')
|
|
2158
|
+
|
|
2159
|
+
# Agar missing ho (Linux/Colab) toh download karo
|
|
2160
|
+
if not os.path.exists(cf_executable) and os.name != 'nt':
|
|
2161
|
+
print("Downloading cloudflared...")
|
|
2162
|
+
subprocess.run(['wget', '-q', '-O', cf_executable, 'https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64'])
|
|
2163
|
+
subprocess.run(['chmod', '+x', cf_executable])
|
|
2164
|
+
|
|
2165
|
+
process = subprocess.Popen(
|
|
2166
|
+
[cf_executable, 'tunnel', '--protocol', 'http2', '--url', f'http://127.0.0.1:{port}'],
|
|
2167
|
+
stdout=subprocess.PIPE,
|
|
2168
|
+
stderr=subprocess.STDOUT,
|
|
2169
|
+
text=True,
|
|
2170
|
+
bufsize=1
|
|
2171
|
+
)
|
|
2172
|
+
|
|
2173
|
+
url_pattern = r'https://[a-z0-9-]+\.trycloudflare\.com'
|
|
2174
|
+
start_time = time.time()
|
|
2175
|
+
|
|
2176
|
+
while time.time() - start_time < 30:
|
|
2177
|
+
line = process.stdout.readline()
|
|
2178
|
+
if line:
|
|
2179
|
+
match = re.search(url_pattern, line)
|
|
2180
|
+
if match:
|
|
2181
|
+
url = match.group(0)
|
|
2182
|
+
time.sleep(2)
|
|
2183
|
+
return url, process
|
|
2184
|
+
return None, None
|
|
2185
|
+
except:
|
|
2186
|
+
return None, None
|
|
2187
|
+
|
|
2188
|
+
def print_cli_report(data):
|
|
2189
|
+
"""Beautiful Colorized two-column report for CLI mode"""
|
|
2190
|
+
if not data.get('success'):
|
|
2191
|
+
print(f"\n{Fore.RED if COLOR_SUPPORT else ''}❌ Scrape Failed: {data.get('error')}")
|
|
2192
|
+
return
|
|
2193
|
+
|
|
2194
|
+
c_b = Fore.CYAN if COLOR_SUPPORT else ""
|
|
2195
|
+
c_g = Fore.GREEN if COLOR_SUPPORT else ""
|
|
2196
|
+
c_y = Fore.YELLOW if COLOR_SUPPORT else ""
|
|
2197
|
+
c_r = Fore.RED if COLOR_SUPPORT else ""
|
|
2198
|
+
c_m = Fore.MAGENTA if COLOR_SUPPORT else ""
|
|
2199
|
+
c_w = Fore.WHITE if COLOR_SUPPORT else ""
|
|
2200
|
+
R = Style.RESET_ALL if COLOR_SUPPORT else ""
|
|
2201
|
+
|
|
2202
|
+
W = 37 # Inner width per column
|
|
2203
|
+
SEP = "─"
|
|
2204
|
+
|
|
2205
|
+
def vlen(s):
|
|
2206
|
+
return len(re.sub(r'\033\[[0-9;]*m', '', s))
|
|
2207
|
+
|
|
2208
|
+
def pad(s, w):
|
|
2209
|
+
return s + ' ' * max(0, w - vlen(s))
|
|
2210
|
+
|
|
2211
|
+
# --- Build LEFT column: Security + SEO + Intel ---
|
|
2212
|
+
left = []
|
|
2213
|
+
sec = data['security']
|
|
2214
|
+
sec_icon = "🟢" if sec['level'] == 'LOW' else "🟡" if sec['level'] == 'MEDIUM' else "🔴"
|
|
2215
|
+
sec_c = c_g if sec['level'] == 'LOW' else c_y if sec['level'] == 'MEDIUM' else c_r
|
|
2216
|
+
left.append(f"SECURITY: {sec_icon} {sec_c}{sec['level']}{R}")
|
|
2217
|
+
for t in sec['threats'][:3]:
|
|
2218
|
+
left.append(f" - {t[:W-3]}")
|
|
2219
|
+
left.append(SEP)
|
|
2220
|
+
|
|
2221
|
+
seo = data['seo']
|
|
2222
|
+
bar = '█' * int(seo['score']/5)
|
|
2223
|
+
left.append(f"SEO: {c_g}{seo['score']}{R}/100 {bar}")
|
|
2224
|
+
left.append(f"H1:{seo['headings']['h1']} H2:{seo['headings']['h2']} H3:{seo['headings']['h3']}")
|
|
2225
|
+
left.append(f"Imgs: {seo['images_analysis']['total']} total, {seo['images_analysis']['missing_alt']} no-alt")
|
|
2226
|
+
left.append(SEP)
|
|
2227
|
+
|
|
2228
|
+
intel = data['intel']
|
|
2229
|
+
left.append(f"INTEL (OSINT):")
|
|
2230
|
+
left.append(f" 📧 Emails: {len(intel['emails'])}")
|
|
2231
|
+
for e in intel['emails'][:2]:
|
|
2232
|
+
left.append(f" {e[:W-4]}")
|
|
2233
|
+
left.append(f" 📱 Phones: {len(intel['phones'])}")
|
|
2234
|
+
for p in intel['phones'][:2]:
|
|
2235
|
+
left.append(f" {p[:W-4]}")
|
|
2236
|
+
left.append(f" 📍 Locations: {len(intel['locations'])}")
|
|
2237
|
+
left.append(f" 🛠️ {c_m}{', '.join(intel['tech_stack'][:3])[:W-4]}{R}")
|
|
2238
|
+
|
|
2239
|
+
# --- Build RIGHT column: AI + Performance ---
|
|
2240
|
+
right = []
|
|
2241
|
+
if intel.get('ai_analysis'):
|
|
2242
|
+
ai = intel['ai_analysis']
|
|
2243
|
+
right.append(f"AI ANALYSIS:")
|
|
2244
|
+
right.append(f" Sentiment: {c_m}{ai['sentiment']['label']}{R} ({ai['sentiment']['polarity']})")
|
|
2245
|
+
right.append(f" Readability: {ai['readability']['level']} ({ai['readability']['score']})")
|
|
2246
|
+
right.append(f" Keywords:")
|
|
2247
|
+
kw = ', '.join(ai['keywords'][:5])
|
|
2248
|
+
right.append(f" {kw[:W-2]}")
|
|
2249
|
+
right.append(SEP)
|
|
2250
|
+
|
|
2251
|
+
perf = data.get('performance', {})
|
|
2252
|
+
if perf and perf.get('total'):
|
|
2253
|
+
right.append(f"PERFORMANCE: {c_y}{perf['total']:.2f}s{R}")
|
|
2254
|
+
for phase, dur in perf['phases'].items():
|
|
2255
|
+
perc = (dur / perf['total']) * 100
|
|
2256
|
+
bl = int(perc / 5)
|
|
2257
|
+
b = "█" * bl
|
|
2258
|
+
right.append(f" {phase[:12]:<12} {c_g}{b:<10}{R} {perc:>3.0f}%")
|
|
2259
|
+
right.append(f" Avg:{perf['avg']:.1f}s Best:{perf['best']:.1f}s")
|
|
2260
|
+
|
|
2261
|
+
# Equalize rows
|
|
2262
|
+
mx = max(len(left), len(right))
|
|
2263
|
+
while len(left) < mx: left.append("")
|
|
2264
|
+
while len(right) < mx: right.append("")
|
|
2265
|
+
|
|
2266
|
+
# --- Render ---
|
|
2267
|
+
url_display = data['site_structure']['url'][:W*2]
|
|
2268
|
+
total_w = W * 2 + 5 # inner total
|
|
2269
|
+
|
|
2270
|
+
print(f"\n{c_b}╔{'═'*total_w}╗{R}")
|
|
2271
|
+
print(f"{c_b}║ {R}{pad('intelligence report: ' + url_display, total_w - 1)}{c_b}║{R}")
|
|
2272
|
+
print(f"{c_b}╠{'═'*W}═╦═{'═'*W}══╣{R}")
|
|
2273
|
+
|
|
2274
|
+
for i in range(mx):
|
|
2275
|
+
l, r = left[i], right[i]
|
|
2276
|
+
l_sep = (l == SEP)
|
|
2277
|
+
r_sep = (r == SEP)
|
|
2278
|
+
|
|
2279
|
+
if l_sep and r_sep:
|
|
2280
|
+
print(f"{c_b}╟{'─'*W}─╫─{'─'*W}──╢{R}")
|
|
2281
|
+
elif l_sep:
|
|
2282
|
+
print(f"{c_b}╟{'─'*W}─╫ {R}{pad(r, W+1)}{c_b}║{R}")
|
|
2283
|
+
elif r_sep:
|
|
2284
|
+
print(f"{c_b}║ {R}{pad(l, W)}{c_b}╟ {R}{' '*W} {c_b}║{R}")
|
|
2285
|
+
else:
|
|
2286
|
+
print(f"{c_b}║ {R}{pad(l, W)}{c_b}║ {R}{pad(r, W+1)}{c_b}║{R}")
|
|
2287
|
+
|
|
2288
|
+
print(f"{c_b}╚{'═'*W}═╩═{'═'*W}══╝{R}\n")
|
|
2289
|
+
|
|
2290
|
+
def is_valid_url(text):
|
|
2291
|
+
"""Smart detection for URLs/Domains - Ultra Strict"""
|
|
2292
|
+
text = text.lower().strip()
|
|
2293
|
+
if not text or ' ' in text or len(text) < 4: return False
|
|
2294
|
+
if text.startswith('http'): return True
|
|
2295
|
+
if text.startswith(('/', '.', '@')): return False
|
|
2296
|
+
|
|
2297
|
+
# Must have a legitimate domain-like structure
|
|
2298
|
+
parts = text.split('.')
|
|
2299
|
+
if len(parts) >= 2:
|
|
2300
|
+
# Check TLD (2-12 chars, letters only)
|
|
2301
|
+
tld = parts[-1].split('/')[0]
|
|
2302
|
+
if tld.isalpha() and 2 <= len(tld) <= 12:
|
|
2303
|
+
# Common file types to exclude
|
|
2304
|
+
if tld in ['py', 'json', 'txt', 'md', 'exe', 'log', 'bat', 'sh', 'zip', 'rar']:
|
|
2305
|
+
return False
|
|
2306
|
+
# Ensure the domain part isn't empty or invalid
|
|
2307
|
+
domain_part = parts[-2]
|
|
2308
|
+
if domain_part and any(c.isalnum() for c in domain_part):
|
|
2309
|
+
return True
|
|
2310
|
+
return False
|
|
2311
|
+
|
|
2312
|
+
def run_cli_mode(initial_url=None):
|
|
2313
|
+
"""Interactive CLI prompting flow"""
|
|
2314
|
+
if initial_url and not is_valid_url(initial_url):
|
|
2315
|
+
print(f"\n{Fore.RED}⚠️ Invalid Link: {Fore.WHITE}{initial_url}")
|
|
2316
|
+
time.sleep(1.5)
|
|
2317
|
+
return
|
|
2318
|
+
|
|
2319
|
+
os.system('cls' if os.name == 'nt' else 'clear')
|
|
2320
|
+
banner = r"""██╗ ██╗███████╗██████╗ ████████╗ ██████╗ ██████╗ ██╗ ███████╗
|
|
2321
|
+
██║ ██║██╔════╝██╔══██╗ ╚══██╔══╝██╔═══██╗██╔═══██╗██║ ██╔════╝
|
|
2322
|
+
██║ █╗ ██║█████╗ ██████╔╝ ██║ ██║ ██║██║ ██║██║ ███████╗
|
|
2323
|
+
██║███╗██║██╔══╝ ██╔══██╗ ██║ ██║ ██║██║ ██║██║ ╚════██║
|
|
2324
|
+
╚███╔███╔╝███████╗██████╔╝ ██║ ╚██████╔╝╚██████╔╝███████╗███████║
|
|
2325
|
+
╚══╝╚══╝ ╚══════╝╚═════╝ ╚═╝ ╚═════╝ ╚═════╝ ╚══════╝╚══════╝"""
|
|
2326
|
+
print_gradient_text(banner, (0, 255, 255), (255, 0, 255))
|
|
2327
|
+
print(f"{' ' * 45}{Fore.WHITE}{Style.DIM}Dev: Abhinav Adarsh{Style.RESET_ALL}")
|
|
2328
|
+
print(f"{Fore.CYAN}{Style.BRIGHT} ADVANCED CLI INTELLIGENCE MODE{Style.RESET_ALL}")
|
|
2329
|
+
|
|
2330
|
+
try:
|
|
2331
|
+
url = initial_url if initial_url else input(f"{Fore.LIGHTGREEN_EX if COLOR_SUPPORT else ''}link > {Style.RESET_ALL}").strip()
|
|
2332
|
+
if not url: return
|
|
2333
|
+
|
|
2334
|
+
# Double check validity if manual input
|
|
2335
|
+
if not initial_url and not is_valid_url(url):
|
|
2336
|
+
print(f"\n{Fore.RED}⚠️ Invalid Link: {Fore.WHITE}{url}")
|
|
2337
|
+
time.sleep(1.5)
|
|
2338
|
+
return
|
|
2339
|
+
|
|
2340
|
+
if AUTOCOMPLETE_AVAILABLE:
|
|
2341
|
+
if not url.startswith('/'):
|
|
2342
|
+
try: readline.write_history_file(HISTORY_FILE)
|
|
2343
|
+
except: pass
|
|
2344
|
+
else:
|
|
2345
|
+
try: readline.remove_history_item(readline.get_current_history_length() - 1)
|
|
2346
|
+
except: pass
|
|
2347
|
+
|
|
2348
|
+
print("\n⚙️ Scrape Options:")
|
|
2349
|
+
fetch_img = input(" - Fetch & Analyze Images? (y/N) > ").lower() == 'y'
|
|
2350
|
+
if AUTOCOMPLETE_AVAILABLE:
|
|
2351
|
+
try: readline.remove_history_item(readline.get_current_history_length() - 1)
|
|
2352
|
+
except: pass
|
|
2353
|
+
fetch_vid = input(" - Fetch & Deep-Scan Videos? (y/N) > ").lower() == 'y'
|
|
2354
|
+
if AUTOCOMPLETE_AVAILABLE:
|
|
2355
|
+
try: readline.remove_history_item(readline.get_current_history_length() - 1)
|
|
2356
|
+
except: pass
|
|
2357
|
+
depth = input(" - Crawl Depth (1-3) [Default 2] > ").strip()
|
|
2358
|
+
depth = int(depth) if depth.isdigit() else 2
|
|
2359
|
+
if AUTOCOMPLETE_AVAILABLE:
|
|
2360
|
+
try: readline.remove_history_item(readline.get_current_history_length() - 1)
|
|
2361
|
+
except: pass
|
|
2362
|
+
use_proxy = input(" - Use Intelligent Proxies? (y/N) > ").lower() == 'y'
|
|
2363
|
+
if AUTOCOMPLETE_AVAILABLE:
|
|
2364
|
+
try: readline.remove_history_item(readline.get_current_history_length() - 1)
|
|
2365
|
+
except: pass
|
|
2366
|
+
|
|
2367
|
+
with MoonSpinner("Scanning"):
|
|
2368
|
+
result = execute_scrape_logic(url, fetch_img, fetch_vid, depth, use_proxy)
|
|
2369
|
+
print_cli_report(result)
|
|
2370
|
+
|
|
2371
|
+
input("Press Enter to return to main menu and CLEAR session data...")
|
|
2372
|
+
clear_scraped_data()
|
|
2373
|
+
print("🧹 Session data cleared.")
|
|
2374
|
+
time.sleep(1)
|
|
2375
|
+
except KeyboardInterrupt:
|
|
2376
|
+
print("\n\n⚠️ Input interrupted. Returning to menu...")
|
|
2377
|
+
time.sleep(1)
|
|
2378
|
+
|
|
2379
|
+
def run_image_forensics_mode(initial_image=None):
|
|
2380
|
+
"""CLI flow for image analysis"""
|
|
2381
|
+
os.system('cls' if os.name == 'nt' else 'clear')
|
|
2382
|
+
banner = r"""██╗███╗ ███╗ █████╗ ██████╗ ███████╗ ███████╗ ██████╗ ██████╗ ███████╗███╗ ██╗███████╗██╗ ██████╗███████╗
|
|
2383
|
+
██║████╗ ████║██╔══██╗██╔════╝ ██╔════╝ ██╔════╝██╔═══██╗██╔══██╗██╔════╝████╗ ██║██╔════╝██║██╔════╝██╔════╝
|
|
2384
|
+
██║██╔████╔██║███████║██║ ███╗█████╗ █████╗ ██║ ██║██████╔╝█████╗ ██╔██╗ ██║███████╗██║██║ ███████╗
|
|
2385
|
+
██║██║╚██╔╝██║██╔══██║██║ ██║██╔══╝ ██╔══╝ ██║ ██║██╔══██╗██╔══╝ ██║╚██╗██║╚════██║██║██║ ╚════██║
|
|
2386
|
+
██║██║ ╚═╝ ██║██║ ██║╚██████╔╝███████╗ ██║ ╚██████╔╝██║ ██║███████╗██║ ╚████║███████║██║╚██████╗███████║
|
|
2387
|
+
╚═╝╚═╝ ╚═╝╚═╝ ╚═╝ ╚═════╝ ╚══════╝ ╚═╝ ╚═════╝ ╚═╝ ╚═╝╚══════╝╚═╝ ╚═══╝╚══════╝╚═╝ ╚═════╝╚══════╝"""
|
|
2388
|
+
print_gradient_text(banner, (255, 100, 100), (100, 100, 255))
|
|
2389
|
+
print(f"{' ' * 45}{Fore.WHITE}{Style.DIM}Dev : Abhinav Adarsh{Style.RESET_ALL}\n")
|
|
2390
|
+
|
|
2391
|
+
try:
|
|
2392
|
+
user_input = initial_image if initial_image else input(f"{Fore.LIGHTGREEN_EX if COLOR_SUPPORT else ''}image link or local path > {Style.RESET_ALL}").strip()
|
|
2393
|
+
if not user_input: return
|
|
2394
|
+
|
|
2395
|
+
# Handle quotes (e.g. from copy-pasting path)
|
|
2396
|
+
user_input = user_input.strip('"\'')
|
|
2397
|
+
|
|
2398
|
+
image = None
|
|
2399
|
+
source_type = "local"
|
|
2400
|
+
|
|
2401
|
+
with MoonSpinner("Analyzing Image"):
|
|
2402
|
+
try:
|
|
2403
|
+
# Is it a URL?
|
|
2404
|
+
if user_input.startswith(('http://', 'https://')):
|
|
2405
|
+
source_type = "url"
|
|
2406
|
+
headers = {'User-Agent': proxy_manager.get_random_ua()}
|
|
2407
|
+
resp = requests.get(user_input, headers=headers, stream=True, timeout=15, verify=False)
|
|
2408
|
+
resp.raise_for_status()
|
|
2409
|
+
image = Image.open(BytesIO(resp.content))
|
|
2410
|
+
else:
|
|
2411
|
+
# Assume it's a local path
|
|
2412
|
+
if os.path.exists(user_input):
|
|
2413
|
+
image = Image.open(user_input)
|
|
2414
|
+
else:
|
|
2415
|
+
print(f"\n{Fore.RED}Error: File or URL not found: {Fore.WHITE}{user_input}")
|
|
2416
|
+
time.sleep(2)
|
|
2417
|
+
return
|
|
2418
|
+
|
|
2419
|
+
if not image:
|
|
2420
|
+
raise Exception("Failed to load image")
|
|
2421
|
+
|
|
2422
|
+
# Process
|
|
2423
|
+
metadata = get_image_metadata(image)
|
|
2424
|
+
ai_detection = compute_ai_likelihood(image)
|
|
2425
|
+
|
|
2426
|
+
print_image_forensics_report({
|
|
2427
|
+
'source': source_type,
|
|
2428
|
+
'path': user_input,
|
|
2429
|
+
'metadata': metadata,
|
|
2430
|
+
'ai': ai_detection
|
|
2431
|
+
})
|
|
2432
|
+
|
|
2433
|
+
except Exception as e:
|
|
2434
|
+
print(f"\n{Fore.RED}Analysis Failed: {Fore.WHITE}{e}")
|
|
2435
|
+
time.sleep(2)
|
|
2436
|
+
return
|
|
2437
|
+
|
|
2438
|
+
input("\nPress Enter to return to main menu...")
|
|
2439
|
+
except KeyboardInterrupt:
|
|
2440
|
+
pass
|
|
2441
|
+
|
|
2442
|
+
def print_image_forensics_report(data):
|
|
2443
|
+
"""Beautiful terminal table for image analysis results"""
|
|
2444
|
+
c_b = Fore.CYAN if COLOR_SUPPORT else ""
|
|
2445
|
+
c_g = Fore.GREEN if COLOR_SUPPORT else ""
|
|
2446
|
+
c_y = Fore.YELLOW if COLOR_SUPPORT else ""
|
|
2447
|
+
c_r = Fore.RED if COLOR_SUPPORT else ""
|
|
2448
|
+
c_w = Fore.WHITE if COLOR_SUPPORT else ""
|
|
2449
|
+
R = Style.RESET_ALL if COLOR_SUPPORT else ""
|
|
2450
|
+
|
|
2451
|
+
meta = data['metadata']
|
|
2452
|
+
basic = meta['basic']
|
|
2453
|
+
ai = data['ai']
|
|
2454
|
+
|
|
2455
|
+
total_w = 80
|
|
2456
|
+
def vlen(s): return len(re.sub(r'\033\[[0-9;]*m', '', s))
|
|
2457
|
+
def pad(s, w): return s + ' ' * (w - vlen(s))
|
|
2458
|
+
|
|
2459
|
+
print(f"\n{c_b}╔{'═'*(total_w-2)}╗{R}")
|
|
2460
|
+
title = f" forensic report: {os.path.basename(data['path'])[:40]} "
|
|
2461
|
+
print(f"{c_b}║{R}{c_w}{Style.BRIGHT}{title.center(total_w-2)}{R}{c_b}║{R}")
|
|
2462
|
+
print(f"{c_b}╠{'═'*(total_w-2)}╣{R}")
|
|
2463
|
+
|
|
2464
|
+
# Row helper
|
|
2465
|
+
def print_row(key, val, color=c_w):
|
|
2466
|
+
k = f" {key}:"
|
|
2467
|
+
line = f"{c_y}{pad(k, 20)}{R} {color}{val}{R}"
|
|
2468
|
+
print(f"{c_b}║{R} {pad(line, total_w-4)} {c_b}║{R}")
|
|
2469
|
+
|
|
2470
|
+
# Basic Info
|
|
2471
|
+
print_row("Source Type", data['source'].upper(), c_g)
|
|
2472
|
+
print_row("Format", basic['Format'])
|
|
2473
|
+
print_row("Resolution", basic['Size'])
|
|
2474
|
+
print_row("Color Mode", basic['Mode'])
|
|
2475
|
+
|
|
2476
|
+
# AI Detection
|
|
2477
|
+
ai_score = ai['score']
|
|
2478
|
+
ai_color = c_r if ai_score > 70 else (c_y if ai_score > 40 else c_g)
|
|
2479
|
+
print_row("AI Likelihood", f"{ai_score}% ({ai['label']})", ai_color)
|
|
2480
|
+
|
|
2481
|
+
# GPS / Location
|
|
2482
|
+
if meta.get('location'):
|
|
2483
|
+
print_row("GPS Coordinates", f"{meta['location']['lat']}, {meta['location']['lon']}", c_m := (Fore.MAGENTA if COLOR_SUPPORT else ""))
|
|
2484
|
+
print_row("Map Link", meta['location']['map_url'], c_b)
|
|
2485
|
+
|
|
2486
|
+
# Crucial EXIF
|
|
2487
|
+
exif = meta['exif']
|
|
2488
|
+
important_tags = ['Make', 'Model', 'Software', 'DateTime', 'LensModel', 'ExposureTime', 'ISOSpeedRatings']
|
|
2489
|
+
found_exif = False
|
|
2490
|
+
for tag in important_tags:
|
|
2491
|
+
if tag in exif:
|
|
2492
|
+
if not found_exif:
|
|
2493
|
+
print(f"{c_b}╟{'─'*(total_w-2)}╢{R}")
|
|
2494
|
+
found_exif = True
|
|
2495
|
+
print_row(tag, exif[tag])
|
|
2496
|
+
|
|
2497
|
+
print(f"{c_b}╚{'═'*(total_w-2)}╝{R}\n")
|
|
2498
|
+
|
|
2499
|
+
def main_launcher():
|
|
2500
|
+
"""Mode selection menu on startup"""
|
|
2501
|
+
menu_commands = ['/web', '/cli', '/image', '/help', '/clear', '/quit', '/history', '/w', '/c', '/i', '/h', '/q', '/hi', '--help']
|
|
2502
|
+
setup_autocomplete(menu_commands)
|
|
2503
|
+
|
|
2504
|
+
while True:
|
|
2505
|
+
try:
|
|
2506
|
+
os.system('cls' if os.name == 'nt' else 'clear')
|
|
2507
|
+
banner = r"""██╗ ██╗███████╗██████╗ ████████╗ ██████╗ ██████╗ ██╗ ███████╗
|
|
2508
|
+
██║ ██║██╔════╝██╔══██╗ ╚══██╔══╝██╔═══██╗██╔═══██╗██║ ██╔════╝
|
|
2509
|
+
██║ █╗ ██║█████╗ ██████╔╝ ██║ ██║ ██║██║ ██║██║ ███████╗
|
|
2510
|
+
██║███╗██║██╔══╝ ██╔══██╗ ██║ ██║ ██║██║ ██║██║ ╚════██║
|
|
2511
|
+
╚███╔███╔╝███████╗██████╔╝ ██║ ╚██████╔╝╚██████╔╝███████╗███████║
|
|
2512
|
+
╚══╝╚══╝ ╚══════╝╚═════╝ ╚═╝ ╚═════╝ ╚═════╝ ╚══════╝╚══════╝"""
|
|
2513
|
+
print_gradient_text(banner, (0, 255, 255), (255, 0, 255))
|
|
2514
|
+
print(f"{' ' * 45}{Fore.WHITE}{Style.DIM}Dev: Abhinav Adarsh{Style.RESET_ALL}")
|
|
2515
|
+
print(f"{Fore.WHITE}Type {Fore.CYAN}/help{Fore.WHITE} or {Fore.CYAN}/h{Fore.WHITE} to see all commands.\n")
|
|
2516
|
+
|
|
2517
|
+
choice = input(f"{Fore.LIGHTGREEN_EX if COLOR_SUPPORT else ''}> {Style.RESET_ALL}").strip().lower()
|
|
2518
|
+
if AUTOCOMPLETE_AVAILABLE:
|
|
2519
|
+
try: readline.remove_history_item(readline.get_current_history_length() - 1)
|
|
2520
|
+
except: pass
|
|
2521
|
+
|
|
2522
|
+
if choice in ['/web', '/w']:
|
|
2523
|
+
start_web_server()
|
|
2524
|
+
elif choice in ['/cli', '/c']:
|
|
2525
|
+
run_cli_mode()
|
|
2526
|
+
elif choice in ['/image', '/i']:
|
|
2527
|
+
run_image_forensics_mode()
|
|
2528
|
+
elif choice in ['/help', '/h', '--help']:
|
|
2529
|
+
print(f"\n{Fore.CYAN if COLOR_SUPPORT else ''}Available Commands:")
|
|
2530
|
+
print(f" {Fore.CYAN}/web{Style.RESET_ALL} - Launches the web engine for browser-based auditing.")
|
|
2531
|
+
print(f" {Fore.CYAN}/cli{Style.RESET_ALL} - Runs a deep-scan intelligence report in the terminal.")
|
|
2532
|
+
print(f" {Fore.CYAN}/image{Style.RESET_ALL} - Local/Remote Image Forensics & AI detection (Alias: /i).")
|
|
2533
|
+
print(f" {Fore.CYAN}/clear{Style.RESET_ALL} - Purges the 'webfiles/scraped' directory and clears screen.")
|
|
2534
|
+
print(f" {Fore.CYAN}/history{Style.RESET_ALL} - Shows command history (Alias: /hi).")
|
|
2535
|
+
print(f" {Fore.CYAN}/help{Style.RESET_ALL} - Displays this help message (Alias: /h, --help).")
|
|
2536
|
+
print(f" {Fore.RED}/quit{Style.RESET_ALL} - Shuts down the application safely.")
|
|
2537
|
+
input("\nPress Enter to continue...")
|
|
2538
|
+
elif choice in ['/history', '/hi']:
|
|
2539
|
+
if AUTOCOMPLETE_AVAILABLE and os.path.exists(HISTORY_FILE):
|
|
2540
|
+
print(f"\n{Fore.CYAN}--- Command History ---{Style.RESET_ALL}")
|
|
2541
|
+
with open(HISTORY_FILE, 'r') as f:
|
|
2542
|
+
lines = f.readlines()
|
|
2543
|
+
for i, line in enumerate(lines[-20:]): # Show last 20
|
|
2544
|
+
print(f"{Fore.WHITE}{i+1}. {line.strip()}")
|
|
2545
|
+
else:
|
|
2546
|
+
print(f"\n{Fore.YELLOW}No history found.")
|
|
2547
|
+
input("\nPress Enter to continue...")
|
|
2548
|
+
elif choice in ['/clear', '/c']:
|
|
2549
|
+
clear_scraped_data()
|
|
2550
|
+
os.system('cls' if os.name == 'nt' else 'clear')
|
|
2551
|
+
print("Cache purged and screen cleared.")
|
|
2552
|
+
time.sleep(1)
|
|
2553
|
+
elif choice in ['/quit', '/q']:
|
|
2554
|
+
print(f"\n{Fore.YELLOW if COLOR_SUPPORT else ''}Goodbye!")
|
|
2555
|
+
sys.exit()
|
|
2556
|
+
elif is_valid_url(choice):
|
|
2557
|
+
run_cli_mode(choice)
|
|
2558
|
+
except KeyboardInterrupt:
|
|
2559
|
+
print("\n\nGoodbye!")
|
|
2560
|
+
sys.exit()
|
|
2561
|
+
|
|
2562
|
+
def start_web_server():
|
|
2563
|
+
"""Original server startup logic"""
|
|
2564
|
+
public_url, tunnel_proc = None, None
|
|
2565
|
+
with MoonSpinner("Initializing Web Engine"):
|
|
2566
|
+
threading.Thread(target=lambda: app.run(host='0.0.0.0', port=PORT, debug=False, use_reloader=False, threaded=True), daemon=True).start()
|
|
2567
|
+
|
|
2568
|
+
if wait_for_server(PORT, timeout=10):
|
|
2569
|
+
public_url, tunnel_proc = start_cloudflare_tunnel(PORT)
|
|
2570
|
+
else:
|
|
2571
|
+
print("❌ Server failed to start")
|
|
2572
|
+
return
|
|
2573
|
+
|
|
2574
|
+
if public_url:
|
|
2575
|
+
os.system('cls' if os.name == 'nt' else 'clear')
|
|
2576
|
+
print(f"Scan this QR code :\n")
|
|
2577
|
+
display_qr_image(public_url)
|
|
2578
|
+
print(f"\nPress Ctrl+C to stop.")
|
|
2579
|
+
|
|
2580
|
+
try:
|
|
2581
|
+
if 'google.colab' in sys.modules:
|
|
2582
|
+
print("\n Running in Background.")
|
|
2583
|
+
while True: time.sleep(100)
|
|
2584
|
+
else:
|
|
2585
|
+
while True: time.sleep(1)
|
|
2586
|
+
except KeyboardInterrupt:
|
|
2587
|
+
print("\n Thank you for using ^_^ Web Tools")
|
|
2588
|
+
if tunnel_proc:
|
|
2589
|
+
try: tunnel_proc.terminate()
|
|
2590
|
+
except: pass
|
|
2591
|
+
else:
|
|
2592
|
+
print("❌ Failed to create tunnel")
|
|
2593
|
+
|
|
2594
|
+
# Sab start karo
|
|
2595
|
+
if __name__ == '__main__':
|
|
2596
|
+
main_launcher()
|