waymore 7.6__py3-none-any.whl → 8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- waymore/__init__.py +1 -1
- waymore/waymore.py +1829 -305
- {waymore-7.6.dist-info → waymore-8.0.dist-info}/METADATA +5 -2
- waymore-8.0.dist-info/RECORD +8 -0
- {waymore-7.6.dist-info → waymore-8.0.dist-info}/WHEEL +1 -1
- waymore-7.6.dist-info/RECORD +0 -8
- {waymore-7.6.dist-info → waymore-8.0.dist-info}/entry_points.txt +0 -0
- {waymore-7.6.dist-info → waymore-8.0.dist-info}/licenses/LICENSE +0 -0
- {waymore-7.6.dist-info → waymore-8.0.dist-info}/top_level.txt +0 -0
waymore/waymore.py
CHANGED
|
@@ -70,6 +70,7 @@ stopSourceAlienVault = False
|
|
|
70
70
|
stopSourceURLScan = False
|
|
71
71
|
stopSourceVirusTotal = False
|
|
72
72
|
stopSourceIntelx = False
|
|
73
|
+
stopSourceGhostArchive = False
|
|
73
74
|
successCount = 0
|
|
74
75
|
failureCount = 0
|
|
75
76
|
fileCount = 0
|
|
@@ -79,6 +80,7 @@ totalPages = 0
|
|
|
79
80
|
indexFile = None
|
|
80
81
|
continueRespFile = None
|
|
81
82
|
continueRespFileURLScan = None
|
|
83
|
+
continueRespFileGhostArchive = None
|
|
82
84
|
inputIsDomainANDPath = False
|
|
83
85
|
inputIsSubDomain = False
|
|
84
86
|
subs = "*."
|
|
@@ -102,6 +104,7 @@ checkAlienVault = 0
|
|
|
102
104
|
checkURLScan = 0
|
|
103
105
|
checkVirusTotal = 0
|
|
104
106
|
checkIntelx = 0
|
|
107
|
+
checkGhostArchive = 0
|
|
105
108
|
argsInputHostname = ""
|
|
106
109
|
responseOutputDirectory = ""
|
|
107
110
|
urlscanRequestLinks = set()
|
|
@@ -112,11 +115,14 @@ linkCountAlienVault = 0
|
|
|
112
115
|
linkCountURLScan = 0
|
|
113
116
|
linkCountVirusTotal = 0
|
|
114
117
|
linkCountIntelx = 0
|
|
118
|
+
linkCountGhostArchive = 0
|
|
115
119
|
linksFoundCommonCrawl = set()
|
|
116
120
|
linksFoundAlienVault = set()
|
|
117
121
|
linksFoundURLScan = set()
|
|
118
122
|
linksFoundVirusTotal = set()
|
|
119
123
|
linksFoundIntelx = set()
|
|
124
|
+
linksFoundGhostArchive = set()
|
|
125
|
+
ghostArchiveRequestLinks = set()
|
|
120
126
|
|
|
121
127
|
# Thread lock for protecting shared state during concurrent operations
|
|
122
128
|
links_lock = threading.Lock()
|
|
@@ -124,6 +130,7 @@ links_lock = threading.Lock()
|
|
|
124
130
|
# Shared state for link collection across all sources
|
|
125
131
|
linksFound = set()
|
|
126
132
|
linkMimes = set()
|
|
133
|
+
extraWarcLinks = set() # Track extra URLs found in WARC files for mode B
|
|
127
134
|
|
|
128
135
|
# Source Provider URLs
|
|
129
136
|
WAYBACK_URL = "https://web.archive.org/cdx/search/cdx?url={DOMAIN}{COLLAPSE}&fl=timestamp,original,mimetype,statuscode,digest"
|
|
@@ -134,6 +141,8 @@ URLSCAN_DOM_URL = "https://urlscan.io/dom/"
|
|
|
134
141
|
VIRUSTOTAL_URL = "https://www.virustotal.com/vtapi/v2/domain/report?apikey={APIKEY}&domain={DOMAIN}"
|
|
135
142
|
# Paid endpoint first, free endpoint as fallback
|
|
136
143
|
INTELX_BASES = ["https://2.intelx.io", "https://free.intelx.io"]
|
|
144
|
+
GHOSTARCHIVE_URL = "https://ghostarchive.org/search?term={DOMAIN}&page="
|
|
145
|
+
GHOSTARCHIVE_DOM_URL = "https://ghostarchive.org"
|
|
137
146
|
|
|
138
147
|
intelx_tls = threading.local()
|
|
139
148
|
|
|
@@ -247,10 +256,10 @@ DEFAULT_LIMIT = 5000
|
|
|
247
256
|
DEFAULT_TIMEOUT = 30
|
|
248
257
|
|
|
249
258
|
# Exclusions used to exclude responses we will try to get from web.archive.org
|
|
250
|
-
DEFAULT_FILTER_URL = ".css,.jpg,.jpeg,.png,.svg,.img,.gif,.mp4,.flv,.ogv,.webm,.webp,.mov,.mp3,.m4a,.m4p,.scss,.tif,.tiff,.ttf,.otf,.woff,.woff2,.bmp,.ico,.eot,.htc,.rtf,.swf,.image,/image,/img,/css,/wp-json,/wp-content,/wp-includes,/theme,/audio,/captcha,/font,node_modules,/jquery,/bootstrap,/_incapsula_resource"
|
|
259
|
+
DEFAULT_FILTER_URL = ".css,.jpg,.jpeg,.png,.svg,.img,.gif,.mp4,.flv,.ogv,.webm,.webp,.mov,.mp3,.m4a,.m4p,.scss,.tif,.tiff,.ttf,.otf,.woff,.woff2,.bmp,.ico,.eot,.htc,.rtf,.swf,.image,/image,/img,/css,/wp-json,/wp-content,/wp-includes,/theme,/audio,/captcha,/font,node_modules,/jquery,/bootstrap,/_incapsula_resource,.wmv,.wma,.asx,.avif"
|
|
251
260
|
|
|
252
261
|
# MIME Content-Type exclusions used to filter links and responses from web.archive.org through their API
|
|
253
|
-
DEFAULT_FILTER_MIME = "text/css,image/jpeg,image/jpg,image/png,image/svg+xml,image/gif,image/tiff,image/webp,image/bmp,image/vnd,image/x-icon,image/vnd.microsoft.icon,font/ttf,font/woff,font/woff2,font/x-woff2,font/x-woff,font/otf,audio/mpeg,audio/wav,audio/webm,audio/aac,audio/ogg,audio/wav,audio/webm,video/mp4,video/mpeg,video/webm,video/ogg,video/mp2t,video/webm,video/x-msvideo,video/x-flv,application/font-woff,application/font-woff2,application/x-font-woff,application/x-font-woff2,application/vnd.ms-fontobject,application/font-sfnt,application/vnd.android.package-archive,binary/octet-stream,application/octet-stream,application/
|
|
262
|
+
DEFAULT_FILTER_MIME = "text/css,image/jpeg,image/jpg,image/png,image/svg+xml,image/gif,image/tiff,image/webp,image/bmp,image/vnd,image/x-icon,image/vnd.microsoft.icon,font/ttf,font/woff,font/woff2,font/x-woff2,font/x-woff,font/otf,audio/mpeg,audio/wav,audio/webm,audio/aac,audio/ogg,audio/wav,audio/webm,video/mp4,video/mpeg,video/webm,video/ogg,video/mp2t,video/webm,video/x-msvideo,video/x-flv,application/font-woff,application/font-woff2,application/x-font-woff,application/x-font-woff2,application/vnd.ms-fontobject,application/font-sfnt,application/vnd.android.package-archive,binary/octet-stream,application/octet-stream,application/x-font-ttf,application/x-font-otf,video/webm,video/3gpp,application/font-ttf,audio/mp3,audio/x-wav,image/pjpeg,audio/basic,application/font-otf,application/x-ms-application,application/x-msdownload,video/x-ms-wmv,image/x-png,video/quicktime,image/x-ms-bmp,font/opentype,application/x-font-opentype,application/x-woff,audio/aiff,video/x-ms-asf,audio/x-ms-wma,audio/wma,application/x-mplayer2,image/avif"
|
|
254
263
|
|
|
255
264
|
# Response code exclusions we will use to filter links and responses from web.archive.org through their API
|
|
256
265
|
DEFAULT_FILTER_CODE = "404,301,302"
|
|
@@ -297,6 +306,298 @@ INLINE_JS_EXCLUDE = [
|
|
|
297
306
|
".json",
|
|
298
307
|
]
|
|
299
308
|
|
|
309
|
+
# Binary file extensions that should be saved as raw bytes, not text
|
|
310
|
+
BINARY_EXTENSIONS = frozenset(
|
|
311
|
+
[
|
|
312
|
+
".zip",
|
|
313
|
+
".gz",
|
|
314
|
+
".gzip",
|
|
315
|
+
".tar",
|
|
316
|
+
".rar",
|
|
317
|
+
".7z",
|
|
318
|
+
".bz2",
|
|
319
|
+
".xz",
|
|
320
|
+
".pdf",
|
|
321
|
+
".doc",
|
|
322
|
+
".docx",
|
|
323
|
+
".xls",
|
|
324
|
+
".xlsx",
|
|
325
|
+
".ppt",
|
|
326
|
+
".pptx",
|
|
327
|
+
".exe",
|
|
328
|
+
".msi",
|
|
329
|
+
".dll",
|
|
330
|
+
".bin",
|
|
331
|
+
".so",
|
|
332
|
+
".dmg",
|
|
333
|
+
".deb",
|
|
334
|
+
".rpm",
|
|
335
|
+
".png",
|
|
336
|
+
".jpg",
|
|
337
|
+
".jpeg",
|
|
338
|
+
".gif",
|
|
339
|
+
".bmp",
|
|
340
|
+
".ico",
|
|
341
|
+
".webp",
|
|
342
|
+
".svg",
|
|
343
|
+
".tiff",
|
|
344
|
+
".tif",
|
|
345
|
+
".mp3",
|
|
346
|
+
".mp4",
|
|
347
|
+
".wav",
|
|
348
|
+
".avi",
|
|
349
|
+
".mov",
|
|
350
|
+
".mkv",
|
|
351
|
+
".flv",
|
|
352
|
+
".wmv",
|
|
353
|
+
".webm",
|
|
354
|
+
".ogg",
|
|
355
|
+
".ttf",
|
|
356
|
+
".otf",
|
|
357
|
+
".woff",
|
|
358
|
+
".woff2",
|
|
359
|
+
".eot",
|
|
360
|
+
".class",
|
|
361
|
+
".jar",
|
|
362
|
+
".war",
|
|
363
|
+
".ear",
|
|
364
|
+
".pyc",
|
|
365
|
+
".pyo",
|
|
366
|
+
".o",
|
|
367
|
+
".a",
|
|
368
|
+
".lib",
|
|
369
|
+
".iso",
|
|
370
|
+
".img",
|
|
371
|
+
".sqlite",
|
|
372
|
+
".db",
|
|
373
|
+
".mdb",
|
|
374
|
+
".swf",
|
|
375
|
+
".fla",
|
|
376
|
+
]
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
# Binary MIME types that should be saved as raw bytes, not text
|
|
380
|
+
BINARY_MIME_TYPES = frozenset(
|
|
381
|
+
[
|
|
382
|
+
"application/zip",
|
|
383
|
+
"application/x-zip-compressed",
|
|
384
|
+
"application/x-gzip",
|
|
385
|
+
"application/gzip",
|
|
386
|
+
"application/x-tar",
|
|
387
|
+
"application/x-rar-compressed",
|
|
388
|
+
"application/x-7z-compressed",
|
|
389
|
+
"application/x-bzip2",
|
|
390
|
+
"application/x-xz",
|
|
391
|
+
"application/pdf",
|
|
392
|
+
"application/msword",
|
|
393
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
394
|
+
"application/vnd.ms-excel",
|
|
395
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
396
|
+
"application/vnd.ms-powerpoint",
|
|
397
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
398
|
+
"application/x-msdownload",
|
|
399
|
+
"application/x-msi",
|
|
400
|
+
"application/x-dosexec",
|
|
401
|
+
"application/octet-stream",
|
|
402
|
+
"image/png",
|
|
403
|
+
"image/jpeg",
|
|
404
|
+
"image/gif",
|
|
405
|
+
"image/bmp",
|
|
406
|
+
"image/x-icon",
|
|
407
|
+
"image/webp",
|
|
408
|
+
"image/tiff",
|
|
409
|
+
"audio/mpeg",
|
|
410
|
+
"audio/wav",
|
|
411
|
+
"audio/ogg",
|
|
412
|
+
"audio/webm",
|
|
413
|
+
"video/mp4",
|
|
414
|
+
"video/avi",
|
|
415
|
+
"video/quicktime",
|
|
416
|
+
"video/x-msvideo",
|
|
417
|
+
"video/x-matroska",
|
|
418
|
+
"video/webm",
|
|
419
|
+
"video/ogg",
|
|
420
|
+
"font/ttf",
|
|
421
|
+
"font/otf",
|
|
422
|
+
"font/woff",
|
|
423
|
+
"font/woff2",
|
|
424
|
+
"application/x-font-ttf",
|
|
425
|
+
"application/x-font-otf",
|
|
426
|
+
"application/font-woff",
|
|
427
|
+
"application/font-woff2",
|
|
428
|
+
"application/java-archive",
|
|
429
|
+
"application/x-java-class",
|
|
430
|
+
"application/x-shockwave-flash",
|
|
431
|
+
"application/x-sqlite3",
|
|
432
|
+
"application/x-iso9660-image",
|
|
433
|
+
]
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
def isBinaryContent(contentBytes, contentType, url=""):
|
|
438
|
+
"""
|
|
439
|
+
Determine if content should be treated as binary based on actual content, Content-Type, and URL.
|
|
440
|
+
|
|
441
|
+
Priority (highest to lowest):
|
|
442
|
+
1. Content inspection - check for text signatures (most reliable)
|
|
443
|
+
2. Content-Type header
|
|
444
|
+
3. URL extension (least reliable - archive might have captured an HTML error page)
|
|
445
|
+
|
|
446
|
+
Args:
|
|
447
|
+
contentBytes: The raw response bytes (at least first 100 bytes)
|
|
448
|
+
contentType: The Content-Type header value
|
|
449
|
+
url: The URL (optional, used as fallback)
|
|
450
|
+
|
|
451
|
+
Returns True if content is binary and should be saved as raw bytes.
|
|
452
|
+
"""
|
|
453
|
+
# STEP 1: Check actual content for text signatures (most reliable)
|
|
454
|
+
# If content starts with text markers, it's definitely NOT binary regardless of extension
|
|
455
|
+
try:
|
|
456
|
+
if contentBytes and len(contentBytes) > 0:
|
|
457
|
+
# Get first 100 bytes and strip leading whitespace/newlines for checking
|
|
458
|
+
preview = contentBytes[:100].lstrip()
|
|
459
|
+
previewLower = preview.lower()
|
|
460
|
+
|
|
461
|
+
# Common text file signatures
|
|
462
|
+
textSignatures = [
|
|
463
|
+
b"<!doctype", # HTML doctype
|
|
464
|
+
b"<html", # HTML tag
|
|
465
|
+
b"<head", # HTML head
|
|
466
|
+
b"<body", # HTML body
|
|
467
|
+
b"<?xml", # XML declaration
|
|
468
|
+
b"<svg", # SVG image (actually XML text)
|
|
469
|
+
b"{", # JSON object
|
|
470
|
+
b"[", # JSON array
|
|
471
|
+
b"/*", # CSS/JS comment
|
|
472
|
+
b"//", # JS comment
|
|
473
|
+
b"#!", # Shebang (shell scripts)
|
|
474
|
+
b"var ", # JavaScript
|
|
475
|
+
b"let ", # JavaScript
|
|
476
|
+
b"const ", # JavaScript
|
|
477
|
+
b"function", # JavaScript
|
|
478
|
+
b"import ", # JavaScript/Python
|
|
479
|
+
b"export ", # JavaScript
|
|
480
|
+
b"class ", # Various languages
|
|
481
|
+
b"def ", # Python
|
|
482
|
+
]
|
|
483
|
+
|
|
484
|
+
for sig in textSignatures:
|
|
485
|
+
if previewLower.startswith(sig):
|
|
486
|
+
return False # Definitely text, not binary
|
|
487
|
+
|
|
488
|
+
# Check for binary file magic bytes (file signatures)
|
|
489
|
+
binarySignatures = [
|
|
490
|
+
b"%PDF", # PDF
|
|
491
|
+
b"PK\x03\x04", # ZIP, DOCX, XLSX, etc.
|
|
492
|
+
b"PK\x05\x06", # Empty ZIP
|
|
493
|
+
b"\x1f\x8b", # GZIP
|
|
494
|
+
b"\x89PNG", # PNG
|
|
495
|
+
b"\xff\xd8\xff", # JPEG
|
|
496
|
+
b"GIF87a", # GIF
|
|
497
|
+
b"GIF89a", # GIF
|
|
498
|
+
b"BM", # BMP (check at start)
|
|
499
|
+
b"RIFF", # WAV, AVI, WebP
|
|
500
|
+
b"\x00\x00\x00", # Various binary formats (MP4, etc.)
|
|
501
|
+
b"ID3", # MP3 with ID3 tag
|
|
502
|
+
b"\xff\xfb", # MP3
|
|
503
|
+
b"\xff\xfa", # MP3
|
|
504
|
+
b"OggS", # OGG
|
|
505
|
+
b"\x4d\x5a", # EXE/DLL (MZ header)
|
|
506
|
+
b"\x7fELF", # Linux ELF binary
|
|
507
|
+
b"\xca\xfe\xba\xbe", # Java class file
|
|
508
|
+
b"\x30\x26\xb2\x75", # ASF/WMV/WMA (first 4 bytes of ASF GUID)
|
|
509
|
+
b"FLV\x01", # FLV (Flash Video)
|
|
510
|
+
b"ftyp", # MP4/M4A/MOV (after 4 byte size prefix)
|
|
511
|
+
b"Rar!\x1a\x07", # RAR archive
|
|
512
|
+
b"7z\xbc\xaf\x27\x1c", # 7-Zip archive
|
|
513
|
+
b"\x1a\x45\xdf\xa3", # WebM/MKV (EBML)
|
|
514
|
+
b"II\x2a\x00", # TIFF (Intel byte order)
|
|
515
|
+
b"MM\x00\x2a", # TIFF (Motorola byte order)
|
|
516
|
+
b"\x00\x00\x01\x00", # ICO (Windows Icon)
|
|
517
|
+
b"\x00\x00\x02\x00", # CUR (Windows Cursor)
|
|
518
|
+
b"wOFF", # WOFF font
|
|
519
|
+
b"wOF2", # WOFF2 font
|
|
520
|
+
b"FWS", # SWF (uncompressed Flash)
|
|
521
|
+
b"CWS", # SWF (zlib compressed Flash)
|
|
522
|
+
b"ZWS", # SWF (LZMA compressed Flash)
|
|
523
|
+
b"\x00\x01\x00\x00", # TrueType font
|
|
524
|
+
b"OTTO", # OpenType font with CFF
|
|
525
|
+
]
|
|
526
|
+
|
|
527
|
+
for sig in binarySignatures:
|
|
528
|
+
if preview.startswith(sig):
|
|
529
|
+
return True # Definitely binary
|
|
530
|
+
except Exception:
|
|
531
|
+
pass
|
|
532
|
+
|
|
533
|
+
# STEP 2: Check Content-Type header
|
|
534
|
+
try:
|
|
535
|
+
if contentType:
|
|
536
|
+
mimeType = contentType.lower().split(";")[0].strip()
|
|
537
|
+
|
|
538
|
+
# Explicit text types
|
|
539
|
+
textMimeTypes = [
|
|
540
|
+
"text/html",
|
|
541
|
+
"text/plain",
|
|
542
|
+
"text/css",
|
|
543
|
+
"text/javascript",
|
|
544
|
+
"text/xml",
|
|
545
|
+
"text/csv",
|
|
546
|
+
"text/markdown",
|
|
547
|
+
"application/json",
|
|
548
|
+
"application/javascript",
|
|
549
|
+
"application/xml",
|
|
550
|
+
"application/xhtml+xml",
|
|
551
|
+
"application/rss+xml",
|
|
552
|
+
"application/atom+xml",
|
|
553
|
+
]
|
|
554
|
+
if mimeType in textMimeTypes or mimeType.startswith("text/"):
|
|
555
|
+
return False # Text type
|
|
556
|
+
|
|
557
|
+
# Known binary types
|
|
558
|
+
if mimeType in BINARY_MIME_TYPES:
|
|
559
|
+
return True
|
|
560
|
+
|
|
561
|
+
# Generic binary prefixes
|
|
562
|
+
if (
|
|
563
|
+
mimeType.startswith("image/")
|
|
564
|
+
or mimeType.startswith("audio/")
|
|
565
|
+
or mimeType.startswith("video/")
|
|
566
|
+
):
|
|
567
|
+
return True
|
|
568
|
+
if mimeType.startswith("application/") and mimeType not in textMimeTypes:
|
|
569
|
+
# application/* is often binary, but not always - be conservative
|
|
570
|
+
if "octet-stream" in mimeType or "binary" in mimeType:
|
|
571
|
+
return True
|
|
572
|
+
except Exception:
|
|
573
|
+
pass
|
|
574
|
+
|
|
575
|
+
# STEP 3: Check URL extension as last resort
|
|
576
|
+
try:
|
|
577
|
+
if url:
|
|
578
|
+
# Extract actual URL from prefixed formats (Wayback/URLScan)
|
|
579
|
+
actualUrl = url
|
|
580
|
+
httpPos = url.find("http://")
|
|
581
|
+
httpsPos = url.find("https://")
|
|
582
|
+
if httpsPos >= 0:
|
|
583
|
+
actualUrl = url[httpsPos:]
|
|
584
|
+
elif httpPos >= 0:
|
|
585
|
+
actualUrl = url[httpPos:]
|
|
586
|
+
|
|
587
|
+
parsed = urlparse(actualUrl.strip())
|
|
588
|
+
path = parsed.path.lower()
|
|
589
|
+
if "." in path:
|
|
590
|
+
ext = "." + path.rsplit(".", 1)[-1]
|
|
591
|
+
if "?" in ext:
|
|
592
|
+
ext = ext.split("?")[0]
|
|
593
|
+
if ext in BINARY_EXTENSIONS:
|
|
594
|
+
return True
|
|
595
|
+
except Exception:
|
|
596
|
+
pass
|
|
597
|
+
|
|
598
|
+
# Default: treat as text (safer - text processing won't corrupt text)
|
|
599
|
+
return False
|
|
600
|
+
|
|
300
601
|
|
|
301
602
|
# Get memory usage for
|
|
302
603
|
def getMemory():
|
|
@@ -451,7 +752,7 @@ def handler(signal_received, frame):
|
|
|
451
752
|
This function is called if Ctrl-C is called by the user
|
|
452
753
|
An attempt will be made to try and clean up properly
|
|
453
754
|
"""
|
|
454
|
-
global stopSource, stopProgram, stopProgramCount, stopSourceWayback, stopSourceCommonCrawl, stopSourceAlienVault, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx, current_response, current_session
|
|
755
|
+
global stopSource, stopProgram, stopProgramCount, stopSourceWayback, stopSourceCommonCrawl, stopSourceAlienVault, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx, stopSourceGhostArchive, current_response, current_session
|
|
455
756
|
|
|
456
757
|
if stopProgram is not None:
|
|
457
758
|
stopProgramCount = stopProgramCount + 1
|
|
@@ -486,6 +787,7 @@ def handler(signal_received, frame):
|
|
|
486
787
|
stopSourceURLScan = True
|
|
487
788
|
stopSourceVirusTotal = True
|
|
488
789
|
stopSourceIntelx = True
|
|
790
|
+
stopSourceGhostArchive = True
|
|
489
791
|
# Try to close any active response or session to interrupt blocking network I/O
|
|
490
792
|
try:
|
|
491
793
|
if current_response is not None:
|
|
@@ -955,16 +1257,12 @@ def showOptions():
|
|
|
955
1257
|
)
|
|
956
1258
|
)
|
|
957
1259
|
|
|
1260
|
+
# Only show --source-ip if it's explicitly configured
|
|
958
1261
|
if SOURCE_IP:
|
|
959
1262
|
write(
|
|
960
1263
|
colored("--source-ip: " + str(SOURCE_IP), "magenta")
|
|
961
1264
|
+ colored(" Outbound requests will bind to this IP.", "white")
|
|
962
1265
|
)
|
|
963
|
-
else:
|
|
964
|
-
write(
|
|
965
|
-
colored("--source-ip: default", "magenta")
|
|
966
|
-
+ colored(" Outbound IP determined by OS routing table.", "white")
|
|
967
|
-
)
|
|
968
1266
|
|
|
969
1267
|
write()
|
|
970
1268
|
|
|
@@ -1465,11 +1763,15 @@ def printProgressBar(
|
|
|
1465
1763
|
|
|
1466
1764
|
def filehash(text):
|
|
1467
1765
|
"""
|
|
1468
|
-
Generate a hash value for the passed string. This is used for the file name of a downloaded archived response
|
|
1766
|
+
Generate a hash value for the passed string or bytes. This is used for the file name of a downloaded archived response
|
|
1469
1767
|
"""
|
|
1470
1768
|
hash = 0
|
|
1471
1769
|
for ch in text:
|
|
1472
|
-
|
|
1770
|
+
# Handle both str (gives chars needing ord()) and bytes (gives ints directly)
|
|
1771
|
+
if isinstance(ch, int):
|
|
1772
|
+
hash = (hash * 281 ^ ch * 997) & 0xFFFFFFFFFFF
|
|
1773
|
+
else:
|
|
1774
|
+
hash = (hash * 281 ^ ord(ch) * 997) & 0xFFFFFFFFFFF
|
|
1473
1775
|
return str(hash)
|
|
1474
1776
|
|
|
1475
1777
|
|
|
@@ -1497,6 +1799,63 @@ def fixArchiveOrgUrl(url):
|
|
|
1497
1799
|
return url
|
|
1498
1800
|
|
|
1499
1801
|
|
|
1802
|
+
def isLikelyBinaryUrl(url):
|
|
1803
|
+
"""
|
|
1804
|
+
Check if a URL likely points to a binary file based on its extension.
|
|
1805
|
+
This is used BEFORE making a request to decide if we need the raw/id_ version.
|
|
1806
|
+
"""
|
|
1807
|
+
try:
|
|
1808
|
+
# Extract actual URL from prefixed formats (Wayback timestamp/URLScan UUID)
|
|
1809
|
+
actualUrl = url
|
|
1810
|
+
httpPos = url.find("http://")
|
|
1811
|
+
httpsPos = url.find("https://")
|
|
1812
|
+
if httpsPos >= 0:
|
|
1813
|
+
actualUrl = url[httpsPos:]
|
|
1814
|
+
elif httpPos >= 0:
|
|
1815
|
+
actualUrl = url[httpPos:]
|
|
1816
|
+
|
|
1817
|
+
parsed = urlparse(actualUrl.strip())
|
|
1818
|
+
path = parsed.path.lower()
|
|
1819
|
+
if "." in path:
|
|
1820
|
+
ext = "." + path.rsplit(".", 1)[-1]
|
|
1821
|
+
if "?" in ext:
|
|
1822
|
+
ext = ext.split("?")[0]
|
|
1823
|
+
if ext in BINARY_EXTENSIONS:
|
|
1824
|
+
return True
|
|
1825
|
+
except Exception:
|
|
1826
|
+
pass
|
|
1827
|
+
return False
|
|
1828
|
+
|
|
1829
|
+
|
|
1830
|
+
def addRawModifier(archiveUrl):
|
|
1831
|
+
"""
|
|
1832
|
+
Add 'id_' modifier to Wayback Machine URL to get raw/original content.
|
|
1833
|
+
This is essential for binary files to avoid Wayback modifications.
|
|
1834
|
+
|
|
1835
|
+
Example:
|
|
1836
|
+
Input: https://web.archive.org/web/20090315210455/http://example.com/file.wmv
|
|
1837
|
+
Output: https://web.archive.org/web/20090315210455id_/http://example.com/file.wmv
|
|
1838
|
+
"""
|
|
1839
|
+
try:
|
|
1840
|
+
# Find the timestamp in the URL (14 digits after /web/)
|
|
1841
|
+
webPos = archiveUrl.find("/web/")
|
|
1842
|
+
if webPos >= 0:
|
|
1843
|
+
# Find where the timestamp ends (first / after /web/)
|
|
1844
|
+
afterWeb = webPos + 5 # Position after "/web/"
|
|
1845
|
+
slashAfterTimestamp = archiveUrl.find("/", afterWeb)
|
|
1846
|
+
if slashAfterTimestamp > afterWeb:
|
|
1847
|
+
# Insert id_ before the slash after timestamp
|
|
1848
|
+
timestamp = archiveUrl[afterWeb:slashAfterTimestamp]
|
|
1849
|
+
# Only add id_ if it's not already there
|
|
1850
|
+
if not timestamp.endswith("id_"):
|
|
1851
|
+
return (
|
|
1852
|
+
archiveUrl[:slashAfterTimestamp] + "id_" + archiveUrl[slashAfterTimestamp:]
|
|
1853
|
+
)
|
|
1854
|
+
except Exception:
|
|
1855
|
+
pass
|
|
1856
|
+
return archiveUrl
|
|
1857
|
+
|
|
1858
|
+
|
|
1500
1859
|
# Add a link to the linksFound collection for archived responses (included timestamp preifx)
|
|
1501
1860
|
def linksFoundResponseAdd(link):
|
|
1502
1861
|
global linksFound, argsInput, argsInputHostname, links_lock
|
|
@@ -1581,6 +1940,12 @@ def processArchiveUrl(url):
|
|
|
1581
1940
|
if stopProgram is None:
|
|
1582
1941
|
|
|
1583
1942
|
archiveUrl = "https://web.archive.org/web/" + fixArchiveOrgUrl(url)
|
|
1943
|
+
|
|
1944
|
+
# For binary files, add id_ modifier to get raw/original content
|
|
1945
|
+
# This prevents Wayback Machine from modifying the content
|
|
1946
|
+
if isLikelyBinaryUrl(url):
|
|
1947
|
+
archiveUrl = addRawModifier(archiveUrl)
|
|
1948
|
+
|
|
1584
1949
|
hashValue = ""
|
|
1585
1950
|
|
|
1586
1951
|
# Get memory usage every 100 responses
|
|
@@ -1593,6 +1958,18 @@ def processArchiveUrl(url):
|
|
|
1593
1958
|
# Make a request to the web archive
|
|
1594
1959
|
try:
|
|
1595
1960
|
try:
|
|
1961
|
+
try:
|
|
1962
|
+
if verbose() and os.environ.get("USER") == "xnl":
|
|
1963
|
+
writerr(
|
|
1964
|
+
colored(
|
|
1965
|
+
"[ DBG ] Requesting file " + archiveUrl,
|
|
1966
|
+
"yellow",
|
|
1967
|
+
attrs=["dark"],
|
|
1968
|
+
)
|
|
1969
|
+
)
|
|
1970
|
+
except Exception:
|
|
1971
|
+
pass
|
|
1972
|
+
|
|
1596
1973
|
# Choose a random user agent string to use for any requests
|
|
1597
1974
|
userAgent = random.choice(USER_AGENT)
|
|
1598
1975
|
|
|
@@ -1604,146 +1981,175 @@ def processArchiveUrl(url):
|
|
|
1604
1981
|
headers={"User-Agent": userAgent},
|
|
1605
1982
|
allow_redirects=True,
|
|
1606
1983
|
)
|
|
1607
|
-
|
|
1984
|
+
|
|
1985
|
+
# Get raw content bytes first
|
|
1986
|
+
contentBytes = resp.content
|
|
1987
|
+
|
|
1608
1988
|
try:
|
|
1609
|
-
contentType = resp.headers.get("Content-Type").split(";")[0].lower()
|
|
1989
|
+
contentType = resp.headers.get("Content-Type", "").split(";")[0].lower()
|
|
1610
1990
|
except Exception:
|
|
1611
1991
|
contentType = ""
|
|
1612
1992
|
|
|
1993
|
+
# Determine if this is binary content based on actual content, Content-Type, and URL
|
|
1994
|
+
isBinary = isBinaryContent(contentBytes, contentType, url)
|
|
1995
|
+
|
|
1996
|
+
if isBinary:
|
|
1997
|
+
# For binary files, use raw bytes as-is
|
|
1998
|
+
archiveContent = contentBytes
|
|
1999
|
+
archiveHtml = None # Not used for binary files
|
|
2000
|
+
else:
|
|
2001
|
+
# For text files, decode to string
|
|
2002
|
+
archiveHtml = contentBytes.decode("utf-8", errors="replace")
|
|
2003
|
+
archiveContent = None # Not used for text files
|
|
2004
|
+
|
|
1613
2005
|
# Only create a file if there is a response
|
|
1614
|
-
if len(archiveHtml)
|
|
2006
|
+
responseLength = len(archiveContent) if isBinary else len(archiveHtml)
|
|
2007
|
+
if responseLength != 0:
|
|
1615
2008
|
|
|
2009
|
+
# For text files, check for custom 404 pages
|
|
1616
2010
|
# If the FILTER_CODE doesn't include 404, OR
|
|
1617
2011
|
# If the FILTER_CODE includes 404, and it doesn't seem to be a custom 404 page
|
|
1618
|
-
if
|
|
1619
|
-
|
|
1620
|
-
|
|
1621
|
-
|
|
1622
|
-
|
|
1623
|
-
|
|
1624
|
-
|
|
1625
|
-
archiveHtml = (
|
|
1626
|
-
"/* Original URL: " + archiveUrl + " */\n" + archiveHtml
|
|
2012
|
+
if (
|
|
2013
|
+
isBinary
|
|
2014
|
+
or "404" not in FILTER_CODE
|
|
2015
|
+
or (
|
|
2016
|
+
"404" in FILTER_CODE
|
|
2017
|
+
and not re.findall(
|
|
2018
|
+
REGEX_404, archiveHtml, re.DOTALL | re.IGNORECASE
|
|
1627
2019
|
)
|
|
1628
|
-
|
|
1629
|
-
# Remove all web archive references in the response
|
|
1630
|
-
archiveHtml = re.sub(
|
|
1631
|
-
r'\<script type=\"text\/javascript" src=\"\/_static\/js\/bundle-playback\.js\?v=[A-Za-z0-9]*" charset="utf-8"><\/script>\n<script type="text\/javascript" src="\/_static\/js\/wombat\.js.*\<\!-- End Wayback Rewrite JS Include --\>',
|
|
1632
|
-
"",
|
|
1633
|
-
archiveHtml,
|
|
1634
|
-
1,
|
|
1635
|
-
flags=re.DOTALL | re.IGNORECASE,
|
|
1636
|
-
)
|
|
1637
|
-
archiveHtml = re.sub(
|
|
1638
|
-
r"\<script src=\"\/\/archive\.org.*\<\!-- End Wayback Rewrite JS Include --\>",
|
|
1639
|
-
"",
|
|
1640
|
-
archiveHtml,
|
|
1641
|
-
1,
|
|
1642
|
-
flags=re.DOTALL | re.IGNORECASE,
|
|
1643
|
-
)
|
|
1644
|
-
archiveHtml = re.sub(
|
|
1645
|
-
r"\<script\>window\.RufflePlayer[^\<]*\<\/script\>",
|
|
1646
|
-
"",
|
|
1647
|
-
archiveHtml,
|
|
1648
|
-
1,
|
|
1649
|
-
flags=re.DOTALL | re.IGNORECASE,
|
|
1650
|
-
)
|
|
1651
|
-
archiveHtml = re.sub(
|
|
1652
|
-
r"\<\!-- BEGIN WAYBACK TOOLBAR INSERT --\>.*\<\!-- END WAYBACK TOOLBAR INSERT --\>",
|
|
1653
|
-
"",
|
|
1654
|
-
archiveHtml,
|
|
1655
|
-
1,
|
|
1656
|
-
flags=re.DOTALL | re.IGNORECASE,
|
|
1657
|
-
)
|
|
1658
|
-
archiveHtml = re.sub(
|
|
1659
|
-
r"(}\n)?(\/\*|<!--\n)\s*FILE ARCHIVED ON.*108\(a\)\(3\)\)\.\n(\*\/|-->)",
|
|
1660
|
-
"",
|
|
1661
|
-
archiveHtml,
|
|
1662
|
-
1,
|
|
1663
|
-
flags=re.DOTALL | re.IGNORECASE,
|
|
1664
|
-
)
|
|
1665
|
-
archiveHtml = re.sub(
|
|
1666
|
-
r"var\s_____WB\$wombat\$assign\$function.*WB\$wombat\$assign\$function_____\(\"opener\"\);",
|
|
1667
|
-
"",
|
|
1668
|
-
archiveHtml,
|
|
1669
|
-
1,
|
|
1670
|
-
flags=re.DOTALL | re.IGNORECASE,
|
|
1671
|
-
)
|
|
1672
|
-
archiveHtml = re.sub(
|
|
1673
|
-
r"(\<\!--|\/\*)\nplayback timings.*(--\>|\*\/)",
|
|
1674
|
-
"",
|
|
1675
|
-
archiveHtml,
|
|
1676
|
-
1,
|
|
1677
|
-
flags=re.DOTALL | re.IGNORECASE,
|
|
1678
|
-
)
|
|
1679
|
-
archiveHtml = re.sub(
|
|
1680
|
-
r"((https:)?\/\/web\.archive\.org)?\/web\/[0-9]{14}([A-Za-z]{2}\_)?\/",
|
|
1681
|
-
"",
|
|
1682
|
-
archiveHtml,
|
|
1683
|
-
flags=re.IGNORECASE,
|
|
1684
|
-
)
|
|
1685
|
-
archiveHtml = re.sub(
|
|
1686
|
-
r"((https:)?\\\/\\\/web\.archive\.org)?\\\/web\\\/[0-9]{14}([A-Za-z]{2}\_)?\\\/",
|
|
1687
|
-
"",
|
|
1688
|
-
archiveHtml,
|
|
1689
|
-
flags=re.IGNORECASE,
|
|
1690
|
-
)
|
|
1691
|
-
archiveHtml = re.sub(
|
|
1692
|
-
r"((https:)?%2F%2Fweb\.archive\.org)?%2Fweb%2F[0-9]{14}([A-Za-z]{2}\_)?%2F",
|
|
1693
|
-
"",
|
|
1694
|
-
archiveHtml,
|
|
1695
|
-
flags=re.IGNORECASE,
|
|
1696
|
-
)
|
|
1697
|
-
archiveHtml = re.sub(
|
|
1698
|
-
r"((https:)?\\u002F\\u002Fweb\.archive\.org)?\\u002Fweb\\u002F[0-9]{14}([A-Za-z]{2}\_)?\\u002F",
|
|
1699
|
-
"",
|
|
1700
|
-
archiveHtml,
|
|
1701
|
-
flags=re.IGNORECASE,
|
|
1702
|
-
)
|
|
1703
|
-
archiveHtml = re.sub(
|
|
1704
|
-
r"\<script type=\"text\/javascript\">\s*__wm\.init\(\"https:\/\/web\.archive\.org\/web\"\);[^\<]*\<\/script\>",
|
|
1705
|
-
"",
|
|
1706
|
-
archiveHtml,
|
|
1707
|
-
flags=re.IGNORECASE,
|
|
1708
|
-
)
|
|
1709
|
-
archiveHtml = re.sub(
|
|
1710
|
-
r'\<script type=\"text\/javascript\" src="https:\/\/web-static\.archive\.org[^\<]*\<\/script\>',
|
|
1711
|
-
"",
|
|
1712
|
-
archiveHtml,
|
|
1713
|
-
flags=re.IGNORECASE,
|
|
1714
|
-
)
|
|
1715
|
-
archiveHtml = re.sub(
|
|
1716
|
-
r"\<link rel=\"stylesheet\" type=\"text\/css\" href=\"https:\/\/web-static\.archive\.org[^\<]*\/\>",
|
|
1717
|
-
"",
|
|
1718
|
-
archiveHtml,
|
|
1719
|
-
flags=re.IGNORECASE,
|
|
1720
|
-
)
|
|
1721
|
-
archiveHtml = re.sub(
|
|
1722
|
-
r"\<\!-- End Wayback Rewrite JS Include --\>",
|
|
1723
|
-
"",
|
|
1724
|
-
archiveHtml,
|
|
1725
|
-
re.IGNORECASE,
|
|
1726
2020
|
)
|
|
2021
|
+
):
|
|
1727
2022
|
|
|
1728
|
-
#
|
|
1729
|
-
if
|
|
1730
|
-
|
|
1731
|
-
|
|
2023
|
+
# For text files only: Add URL comment and clean up wayback references
|
|
2024
|
+
if not isBinary:
|
|
2025
|
+
# Add the URL as a comment at the start of the response
|
|
2026
|
+
if args.url_filename:
|
|
2027
|
+
archiveHtml = (
|
|
2028
|
+
"/* Original URL: " + archiveUrl + " */\n" + archiveHtml
|
|
2029
|
+
)
|
|
2030
|
+
|
|
2031
|
+
# Remove all web archive references in the response
|
|
2032
|
+
archiveHtml = re.sub(
|
|
2033
|
+
r'\<script type=\"text\/javascript" src=\"\/_static\/js\/bundle-playback\.js\?v=[A-Za-z0-9]*" charset="utf-8"\><\/script>\n<script type="text\/javascript" src="\/_static\/js\/wombat\.js.*\<\!-- End Wayback Rewrite JS Include --\>',
|
|
2034
|
+
"",
|
|
2035
|
+
archiveHtml,
|
|
2036
|
+
1,
|
|
2037
|
+
flags=re.DOTALL | re.IGNORECASE,
|
|
1732
2038
|
)
|
|
1733
|
-
|
|
1734
|
-
|
|
1735
|
-
"
|
|
2039
|
+
archiveHtml = re.sub(
|
|
2040
|
+
r"\<script src=\"\/\/archive\.org.*\<\!-- End Wayback Rewrite JS Include --\>",
|
|
2041
|
+
"",
|
|
2042
|
+
archiveHtml,
|
|
2043
|
+
1,
|
|
2044
|
+
flags=re.DOTALL | re.IGNORECASE,
|
|
1736
2045
|
)
|
|
1737
|
-
|
|
1738
|
-
|
|
1739
|
-
|
|
2046
|
+
archiveHtml = re.sub(
|
|
2047
|
+
r"\<script\>window\.RufflePlayer[^\<]*\<\/script\>",
|
|
2048
|
+
"",
|
|
2049
|
+
archiveHtml,
|
|
2050
|
+
1,
|
|
2051
|
+
flags=re.DOTALL | re.IGNORECASE,
|
|
2052
|
+
)
|
|
2053
|
+
archiveHtml = re.sub(
|
|
2054
|
+
r"\<\!-- BEGIN WAYBACK TOOLBAR INSERT --\>.*\<\!-- END WAYBACK TOOLBAR INSERT --\>",
|
|
2055
|
+
"",
|
|
2056
|
+
archiveHtml,
|
|
2057
|
+
1,
|
|
2058
|
+
flags=re.DOTALL | re.IGNORECASE,
|
|
2059
|
+
)
|
|
2060
|
+
archiveHtml = re.sub(
|
|
2061
|
+
r"(}\n)?(\/\*|<\!--\n)\s*FILE ARCHIVED ON.*108\(a\)\(3\)\)\.\n(\*\/|--\>)",
|
|
2062
|
+
"",
|
|
2063
|
+
archiveHtml,
|
|
2064
|
+
1,
|
|
2065
|
+
flags=re.DOTALL | re.IGNORECASE,
|
|
2066
|
+
)
|
|
2067
|
+
archiveHtml = re.sub(
|
|
2068
|
+
r"var\s_____WB\$wombat\$assign\$function.*WB\$wombat\$assign\$function_____\(\"opener\"\);",
|
|
2069
|
+
"",
|
|
2070
|
+
archiveHtml,
|
|
2071
|
+
1,
|
|
2072
|
+
flags=re.DOTALL | re.IGNORECASE,
|
|
2073
|
+
)
|
|
2074
|
+
archiveHtml = re.sub(
|
|
2075
|
+
r"(\<\!--|\/\*)\nplayback timings.*(--\>|\*\/)",
|
|
2076
|
+
"",
|
|
2077
|
+
archiveHtml,
|
|
2078
|
+
1,
|
|
2079
|
+
flags=re.DOTALL | re.IGNORECASE,
|
|
2080
|
+
)
|
|
2081
|
+
archiveHtml = re.sub(
|
|
2082
|
+
r"((https:)?\/\/web\.archive\.org)?\/web\/[0-9]{14}([A-Za-z]{2}\_)?\/",
|
|
2083
|
+
"",
|
|
2084
|
+
archiveHtml,
|
|
2085
|
+
flags=re.IGNORECASE,
|
|
2086
|
+
)
|
|
2087
|
+
archiveHtml = re.sub(
|
|
2088
|
+
r"((https:)?\\\/\\\/web\.archive\.org)?\\\/web\\\/[0-9]{14}([A-Za-z]{2}\_)?\\\/",
|
|
2089
|
+
"",
|
|
2090
|
+
archiveHtml,
|
|
2091
|
+
flags=re.IGNORECASE,
|
|
2092
|
+
)
|
|
2093
|
+
archiveHtml = re.sub(
|
|
2094
|
+
r"((https:)?%2F%2Fweb\.archive\.org)?%2Fweb%2F[0-9]{14}([A-Za-z]{2}\_)?%2F",
|
|
2095
|
+
"",
|
|
2096
|
+
archiveHtml,
|
|
2097
|
+
flags=re.IGNORECASE,
|
|
2098
|
+
)
|
|
2099
|
+
archiveHtml = re.sub(
|
|
2100
|
+
r"((https:)?\\u002F\\u002Fweb\.archive\.org)?\\u002Fweb\\u002F[0-9]{14}([A-Za-z]{2}\_)?\\u002F",
|
|
2101
|
+
"",
|
|
2102
|
+
archiveHtml,
|
|
2103
|
+
flags=re.IGNORECASE,
|
|
2104
|
+
)
|
|
2105
|
+
archiveHtml = re.sub(
|
|
2106
|
+
r"\<script type=\"text\/javascript\"\>\s*__wm\.init\(\"https:\/\/web\.archive\.org\/web\"\);[^\<]*\<\/script\>",
|
|
2107
|
+
"",
|
|
2108
|
+
archiveHtml,
|
|
2109
|
+
flags=re.IGNORECASE,
|
|
2110
|
+
)
|
|
2111
|
+
archiveHtml = re.sub(
|
|
2112
|
+
r'\<script type=\"text\/javascript\" src="https:\/\/web-static\.archive\.org[^\<]*\<\/script\>',
|
|
2113
|
+
"",
|
|
2114
|
+
archiveHtml,
|
|
2115
|
+
flags=re.IGNORECASE,
|
|
2116
|
+
)
|
|
2117
|
+
archiveHtml = re.sub(
|
|
2118
|
+
r"\<link rel=\"stylesheet\" type=\"text\/css\" href=\"https:\/\/web-static\.archive\.org[^\<]*\/\>",
|
|
2119
|
+
"",
|
|
2120
|
+
archiveHtml,
|
|
2121
|
+
flags=re.IGNORECASE,
|
|
2122
|
+
)
|
|
2123
|
+
archiveHtml = re.sub(
|
|
2124
|
+
r"\<\!-- End Wayback Rewrite JS Include --\>",
|
|
2125
|
+
"",
|
|
2126
|
+
archiveHtml,
|
|
2127
|
+
re.IGNORECASE,
|
|
2128
|
+
)
|
|
2129
|
+
|
|
2130
|
+
# If there is a specific Wayback error in the response, raise an exception
|
|
2131
|
+
if (
|
|
2132
|
+
archiveHtml.lower().find(
|
|
2133
|
+
"wayback machine has not archived that url"
|
|
2134
|
+
)
|
|
2135
|
+
> 0
|
|
2136
|
+
or archiveHtml.lower().find(
|
|
2137
|
+
"snapshot cannot be displayed due to an internal error"
|
|
2138
|
+
)
|
|
2139
|
+
> 0
|
|
2140
|
+
):
|
|
2141
|
+
raise WayBackException
|
|
1740
2142
|
|
|
1741
2143
|
# Create file name based on url or hash value of the response, depending on selection. Ensure the file name isn't over 255 characters
|
|
1742
2144
|
if args.url_filename:
|
|
1743
2145
|
fileName = url.replace("/", "-").replace(":", "")
|
|
1744
2146
|
fileName = fileName[0:254]
|
|
1745
2147
|
else:
|
|
1746
|
-
|
|
2148
|
+
# For binary files, hash the raw bytes; for text, hash the text
|
|
2149
|
+
if isBinary:
|
|
2150
|
+
hashValue = filehash(archiveContent.hex())
|
|
2151
|
+
else:
|
|
2152
|
+
hashValue = filehash(archiveHtml)
|
|
1747
2153
|
fileName = hashValue
|
|
1748
2154
|
|
|
1749
2155
|
# Determine extension of file from the content-type using the mimetypes library
|
|
@@ -1785,11 +2191,15 @@ def processArchiveUrl(url):
|
|
|
1785
2191
|
extension = "css"
|
|
1786
2192
|
elif "pdf" in extension:
|
|
1787
2193
|
extension = "pdf"
|
|
2194
|
+
elif "zip" in extension:
|
|
2195
|
+
extension = "zip"
|
|
2196
|
+
elif "gzip" in extension or "x-gzip" in extension:
|
|
2197
|
+
extension = "gz"
|
|
1788
2198
|
elif "plain" == extension:
|
|
1789
2199
|
extension = "txt"
|
|
1790
2200
|
|
|
1791
2201
|
# If extension is still blank, set to html if the content ends with HTML tag, otherwise set to unknown
|
|
1792
|
-
if extension == "":
|
|
2202
|
+
if extension == "" and not isBinary:
|
|
1793
2203
|
if (
|
|
1794
2204
|
archiveHtml.lower().strip().endswith("</html>")
|
|
1795
2205
|
or archiveHtml.lower()
|
|
@@ -1800,6 +2210,8 @@ def processArchiveUrl(url):
|
|
|
1800
2210
|
extension = "html"
|
|
1801
2211
|
else:
|
|
1802
2212
|
extension = "unknown"
|
|
2213
|
+
elif extension == "" and isBinary:
|
|
2214
|
+
extension = "bin"
|
|
1803
2215
|
|
|
1804
2216
|
fileName = fileName + "." + extension
|
|
1805
2217
|
|
|
@@ -1816,10 +2228,14 @@ def processArchiveUrl(url):
|
|
|
1816
2228
|
+ f"{fileName}"
|
|
1817
2229
|
)
|
|
1818
2230
|
|
|
1819
|
-
# Write the file
|
|
2231
|
+
# Write the file - binary mode for binary files, text mode for text files
|
|
1820
2232
|
try:
|
|
1821
|
-
|
|
1822
|
-
|
|
2233
|
+
if isBinary:
|
|
2234
|
+
responseFile = open(filePath, "wb")
|
|
2235
|
+
responseFile.write(archiveContent)
|
|
2236
|
+
else:
|
|
2237
|
+
responseFile = open(filePath, "w", encoding="utf8")
|
|
2238
|
+
responseFile.write(archiveHtml)
|
|
1823
2239
|
responseFile.close()
|
|
1824
2240
|
fileCount = fileCount + 1
|
|
1825
2241
|
except Exception as e:
|
|
@@ -1852,9 +2268,10 @@ def processArchiveUrl(url):
|
|
|
1852
2268
|
)
|
|
1853
2269
|
)
|
|
1854
2270
|
|
|
1855
|
-
# FOR DEBUGGING PURPOSES
|
|
2271
|
+
# FOR DEBUGGING PURPOSES (only for text files)
|
|
1856
2272
|
try:
|
|
1857
|
-
if os.environ.get("USER") == "xnl":
|
|
2273
|
+
if os.environ.get("USER") == "xnl" and not isBinary:
|
|
2274
|
+
|
|
1858
2275
|
debugText = ""
|
|
1859
2276
|
if archiveHtml.lower().find("archive.org") > 0:
|
|
1860
2277
|
debugText = "ARCHIVE.ORG"
|
|
@@ -1862,20 +2279,32 @@ def processArchiveUrl(url):
|
|
|
1862
2279
|
debugText = "INTERNET ARCHIVE"
|
|
1863
2280
|
elif archiveHtml.lower().find("wombat") > 0:
|
|
1864
2281
|
debugText = "WOMBAT (JS)"
|
|
1865
|
-
if debugText != "":
|
|
2282
|
+
if verbose() and debugText != "":
|
|
1866
2283
|
writerr(
|
|
1867
2284
|
colored(
|
|
1868
2285
|
getSPACER(
|
|
1869
|
-
'"'
|
|
2286
|
+
'[ DBG ] "'
|
|
1870
2287
|
+ fileName
|
|
1871
2288
|
+ '" CONTAINS '
|
|
1872
2289
|
+ debugText
|
|
1873
2290
|
+ " - CHECK ITS A VALID REFERENCE"
|
|
1874
2291
|
),
|
|
1875
2292
|
"yellow",
|
|
2293
|
+
attrs=["dark"],
|
|
1876
2294
|
)
|
|
1877
2295
|
)
|
|
1878
|
-
except Exception:
|
|
2296
|
+
except Exception as e:
|
|
2297
|
+
if verbose():
|
|
2298
|
+
writerr(
|
|
2299
|
+
colored(
|
|
2300
|
+
'[ DBG ] Error - Failed to output debug info for "'
|
|
2301
|
+
+ archiveUrl
|
|
2302
|
+
+ '": '
|
|
2303
|
+
+ str(e),
|
|
2304
|
+
"red",
|
|
2305
|
+
attrs=["dark"],
|
|
2306
|
+
)
|
|
2307
|
+
)
|
|
1879
2308
|
pass
|
|
1880
2309
|
|
|
1881
2310
|
successCount = successCount + 1
|
|
@@ -2346,17 +2775,20 @@ def validateArgProviders(x):
|
|
|
2346
2775
|
- urlscan
|
|
2347
2776
|
- virustotal
|
|
2348
2777
|
- intelx
|
|
2778
|
+
- ghostarchive
|
|
2349
2779
|
"""
|
|
2350
2780
|
invalid = False
|
|
2351
2781
|
x = x.lower()
|
|
2352
2782
|
providers = x.split(",")
|
|
2353
2783
|
for provider in providers:
|
|
2354
|
-
if not re.fullmatch(
|
|
2784
|
+
if not re.fullmatch(
|
|
2785
|
+
r"(wayback|commoncrawl|otx|urlscan|virustotal|intelx|ghostarchive)", provider
|
|
2786
|
+
):
|
|
2355
2787
|
invalid = True
|
|
2356
2788
|
break
|
|
2357
2789
|
if invalid:
|
|
2358
2790
|
raise argparse.ArgumentTypeError(
|
|
2359
|
-
"Pass providers separated by a comma, e.g. wayback,commoncrawl,otx,urlscan,virustotal,intelx"
|
|
2791
|
+
"Pass providers separated by a comma, e.g. wayback,commoncrawl,otx,urlscan,virustotal,intelx,ghostarchive"
|
|
2360
2792
|
)
|
|
2361
2793
|
return x
|
|
2362
2794
|
|
|
@@ -2897,17 +3329,38 @@ def getURLScanDOM(originalUrl, domUrl):
|
|
|
2897
3329
|
resp = session.get(
|
|
2898
3330
|
domUrl, headers={"User-Agent": userAgent}, allow_redirects=True
|
|
2899
3331
|
)
|
|
2900
|
-
|
|
3332
|
+
|
|
3333
|
+
# Get raw content bytes first
|
|
3334
|
+
contentBytes = resp.content
|
|
3335
|
+
|
|
3336
|
+
# Get content type from response headers
|
|
3337
|
+
try:
|
|
3338
|
+
contentType = resp.headers.get("Content-Type", "").split(";")[0].lower()
|
|
3339
|
+
except Exception:
|
|
3340
|
+
contentType = ""
|
|
3341
|
+
|
|
3342
|
+
# Determine if this is binary content based on actual content, Content-Type, and URL
|
|
3343
|
+
isBinary = isBinaryContent(contentBytes, contentType, originalUrl)
|
|
3344
|
+
|
|
3345
|
+
if isBinary:
|
|
3346
|
+
# For binary files, use raw bytes as-is
|
|
3347
|
+
archiveContent = contentBytes
|
|
3348
|
+
archiveHtml = None
|
|
3349
|
+
else:
|
|
3350
|
+
# For text files, decode to string
|
|
3351
|
+
archiveHtml = contentBytes.decode("utf-8", errors="replace")
|
|
3352
|
+
archiveContent = None
|
|
2901
3353
|
|
|
2902
3354
|
# If there is a specific URLScan error in the response, raise an exception
|
|
2903
|
-
if archiveHtml.lower().strip() == "not found!":
|
|
3355
|
+
if not isBinary and archiveHtml.lower().strip() == "not found!":
|
|
2904
3356
|
raise WayBackException
|
|
2905
3357
|
|
|
2906
3358
|
# Only create a file if there is a response
|
|
2907
|
-
if len(archiveHtml)
|
|
3359
|
+
responseLength = len(archiveContent) if isBinary else len(archiveHtml)
|
|
3360
|
+
if responseLength != 0:
|
|
2908
3361
|
|
|
2909
|
-
# Add the URL as a comment at the start of the response
|
|
2910
|
-
if args.url_filename:
|
|
3362
|
+
# Add the URL as a comment at the start of the response (text files only)
|
|
3363
|
+
if not isBinary and args.url_filename:
|
|
2911
3364
|
archiveHtml = "/* Original URL: " + originalUrl + " */\n" + archiveHtml
|
|
2912
3365
|
|
|
2913
3366
|
# Create file name based on url or hash value of the response, depending on selection. Ensure the file name isn't over 255 characters
|
|
@@ -2915,7 +3368,11 @@ def getURLScanDOM(originalUrl, domUrl):
|
|
|
2915
3368
|
fileName = originalUrl.replace("/", "-").replace(":", "")
|
|
2916
3369
|
fileName = fileName[0:254]
|
|
2917
3370
|
else:
|
|
2918
|
-
|
|
3371
|
+
# For binary files, hash the raw bytes; for text, hash the text
|
|
3372
|
+
if isBinary:
|
|
3373
|
+
hashValue = filehash(archiveContent.hex())
|
|
3374
|
+
else:
|
|
3375
|
+
hashValue = filehash(archiveHtml)
|
|
2919
3376
|
fileName = hashValue
|
|
2920
3377
|
|
|
2921
3378
|
# Determine extension of file from the content-type using the mimetypes library
|
|
@@ -2933,7 +3390,7 @@ def getURLScanDOM(originalUrl, domUrl):
|
|
|
2933
3390
|
pass
|
|
2934
3391
|
|
|
2935
3392
|
# If the extension is blank, numeric, longer than 4 characters or not alphanumeric - then set to html if the content ends with HTML tag, otherwise set to unknown
|
|
2936
|
-
if extension == "":
|
|
3393
|
+
if extension == "" and not isBinary:
|
|
2937
3394
|
if (
|
|
2938
3395
|
archiveHtml.lower().strip().endswith("</html>")
|
|
2939
3396
|
or archiveHtml.lower().strip().endswith("</body>")
|
|
@@ -2944,6 +3401,8 @@ def getURLScanDOM(originalUrl, domUrl):
|
|
|
2944
3401
|
extension = "html"
|
|
2945
3402
|
else:
|
|
2946
3403
|
extension = "unknown"
|
|
3404
|
+
elif extension == "" and isBinary:
|
|
3405
|
+
extension = "bin"
|
|
2947
3406
|
|
|
2948
3407
|
fileName = fileName + "." + extension
|
|
2949
3408
|
|
|
@@ -2960,10 +3419,14 @@ def getURLScanDOM(originalUrl, domUrl):
|
|
|
2960
3419
|
+ f"{fileName}"
|
|
2961
3420
|
)
|
|
2962
3421
|
|
|
2963
|
-
# Write the file
|
|
3422
|
+
# Write the file - binary mode for binary files, text mode for text files
|
|
2964
3423
|
try:
|
|
2965
|
-
|
|
2966
|
-
|
|
3424
|
+
if isBinary:
|
|
3425
|
+
responseFile = open(filePath, "wb")
|
|
3426
|
+
responseFile.write(archiveContent)
|
|
3427
|
+
else:
|
|
3428
|
+
responseFile = open(filePath, "w", encoding="utf8")
|
|
3429
|
+
responseFile.write(archiveHtml)
|
|
2967
3430
|
responseFile.close()
|
|
2968
3431
|
fileCount = fileCount + 1
|
|
2969
3432
|
except Exception as e:
|
|
@@ -3083,98 +3546,614 @@ def getURLScanDOM(originalUrl, domUrl):
|
|
|
3083
3546
|
writerr(colored("ERROR getURLScanDOM 1: " + str(e), "red"))
|
|
3084
3547
|
|
|
3085
3548
|
|
|
3086
|
-
def
|
|
3087
|
-
# Handle different lengths of input
|
|
3088
|
-
if len(date_str) == 4: # YYYY
|
|
3089
|
-
date_str += "0101"
|
|
3090
|
-
elif len(date_str) == 6: # YYYYMM
|
|
3091
|
-
date_str += "01"
|
|
3092
|
-
|
|
3093
|
-
# Convert to YYYY-MM-DD format
|
|
3094
|
-
try:
|
|
3095
|
-
formatted_date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
|
|
3096
|
-
return formatted_date
|
|
3097
|
-
except Exception:
|
|
3098
|
-
return ""
|
|
3099
|
-
|
|
3100
|
-
|
|
3101
|
-
def getURLScanUrls():
|
|
3549
|
+
def getGhostArchiveWARC(originalUrl, domUrl):
|
|
3102
3550
|
"""
|
|
3103
|
-
Get
|
|
3551
|
+
Get the DOM for the passed GhostArchive link - parses WARC files containing multiple request/response pairs
|
|
3104
3552
|
"""
|
|
3105
|
-
global
|
|
3106
|
-
|
|
3107
|
-
# Write the file of URL's for the passed domain/URL
|
|
3553
|
+
global stopProgram, successCount, failureCount, fileCount, DEFAULT_OUTPUT_DIR, totalResponses, indexFile, argsInput, argsInputHostname, REGEX_404, linksFound, extraWarcLinks, links_lock
|
|
3108
3554
|
try:
|
|
3109
|
-
|
|
3110
|
-
stopSourceURLScan = False
|
|
3111
|
-
linksFoundURLScan = set()
|
|
3112
|
-
totalUrls = 0
|
|
3113
|
-
checkResponse = True
|
|
3114
|
-
|
|
3115
|
-
# Set the URL to just the hostname
|
|
3116
|
-
url = URLSCAN_URL.replace("{DOMAIN}", quote(argsInputHostname))
|
|
3555
|
+
if stopProgram is None:
|
|
3117
3556
|
|
|
3118
|
-
|
|
3119
|
-
|
|
3120
|
-
if args.from_date:
|
|
3121
|
-
fromDate = format_date_for_urlscan(str(args.from_date)[:8])
|
|
3122
|
-
else:
|
|
3123
|
-
fromDate = "2016-01-01" # The year URLScan started
|
|
3124
|
-
if args.to_date:
|
|
3125
|
-
toDate = format_date_for_urlscan(str(args.to_date)[:8])
|
|
3126
|
-
else:
|
|
3127
|
-
toDate = "now"
|
|
3128
|
-
url = url.replace("{DATERANGE}", f"%20date:[{fromDate}%20TO%20{toDate}]")
|
|
3129
|
-
else:
|
|
3130
|
-
url = url.replace("{DATERANGE}", "")
|
|
3557
|
+
# The WARC files are found by replacing /archive with /chimurai4 and using the .warc file extension
|
|
3558
|
+
warcUrl = domUrl.replace("/archive", "/chimurai4") + ".warc"
|
|
3131
3559
|
|
|
3132
|
-
|
|
3133
|
-
if
|
|
3134
|
-
|
|
3135
|
-
|
|
3136
|
-
|
|
3137
|
-
|
|
3138
|
-
)
|
|
3139
|
-
+ colored(url + "\n", "white")
|
|
3140
|
-
)
|
|
3141
|
-
else:
|
|
3142
|
-
write(
|
|
3143
|
-
colored(
|
|
3144
|
-
"URLScan - [ INFO ] The URLScan URL requested to get links: ", "magenta"
|
|
3145
|
-
)
|
|
3146
|
-
+ colored(url + "\n", "white")
|
|
3147
|
-
)
|
|
3560
|
+
# Get memory usage every 100 responses
|
|
3561
|
+
if (successCount + failureCount) % 100 == 0:
|
|
3562
|
+
try:
|
|
3563
|
+
getMemory()
|
|
3564
|
+
except Exception:
|
|
3565
|
+
pass
|
|
3148
3566
|
|
|
3149
|
-
|
|
3150
|
-
|
|
3151
|
-
|
|
3152
|
-
|
|
3153
|
-
|
|
3154
|
-
|
|
3155
|
-
|
|
3567
|
+
# Fetch content
|
|
3568
|
+
try:
|
|
3569
|
+
# Show progress bar
|
|
3570
|
+
fillTest = (successCount + failureCount) % 2
|
|
3571
|
+
fillChar = "o"
|
|
3572
|
+
if fillTest == 0:
|
|
3573
|
+
fillChar = "O"
|
|
3574
|
+
suffix = "Complete "
|
|
3156
3575
|
|
|
3157
|
-
|
|
3158
|
-
|
|
3159
|
-
|
|
3160
|
-
|
|
3161
|
-
|
|
3162
|
-
|
|
3163
|
-
|
|
3164
|
-
session = requests.Session()
|
|
3165
|
-
session.mount("https://", HTTP_ADAPTER)
|
|
3166
|
-
session.mount("http://", HTTP_ADAPTER)
|
|
3167
|
-
# Pass the API-Key header too. This can change the max endpoints per page, depending on URLScan subscription
|
|
3168
|
-
resp = session.get(url, headers={"User-Agent": userAgent, "API-Key": URLSCAN_API_KEY})
|
|
3169
|
-
requestsMade = requestsMade + 1
|
|
3170
|
-
except Exception as e:
|
|
3171
|
-
write(
|
|
3172
|
-
colored(
|
|
3173
|
-
"URLScan - [ ERR ] Unable to get links from urlscan.io: " + str(e),
|
|
3174
|
-
"red",
|
|
3576
|
+
printProgressBar(
|
|
3577
|
+
successCount + failureCount,
|
|
3578
|
+
totalResponses,
|
|
3579
|
+
prefix="Processing " + str(totalResponses) + " WARC files:",
|
|
3580
|
+
suffix=suffix,
|
|
3581
|
+
length=getProgressBarLength(),
|
|
3582
|
+
fill=fillChar,
|
|
3175
3583
|
)
|
|
3176
|
-
|
|
3177
|
-
|
|
3584
|
+
|
|
3585
|
+
try:
|
|
3586
|
+
try:
|
|
3587
|
+
if verbose() and os.environ.get("USER") == "xnl":
|
|
3588
|
+
writerr(
|
|
3589
|
+
colored(
|
|
3590
|
+
"[ DBG ] Requesting file " + warcUrl,
|
|
3591
|
+
"yellow",
|
|
3592
|
+
attrs=["dark"],
|
|
3593
|
+
)
|
|
3594
|
+
)
|
|
3595
|
+
except Exception:
|
|
3596
|
+
pass
|
|
3597
|
+
|
|
3598
|
+
# Choose a random user agent string to use for any requests
|
|
3599
|
+
userAgent = random.choice(USER_AGENT)
|
|
3600
|
+
session = requests.Session()
|
|
3601
|
+
session.mount("https://", HTTP_ADAPTER)
|
|
3602
|
+
session.mount("http://", HTTP_ADAPTER)
|
|
3603
|
+
|
|
3604
|
+
# Retry loop for 503 or maintenance responses
|
|
3605
|
+
maxRetries = 3
|
|
3606
|
+
warcBytes = b""
|
|
3607
|
+
for attempt in range(maxRetries):
|
|
3608
|
+
resp = session.get(
|
|
3609
|
+
warcUrl,
|
|
3610
|
+
headers={"User-Agent": userAgent},
|
|
3611
|
+
allow_redirects=True,
|
|
3612
|
+
timeout=args.timeout,
|
|
3613
|
+
)
|
|
3614
|
+
warcBytes = resp.content
|
|
3615
|
+
|
|
3616
|
+
# Check if we need to retry (decode just for this check)
|
|
3617
|
+
try:
|
|
3618
|
+
warcTextCheck = warcBytes.decode("utf-8", errors="replace").lower()
|
|
3619
|
+
except Exception:
|
|
3620
|
+
warcTextCheck = ""
|
|
3621
|
+
if resp.status_code == 503 or "website under maintenance" in warcTextCheck:
|
|
3622
|
+
if attempt < maxRetries - 1:
|
|
3623
|
+
import time
|
|
3624
|
+
|
|
3625
|
+
time.sleep(0.5)
|
|
3626
|
+
continue
|
|
3627
|
+
break
|
|
3628
|
+
|
|
3629
|
+
# Parse the WARC file to extract multiple responses
|
|
3630
|
+
# WARC header lines are text, but response bodies may be binary
|
|
3631
|
+
# Split by line separator but keep bytes for body extraction
|
|
3632
|
+
lineBytes = warcBytes.split(b"\n")
|
|
3633
|
+
lines = [lb.decode("utf-8", errors="replace") for lb in lineBytes]
|
|
3634
|
+
|
|
3635
|
+
# State machine to track parsing
|
|
3636
|
+
currentTargetUri = ""
|
|
3637
|
+
inResponse = False
|
|
3638
|
+
contentType = ""
|
|
3639
|
+
responsesFound = (
|
|
3640
|
+
[]
|
|
3641
|
+
) # List of (targetUri, contentType, responseBytes, httpStatusCode)
|
|
3642
|
+
|
|
3643
|
+
i = 0
|
|
3644
|
+
skipCurrentResponse = False # Initialize before loop
|
|
3645
|
+
pendingResponseType = (
|
|
3646
|
+
False # Track if we saw WARC-Type: response and are waiting for Target-URI
|
|
3647
|
+
)
|
|
3648
|
+
responseStartIdx = -1 # Initialize before loop
|
|
3649
|
+
httpStatusCode = "" # Initialize before loop
|
|
3650
|
+
while i < len(lines) and stopProgram is None and not stopSourceGhostArchive:
|
|
3651
|
+
line = lines[i]
|
|
3652
|
+
|
|
3653
|
+
# When we see a new WARC record start, reset pending state
|
|
3654
|
+
if line.startswith("WARC/1.0"):
|
|
3655
|
+
# If we were in a response and collecting, save it before moving to new record
|
|
3656
|
+
if inResponse and responseStartIdx >= 0:
|
|
3657
|
+
responseBodyBytes = b"\n".join(lineBytes[responseStartIdx:i])
|
|
3658
|
+
responsesFound.append(
|
|
3659
|
+
(
|
|
3660
|
+
currentTargetUri,
|
|
3661
|
+
contentType,
|
|
3662
|
+
responseBodyBytes,
|
|
3663
|
+
httpStatusCode if "httpStatusCode" in dir() else "",
|
|
3664
|
+
)
|
|
3665
|
+
)
|
|
3666
|
+
inResponse = False
|
|
3667
|
+
responseStartIdx = -1
|
|
3668
|
+
contentType = ""
|
|
3669
|
+
httpStatusCode = ""
|
|
3670
|
+
pendingResponseType = False
|
|
3671
|
+
skipCurrentResponse = False
|
|
3672
|
+
|
|
3673
|
+
# Look for WARC-Type: response - mark that we're in a response record header
|
|
3674
|
+
elif line.startswith("WARC-Type: response"):
|
|
3675
|
+
pendingResponseType = True
|
|
3676
|
+
inResponse = False # Don't start capturing body yet
|
|
3677
|
+
responseStartIdx = -1
|
|
3678
|
+
contentType = ""
|
|
3679
|
+
|
|
3680
|
+
# Look for WARC-Target-URI to get the request URL
|
|
3681
|
+
elif line.startswith("WARC-Target-URI:"):
|
|
3682
|
+
currentTargetUri = line.split(":", 1)[1].strip()
|
|
3683
|
+
skipCurrentResponse = False
|
|
3684
|
+
|
|
3685
|
+
# Check: URL host must contain the input hostname
|
|
3686
|
+
if argsInputHostname:
|
|
3687
|
+
try:
|
|
3688
|
+
parsed = urlparse(currentTargetUri)
|
|
3689
|
+
host = parsed.netloc.lower()
|
|
3690
|
+
if argsInputHostname.lower() not in host:
|
|
3691
|
+
skipCurrentResponse = True
|
|
3692
|
+
except Exception:
|
|
3693
|
+
skipCurrentResponse = True
|
|
3694
|
+
|
|
3695
|
+
# Check: Filter by URL (FILTER_URL)
|
|
3696
|
+
if not skipCurrentResponse and FILTER_URL and currentTargetUri:
|
|
3697
|
+
filterUrls = [u.strip().lower() for u in FILTER_URL.split(",")]
|
|
3698
|
+
for filterUrl in filterUrls:
|
|
3699
|
+
if filterUrl in currentTargetUri.lower():
|
|
3700
|
+
skipCurrentResponse = True
|
|
3701
|
+
break
|
|
3702
|
+
|
|
3703
|
+
# If we were waiting for Target-URI after seeing WARC-Type: response, and it's valid, start response mode
|
|
3704
|
+
if pendingResponseType and not skipCurrentResponse:
|
|
3705
|
+
inResponse = True
|
|
3706
|
+
pendingResponseType = False
|
|
3707
|
+
|
|
3708
|
+
# If we're in a response section (after seeing both WARC-Type: response and valid WARC-Target-URI)
|
|
3709
|
+
elif inResponse:
|
|
3710
|
+
# Check for HTTP start and capture status code
|
|
3711
|
+
if line.startswith("HTTP"):
|
|
3712
|
+
# Extract status code (e.g., "HTTP/1.1 200 OK" -> "200")
|
|
3713
|
+
try:
|
|
3714
|
+
httpStatusCode = line.split()[1]
|
|
3715
|
+
except Exception:
|
|
3716
|
+
httpStatusCode = ""
|
|
3717
|
+
|
|
3718
|
+
# Early check: Filter by HTTP status code (FILTER_CODE)
|
|
3719
|
+
if FILTER_CODE and httpStatusCode:
|
|
3720
|
+
filterCodes = [c.strip() for c in FILTER_CODE.split(",")]
|
|
3721
|
+
if httpStatusCode in filterCodes:
|
|
3722
|
+
inResponse = False
|
|
3723
|
+
responseStartIdx = -1
|
|
3724
|
+
i += 1
|
|
3725
|
+
continue
|
|
3726
|
+
|
|
3727
|
+
responseStartIdx = i # Mark start of response
|
|
3728
|
+
elif responseStartIdx >= 0:
|
|
3729
|
+
# Capture Content-Type if present (case-insensitive check)
|
|
3730
|
+
if line.lower().startswith("content-type:"):
|
|
3731
|
+
try:
|
|
3732
|
+
contentType = (
|
|
3733
|
+
line.split(":", 1)[1].strip().split(";")[0].lower()
|
|
3734
|
+
)
|
|
3735
|
+
except Exception:
|
|
3736
|
+
pass
|
|
3737
|
+
|
|
3738
|
+
# Early check: Filter by MIME type (FILTER_MIME)
|
|
3739
|
+
if FILTER_MIME and contentType:
|
|
3740
|
+
filterMimes = [
|
|
3741
|
+
m.strip().lower() for m in FILTER_MIME.split(",")
|
|
3742
|
+
]
|
|
3743
|
+
if contentType in filterMimes:
|
|
3744
|
+
inResponse = False
|
|
3745
|
+
responseStartIdx = -1
|
|
3746
|
+
i += 1
|
|
3747
|
+
continue
|
|
3748
|
+
|
|
3749
|
+
i += 1
|
|
3750
|
+
|
|
3751
|
+
if stopProgram is not None:
|
|
3752
|
+
return
|
|
3753
|
+
|
|
3754
|
+
# Don't forget the last response if file doesn't end with WARC/1.0
|
|
3755
|
+
if inResponse and responseStartIdx >= 0:
|
|
3756
|
+
responseBodyBytes = b"\n".join(lineBytes[responseStartIdx:])
|
|
3757
|
+
responsesFound.append(
|
|
3758
|
+
(
|
|
3759
|
+
currentTargetUri,
|
|
3760
|
+
contentType,
|
|
3761
|
+
responseBodyBytes,
|
|
3762
|
+
httpStatusCode if "httpStatusCode" in dir() else "",
|
|
3763
|
+
)
|
|
3764
|
+
)
|
|
3765
|
+
|
|
3766
|
+
# Process each response found
|
|
3767
|
+
for targetUri, contentType, responseBytes, httpStatusCode in responsesFound:
|
|
3768
|
+
if stopProgram is not None:
|
|
3769
|
+
break
|
|
3770
|
+
|
|
3771
|
+
if not responseBytes:
|
|
3772
|
+
continue
|
|
3773
|
+
|
|
3774
|
+
# Split HTTP header from body in bytes (look for \r\n\r\n or \n\n separator)
|
|
3775
|
+
if b"\r\n\r\n" in responseBytes:
|
|
3776
|
+
bodyBytes = responseBytes.split(b"\r\n\r\n", 1)[1]
|
|
3777
|
+
elif b"\n\n" in responseBytes:
|
|
3778
|
+
bodyBytes = responseBytes.split(b"\n\n", 1)[1]
|
|
3779
|
+
else:
|
|
3780
|
+
bodyBytes = responseBytes
|
|
3781
|
+
|
|
3782
|
+
# Skip empty bodies or "not found" responses
|
|
3783
|
+
if not bodyBytes or bodyBytes.lower().strip() == b"not found":
|
|
3784
|
+
continue
|
|
3785
|
+
|
|
3786
|
+
# If -f / --filter-responses-only is passed, track all URLs immediately (before filtering)
|
|
3787
|
+
if args.mode == "B" and args.filter_responses_only and targetUri:
|
|
3788
|
+
with links_lock:
|
|
3789
|
+
if targetUri not in linksFound and targetUri not in extraWarcLinks:
|
|
3790
|
+
extraWarcLinks.add(targetUri)
|
|
3791
|
+
|
|
3792
|
+
# Use isBinaryContent to detect if this is binary content
|
|
3793
|
+
isBinary = isBinaryContent(bodyBytes, contentType, targetUri)
|
|
3794
|
+
|
|
3795
|
+
if isBinary:
|
|
3796
|
+
# Binary file - save raw bytes
|
|
3797
|
+
archiveContent = bodyBytes
|
|
3798
|
+
archiveHtml = None
|
|
3799
|
+
else:
|
|
3800
|
+
# Text file - decode to string
|
|
3801
|
+
archiveHtml = bodyBytes.decode("utf-8", errors="replace")
|
|
3802
|
+
archiveContent = None
|
|
3803
|
+
|
|
3804
|
+
# Collapse multiple blank lines into one
|
|
3805
|
+
archiveHtml = re.sub(r"\n{3,}", "\n\n", archiveHtml)
|
|
3806
|
+
|
|
3807
|
+
# Skip if body is empty after processing
|
|
3808
|
+
if not archiveHtml.strip():
|
|
3809
|
+
continue
|
|
3810
|
+
|
|
3811
|
+
if stopProgram is not None:
|
|
3812
|
+
break
|
|
3813
|
+
|
|
3814
|
+
# Determine if this is HTML or JS based on content-type or URL
|
|
3815
|
+
isHtml = (
|
|
3816
|
+
contentType in ["text/html", "application/xhtml+xml"]
|
|
3817
|
+
or targetUri.lower().endswith(".html")
|
|
3818
|
+
or targetUri.lower().endswith(".htm")
|
|
3819
|
+
)
|
|
3820
|
+
isJs = contentType in [
|
|
3821
|
+
"text/javascript",
|
|
3822
|
+
"application/javascript",
|
|
3823
|
+
"application/x-javascript",
|
|
3824
|
+
] or targetUri.lower().endswith(".js")
|
|
3825
|
+
|
|
3826
|
+
# Add the URL as a comment at the start of the response (only for text files)
|
|
3827
|
+
if not isBinary and args.url_filename:
|
|
3828
|
+
if isHtml:
|
|
3829
|
+
archiveHtml = (
|
|
3830
|
+
"<!-- Original URL: " + targetUri + " -->\n" + archiveHtml
|
|
3831
|
+
)
|
|
3832
|
+
elif isJs:
|
|
3833
|
+
archiveHtml = (
|
|
3834
|
+
"/* Original URL: " + targetUri + " */\n" + archiveHtml
|
|
3835
|
+
)
|
|
3836
|
+
|
|
3837
|
+
# Create file name based on url or hash value
|
|
3838
|
+
if args.url_filename:
|
|
3839
|
+
fileName = targetUri.replace("/", "-").replace(":", "")
|
|
3840
|
+
fileName = fileName[0:254]
|
|
3841
|
+
hashValue = ""
|
|
3842
|
+
else:
|
|
3843
|
+
# Hash the content to get the filename
|
|
3844
|
+
if isBinary:
|
|
3845
|
+
hashValue = filehash(archiveContent)
|
|
3846
|
+
else:
|
|
3847
|
+
hashValue = filehash(archiveHtml)
|
|
3848
|
+
fileName = hashValue
|
|
3849
|
+
|
|
3850
|
+
# Determine extension of file from the content-type or URL
|
|
3851
|
+
extension = ""
|
|
3852
|
+
try:
|
|
3853
|
+
# Get path extension from URL
|
|
3854
|
+
if "://" in targetUri:
|
|
3855
|
+
targetUrl = "https://" + targetUri.split("://")[1]
|
|
3856
|
+
parsed = urlparse(targetUrl.strip())
|
|
3857
|
+
path = parsed.path
|
|
3858
|
+
extension = path[path.rindex(".") + 1 :]
|
|
3859
|
+
if "/" in extension:
|
|
3860
|
+
extension = ""
|
|
3861
|
+
# If extension is over 6 characters, it's likely not a real extension (e.g. API endpoint ID)
|
|
3862
|
+
if len(extension) > 6:
|
|
3863
|
+
extension = ""
|
|
3864
|
+
except Exception:
|
|
3865
|
+
pass
|
|
3866
|
+
|
|
3867
|
+
# If extension is blank, determine from MIME type or content
|
|
3868
|
+
if extension == "":
|
|
3869
|
+
if isBinary:
|
|
3870
|
+
# Binary file extensions from MIME type
|
|
3871
|
+
if contentType:
|
|
3872
|
+
if "image/png" in contentType:
|
|
3873
|
+
extension = "png"
|
|
3874
|
+
elif (
|
|
3875
|
+
"image/jpeg" in contentType
|
|
3876
|
+
or "image/jpg" in contentType
|
|
3877
|
+
):
|
|
3878
|
+
extension = "jpg"
|
|
3879
|
+
elif "image/gif" in contentType:
|
|
3880
|
+
extension = "gif"
|
|
3881
|
+
elif "image/webp" in contentType:
|
|
3882
|
+
extension = "webp"
|
|
3883
|
+
elif "application/pdf" in contentType:
|
|
3884
|
+
extension = "pdf"
|
|
3885
|
+
elif "application/zip" in contentType:
|
|
3886
|
+
extension = "zip"
|
|
3887
|
+
else:
|
|
3888
|
+
extension = "bin"
|
|
3889
|
+
else:
|
|
3890
|
+
extension = "bin"
|
|
3891
|
+
else:
|
|
3892
|
+
# Text file extensions
|
|
3893
|
+
if contentType and "javascript" in contentType.lower():
|
|
3894
|
+
extension = "js"
|
|
3895
|
+
elif contentType and "html" in contentType.lower():
|
|
3896
|
+
extension = "html"
|
|
3897
|
+
elif contentType and "json" in contentType.lower():
|
|
3898
|
+
extension = "json"
|
|
3899
|
+
elif contentType and "text" in contentType.lower():
|
|
3900
|
+
extension = "txt"
|
|
3901
|
+
elif archiveHtml and (
|
|
3902
|
+
archiveHtml.lower().strip().endswith("</html>")
|
|
3903
|
+
or archiveHtml.lower().strip().endswith("</body>")
|
|
3904
|
+
or archiveHtml.lower().strip().startswith("<!doctype html")
|
|
3905
|
+
or archiveHtml.lower().strip().startswith("<html")
|
|
3906
|
+
or archiveHtml.lower().strip().startswith("<head")
|
|
3907
|
+
):
|
|
3908
|
+
extension = "html"
|
|
3909
|
+
else:
|
|
3910
|
+
extension = "unknown"
|
|
3911
|
+
|
|
3912
|
+
fileName = fileName + "." + extension
|
|
3913
|
+
|
|
3914
|
+
# Determine file path
|
|
3915
|
+
if args.output_responses != "":
|
|
3916
|
+
filePath = args.output_responses + "/" + f"{fileName}"
|
|
3917
|
+
else:
|
|
3918
|
+
filePath = (
|
|
3919
|
+
DEFAULT_OUTPUT_DIR
|
|
3920
|
+
+ "/results/"
|
|
3921
|
+
+ str(argsInput).replace("/", "-")
|
|
3922
|
+
+ "/"
|
|
3923
|
+
+ f"{fileName}"
|
|
3924
|
+
)
|
|
3925
|
+
|
|
3926
|
+
if stopProgram is not None:
|
|
3927
|
+
break
|
|
3928
|
+
|
|
3929
|
+
# Write the file
|
|
3930
|
+
try:
|
|
3931
|
+
if isBinary:
|
|
3932
|
+
# Binary file - write as bytes
|
|
3933
|
+
responseFile = open(filePath, "wb")
|
|
3934
|
+
responseFile.write(archiveContent)
|
|
3935
|
+
else:
|
|
3936
|
+
# Text file - write as UTF-8
|
|
3937
|
+
responseFile = open(filePath, "w", encoding="utf8")
|
|
3938
|
+
responseFile.write(archiveHtml)
|
|
3939
|
+
responseFile.close()
|
|
3940
|
+
with links_lock:
|
|
3941
|
+
fileCount = fileCount + 1
|
|
3942
|
+
|
|
3943
|
+
# Track extra URLs found in WARC files for mode B (only when -f is not passed, since we track earlier if it is)
|
|
3944
|
+
if args.mode == "B" and not args.filter_responses_only and targetUri:
|
|
3945
|
+
with links_lock:
|
|
3946
|
+
if (
|
|
3947
|
+
targetUri not in linksFound
|
|
3948
|
+
and targetUri not in extraWarcLinks
|
|
3949
|
+
):
|
|
3950
|
+
extraWarcLinks.add(targetUri)
|
|
3951
|
+
except Exception as e:
|
|
3952
|
+
writerr(
|
|
3953
|
+
colored(
|
|
3954
|
+
"GhostArchive - [ ERR ] Failed to write file "
|
|
3955
|
+
+ filePath
|
|
3956
|
+
+ ": "
|
|
3957
|
+
+ str(e),
|
|
3958
|
+
"red",
|
|
3959
|
+
)
|
|
3960
|
+
)
|
|
3961
|
+
|
|
3962
|
+
# Write the hash value and URL to the index file
|
|
3963
|
+
if not args.url_filename and hashValue:
|
|
3964
|
+
try:
|
|
3965
|
+
timestamp = str(datetime.now())
|
|
3966
|
+
indexFile.write(
|
|
3967
|
+
hashValue
|
|
3968
|
+
+ ","
|
|
3969
|
+
+ domUrl
|
|
3970
|
+
+ "#"
|
|
3971
|
+
+ targetUri
|
|
3972
|
+
+ " ,"
|
|
3973
|
+
+ timestamp
|
|
3974
|
+
+ "\n"
|
|
3975
|
+
)
|
|
3976
|
+
indexFile.flush()
|
|
3977
|
+
except Exception as e:
|
|
3978
|
+
writerr(
|
|
3979
|
+
colored(
|
|
3980
|
+
'GhostArchive - [ ERR ] Failed to write to waymore_index.txt for "'
|
|
3981
|
+
+ warcUrl
|
|
3982
|
+
+ '": '
|
|
3983
|
+
+ str(e),
|
|
3984
|
+
"red",
|
|
3985
|
+
)
|
|
3986
|
+
)
|
|
3987
|
+
|
|
3988
|
+
successCount = successCount + 1
|
|
3989
|
+
|
|
3990
|
+
except WayBackException:
|
|
3991
|
+
failureCount = failureCount + 1
|
|
3992
|
+
|
|
3993
|
+
except Exception as e:
|
|
3994
|
+
failureCount = failureCount + 1
|
|
3995
|
+
if verbose():
|
|
3996
|
+
# Simplify common error messages
|
|
3997
|
+
if "connection broken" in str(e).lower():
|
|
3998
|
+
errorMsg = "Connection Broken"
|
|
3999
|
+
else:
|
|
4000
|
+
errorMsg = str(e)
|
|
4001
|
+
try:
|
|
4002
|
+
statusCode = (
|
|
4003
|
+
resp.status_code if "resp" in dir() and resp is not None else "ERR"
|
|
4004
|
+
)
|
|
4005
|
+
writerr(
|
|
4006
|
+
colored(
|
|
4007
|
+
"GhostArchive - [ "
|
|
4008
|
+
+ str(statusCode)
|
|
4009
|
+
+ ' ] Failed to get response for "'
|
|
4010
|
+
+ warcUrl
|
|
4011
|
+
+ '": '
|
|
4012
|
+
+ errorMsg,
|
|
4013
|
+
"red",
|
|
4014
|
+
)
|
|
4015
|
+
)
|
|
4016
|
+
except Exception:
|
|
4017
|
+
writerr(
|
|
4018
|
+
colored(
|
|
4019
|
+
'GhostArchive - [ ERR ] Failed to get response for "'
|
|
4020
|
+
+ warcUrl
|
|
4021
|
+
+ '": '
|
|
4022
|
+
+ errorMsg,
|
|
4023
|
+
"red",
|
|
4024
|
+
)
|
|
4025
|
+
)
|
|
4026
|
+
|
|
4027
|
+
# Show memory usage if -v option chosen, and check memory every 25 responses (or if its the last)
|
|
4028
|
+
if (successCount + failureCount) % 25 == 1 or (
|
|
4029
|
+
successCount + failureCount
|
|
4030
|
+
) == totalResponses:
|
|
4031
|
+
try:
|
|
4032
|
+
getMemory()
|
|
4033
|
+
if verbose():
|
|
4034
|
+
suffix = (
|
|
4035
|
+
"Complete (Mem Usage "
|
|
4036
|
+
+ humanReadableSize(currentMemUsage)
|
|
4037
|
+
+ ", Total Mem "
|
|
4038
|
+
+ str(currentMemPercent)
|
|
4039
|
+
+ "%) "
|
|
4040
|
+
)
|
|
4041
|
+
except Exception:
|
|
4042
|
+
if verbose():
|
|
4043
|
+
suffix = 'Complete (To show mem use, run "pip install psutil")'
|
|
4044
|
+
printProgressBar(
|
|
4045
|
+
successCount + failureCount,
|
|
4046
|
+
totalResponses,
|
|
4047
|
+
prefix="Processing " + str(totalResponses) + " WARC files:",
|
|
4048
|
+
suffix=suffix,
|
|
4049
|
+
length=getProgressBarLength(),
|
|
4050
|
+
fill=fillChar,
|
|
4051
|
+
)
|
|
4052
|
+
|
|
4053
|
+
except Exception as e:
|
|
4054
|
+
if verbose():
|
|
4055
|
+
writerr(
|
|
4056
|
+
colored(
|
|
4057
|
+
'GhostArchive - [ ERR ] Error for "' + domUrl + '": ' + str(e), "red"
|
|
4058
|
+
)
|
|
4059
|
+
)
|
|
4060
|
+
|
|
4061
|
+
except Exception as e:
|
|
4062
|
+
writerr(colored("ERROR getGhostArchiveWARC 1: " + str(e), "red"))
|
|
4063
|
+
|
|
4064
|
+
|
|
4065
|
+
def format_date_for_urlscan(date_str):
|
|
4066
|
+
# Handle different lengths of input
|
|
4067
|
+
if len(date_str) == 4: # YYYY
|
|
4068
|
+
date_str += "0101"
|
|
4069
|
+
elif len(date_str) == 6: # YYYYMM
|
|
4070
|
+
date_str += "01"
|
|
4071
|
+
|
|
4072
|
+
# Convert to YYYY-MM-DD format
|
|
4073
|
+
try:
|
|
4074
|
+
formatted_date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
|
|
4075
|
+
return formatted_date
|
|
4076
|
+
except Exception:
|
|
4077
|
+
return ""
|
|
4078
|
+
|
|
4079
|
+
|
|
4080
|
+
def getURLScanUrls():
|
|
4081
|
+
"""
|
|
4082
|
+
Get URLs from the URLSCan API, urlscan.io
|
|
4083
|
+
"""
|
|
4084
|
+
global URLSCAN_API_KEY, linksFound, linkMimes, waymorePath, subs, stopProgram, stopSourceURLScan, argsInput, checkURLScan, argsInputHostname, linkCountURLScan, linksFoundURLScan
|
|
4085
|
+
|
|
4086
|
+
# Write the file of URL's for the passed domain/URL
|
|
4087
|
+
try:
|
|
4088
|
+
requestsMade = 0
|
|
4089
|
+
stopSourceURLScan = False
|
|
4090
|
+
linksFoundURLScan = set()
|
|
4091
|
+
totalUrls = 0
|
|
4092
|
+
checkResponse = True
|
|
4093
|
+
|
|
4094
|
+
# Set the URL to just the hostname
|
|
4095
|
+
url = URLSCAN_URL.replace("{DOMAIN}", quote(argsInputHostname))
|
|
4096
|
+
|
|
4097
|
+
# If the --from-date or --to-date parameters were paassed then also add a date filter
|
|
4098
|
+
if args.from_date or args.to_date:
|
|
4099
|
+
if args.from_date:
|
|
4100
|
+
fromDate = format_date_for_urlscan(str(args.from_date)[:8])
|
|
4101
|
+
else:
|
|
4102
|
+
fromDate = "2016-01-01" # The year URLScan started
|
|
4103
|
+
if args.to_date:
|
|
4104
|
+
toDate = format_date_for_urlscan(str(args.to_date)[:8])
|
|
4105
|
+
else:
|
|
4106
|
+
toDate = "now"
|
|
4107
|
+
url = url.replace("{DATERANGE}", f"%20date:[{fromDate}%20TO%20{toDate}]")
|
|
4108
|
+
else:
|
|
4109
|
+
url = url.replace("{DATERANGE}", "")
|
|
4110
|
+
|
|
4111
|
+
if verbose():
|
|
4112
|
+
if args.mode == "R":
|
|
4113
|
+
write(
|
|
4114
|
+
colored(
|
|
4115
|
+
"URLScan - [ INFO ] The URLScan URL requested to get links for responses: ",
|
|
4116
|
+
"magenta",
|
|
4117
|
+
)
|
|
4118
|
+
+ colored(url + "\n", "white")
|
|
4119
|
+
)
|
|
4120
|
+
else:
|
|
4121
|
+
write(
|
|
4122
|
+
colored(
|
|
4123
|
+
"URLScan - [ INFO ] The URLScan URL requested to get links: ", "magenta"
|
|
4124
|
+
)
|
|
4125
|
+
+ colored(url + "\n", "white")
|
|
4126
|
+
)
|
|
4127
|
+
|
|
4128
|
+
if args.mode in ("U", "B") and not args.check_only:
|
|
4129
|
+
write(
|
|
4130
|
+
colored(
|
|
4131
|
+
"URLScan - [ INFO ] Getting links from urlscan.io API (this can take a while for some domains)...",
|
|
4132
|
+
"cyan",
|
|
4133
|
+
)
|
|
4134
|
+
)
|
|
4135
|
+
|
|
4136
|
+
# Get the first page from urlscan.io
|
|
4137
|
+
try:
|
|
4138
|
+
# Choose a random user agent string to use for any requests
|
|
4139
|
+
# For other sources we would use `random.choice(USER_AGENT)` to asignn a random user-agent, but it seems
|
|
4140
|
+
# that there are a handful of those that ALWAYS return 429. Passing a specific one all the time seems to
|
|
4141
|
+
# be successful all the time
|
|
4142
|
+
userAgent = "waymore v" + __version__ + " by xnl-h4ck3r"
|
|
4143
|
+
session = requests.Session()
|
|
4144
|
+
session.mount("https://", HTTP_ADAPTER)
|
|
4145
|
+
session.mount("http://", HTTP_ADAPTER)
|
|
4146
|
+
# Pass the API-Key header too. This can change the max endpoints per page, depending on URLScan subscription
|
|
4147
|
+
resp = session.get(url, headers={"User-Agent": userAgent, "API-Key": URLSCAN_API_KEY})
|
|
4148
|
+
requestsMade = requestsMade + 1
|
|
4149
|
+
except Exception as e:
|
|
4150
|
+
write(
|
|
4151
|
+
colored(
|
|
4152
|
+
"URLScan - [ ERR ] Unable to get links from urlscan.io: " + str(e),
|
|
4153
|
+
"red",
|
|
4154
|
+
)
|
|
4155
|
+
)
|
|
4156
|
+
return
|
|
3178
4157
|
|
|
3179
4158
|
# If the rate limit was reached then determine if to wait and then try again
|
|
3180
4159
|
if resp.status_code == 429:
|
|
@@ -3753,7 +4732,6 @@ def processWayBackPage(url):
|
|
|
3753
4732
|
pass
|
|
3754
4733
|
return
|
|
3755
4734
|
else:
|
|
3756
|
-
print("DEBUG: HERE END!") # DEBUG
|
|
3757
4735
|
pass
|
|
3758
4736
|
except Exception as e:
|
|
3759
4737
|
if verbose():
|
|
@@ -4935,80 +5913,373 @@ def processIntelxType(target, credits):
|
|
|
4935
5913
|
writerr(colored("ERROR processIntelxType 1: " + str(e), "red"))
|
|
4936
5914
|
|
|
4937
5915
|
|
|
4938
|
-
def getIntelxAccountInfo() -> str:
|
|
4939
|
-
"""
|
|
4940
|
-
Get the account info and return the number of Credits remaining from the /phonebook/search
|
|
4941
|
-
"""
|
|
4942
|
-
initIntelxTls()
|
|
4943
|
-
try:
|
|
4944
|
-
resp = chooseIntelxBase(INTELX_API_KEY)
|
|
4945
|
-
if resp is None or resp.status_code != 200:
|
|
4946
|
-
return "Unknown"
|
|
4947
|
-
jsonResp = json.loads(resp.text.strip())
|
|
4948
|
-
credits = str(
|
|
4949
|
-
jsonResp.get("paths", {}).get("/phonebook/search", {}).get("Credit", "Unknown")
|
|
4950
|
-
)
|
|
4951
|
-
credits_max = str(
|
|
4952
|
-
jsonResp.get("paths", {}).get("/phonebook/search", {}).get("CreditMax", "Unknown")
|
|
4953
|
-
)
|
|
4954
|
-
return credits + "/" + credits_max
|
|
4955
|
-
except Exception:
|
|
4956
|
-
return "Unknown"
|
|
5916
|
+
def getIntelxAccountInfo() -> str:
|
|
5917
|
+
"""
|
|
5918
|
+
Get the account info and return the number of Credits remaining from the /phonebook/search
|
|
5919
|
+
"""
|
|
5920
|
+
initIntelxTls()
|
|
5921
|
+
try:
|
|
5922
|
+
resp = chooseIntelxBase(INTELX_API_KEY)
|
|
5923
|
+
if resp is None or resp.status_code != 200:
|
|
5924
|
+
return "Unknown"
|
|
5925
|
+
jsonResp = json.loads(resp.text.strip())
|
|
5926
|
+
credits = str(
|
|
5927
|
+
jsonResp.get("paths", {}).get("/phonebook/search", {}).get("Credit", "Unknown")
|
|
5928
|
+
)
|
|
5929
|
+
credits_max = str(
|
|
5930
|
+
jsonResp.get("paths", {}).get("/phonebook/search", {}).get("CreditMax", "Unknown")
|
|
5931
|
+
)
|
|
5932
|
+
return credits + "/" + credits_max
|
|
5933
|
+
except Exception:
|
|
5934
|
+
return "Unknown"
|
|
5935
|
+
|
|
5936
|
+
|
|
5937
|
+
def getIntelxUrls():
|
|
5938
|
+
"""
|
|
5939
|
+
Get URLs from the Intelligence X Phonebook search
|
|
5940
|
+
"""
|
|
5941
|
+
global INTELX_API_KEY, linksFound, waymorePath, subs, stopProgram, stopSourceIntelx, argsInput, checkIntelx, argsInputHostname, intelxAPIIssue, linkCountIntelx, linksFoundIntelx
|
|
5942
|
+
|
|
5943
|
+
# Write the file of URL's for the passed domain/URL
|
|
5944
|
+
try:
|
|
5945
|
+
if args.check_only:
|
|
5946
|
+
write(
|
|
5947
|
+
colored("IntelX - [ INFO ] Get URLs from Intelligence X: ", "cyan")
|
|
5948
|
+
+ colored("minimum 4 requests", "white")
|
|
5949
|
+
)
|
|
5950
|
+
checkIntelx = 4
|
|
5951
|
+
return
|
|
5952
|
+
|
|
5953
|
+
stopSourceIntelx = False
|
|
5954
|
+
linksFoundIntelx = set()
|
|
5955
|
+
initIntelxTls()
|
|
5956
|
+
|
|
5957
|
+
credits = getIntelxAccountInfo()
|
|
5958
|
+
if verbose():
|
|
5959
|
+
write(
|
|
5960
|
+
colored(
|
|
5961
|
+
"IntelX - [ INFO ] The Intelligence X URL requested to get links (Credits: "
|
|
5962
|
+
+ credits
|
|
5963
|
+
+ "): ",
|
|
5964
|
+
"magenta",
|
|
5965
|
+
)
|
|
5966
|
+
+ colored(intelx_tls.INTELX_SEARCH_URL + "\n", "white")
|
|
5967
|
+
)
|
|
5968
|
+
|
|
5969
|
+
if not args.check_only:
|
|
5970
|
+
write(colored("IntelX - [ INFO ] Getting links from intelx.io API...", "cyan"))
|
|
5971
|
+
|
|
5972
|
+
# Get the domains from Intelligence X if the --no-subs wasn't passed
|
|
5973
|
+
if not args.no_subs:
|
|
5974
|
+
processIntelxType(1, credits)
|
|
5975
|
+
|
|
5976
|
+
# Get the URLs from Intelligence X
|
|
5977
|
+
if not intelxAPIIssue:
|
|
5978
|
+
processIntelxType(3, credits)
|
|
5979
|
+
|
|
5980
|
+
linkCountIntelx = len(linksFoundIntelx)
|
|
5981
|
+
write(
|
|
5982
|
+
colored("IntelX - [ INFO ] Links found on intelx.io: ", "cyan")
|
|
5983
|
+
+ colored(str(linkCountIntelx), "white")
|
|
5984
|
+
)
|
|
5985
|
+
linksFound.update(linksFoundIntelx)
|
|
5986
|
+
linksFoundIntelx.clear()
|
|
5987
|
+
|
|
5988
|
+
except Exception as e:
|
|
5989
|
+
writerr(colored("ERROR getIntelxUrls 1: " + str(e), "red"))
|
|
5990
|
+
|
|
5991
|
+
|
|
5992
|
+
def processGhostArchiveUrl(url, ghostArchiveID=""):
|
|
5993
|
+
"""
|
|
5994
|
+
Process a specific URL from ghostarchive.org to determine whether to save the link
|
|
5995
|
+
"""
|
|
5996
|
+
global argsInput, argsInputHostname, links_lock, linkCountGhostArchive, linksFoundGhostArchive
|
|
5997
|
+
|
|
5998
|
+
addLink = True
|
|
5999
|
+
|
|
6000
|
+
try:
|
|
6001
|
+
# Strip Wayback Machine prefix if present (e.g., https://web.archive.org/web/20230101120000_/https://example.com)
|
|
6002
|
+
waybackMatch = re.match(r"^https?://web\.archive\.org/[^/]+/[a-zA-Z0-9]+_/", url)
|
|
6003
|
+
if waybackMatch:
|
|
6004
|
+
url = url[waybackMatch.end() :]
|
|
6005
|
+
|
|
6006
|
+
# If the input has a / in it, then a URL was passed, so the link will only be added if the URL matches
|
|
6007
|
+
if "/" in url:
|
|
6008
|
+
if argsInput not in url:
|
|
6009
|
+
addLink = False
|
|
6010
|
+
|
|
6011
|
+
# If filters are required then test them
|
|
6012
|
+
if addLink and not args.filter_responses_only:
|
|
6013
|
+
|
|
6014
|
+
# If the user requested -n / --no-subs then we don't want to add it if it has a sub domain (www. will not be classed as a sub domain)
|
|
6015
|
+
if args.no_subs:
|
|
6016
|
+
match = re.search(
|
|
6017
|
+
r"^[A-za-z]*\:\/\/(www\.)?" + re.escape(argsInputHostname),
|
|
6018
|
+
url,
|
|
6019
|
+
flags=re.IGNORECASE,
|
|
6020
|
+
)
|
|
6021
|
+
if match is None:
|
|
6022
|
+
addLink = False
|
|
6023
|
+
|
|
6024
|
+
# If the user didn't requested -f / --filter-responses-only then check http code
|
|
6025
|
+
if addLink and not args.filter_responses_only:
|
|
6026
|
+
|
|
6027
|
+
# Check the URL exclusions
|
|
6028
|
+
if addLink:
|
|
6029
|
+
match = re.search(
|
|
6030
|
+
r"(" + re.escape(FILTER_URL).replace(",", "|") + ")",
|
|
6031
|
+
url,
|
|
6032
|
+
flags=re.IGNORECASE,
|
|
6033
|
+
)
|
|
6034
|
+
if match is not None:
|
|
6035
|
+
addLink = False
|
|
6036
|
+
|
|
6037
|
+
# Set keywords filter if -ko argument passed
|
|
6038
|
+
if addLink and args.keywords_only:
|
|
6039
|
+
if args.keywords_only == "#CONFIG":
|
|
6040
|
+
match = re.search(
|
|
6041
|
+
r"(" + re.escape(FILTER_KEYWORDS).replace(",", "|") + ")",
|
|
6042
|
+
url,
|
|
6043
|
+
flags=re.IGNORECASE,
|
|
6044
|
+
)
|
|
6045
|
+
else:
|
|
6046
|
+
match = re.search(r"(" + args.keywords_only + ")", url, flags=re.IGNORECASE)
|
|
6047
|
+
if match is None:
|
|
6048
|
+
addLink = False
|
|
6049
|
+
|
|
6050
|
+
# Add link if it passed filters
|
|
6051
|
+
if addLink:
|
|
6052
|
+
# Just get the hostname of the url
|
|
6053
|
+
tldExtract = tldextract.extract(url)
|
|
6054
|
+
subDomain = tldExtract.subdomain
|
|
6055
|
+
if subDomain != "":
|
|
6056
|
+
subDomain = subDomain + "."
|
|
6057
|
+
domainOnly = subDomain + tldExtract.domain + "." + tldExtract.suffix
|
|
6058
|
+
|
|
6059
|
+
# GhostArchive might return URLs that aren't for the domain passed so we need to check for those and not process them
|
|
6060
|
+
# Check the URL
|
|
6061
|
+
match = re.search(
|
|
6062
|
+
r"(^|\.)" + re.escape(argsInputHostname) + "$",
|
|
6063
|
+
domainOnly,
|
|
6064
|
+
flags=re.IGNORECASE,
|
|
6065
|
+
)
|
|
6066
|
+
if match is not None:
|
|
6067
|
+
if args.mode in ("U", "B"):
|
|
6068
|
+
linksFoundAdd(url, linksFoundGhostArchive)
|
|
6069
|
+
# If Response mode is requested then add the DOM ID to try later, for the number of responses wanted
|
|
6070
|
+
if ghostArchiveID != "" and args.mode in ("R", "B"):
|
|
6071
|
+
if args.limit == 0 or len(ghostArchiveRequestLinks) < args.limit:
|
|
6072
|
+
with links_lock:
|
|
6073
|
+
ghostArchiveRequestLinks.add(
|
|
6074
|
+
(url, GHOSTARCHIVE_DOM_URL + ghostArchiveID)
|
|
6075
|
+
)
|
|
6076
|
+
|
|
6077
|
+
except Exception as e:
|
|
6078
|
+
writerr(colored("ERROR processGhostArchiveUrl 1: " + str(e), "red"))
|
|
6079
|
+
|
|
6080
|
+
|
|
6081
|
+
def getGhostArchiveUrls():
|
|
6082
|
+
"""
|
|
6083
|
+
Get URLs from GhostArchive (ghostarchive.org)
|
|
6084
|
+
This source doesn't have an API, so we crawl the HTML pages directly.
|
|
6085
|
+
"""
|
|
6086
|
+
global linksFound, path, subs, stopProgram, stopSourceGhostArchive, argsInput, checkGhostArchive, argsInputHostname, linkCountGhostArchive, linksFoundGhostArchive
|
|
6087
|
+
|
|
6088
|
+
try:
|
|
6089
|
+
stopSourceGhostArchive = False
|
|
6090
|
+
linksFoundGhostArchive = set()
|
|
6091
|
+
|
|
6092
|
+
# Build the base URL
|
|
6093
|
+
# If there is only one . in the hostname, we can guarantee that a subdoman wasn't passed, so we can prefix with . to the links quicker as it won't include other domains that end with the target domain,
|
|
6094
|
+
# Else, we need to get all and then confirm the actual host of the links later
|
|
6095
|
+
if argsInputHostname.count(".") == 1:
|
|
6096
|
+
baseUrl = GHOSTARCHIVE_URL.replace("{DOMAIN}", "." + quote(argsInput))
|
|
6097
|
+
else:
|
|
6098
|
+
baseUrl = GHOSTARCHIVE_URL.replace("{DOMAIN}", quote(argsInput))
|
|
6099
|
+
|
|
6100
|
+
if verbose():
|
|
6101
|
+
write(
|
|
6102
|
+
colored("GhostArchive - [ INFO ] The URL requested to get links: ", "magenta")
|
|
6103
|
+
+ colored(baseUrl + "0\n", "white")
|
|
6104
|
+
)
|
|
6105
|
+
|
|
6106
|
+
if not args.check_only and args.mode == "U":
|
|
6107
|
+
write(
|
|
6108
|
+
colored(
|
|
6109
|
+
"GhostArchive - [ INFO ] Getting links from ghostarchive.org (this can take a while for some domains)...",
|
|
6110
|
+
"cyan",
|
|
6111
|
+
)
|
|
6112
|
+
)
|
|
6113
|
+
|
|
6114
|
+
# Set up session with cookie
|
|
6115
|
+
session = requests.Session()
|
|
6116
|
+
if HTTP_ADAPTER is not None:
|
|
6117
|
+
session.mount("https://", HTTP_ADAPTER)
|
|
6118
|
+
session.mount("http://", HTTP_ADAPTER)
|
|
6119
|
+
|
|
6120
|
+
userAgent = random.choice(USER_AGENT)
|
|
6121
|
+
headers = {"User-Agent": userAgent}
|
|
6122
|
+
cookies = {"theme": "original"}
|
|
6123
|
+
|
|
6124
|
+
pageNum = 0
|
|
6125
|
+
|
|
6126
|
+
while stopProgram is None and not stopSourceGhostArchive:
|
|
6127
|
+
getMemory()
|
|
6128
|
+
|
|
6129
|
+
url = baseUrl + str(pageNum)
|
|
6130
|
+
|
|
6131
|
+
try:
|
|
6132
|
+
resp = session.get(url, headers=headers, cookies=cookies, timeout=DEFAULT_TIMEOUT)
|
|
6133
|
+
except Exception as e:
|
|
6134
|
+
writerr(
|
|
6135
|
+
colored(
|
|
6136
|
+
"GhostArchive - [ ERR ] Unable to get page " + str(pageNum) + ": " + str(e),
|
|
6137
|
+
"red",
|
|
6138
|
+
)
|
|
6139
|
+
)
|
|
6140
|
+
break
|
|
6141
|
+
|
|
6142
|
+
if resp.status_code == 429:
|
|
6143
|
+
writerr(
|
|
6144
|
+
colored(
|
|
6145
|
+
"GhostArchive - [ 429 ] Rate limit reached at page " + str(pageNum) + ".",
|
|
6146
|
+
"red",
|
|
6147
|
+
)
|
|
6148
|
+
)
|
|
6149
|
+
break
|
|
6150
|
+
|
|
6151
|
+
# Check for maintenance/end of results indicator
|
|
6152
|
+
if (
|
|
6153
|
+
resp.status_code == 503
|
|
6154
|
+
or "The site is under maintenance and will be back soon" in resp.text
|
|
6155
|
+
or "No archives for that site" in resp.text
|
|
6156
|
+
):
|
|
6157
|
+
if verbose():
|
|
6158
|
+
if pageNum == 0:
|
|
6159
|
+
if args.check_only:
|
|
6160
|
+
checkGhostArchive = 1
|
|
6161
|
+
write(
|
|
6162
|
+
colored(
|
|
6163
|
+
"GhostArchive - [ INFO ] Get URLs from GhostArchive: ", "cyan"
|
|
6164
|
+
)
|
|
6165
|
+
+ colored("1 request", "white")
|
|
6166
|
+
)
|
|
6167
|
+
else:
|
|
6168
|
+
write(
|
|
6169
|
+
colored(
|
|
6170
|
+
"GhostArchive - [ INFO ] No results found",
|
|
6171
|
+
"cyan",
|
|
6172
|
+
)
|
|
6173
|
+
)
|
|
6174
|
+
else:
|
|
6175
|
+
write(
|
|
6176
|
+
colored(
|
|
6177
|
+
"GhostArchive - [ INFO ] Retrieved all results from "
|
|
6178
|
+
+ str(pageNum)
|
|
6179
|
+
+ " pages",
|
|
6180
|
+
"cyan",
|
|
6181
|
+
)
|
|
6182
|
+
)
|
|
6183
|
+
break
|
|
6184
|
+
if resp.status_code != 200:
|
|
6185
|
+
writerr(
|
|
6186
|
+
colored(
|
|
6187
|
+
"GhostArchive - [ ERR ] [ "
|
|
6188
|
+
+ str(resp.status_code)
|
|
6189
|
+
+ " ] at page "
|
|
6190
|
+
+ str(pageNum),
|
|
6191
|
+
"red",
|
|
6192
|
+
)
|
|
6193
|
+
)
|
|
6194
|
+
break
|
|
6195
|
+
|
|
6196
|
+
# Check only mode - just count pages
|
|
6197
|
+
if args.check_only:
|
|
6198
|
+
# For check only, we check if there are results and try to get total count
|
|
6199
|
+
if pageNum == 0:
|
|
6200
|
+
# Check if there are any results on the first page
|
|
6201
|
+
if '<a href="/archive/' in resp.text:
|
|
6202
|
+
# Try to find "out of X" to determine total results/pages
|
|
6203
|
+
outOfMatch = re.search(r"out of (\d+)", resp.text)
|
|
6204
|
+
if outOfMatch:
|
|
6205
|
+
totalResults = int(outOfMatch.group(1))
|
|
6206
|
+
checkGhostArchive = totalResults
|
|
6207
|
+
write(
|
|
6208
|
+
colored(
|
|
6209
|
+
"GhostArchive - [ INFO ] Get URLs from GhostArchive: ", "cyan"
|
|
6210
|
+
)
|
|
6211
|
+
+ colored(f"{totalResults} requests (pagination required)", "white")
|
|
6212
|
+
)
|
|
6213
|
+
else:
|
|
6214
|
+
checkGhostArchive = 1
|
|
6215
|
+
write(
|
|
6216
|
+
colored(
|
|
6217
|
+
"GhostArchive - [ INFO ] Get URLs from GhostArchive: ", "cyan"
|
|
6218
|
+
)
|
|
6219
|
+
+ colored("unknown requests (pagination required)", "white")
|
|
6220
|
+
)
|
|
6221
|
+
else:
|
|
6222
|
+
checkGhostArchive = 1
|
|
6223
|
+
write(
|
|
6224
|
+
colored("GhostArchive - [ INFO ] Get URLs from GhostArchive: ", "cyan")
|
|
6225
|
+
+ colored("1 request (no results)", "white")
|
|
6226
|
+
)
|
|
6227
|
+
break
|
|
6228
|
+
|
|
6229
|
+
# Use regex to extract URLs from anchor tag text content
|
|
6230
|
+
# Pattern matches: <a href="/archive/ID">URL_HERE</a> - captures both href path and URL
|
|
6231
|
+
pattern = r'<a href="(/archive/[^"]*)">([^<]+)</a>'
|
|
6232
|
+
matches = re.findall(pattern, resp.text)
|
|
4957
6233
|
|
|
6234
|
+
# If no matches found, we've reached the end of results
|
|
6235
|
+
if not matches:
|
|
6236
|
+
if verbose():
|
|
6237
|
+
write(
|
|
6238
|
+
colored(
|
|
6239
|
+
"GhostArchive - [ INFO ] Retrieved all results from "
|
|
6240
|
+
+ str(pageNum + 1)
|
|
6241
|
+
+ " pages",
|
|
6242
|
+
"cyan",
|
|
6243
|
+
)
|
|
6244
|
+
)
|
|
6245
|
+
break
|
|
4958
6246
|
|
|
4959
|
-
|
|
4960
|
-
|
|
4961
|
-
|
|
4962
|
-
|
|
4963
|
-
global INTELX_API_KEY, linksFound, waymorePath, subs, stopProgram, stopSourceIntelx, argsInput, checkIntelx, argsInputHostname, intelxAPIIssue, linkCountIntelx, linksFoundIntelx
|
|
6247
|
+
for match in matches:
|
|
6248
|
+
ghostArchiveId = match[0] # e.g., "/archive/gkOOR"
|
|
6249
|
+
potentialUrl = match[1].strip()
|
|
6250
|
+
processGhostArchiveUrl(potentialUrl, ghostArchiveId)
|
|
4964
6251
|
|
|
4965
|
-
|
|
4966
|
-
|
|
4967
|
-
|
|
4968
|
-
|
|
4969
|
-
|
|
4970
|
-
|
|
4971
|
-
|
|
4972
|
-
|
|
4973
|
-
|
|
6252
|
+
# Check if there's a "Next Page" link - if not, we've reached the last page
|
|
6253
|
+
# GhostArchive resets to Page 1 when exceeding actual pages, so checking for Next Page is essential
|
|
6254
|
+
if "Next Page" not in resp.text and ">»</a>" not in resp.text:
|
|
6255
|
+
if verbose():
|
|
6256
|
+
write(
|
|
6257
|
+
colored(
|
|
6258
|
+
"GhostArchive - [ INFO ] Retrieved all results from "
|
|
6259
|
+
+ str(pageNum + 1)
|
|
6260
|
+
+ " pages",
|
|
6261
|
+
"cyan",
|
|
6262
|
+
)
|
|
6263
|
+
)
|
|
6264
|
+
break
|
|
4974
6265
|
|
|
4975
|
-
|
|
4976
|
-
linksFoundIntelx = set()
|
|
4977
|
-
initIntelxTls()
|
|
6266
|
+
pageNum += 1
|
|
4978
6267
|
|
|
4979
|
-
|
|
4980
|
-
|
|
6268
|
+
if not args.check_only:
|
|
6269
|
+
# Count links based on mode - in R mode, count response links; in U/B mode, count URL links
|
|
6270
|
+
if args.mode == "R":
|
|
6271
|
+
linkCountGhostArchive = len(ghostArchiveRequestLinks)
|
|
6272
|
+
else:
|
|
6273
|
+
linkCountGhostArchive = len(linksFoundGhostArchive)
|
|
4981
6274
|
write(
|
|
4982
|
-
colored(
|
|
4983
|
-
|
|
4984
|
-
+ credits
|
|
4985
|
-
+ "): ",
|
|
4986
|
-
"magenta",
|
|
4987
|
-
)
|
|
4988
|
-
+ colored(intelx_tls.INTELX_SEARCH_URL + "\n", "white")
|
|
6275
|
+
colored("GhostArchive - [ INFO ] Links found on ghostarchive.org: ", "cyan")
|
|
6276
|
+
+ colored(str(linkCountGhostArchive), "white")
|
|
4989
6277
|
)
|
|
4990
|
-
|
|
4991
|
-
|
|
4992
|
-
write(colored("IntelX - [ INFO ] Getting links from intelx.io API...", "cyan"))
|
|
4993
|
-
|
|
4994
|
-
# Get the domains from Intelligence X if the --no-subs wasn't passed
|
|
4995
|
-
if not args.no_subs:
|
|
4996
|
-
processIntelxType(1, credits)
|
|
4997
|
-
|
|
4998
|
-
# Get the URLs from Intelligence X
|
|
4999
|
-
if not intelxAPIIssue:
|
|
5000
|
-
processIntelxType(3, credits)
|
|
5001
|
-
|
|
5002
|
-
linkCountIntelx = len(linksFoundIntelx)
|
|
5003
|
-
write(
|
|
5004
|
-
colored("IntelX - [ INFO ] Links found on intelx.io: ", "cyan")
|
|
5005
|
-
+ colored(str(linkCountIntelx), "white")
|
|
5006
|
-
)
|
|
5007
|
-
linksFound.update(linksFoundIntelx)
|
|
5008
|
-
linksFoundIntelx.clear()
|
|
6278
|
+
linksFound.update(linksFoundGhostArchive)
|
|
6279
|
+
linksFoundGhostArchive.clear()
|
|
5009
6280
|
|
|
5010
6281
|
except Exception as e:
|
|
5011
|
-
writerr(colored("ERROR
|
|
6282
|
+
writerr(colored("ERROR getGhostArchiveUrls 1: " + str(e), "red"))
|
|
5012
6283
|
|
|
5013
6284
|
|
|
5014
6285
|
def processResponses():
|
|
@@ -5018,6 +6289,10 @@ def processResponses():
|
|
|
5018
6289
|
global stopProgram, totalFileCount
|
|
5019
6290
|
try:
|
|
5020
6291
|
|
|
6292
|
+
# Get responses from GhostArchive unless excluded
|
|
6293
|
+
if stopProgram is None and not args.xga:
|
|
6294
|
+
processResponsesGhostArchive()
|
|
6295
|
+
|
|
5021
6296
|
# Get responses from URLScan unless excluded
|
|
5022
6297
|
if stopProgram is None and not args.xus:
|
|
5023
6298
|
processResponsesURLScan()
|
|
@@ -5039,6 +6314,235 @@ def processResponses():
|
|
|
5039
6314
|
writerr(colored(getSPACER("ERROR processResponses 1: " + str(e)), "red"))
|
|
5040
6315
|
|
|
5041
6316
|
|
|
6317
|
+
def processResponsesGhostArchive():
|
|
6318
|
+
"""
|
|
6319
|
+
Get archived responses from GhostArchive (ghostarchive.org)
|
|
6320
|
+
"""
|
|
6321
|
+
global subs, path, indexFile, totalResponses, stopProgram, argsInput, successCount, fileCount, DEFAULT_OUTPUT_DIR, responseOutputDirectory, ghostArchiveRequestLinks, failureCount, totalFileCount, checkGhostArchive
|
|
6322
|
+
try:
|
|
6323
|
+
fileCount = 0
|
|
6324
|
+
failureCount = 0
|
|
6325
|
+
if not args.check_only:
|
|
6326
|
+
# Create 'results' and domain directory if needed
|
|
6327
|
+
createDirs()
|
|
6328
|
+
|
|
6329
|
+
# Get the path of the files, depending on whether -oR / --output_responses was passed
|
|
6330
|
+
try:
|
|
6331
|
+
responsesPath = responseOutputDirectory + "responses.GhostArchive.tmp"
|
|
6332
|
+
indexPath = responseOutputDirectory + "waymore_index.txt"
|
|
6333
|
+
except Exception as e:
|
|
6334
|
+
if verbose():
|
|
6335
|
+
writerr(colored("ERROR processResponsesGhostArchive 4: " + str(e), "red"))
|
|
6336
|
+
|
|
6337
|
+
# Get URLs from GhostArchive if the DOM ID's haven't been retrieved yet
|
|
6338
|
+
if stopProgram is None and not args.check_only:
|
|
6339
|
+
if args.mode in ("R", "B"):
|
|
6340
|
+
write(
|
|
6341
|
+
colored(
|
|
6342
|
+
"GhostArchive - [ INFO ] Getting list of response links (this can take a while for some domains)...",
|
|
6343
|
+
"cyan",
|
|
6344
|
+
)
|
|
6345
|
+
)
|
|
6346
|
+
if args.mode == "R":
|
|
6347
|
+
getGhostArchiveUrls()
|
|
6348
|
+
|
|
6349
|
+
# Check if a responses.GhostArchive.tmp files exists
|
|
6350
|
+
if not args.check_only and os.path.exists(responsesPath):
|
|
6351
|
+
|
|
6352
|
+
# Load the links into the set
|
|
6353
|
+
with open(responsesPath, "rb") as fl:
|
|
6354
|
+
linkRequests = pickle.load(fl)
|
|
6355
|
+
|
|
6356
|
+
# Set start point
|
|
6357
|
+
successCount = 0
|
|
6358
|
+
|
|
6359
|
+
# Get the URLScan DOM links
|
|
6360
|
+
linkRequests = []
|
|
6361
|
+
for originalUrl, domUrl in ghostArchiveRequestLinks:
|
|
6362
|
+
linkRequests.append((originalUrl, domUrl))
|
|
6363
|
+
|
|
6364
|
+
# Write the links to a temp file
|
|
6365
|
+
if not args.check_only:
|
|
6366
|
+
with open(responsesPath, "wb") as f:
|
|
6367
|
+
pickle.dump(linkRequests, f)
|
|
6368
|
+
|
|
6369
|
+
# Get the total number of responses we will try to get and set the current file count to the success count
|
|
6370
|
+
totalResponses = len(linkRequests)
|
|
6371
|
+
checkGhostArchive = checkGhostArchive + totalResponses
|
|
6372
|
+
|
|
6373
|
+
# If there are no reponses to download, diaplay an error and exit
|
|
6374
|
+
if args.mode != "R" and totalResponses == 0:
|
|
6375
|
+
writerr(
|
|
6376
|
+
colored(
|
|
6377
|
+
getSPACER(
|
|
6378
|
+
"Failed to get responses from GhostArchive (ghostarchive.org) - check input and try again."
|
|
6379
|
+
),
|
|
6380
|
+
"red",
|
|
6381
|
+
)
|
|
6382
|
+
)
|
|
6383
|
+
return
|
|
6384
|
+
|
|
6385
|
+
fileCount = successCount
|
|
6386
|
+
|
|
6387
|
+
if args.check_only:
|
|
6388
|
+
writerr(
|
|
6389
|
+
colored("Downloading archived responses: ", "cyan")
|
|
6390
|
+
+ colored("UNKNOWN requests", "cyan")
|
|
6391
|
+
)
|
|
6392
|
+
writerr(
|
|
6393
|
+
colored(
|
|
6394
|
+
"\n-> Downloading the responses can vary depending on the target and the rate limiting on GhostArchive",
|
|
6395
|
+
"green",
|
|
6396
|
+
)
|
|
6397
|
+
)
|
|
6398
|
+
write("")
|
|
6399
|
+
else:
|
|
6400
|
+
# If the limit has been set over the default, give a warning that this could take a long time!
|
|
6401
|
+
if totalResponses - successCount > DEFAULT_LIMIT:
|
|
6402
|
+
if successCount > 0:
|
|
6403
|
+
writerr(
|
|
6404
|
+
colored(
|
|
6405
|
+
getSPACER(
|
|
6406
|
+
"WARNING: Downloading remaining "
|
|
6407
|
+
+ str(totalResponses - successCount)
|
|
6408
|
+
+ " responses may take a loooooooong time! Consider using arguments -ko, -l, -ci, -from and -to wisely!"
|
|
6409
|
+
),
|
|
6410
|
+
"yellow",
|
|
6411
|
+
)
|
|
6412
|
+
)
|
|
6413
|
+
else:
|
|
6414
|
+
writerr(
|
|
6415
|
+
colored(
|
|
6416
|
+
getSPACER(
|
|
6417
|
+
"WARNING: Downloading "
|
|
6418
|
+
+ str(totalResponses)
|
|
6419
|
+
+ " responses may take a loooooooong time! Consider using arguments -ko, -l, -ci, -from and -to wisely!"
|
|
6420
|
+
),
|
|
6421
|
+
"yellow",
|
|
6422
|
+
)
|
|
6423
|
+
)
|
|
6424
|
+
|
|
6425
|
+
# Open the index file if hash value is going to be used (not URL)
|
|
6426
|
+
if not args.url_filename:
|
|
6427
|
+
indexFile = open(indexPath, "a")
|
|
6428
|
+
|
|
6429
|
+
# Process the URLs from GhostArchive
|
|
6430
|
+
if stopProgram is None:
|
|
6431
|
+
p = mp.Pool(
|
|
6432
|
+
args.processes * 2
|
|
6433
|
+
) # Double the number of processes to speed up the download
|
|
6434
|
+
p.starmap(getGhostArchiveWARC, linkRequests[successCount:])
|
|
6435
|
+
p.close()
|
|
6436
|
+
p.join()
|
|
6437
|
+
|
|
6438
|
+
# Delete the tmp files now it has run successfully
|
|
6439
|
+
if stopProgram is None:
|
|
6440
|
+
try:
|
|
6441
|
+
os.remove(responsesPath)
|
|
6442
|
+
except Exception:
|
|
6443
|
+
pass
|
|
6444
|
+
|
|
6445
|
+
# Close the index file if hash value is going to be used (not URL)
|
|
6446
|
+
if not args.url_filename:
|
|
6447
|
+
indexFile.close()
|
|
6448
|
+
|
|
6449
|
+
if not args.check_only:
|
|
6450
|
+
try:
|
|
6451
|
+
if failureCount > 0:
|
|
6452
|
+
if verbose():
|
|
6453
|
+
write(
|
|
6454
|
+
colored("GhostArchive - [ INFO ] Responses saved to ", "cyan")
|
|
6455
|
+
+ colored(responseOutputDirectory, "white")
|
|
6456
|
+
+ colored(" for " + subs + argsInput + ": ", "cyan")
|
|
6457
|
+
+ colored(
|
|
6458
|
+
str(fileCount) + " 🤘",
|
|
6459
|
+
"white",
|
|
6460
|
+
)
|
|
6461
|
+
+ colored(" (" + str(failureCount) + " not found)\n", "red")
|
|
6462
|
+
)
|
|
6463
|
+
else:
|
|
6464
|
+
write(
|
|
6465
|
+
colored("GhostArchive - [ INFO ] Responses saved to ", "cyan")
|
|
6466
|
+
+ colored(responseOutputDirectory, "white")
|
|
6467
|
+
+ colored(" for " + subs + argsInput + ": ", "cyan")
|
|
6468
|
+
+ colored(str(fileCount) + " 🤘", "white")
|
|
6469
|
+
+ colored(" (" + str(failureCount) + " not found)\n", "red")
|
|
6470
|
+
)
|
|
6471
|
+
else:
|
|
6472
|
+
if verbose():
|
|
6473
|
+
write(
|
|
6474
|
+
colored("GhostArchive - [ INFO ] Responses saved to ", "cyan")
|
|
6475
|
+
+ colored(responseOutputDirectory, "white")
|
|
6476
|
+
+ colored(" for " + subs + argsInput + ": ", "cyan")
|
|
6477
|
+
+ colored(str(fileCount) + " 🤘\n", "white")
|
|
6478
|
+
)
|
|
6479
|
+
else:
|
|
6480
|
+
write(
|
|
6481
|
+
colored("GhostArchive - [ INFO ] Responses saved to ", "cyan")
|
|
6482
|
+
+ colored(responseOutputDirectory, "white")
|
|
6483
|
+
+ colored(" for " + subs + argsInput + ": ", "cyan")
|
|
6484
|
+
+ colored(str(fileCount) + " 🤘\n", "white")
|
|
6485
|
+
)
|
|
6486
|
+
except Exception as e:
|
|
6487
|
+
if verbose():
|
|
6488
|
+
writerr(colored("ERROR processResponsesGhostArchive 5: " + str(e), "red"))
|
|
6489
|
+
|
|
6490
|
+
# Append extra links from WARC files to URL output file (for mode B)
|
|
6491
|
+
try:
|
|
6492
|
+
if args.mode == "B" and len(extraWarcLinks) > 0:
|
|
6493
|
+
# Determine URL output file path (same logic as processURLOutput)
|
|
6494
|
+
if args.output_urls == "":
|
|
6495
|
+
if args.output_responses != "":
|
|
6496
|
+
urlFilePath = args.output_responses + "/waymore.txt"
|
|
6497
|
+
else:
|
|
6498
|
+
urlFilePath = (
|
|
6499
|
+
str(DEFAULT_OUTPUT_DIR)
|
|
6500
|
+
+ "/results/"
|
|
6501
|
+
+ str(argsInput).replace("/", "-")
|
|
6502
|
+
+ "/waymore.txt"
|
|
6503
|
+
)
|
|
6504
|
+
else:
|
|
6505
|
+
urlFilePath = args.output_urls
|
|
6506
|
+
|
|
6507
|
+
# Load existing URLs from file to avoid duplicates
|
|
6508
|
+
existingUrls = set()
|
|
6509
|
+
try:
|
|
6510
|
+
with open(urlFilePath) as f:
|
|
6511
|
+
for line in f:
|
|
6512
|
+
existingUrls.add(line.strip())
|
|
6513
|
+
except Exception:
|
|
6514
|
+
pass
|
|
6515
|
+
|
|
6516
|
+
# Append only new unique URLs
|
|
6517
|
+
newLinks = [
|
|
6518
|
+
url
|
|
6519
|
+
for url in extraWarcLinks
|
|
6520
|
+
if url not in existingUrls and url not in linksFound
|
|
6521
|
+
]
|
|
6522
|
+
if len(newLinks) > 0:
|
|
6523
|
+
with open(urlFilePath, "a") as f:
|
|
6524
|
+
for url in newLinks:
|
|
6525
|
+
f.write(url + "\n")
|
|
6526
|
+
|
|
6527
|
+
# Display message about extra links
|
|
6528
|
+
write(
|
|
6529
|
+
colored("GhostArchive - [ INFO ] ", "cyan")
|
|
6530
|
+
+ colored(str(len(newLinks)), "white")
|
|
6531
|
+
+ colored(" extra links found in WARC files added to file ", "cyan")
|
|
6532
|
+
+ colored(urlFilePath, "white")
|
|
6533
|
+
+ "\n"
|
|
6534
|
+
)
|
|
6535
|
+
except Exception as e:
|
|
6536
|
+
if verbose():
|
|
6537
|
+
writerr(colored("ERROR processResponsesGhostArchive 6: " + str(e), "red"))
|
|
6538
|
+
|
|
6539
|
+
totalFileCount = totalFileCount + fileCount
|
|
6540
|
+
except Exception as e:
|
|
6541
|
+
writerr(colored(getSPACER("ERROR processResponsesGhostArchive 1: " + str(e)), "red"))
|
|
6542
|
+
finally:
|
|
6543
|
+
linkRequests = None
|
|
6544
|
+
|
|
6545
|
+
|
|
5042
6546
|
def processResponsesURLScan():
|
|
5043
6547
|
"""
|
|
5044
6548
|
Get archived responses from URLScan (urlscan.io)
|
|
@@ -6254,6 +7758,12 @@ async def fetch_intelx_async():
|
|
|
6254
7758
|
await loop.run_in_executor(None, getIntelxUrls)
|
|
6255
7759
|
|
|
6256
7760
|
|
|
7761
|
+
async def fetch_ghostarchive_async():
|
|
7762
|
+
"""Async wrapper for getGhostArchiveUrls - runs in thread pool"""
|
|
7763
|
+
loop = asyncio.get_event_loop()
|
|
7764
|
+
await loop.run_in_executor(None, getGhostArchiveUrls)
|
|
7765
|
+
|
|
7766
|
+
|
|
6257
7767
|
async def fetch_all_sources_async():
|
|
6258
7768
|
"""
|
|
6259
7769
|
Orchestrator function to fetch from all enabled sources concurrently.
|
|
@@ -6276,6 +7786,8 @@ async def fetch_all_sources_async():
|
|
|
6276
7786
|
tasks.append(("VirusTotal", fetch_virustotal_async()))
|
|
6277
7787
|
if not args.xix and INTELX_API_KEY != "" and stopProgram is None:
|
|
6278
7788
|
tasks.append(("Intelligence X", fetch_intelx_async()))
|
|
7789
|
+
if not args.xga and stopProgram is None:
|
|
7790
|
+
tasks.append(("GhostArchive", fetch_ghostarchive_async()))
|
|
6279
7791
|
|
|
6280
7792
|
if not tasks:
|
|
6281
7793
|
return
|
|
@@ -6301,7 +7813,7 @@ async def fetch_all_sources_async():
|
|
|
6301
7813
|
|
|
6302
7814
|
# Run waymore
|
|
6303
7815
|
def main():
|
|
6304
|
-
global args, DEFAULT_TIMEOUT, inputValues, argsInput, linksFound, linkMimes, successCount, failureCount, fileCount, totalResponses, totalPages, indexFile, path, stopSource, stopProgram, VIRUSTOTAL_API_KEY, inputIsSubDomain, argsInputHostname, WEBHOOK_DISCORD, responseOutputDirectory, fileCount, INTELX_API_KEY, stopSourceAlienVault, stopSourceCommonCrawl, stopSourceWayback, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx
|
|
7816
|
+
global args, DEFAULT_TIMEOUT, inputValues, argsInput, linksFound, linkMimes, successCount, failureCount, fileCount, totalResponses, totalPages, indexFile, path, stopSource, stopProgram, VIRUSTOTAL_API_KEY, inputIsSubDomain, argsInputHostname, WEBHOOK_DISCORD, responseOutputDirectory, fileCount, INTELX_API_KEY, stopSourceAlienVault, stopSourceCommonCrawl, stopSourceWayback, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx, stopSourceGhostArchive, extraWarcLinks
|
|
6305
7817
|
|
|
6306
7818
|
# Tell Python to run the handler() function when SIGINT is received
|
|
6307
7819
|
signal(SIGINT, handler)
|
|
@@ -6457,13 +7969,19 @@ def main():
|
|
|
6457
7969
|
help="Exclude checks for links from intelx.io",
|
|
6458
7970
|
default=False,
|
|
6459
7971
|
)
|
|
7972
|
+
parser.add_argument(
|
|
7973
|
+
"-xga",
|
|
7974
|
+
action="store_true",
|
|
7975
|
+
help="Exclude checks for links from ghostarchive.org",
|
|
7976
|
+
default=False,
|
|
7977
|
+
)
|
|
6460
7978
|
parser.add_argument(
|
|
6461
7979
|
"--providers",
|
|
6462
7980
|
action="store",
|
|
6463
|
-
help="A comma separated list of source providers that you want to get URLs from. The values can be wayback,commoncrawl,otx,urlscan,virustotal and
|
|
7981
|
+
help="A comma separated list of source providers that you want to get URLs from. The values can be wayback,commoncrawl,otx,urlscan,virustotal,intelx and ghostarchive. Passing this will override any exclude arguments (e.g. -xwm,-xcc, etc.) passed to exclude sources, and reset those based on what was passed with this argument.",
|
|
6464
7982
|
default=[],
|
|
6465
7983
|
type=validateArgProviders,
|
|
6466
|
-
metavar="{wayback,commoncrawl,otx,urlscan,virustotal,intelx}",
|
|
7984
|
+
metavar="{wayback,commoncrawl,otx,urlscan,virustotal,intelx,ghostarchive}",
|
|
6467
7985
|
)
|
|
6468
7986
|
parser.add_argument(
|
|
6469
7987
|
"-lcc",
|
|
@@ -6630,6 +8148,10 @@ def main():
|
|
|
6630
8148
|
args.xix = True
|
|
6631
8149
|
else:
|
|
6632
8150
|
args.xix = False
|
|
8151
|
+
if "ghostarchive" not in args.providers:
|
|
8152
|
+
args.xga = True
|
|
8153
|
+
else:
|
|
8154
|
+
args.xga = False
|
|
6633
8155
|
|
|
6634
8156
|
# If no input was given, raise an error
|
|
6635
8157
|
if sys.stdin.isatty():
|
|
@@ -6700,6 +8222,7 @@ def main():
|
|
|
6700
8222
|
# Reset global variables
|
|
6701
8223
|
linksFound = set()
|
|
6702
8224
|
linkMimes = set()
|
|
8225
|
+
extraWarcLinks = set()
|
|
6703
8226
|
successCount = 0
|
|
6704
8227
|
failureCount = 0
|
|
6705
8228
|
fileCount = 0
|
|
@@ -6714,6 +8237,7 @@ def main():
|
|
|
6714
8237
|
stopSourceURLScan = False
|
|
6715
8238
|
stopSourceVirusTotal = False
|
|
6716
8239
|
stopSourceIntelx = False
|
|
8240
|
+
stopSourceGhostArchive = False
|
|
6717
8241
|
|
|
6718
8242
|
# Get the config settings from the config.yml file
|
|
6719
8243
|
getConfig()
|