waymore 7.6__py3-none-any.whl → 8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
waymore/waymore.py CHANGED
@@ -70,6 +70,7 @@ stopSourceAlienVault = False
70
70
  stopSourceURLScan = False
71
71
  stopSourceVirusTotal = False
72
72
  stopSourceIntelx = False
73
+ stopSourceGhostArchive = False
73
74
  successCount = 0
74
75
  failureCount = 0
75
76
  fileCount = 0
@@ -79,6 +80,7 @@ totalPages = 0
79
80
  indexFile = None
80
81
  continueRespFile = None
81
82
  continueRespFileURLScan = None
83
+ continueRespFileGhostArchive = None
82
84
  inputIsDomainANDPath = False
83
85
  inputIsSubDomain = False
84
86
  subs = "*."
@@ -102,6 +104,7 @@ checkAlienVault = 0
102
104
  checkURLScan = 0
103
105
  checkVirusTotal = 0
104
106
  checkIntelx = 0
107
+ checkGhostArchive = 0
105
108
  argsInputHostname = ""
106
109
  responseOutputDirectory = ""
107
110
  urlscanRequestLinks = set()
@@ -112,11 +115,14 @@ linkCountAlienVault = 0
112
115
  linkCountURLScan = 0
113
116
  linkCountVirusTotal = 0
114
117
  linkCountIntelx = 0
118
+ linkCountGhostArchive = 0
115
119
  linksFoundCommonCrawl = set()
116
120
  linksFoundAlienVault = set()
117
121
  linksFoundURLScan = set()
118
122
  linksFoundVirusTotal = set()
119
123
  linksFoundIntelx = set()
124
+ linksFoundGhostArchive = set()
125
+ ghostArchiveRequestLinks = set()
120
126
 
121
127
  # Thread lock for protecting shared state during concurrent operations
122
128
  links_lock = threading.Lock()
@@ -124,6 +130,7 @@ links_lock = threading.Lock()
124
130
  # Shared state for link collection across all sources
125
131
  linksFound = set()
126
132
  linkMimes = set()
133
+ extraWarcLinks = set() # Track extra URLs found in WARC files for mode B
127
134
 
128
135
  # Source Provider URLs
129
136
  WAYBACK_URL = "https://web.archive.org/cdx/search/cdx?url={DOMAIN}{COLLAPSE}&fl=timestamp,original,mimetype,statuscode,digest"
@@ -134,6 +141,8 @@ URLSCAN_DOM_URL = "https://urlscan.io/dom/"
134
141
  VIRUSTOTAL_URL = "https://www.virustotal.com/vtapi/v2/domain/report?apikey={APIKEY}&domain={DOMAIN}"
135
142
  # Paid endpoint first, free endpoint as fallback
136
143
  INTELX_BASES = ["https://2.intelx.io", "https://free.intelx.io"]
144
+ GHOSTARCHIVE_URL = "https://ghostarchive.org/search?term={DOMAIN}&page="
145
+ GHOSTARCHIVE_DOM_URL = "https://ghostarchive.org"
137
146
 
138
147
  intelx_tls = threading.local()
139
148
 
@@ -247,10 +256,10 @@ DEFAULT_LIMIT = 5000
247
256
  DEFAULT_TIMEOUT = 30
248
257
 
249
258
  # Exclusions used to exclude responses we will try to get from web.archive.org
250
- DEFAULT_FILTER_URL = ".css,.jpg,.jpeg,.png,.svg,.img,.gif,.mp4,.flv,.ogv,.webm,.webp,.mov,.mp3,.m4a,.m4p,.scss,.tif,.tiff,.ttf,.otf,.woff,.woff2,.bmp,.ico,.eot,.htc,.rtf,.swf,.image,/image,/img,/css,/wp-json,/wp-content,/wp-includes,/theme,/audio,/captcha,/font,node_modules,/jquery,/bootstrap,/_incapsula_resource"
259
+ DEFAULT_FILTER_URL = ".css,.jpg,.jpeg,.png,.svg,.img,.gif,.mp4,.flv,.ogv,.webm,.webp,.mov,.mp3,.m4a,.m4p,.scss,.tif,.tiff,.ttf,.otf,.woff,.woff2,.bmp,.ico,.eot,.htc,.rtf,.swf,.image,/image,/img,/css,/wp-json,/wp-content,/wp-includes,/theme,/audio,/captcha,/font,node_modules,/jquery,/bootstrap,/_incapsula_resource,.wmv,.wma,.asx,.avif"
251
260
 
252
261
  # MIME Content-Type exclusions used to filter links and responses from web.archive.org through their API
253
- DEFAULT_FILTER_MIME = "text/css,image/jpeg,image/jpg,image/png,image/svg+xml,image/gif,image/tiff,image/webp,image/bmp,image/vnd,image/x-icon,image/vnd.microsoft.icon,font/ttf,font/woff,font/woff2,font/x-woff2,font/x-woff,font/otf,audio/mpeg,audio/wav,audio/webm,audio/aac,audio/ogg,audio/wav,audio/webm,video/mp4,video/mpeg,video/webm,video/ogg,video/mp2t,video/webm,video/x-msvideo,video/x-flv,application/font-woff,application/font-woff2,application/x-font-woff,application/x-font-woff2,application/vnd.ms-fontobject,application/font-sfnt,application/vnd.android.package-archive,binary/octet-stream,application/octet-stream,application/pdf,application/x-font-ttf,application/x-font-otf,video/webm,video/3gpp,application/font-ttf,audio/mp3,audio/x-wav,image/pjpeg,audio/basic,application/font-otf,application/x-ms-application,application/x-msdownload,video/x-ms-wmv,image/x-png,video/quicktime,image/x-ms-bmp,font/opentype,application/x-font-opentype,application/x-woff,audio/aiff"
262
+ DEFAULT_FILTER_MIME = "text/css,image/jpeg,image/jpg,image/png,image/svg+xml,image/gif,image/tiff,image/webp,image/bmp,image/vnd,image/x-icon,image/vnd.microsoft.icon,font/ttf,font/woff,font/woff2,font/x-woff2,font/x-woff,font/otf,audio/mpeg,audio/wav,audio/webm,audio/aac,audio/ogg,audio/wav,audio/webm,video/mp4,video/mpeg,video/webm,video/ogg,video/mp2t,video/webm,video/x-msvideo,video/x-flv,application/font-woff,application/font-woff2,application/x-font-woff,application/x-font-woff2,application/vnd.ms-fontobject,application/font-sfnt,application/vnd.android.package-archive,binary/octet-stream,application/octet-stream,application/x-font-ttf,application/x-font-otf,video/webm,video/3gpp,application/font-ttf,audio/mp3,audio/x-wav,image/pjpeg,audio/basic,application/font-otf,application/x-ms-application,application/x-msdownload,video/x-ms-wmv,image/x-png,video/quicktime,image/x-ms-bmp,font/opentype,application/x-font-opentype,application/x-woff,audio/aiff,video/x-ms-asf,audio/x-ms-wma,audio/wma,application/x-mplayer2,image/avif"
254
263
 
255
264
  # Response code exclusions we will use to filter links and responses from web.archive.org through their API
256
265
  DEFAULT_FILTER_CODE = "404,301,302"
@@ -297,6 +306,298 @@ INLINE_JS_EXCLUDE = [
297
306
  ".json",
298
307
  ]
299
308
 
309
+ # Binary file extensions that should be saved as raw bytes, not text
310
+ BINARY_EXTENSIONS = frozenset(
311
+ [
312
+ ".zip",
313
+ ".gz",
314
+ ".gzip",
315
+ ".tar",
316
+ ".rar",
317
+ ".7z",
318
+ ".bz2",
319
+ ".xz",
320
+ ".pdf",
321
+ ".doc",
322
+ ".docx",
323
+ ".xls",
324
+ ".xlsx",
325
+ ".ppt",
326
+ ".pptx",
327
+ ".exe",
328
+ ".msi",
329
+ ".dll",
330
+ ".bin",
331
+ ".so",
332
+ ".dmg",
333
+ ".deb",
334
+ ".rpm",
335
+ ".png",
336
+ ".jpg",
337
+ ".jpeg",
338
+ ".gif",
339
+ ".bmp",
340
+ ".ico",
341
+ ".webp",
342
+ ".svg",
343
+ ".tiff",
344
+ ".tif",
345
+ ".mp3",
346
+ ".mp4",
347
+ ".wav",
348
+ ".avi",
349
+ ".mov",
350
+ ".mkv",
351
+ ".flv",
352
+ ".wmv",
353
+ ".webm",
354
+ ".ogg",
355
+ ".ttf",
356
+ ".otf",
357
+ ".woff",
358
+ ".woff2",
359
+ ".eot",
360
+ ".class",
361
+ ".jar",
362
+ ".war",
363
+ ".ear",
364
+ ".pyc",
365
+ ".pyo",
366
+ ".o",
367
+ ".a",
368
+ ".lib",
369
+ ".iso",
370
+ ".img",
371
+ ".sqlite",
372
+ ".db",
373
+ ".mdb",
374
+ ".swf",
375
+ ".fla",
376
+ ]
377
+ )
378
+
379
+ # Binary MIME types that should be saved as raw bytes, not text
380
+ BINARY_MIME_TYPES = frozenset(
381
+ [
382
+ "application/zip",
383
+ "application/x-zip-compressed",
384
+ "application/x-gzip",
385
+ "application/gzip",
386
+ "application/x-tar",
387
+ "application/x-rar-compressed",
388
+ "application/x-7z-compressed",
389
+ "application/x-bzip2",
390
+ "application/x-xz",
391
+ "application/pdf",
392
+ "application/msword",
393
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
394
+ "application/vnd.ms-excel",
395
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
396
+ "application/vnd.ms-powerpoint",
397
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
398
+ "application/x-msdownload",
399
+ "application/x-msi",
400
+ "application/x-dosexec",
401
+ "application/octet-stream",
402
+ "image/png",
403
+ "image/jpeg",
404
+ "image/gif",
405
+ "image/bmp",
406
+ "image/x-icon",
407
+ "image/webp",
408
+ "image/tiff",
409
+ "audio/mpeg",
410
+ "audio/wav",
411
+ "audio/ogg",
412
+ "audio/webm",
413
+ "video/mp4",
414
+ "video/avi",
415
+ "video/quicktime",
416
+ "video/x-msvideo",
417
+ "video/x-matroska",
418
+ "video/webm",
419
+ "video/ogg",
420
+ "font/ttf",
421
+ "font/otf",
422
+ "font/woff",
423
+ "font/woff2",
424
+ "application/x-font-ttf",
425
+ "application/x-font-otf",
426
+ "application/font-woff",
427
+ "application/font-woff2",
428
+ "application/java-archive",
429
+ "application/x-java-class",
430
+ "application/x-shockwave-flash",
431
+ "application/x-sqlite3",
432
+ "application/x-iso9660-image",
433
+ ]
434
+ )
435
+
436
+
437
+ def isBinaryContent(contentBytes, contentType, url=""):
438
+ """
439
+ Determine if content should be treated as binary based on actual content, Content-Type, and URL.
440
+
441
+ Priority (highest to lowest):
442
+ 1. Content inspection - check for text signatures (most reliable)
443
+ 2. Content-Type header
444
+ 3. URL extension (least reliable - archive might have captured an HTML error page)
445
+
446
+ Args:
447
+ contentBytes: The raw response bytes (at least first 100 bytes)
448
+ contentType: The Content-Type header value
449
+ url: The URL (optional, used as fallback)
450
+
451
+ Returns True if content is binary and should be saved as raw bytes.
452
+ """
453
+ # STEP 1: Check actual content for text signatures (most reliable)
454
+ # If content starts with text markers, it's definitely NOT binary regardless of extension
455
+ try:
456
+ if contentBytes and len(contentBytes) > 0:
457
+ # Get first 100 bytes and strip leading whitespace/newlines for checking
458
+ preview = contentBytes[:100].lstrip()
459
+ previewLower = preview.lower()
460
+
461
+ # Common text file signatures
462
+ textSignatures = [
463
+ b"<!doctype", # HTML doctype
464
+ b"<html", # HTML tag
465
+ b"<head", # HTML head
466
+ b"<body", # HTML body
467
+ b"<?xml", # XML declaration
468
+ b"<svg", # SVG image (actually XML text)
469
+ b"{", # JSON object
470
+ b"[", # JSON array
471
+ b"/*", # CSS/JS comment
472
+ b"//", # JS comment
473
+ b"#!", # Shebang (shell scripts)
474
+ b"var ", # JavaScript
475
+ b"let ", # JavaScript
476
+ b"const ", # JavaScript
477
+ b"function", # JavaScript
478
+ b"import ", # JavaScript/Python
479
+ b"export ", # JavaScript
480
+ b"class ", # Various languages
481
+ b"def ", # Python
482
+ ]
483
+
484
+ for sig in textSignatures:
485
+ if previewLower.startswith(sig):
486
+ return False # Definitely text, not binary
487
+
488
+ # Check for binary file magic bytes (file signatures)
489
+ binarySignatures = [
490
+ b"%PDF", # PDF
491
+ b"PK\x03\x04", # ZIP, DOCX, XLSX, etc.
492
+ b"PK\x05\x06", # Empty ZIP
493
+ b"\x1f\x8b", # GZIP
494
+ b"\x89PNG", # PNG
495
+ b"\xff\xd8\xff", # JPEG
496
+ b"GIF87a", # GIF
497
+ b"GIF89a", # GIF
498
+ b"BM", # BMP (check at start)
499
+ b"RIFF", # WAV, AVI, WebP
500
+ b"\x00\x00\x00", # Various binary formats (MP4, etc.)
501
+ b"ID3", # MP3 with ID3 tag
502
+ b"\xff\xfb", # MP3
503
+ b"\xff\xfa", # MP3
504
+ b"OggS", # OGG
505
+ b"\x4d\x5a", # EXE/DLL (MZ header)
506
+ b"\x7fELF", # Linux ELF binary
507
+ b"\xca\xfe\xba\xbe", # Java class file
508
+ b"\x30\x26\xb2\x75", # ASF/WMV/WMA (first 4 bytes of ASF GUID)
509
+ b"FLV\x01", # FLV (Flash Video)
510
+ b"ftyp", # MP4/M4A/MOV (after 4 byte size prefix)
511
+ b"Rar!\x1a\x07", # RAR archive
512
+ b"7z\xbc\xaf\x27\x1c", # 7-Zip archive
513
+ b"\x1a\x45\xdf\xa3", # WebM/MKV (EBML)
514
+ b"II\x2a\x00", # TIFF (Intel byte order)
515
+ b"MM\x00\x2a", # TIFF (Motorola byte order)
516
+ b"\x00\x00\x01\x00", # ICO (Windows Icon)
517
+ b"\x00\x00\x02\x00", # CUR (Windows Cursor)
518
+ b"wOFF", # WOFF font
519
+ b"wOF2", # WOFF2 font
520
+ b"FWS", # SWF (uncompressed Flash)
521
+ b"CWS", # SWF (zlib compressed Flash)
522
+ b"ZWS", # SWF (LZMA compressed Flash)
523
+ b"\x00\x01\x00\x00", # TrueType font
524
+ b"OTTO", # OpenType font with CFF
525
+ ]
526
+
527
+ for sig in binarySignatures:
528
+ if preview.startswith(sig):
529
+ return True # Definitely binary
530
+ except Exception:
531
+ pass
532
+
533
+ # STEP 2: Check Content-Type header
534
+ try:
535
+ if contentType:
536
+ mimeType = contentType.lower().split(";")[0].strip()
537
+
538
+ # Explicit text types
539
+ textMimeTypes = [
540
+ "text/html",
541
+ "text/plain",
542
+ "text/css",
543
+ "text/javascript",
544
+ "text/xml",
545
+ "text/csv",
546
+ "text/markdown",
547
+ "application/json",
548
+ "application/javascript",
549
+ "application/xml",
550
+ "application/xhtml+xml",
551
+ "application/rss+xml",
552
+ "application/atom+xml",
553
+ ]
554
+ if mimeType in textMimeTypes or mimeType.startswith("text/"):
555
+ return False # Text type
556
+
557
+ # Known binary types
558
+ if mimeType in BINARY_MIME_TYPES:
559
+ return True
560
+
561
+ # Generic binary prefixes
562
+ if (
563
+ mimeType.startswith("image/")
564
+ or mimeType.startswith("audio/")
565
+ or mimeType.startswith("video/")
566
+ ):
567
+ return True
568
+ if mimeType.startswith("application/") and mimeType not in textMimeTypes:
569
+ # application/* is often binary, but not always - be conservative
570
+ if "octet-stream" in mimeType or "binary" in mimeType:
571
+ return True
572
+ except Exception:
573
+ pass
574
+
575
+ # STEP 3: Check URL extension as last resort
576
+ try:
577
+ if url:
578
+ # Extract actual URL from prefixed formats (Wayback/URLScan)
579
+ actualUrl = url
580
+ httpPos = url.find("http://")
581
+ httpsPos = url.find("https://")
582
+ if httpsPos >= 0:
583
+ actualUrl = url[httpsPos:]
584
+ elif httpPos >= 0:
585
+ actualUrl = url[httpPos:]
586
+
587
+ parsed = urlparse(actualUrl.strip())
588
+ path = parsed.path.lower()
589
+ if "." in path:
590
+ ext = "." + path.rsplit(".", 1)[-1]
591
+ if "?" in ext:
592
+ ext = ext.split("?")[0]
593
+ if ext in BINARY_EXTENSIONS:
594
+ return True
595
+ except Exception:
596
+ pass
597
+
598
+ # Default: treat as text (safer - text processing won't corrupt text)
599
+ return False
600
+
300
601
 
301
602
  # Get memory usage for
302
603
  def getMemory():
@@ -451,7 +752,7 @@ def handler(signal_received, frame):
451
752
  This function is called if Ctrl-C is called by the user
452
753
  An attempt will be made to try and clean up properly
453
754
  """
454
- global stopSource, stopProgram, stopProgramCount, stopSourceWayback, stopSourceCommonCrawl, stopSourceAlienVault, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx, current_response, current_session
755
+ global stopSource, stopProgram, stopProgramCount, stopSourceWayback, stopSourceCommonCrawl, stopSourceAlienVault, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx, stopSourceGhostArchive, current_response, current_session
455
756
 
456
757
  if stopProgram is not None:
457
758
  stopProgramCount = stopProgramCount + 1
@@ -486,6 +787,7 @@ def handler(signal_received, frame):
486
787
  stopSourceURLScan = True
487
788
  stopSourceVirusTotal = True
488
789
  stopSourceIntelx = True
790
+ stopSourceGhostArchive = True
489
791
  # Try to close any active response or session to interrupt blocking network I/O
490
792
  try:
491
793
  if current_response is not None:
@@ -955,16 +1257,12 @@ def showOptions():
955
1257
  )
956
1258
  )
957
1259
 
1260
+ # Only show --source-ip if it's explicitly configured
958
1261
  if SOURCE_IP:
959
1262
  write(
960
1263
  colored("--source-ip: " + str(SOURCE_IP), "magenta")
961
1264
  + colored(" Outbound requests will bind to this IP.", "white")
962
1265
  )
963
- else:
964
- write(
965
- colored("--source-ip: default", "magenta")
966
- + colored(" Outbound IP determined by OS routing table.", "white")
967
- )
968
1266
 
969
1267
  write()
970
1268
 
@@ -1465,11 +1763,15 @@ def printProgressBar(
1465
1763
 
1466
1764
  def filehash(text):
1467
1765
  """
1468
- Generate a hash value for the passed string. This is used for the file name of a downloaded archived response
1766
+ Generate a hash value for the passed string or bytes. This is used for the file name of a downloaded archived response
1469
1767
  """
1470
1768
  hash = 0
1471
1769
  for ch in text:
1472
- hash = (hash * 281 ^ ord(ch) * 997) & 0xFFFFFFFFFFF
1770
+ # Handle both str (gives chars needing ord()) and bytes (gives ints directly)
1771
+ if isinstance(ch, int):
1772
+ hash = (hash * 281 ^ ch * 997) & 0xFFFFFFFFFFF
1773
+ else:
1774
+ hash = (hash * 281 ^ ord(ch) * 997) & 0xFFFFFFFFFFF
1473
1775
  return str(hash)
1474
1776
 
1475
1777
 
@@ -1497,6 +1799,63 @@ def fixArchiveOrgUrl(url):
1497
1799
  return url
1498
1800
 
1499
1801
 
1802
+ def isLikelyBinaryUrl(url):
1803
+ """
1804
+ Check if a URL likely points to a binary file based on its extension.
1805
+ This is used BEFORE making a request to decide if we need the raw/id_ version.
1806
+ """
1807
+ try:
1808
+ # Extract actual URL from prefixed formats (Wayback timestamp/URLScan UUID)
1809
+ actualUrl = url
1810
+ httpPos = url.find("http://")
1811
+ httpsPos = url.find("https://")
1812
+ if httpsPos >= 0:
1813
+ actualUrl = url[httpsPos:]
1814
+ elif httpPos >= 0:
1815
+ actualUrl = url[httpPos:]
1816
+
1817
+ parsed = urlparse(actualUrl.strip())
1818
+ path = parsed.path.lower()
1819
+ if "." in path:
1820
+ ext = "." + path.rsplit(".", 1)[-1]
1821
+ if "?" in ext:
1822
+ ext = ext.split("?")[0]
1823
+ if ext in BINARY_EXTENSIONS:
1824
+ return True
1825
+ except Exception:
1826
+ pass
1827
+ return False
1828
+
1829
+
1830
+ def addRawModifier(archiveUrl):
1831
+ """
1832
+ Add 'id_' modifier to Wayback Machine URL to get raw/original content.
1833
+ This is essential for binary files to avoid Wayback modifications.
1834
+
1835
+ Example:
1836
+ Input: https://web.archive.org/web/20090315210455/http://example.com/file.wmv
1837
+ Output: https://web.archive.org/web/20090315210455id_/http://example.com/file.wmv
1838
+ """
1839
+ try:
1840
+ # Find the timestamp in the URL (14 digits after /web/)
1841
+ webPos = archiveUrl.find("/web/")
1842
+ if webPos >= 0:
1843
+ # Find where the timestamp ends (first / after /web/)
1844
+ afterWeb = webPos + 5 # Position after "/web/"
1845
+ slashAfterTimestamp = archiveUrl.find("/", afterWeb)
1846
+ if slashAfterTimestamp > afterWeb:
1847
+ # Insert id_ before the slash after timestamp
1848
+ timestamp = archiveUrl[afterWeb:slashAfterTimestamp]
1849
+ # Only add id_ if it's not already there
1850
+ if not timestamp.endswith("id_"):
1851
+ return (
1852
+ archiveUrl[:slashAfterTimestamp] + "id_" + archiveUrl[slashAfterTimestamp:]
1853
+ )
1854
+ except Exception:
1855
+ pass
1856
+ return archiveUrl
1857
+
1858
+
1500
1859
  # Add a link to the linksFound collection for archived responses (included timestamp preifx)
1501
1860
  def linksFoundResponseAdd(link):
1502
1861
  global linksFound, argsInput, argsInputHostname, links_lock
@@ -1581,6 +1940,12 @@ def processArchiveUrl(url):
1581
1940
  if stopProgram is None:
1582
1941
 
1583
1942
  archiveUrl = "https://web.archive.org/web/" + fixArchiveOrgUrl(url)
1943
+
1944
+ # For binary files, add id_ modifier to get raw/original content
1945
+ # This prevents Wayback Machine from modifying the content
1946
+ if isLikelyBinaryUrl(url):
1947
+ archiveUrl = addRawModifier(archiveUrl)
1948
+
1584
1949
  hashValue = ""
1585
1950
 
1586
1951
  # Get memory usage every 100 responses
@@ -1593,6 +1958,18 @@ def processArchiveUrl(url):
1593
1958
  # Make a request to the web archive
1594
1959
  try:
1595
1960
  try:
1961
+ try:
1962
+ if verbose() and os.environ.get("USER") == "xnl":
1963
+ writerr(
1964
+ colored(
1965
+ "[ DBG ] Requesting file " + archiveUrl,
1966
+ "yellow",
1967
+ attrs=["dark"],
1968
+ )
1969
+ )
1970
+ except Exception:
1971
+ pass
1972
+
1596
1973
  # Choose a random user agent string to use for any requests
1597
1974
  userAgent = random.choice(USER_AGENT)
1598
1975
 
@@ -1604,146 +1981,175 @@ def processArchiveUrl(url):
1604
1981
  headers={"User-Agent": userAgent},
1605
1982
  allow_redirects=True,
1606
1983
  )
1607
- archiveHtml = str(resp.text)
1984
+
1985
+ # Get raw content bytes first
1986
+ contentBytes = resp.content
1987
+
1608
1988
  try:
1609
- contentType = resp.headers.get("Content-Type").split(";")[0].lower()
1989
+ contentType = resp.headers.get("Content-Type", "").split(";")[0].lower()
1610
1990
  except Exception:
1611
1991
  contentType = ""
1612
1992
 
1993
+ # Determine if this is binary content based on actual content, Content-Type, and URL
1994
+ isBinary = isBinaryContent(contentBytes, contentType, url)
1995
+
1996
+ if isBinary:
1997
+ # For binary files, use raw bytes as-is
1998
+ archiveContent = contentBytes
1999
+ archiveHtml = None # Not used for binary files
2000
+ else:
2001
+ # For text files, decode to string
2002
+ archiveHtml = contentBytes.decode("utf-8", errors="replace")
2003
+ archiveContent = None # Not used for text files
2004
+
1613
2005
  # Only create a file if there is a response
1614
- if len(archiveHtml) != 0:
2006
+ responseLength = len(archiveContent) if isBinary else len(archiveHtml)
2007
+ if responseLength != 0:
1615
2008
 
2009
+ # For text files, check for custom 404 pages
1616
2010
  # If the FILTER_CODE doesn't include 404, OR
1617
2011
  # If the FILTER_CODE includes 404, and it doesn't seem to be a custom 404 page
1618
- if "404" not in FILTER_CODE or (
1619
- "404" in FILTER_CODE
1620
- and not re.findall(REGEX_404, archiveHtml, re.DOTALL | re.IGNORECASE)
1621
- ):
1622
-
1623
- # Add the URL as a comment at the start of the response
1624
- if args.url_filename:
1625
- archiveHtml = (
1626
- "/* Original URL: " + archiveUrl + " */\n" + archiveHtml
2012
+ if (
2013
+ isBinary
2014
+ or "404" not in FILTER_CODE
2015
+ or (
2016
+ "404" in FILTER_CODE
2017
+ and not re.findall(
2018
+ REGEX_404, archiveHtml, re.DOTALL | re.IGNORECASE
1627
2019
  )
1628
-
1629
- # Remove all web archive references in the response
1630
- archiveHtml = re.sub(
1631
- r'\<script type=\"text\/javascript" src=\"\/_static\/js\/bundle-playback\.js\?v=[A-Za-z0-9]*" charset="utf-8"><\/script>\n<script type="text\/javascript" src="\/_static\/js\/wombat\.js.*\<\!-- End Wayback Rewrite JS Include --\>',
1632
- "",
1633
- archiveHtml,
1634
- 1,
1635
- flags=re.DOTALL | re.IGNORECASE,
1636
- )
1637
- archiveHtml = re.sub(
1638
- r"\<script src=\"\/\/archive\.org.*\<\!-- End Wayback Rewrite JS Include --\>",
1639
- "",
1640
- archiveHtml,
1641
- 1,
1642
- flags=re.DOTALL | re.IGNORECASE,
1643
- )
1644
- archiveHtml = re.sub(
1645
- r"\<script\>window\.RufflePlayer[^\<]*\<\/script\>",
1646
- "",
1647
- archiveHtml,
1648
- 1,
1649
- flags=re.DOTALL | re.IGNORECASE,
1650
- )
1651
- archiveHtml = re.sub(
1652
- r"\<\!-- BEGIN WAYBACK TOOLBAR INSERT --\>.*\<\!-- END WAYBACK TOOLBAR INSERT --\>",
1653
- "",
1654
- archiveHtml,
1655
- 1,
1656
- flags=re.DOTALL | re.IGNORECASE,
1657
- )
1658
- archiveHtml = re.sub(
1659
- r"(}\n)?(\/\*|<!--\n)\s*FILE ARCHIVED ON.*108\(a\)\(3\)\)\.\n(\*\/|-->)",
1660
- "",
1661
- archiveHtml,
1662
- 1,
1663
- flags=re.DOTALL | re.IGNORECASE,
1664
- )
1665
- archiveHtml = re.sub(
1666
- r"var\s_____WB\$wombat\$assign\$function.*WB\$wombat\$assign\$function_____\(\"opener\"\);",
1667
- "",
1668
- archiveHtml,
1669
- 1,
1670
- flags=re.DOTALL | re.IGNORECASE,
1671
- )
1672
- archiveHtml = re.sub(
1673
- r"(\<\!--|\/\*)\nplayback timings.*(--\>|\*\/)",
1674
- "",
1675
- archiveHtml,
1676
- 1,
1677
- flags=re.DOTALL | re.IGNORECASE,
1678
- )
1679
- archiveHtml = re.sub(
1680
- r"((https:)?\/\/web\.archive\.org)?\/web\/[0-9]{14}([A-Za-z]{2}\_)?\/",
1681
- "",
1682
- archiveHtml,
1683
- flags=re.IGNORECASE,
1684
- )
1685
- archiveHtml = re.sub(
1686
- r"((https:)?\\\/\\\/web\.archive\.org)?\\\/web\\\/[0-9]{14}([A-Za-z]{2}\_)?\\\/",
1687
- "",
1688
- archiveHtml,
1689
- flags=re.IGNORECASE,
1690
- )
1691
- archiveHtml = re.sub(
1692
- r"((https:)?%2F%2Fweb\.archive\.org)?%2Fweb%2F[0-9]{14}([A-Za-z]{2}\_)?%2F",
1693
- "",
1694
- archiveHtml,
1695
- flags=re.IGNORECASE,
1696
- )
1697
- archiveHtml = re.sub(
1698
- r"((https:)?\\u002F\\u002Fweb\.archive\.org)?\\u002Fweb\\u002F[0-9]{14}([A-Za-z]{2}\_)?\\u002F",
1699
- "",
1700
- archiveHtml,
1701
- flags=re.IGNORECASE,
1702
- )
1703
- archiveHtml = re.sub(
1704
- r"\<script type=\"text\/javascript\">\s*__wm\.init\(\"https:\/\/web\.archive\.org\/web\"\);[^\<]*\<\/script\>",
1705
- "",
1706
- archiveHtml,
1707
- flags=re.IGNORECASE,
1708
- )
1709
- archiveHtml = re.sub(
1710
- r'\<script type=\"text\/javascript\" src="https:\/\/web-static\.archive\.org[^\<]*\<\/script\>',
1711
- "",
1712
- archiveHtml,
1713
- flags=re.IGNORECASE,
1714
- )
1715
- archiveHtml = re.sub(
1716
- r"\<link rel=\"stylesheet\" type=\"text\/css\" href=\"https:\/\/web-static\.archive\.org[^\<]*\/\>",
1717
- "",
1718
- archiveHtml,
1719
- flags=re.IGNORECASE,
1720
- )
1721
- archiveHtml = re.sub(
1722
- r"\<\!-- End Wayback Rewrite JS Include --\>",
1723
- "",
1724
- archiveHtml,
1725
- re.IGNORECASE,
1726
2020
  )
2021
+ ):
1727
2022
 
1728
- # If there is a specific Wayback error in the response, raise an exception
1729
- if (
1730
- archiveHtml.lower().find(
1731
- "wayback machine has not archived that url"
2023
+ # For text files only: Add URL comment and clean up wayback references
2024
+ if not isBinary:
2025
+ # Add the URL as a comment at the start of the response
2026
+ if args.url_filename:
2027
+ archiveHtml = (
2028
+ "/* Original URL: " + archiveUrl + " */\n" + archiveHtml
2029
+ )
2030
+
2031
+ # Remove all web archive references in the response
2032
+ archiveHtml = re.sub(
2033
+ r'\<script type=\"text\/javascript" src=\"\/_static\/js\/bundle-playback\.js\?v=[A-Za-z0-9]*" charset="utf-8"\><\/script>\n<script type="text\/javascript" src="\/_static\/js\/wombat\.js.*\<\!-- End Wayback Rewrite JS Include --\>',
2034
+ "",
2035
+ archiveHtml,
2036
+ 1,
2037
+ flags=re.DOTALL | re.IGNORECASE,
1732
2038
  )
1733
- > 0
1734
- or archiveHtml.lower().find(
1735
- "snapshot cannot be displayed due to an internal error"
2039
+ archiveHtml = re.sub(
2040
+ r"\<script src=\"\/\/archive\.org.*\<\!-- End Wayback Rewrite JS Include --\>",
2041
+ "",
2042
+ archiveHtml,
2043
+ 1,
2044
+ flags=re.DOTALL | re.IGNORECASE,
1736
2045
  )
1737
- > 0
1738
- ):
1739
- raise WayBackException
2046
+ archiveHtml = re.sub(
2047
+ r"\<script\>window\.RufflePlayer[^\<]*\<\/script\>",
2048
+ "",
2049
+ archiveHtml,
2050
+ 1,
2051
+ flags=re.DOTALL | re.IGNORECASE,
2052
+ )
2053
+ archiveHtml = re.sub(
2054
+ r"\<\!-- BEGIN WAYBACK TOOLBAR INSERT --\>.*\<\!-- END WAYBACK TOOLBAR INSERT --\>",
2055
+ "",
2056
+ archiveHtml,
2057
+ 1,
2058
+ flags=re.DOTALL | re.IGNORECASE,
2059
+ )
2060
+ archiveHtml = re.sub(
2061
+ r"(}\n)?(\/\*|<\!--\n)\s*FILE ARCHIVED ON.*108\(a\)\(3\)\)\.\n(\*\/|--\>)",
2062
+ "",
2063
+ archiveHtml,
2064
+ 1,
2065
+ flags=re.DOTALL | re.IGNORECASE,
2066
+ )
2067
+ archiveHtml = re.sub(
2068
+ r"var\s_____WB\$wombat\$assign\$function.*WB\$wombat\$assign\$function_____\(\"opener\"\);",
2069
+ "",
2070
+ archiveHtml,
2071
+ 1,
2072
+ flags=re.DOTALL | re.IGNORECASE,
2073
+ )
2074
+ archiveHtml = re.sub(
2075
+ r"(\<\!--|\/\*)\nplayback timings.*(--\>|\*\/)",
2076
+ "",
2077
+ archiveHtml,
2078
+ 1,
2079
+ flags=re.DOTALL | re.IGNORECASE,
2080
+ )
2081
+ archiveHtml = re.sub(
2082
+ r"((https:)?\/\/web\.archive\.org)?\/web\/[0-9]{14}([A-Za-z]{2}\_)?\/",
2083
+ "",
2084
+ archiveHtml,
2085
+ flags=re.IGNORECASE,
2086
+ )
2087
+ archiveHtml = re.sub(
2088
+ r"((https:)?\\\/\\\/web\.archive\.org)?\\\/web\\\/[0-9]{14}([A-Za-z]{2}\_)?\\\/",
2089
+ "",
2090
+ archiveHtml,
2091
+ flags=re.IGNORECASE,
2092
+ )
2093
+ archiveHtml = re.sub(
2094
+ r"((https:)?%2F%2Fweb\.archive\.org)?%2Fweb%2F[0-9]{14}([A-Za-z]{2}\_)?%2F",
2095
+ "",
2096
+ archiveHtml,
2097
+ flags=re.IGNORECASE,
2098
+ )
2099
+ archiveHtml = re.sub(
2100
+ r"((https:)?\\u002F\\u002Fweb\.archive\.org)?\\u002Fweb\\u002F[0-9]{14}([A-Za-z]{2}\_)?\\u002F",
2101
+ "",
2102
+ archiveHtml,
2103
+ flags=re.IGNORECASE,
2104
+ )
2105
+ archiveHtml = re.sub(
2106
+ r"\<script type=\"text\/javascript\"\>\s*__wm\.init\(\"https:\/\/web\.archive\.org\/web\"\);[^\<]*\<\/script\>",
2107
+ "",
2108
+ archiveHtml,
2109
+ flags=re.IGNORECASE,
2110
+ )
2111
+ archiveHtml = re.sub(
2112
+ r'\<script type=\"text\/javascript\" src="https:\/\/web-static\.archive\.org[^\<]*\<\/script\>',
2113
+ "",
2114
+ archiveHtml,
2115
+ flags=re.IGNORECASE,
2116
+ )
2117
+ archiveHtml = re.sub(
2118
+ r"\<link rel=\"stylesheet\" type=\"text\/css\" href=\"https:\/\/web-static\.archive\.org[^\<]*\/\>",
2119
+ "",
2120
+ archiveHtml,
2121
+ flags=re.IGNORECASE,
2122
+ )
2123
+ archiveHtml = re.sub(
2124
+ r"\<\!-- End Wayback Rewrite JS Include --\>",
2125
+ "",
2126
+ archiveHtml,
2127
+ re.IGNORECASE,
2128
+ )
2129
+
2130
+ # If there is a specific Wayback error in the response, raise an exception
2131
+ if (
2132
+ archiveHtml.lower().find(
2133
+ "wayback machine has not archived that url"
2134
+ )
2135
+ > 0
2136
+ or archiveHtml.lower().find(
2137
+ "snapshot cannot be displayed due to an internal error"
2138
+ )
2139
+ > 0
2140
+ ):
2141
+ raise WayBackException
1740
2142
 
1741
2143
  # Create file name based on url or hash value of the response, depending on selection. Ensure the file name isn't over 255 characters
1742
2144
  if args.url_filename:
1743
2145
  fileName = url.replace("/", "-").replace(":", "")
1744
2146
  fileName = fileName[0:254]
1745
2147
  else:
1746
- hashValue = filehash(archiveHtml)
2148
+ # For binary files, hash the raw bytes; for text, hash the text
2149
+ if isBinary:
2150
+ hashValue = filehash(archiveContent.hex())
2151
+ else:
2152
+ hashValue = filehash(archiveHtml)
1747
2153
  fileName = hashValue
1748
2154
 
1749
2155
  # Determine extension of file from the content-type using the mimetypes library
@@ -1785,11 +2191,15 @@ def processArchiveUrl(url):
1785
2191
  extension = "css"
1786
2192
  elif "pdf" in extension:
1787
2193
  extension = "pdf"
2194
+ elif "zip" in extension:
2195
+ extension = "zip"
2196
+ elif "gzip" in extension or "x-gzip" in extension:
2197
+ extension = "gz"
1788
2198
  elif "plain" == extension:
1789
2199
  extension = "txt"
1790
2200
 
1791
2201
  # If extension is still blank, set to html if the content ends with HTML tag, otherwise set to unknown
1792
- if extension == "":
2202
+ if extension == "" and not isBinary:
1793
2203
  if (
1794
2204
  archiveHtml.lower().strip().endswith("</html>")
1795
2205
  or archiveHtml.lower()
@@ -1800,6 +2210,8 @@ def processArchiveUrl(url):
1800
2210
  extension = "html"
1801
2211
  else:
1802
2212
  extension = "unknown"
2213
+ elif extension == "" and isBinary:
2214
+ extension = "bin"
1803
2215
 
1804
2216
  fileName = fileName + "." + extension
1805
2217
 
@@ -1816,10 +2228,14 @@ def processArchiveUrl(url):
1816
2228
  + f"{fileName}"
1817
2229
  )
1818
2230
 
1819
- # Write the file
2231
+ # Write the file - binary mode for binary files, text mode for text files
1820
2232
  try:
1821
- responseFile = open(filePath, "w", encoding="utf8")
1822
- responseFile.write(archiveHtml)
2233
+ if isBinary:
2234
+ responseFile = open(filePath, "wb")
2235
+ responseFile.write(archiveContent)
2236
+ else:
2237
+ responseFile = open(filePath, "w", encoding="utf8")
2238
+ responseFile.write(archiveHtml)
1823
2239
  responseFile.close()
1824
2240
  fileCount = fileCount + 1
1825
2241
  except Exception as e:
@@ -1852,9 +2268,10 @@ def processArchiveUrl(url):
1852
2268
  )
1853
2269
  )
1854
2270
 
1855
- # FOR DEBUGGING PURPOSES
2271
+ # FOR DEBUGGING PURPOSES (only for text files)
1856
2272
  try:
1857
- if os.environ.get("USER") == "xnl":
2273
+ if os.environ.get("USER") == "xnl" and not isBinary:
2274
+
1858
2275
  debugText = ""
1859
2276
  if archiveHtml.lower().find("archive.org") > 0:
1860
2277
  debugText = "ARCHIVE.ORG"
@@ -1862,20 +2279,32 @@ def processArchiveUrl(url):
1862
2279
  debugText = "INTERNET ARCHIVE"
1863
2280
  elif archiveHtml.lower().find("wombat") > 0:
1864
2281
  debugText = "WOMBAT (JS)"
1865
- if debugText != "":
2282
+ if verbose() and debugText != "":
1866
2283
  writerr(
1867
2284
  colored(
1868
2285
  getSPACER(
1869
- '"'
2286
+ '[ DBG ] "'
1870
2287
  + fileName
1871
2288
  + '" CONTAINS '
1872
2289
  + debugText
1873
2290
  + " - CHECK ITS A VALID REFERENCE"
1874
2291
  ),
1875
2292
  "yellow",
2293
+ attrs=["dark"],
1876
2294
  )
1877
2295
  )
1878
- except Exception:
2296
+ except Exception as e:
2297
+ if verbose():
2298
+ writerr(
2299
+ colored(
2300
+ '[ DBG ] Error - Failed to output debug info for "'
2301
+ + archiveUrl
2302
+ + '": '
2303
+ + str(e),
2304
+ "red",
2305
+ attrs=["dark"],
2306
+ )
2307
+ )
1879
2308
  pass
1880
2309
 
1881
2310
  successCount = successCount + 1
@@ -2346,17 +2775,20 @@ def validateArgProviders(x):
2346
2775
  - urlscan
2347
2776
  - virustotal
2348
2777
  - intelx
2778
+ - ghostarchive
2349
2779
  """
2350
2780
  invalid = False
2351
2781
  x = x.lower()
2352
2782
  providers = x.split(",")
2353
2783
  for provider in providers:
2354
- if not re.fullmatch(r"(wayback|commoncrawl|otx|urlscan|virustotal|intelx)", provider):
2784
+ if not re.fullmatch(
2785
+ r"(wayback|commoncrawl|otx|urlscan|virustotal|intelx|ghostarchive)", provider
2786
+ ):
2355
2787
  invalid = True
2356
2788
  break
2357
2789
  if invalid:
2358
2790
  raise argparse.ArgumentTypeError(
2359
- "Pass providers separated by a comma, e.g. wayback,commoncrawl,otx,urlscan,virustotal,intelx"
2791
+ "Pass providers separated by a comma, e.g. wayback,commoncrawl,otx,urlscan,virustotal,intelx,ghostarchive"
2360
2792
  )
2361
2793
  return x
2362
2794
 
@@ -2897,17 +3329,38 @@ def getURLScanDOM(originalUrl, domUrl):
2897
3329
  resp = session.get(
2898
3330
  domUrl, headers={"User-Agent": userAgent}, allow_redirects=True
2899
3331
  )
2900
- archiveHtml = str(resp.text)
3332
+
3333
+ # Get raw content bytes first
3334
+ contentBytes = resp.content
3335
+
3336
+ # Get content type from response headers
3337
+ try:
3338
+ contentType = resp.headers.get("Content-Type", "").split(";")[0].lower()
3339
+ except Exception:
3340
+ contentType = ""
3341
+
3342
+ # Determine if this is binary content based on actual content, Content-Type, and URL
3343
+ isBinary = isBinaryContent(contentBytes, contentType, originalUrl)
3344
+
3345
+ if isBinary:
3346
+ # For binary files, use raw bytes as-is
3347
+ archiveContent = contentBytes
3348
+ archiveHtml = None
3349
+ else:
3350
+ # For text files, decode to string
3351
+ archiveHtml = contentBytes.decode("utf-8", errors="replace")
3352
+ archiveContent = None
2901
3353
 
2902
3354
  # If there is a specific URLScan error in the response, raise an exception
2903
- if archiveHtml.lower().strip() == "not found!":
3355
+ if not isBinary and archiveHtml.lower().strip() == "not found!":
2904
3356
  raise WayBackException
2905
3357
 
2906
3358
  # Only create a file if there is a response
2907
- if len(archiveHtml) != 0:
3359
+ responseLength = len(archiveContent) if isBinary else len(archiveHtml)
3360
+ if responseLength != 0:
2908
3361
 
2909
- # Add the URL as a comment at the start of the response
2910
- if args.url_filename:
3362
+ # Add the URL as a comment at the start of the response (text files only)
3363
+ if not isBinary and args.url_filename:
2911
3364
  archiveHtml = "/* Original URL: " + originalUrl + " */\n" + archiveHtml
2912
3365
 
2913
3366
  # Create file name based on url or hash value of the response, depending on selection. Ensure the file name isn't over 255 characters
@@ -2915,7 +3368,11 @@ def getURLScanDOM(originalUrl, domUrl):
2915
3368
  fileName = originalUrl.replace("/", "-").replace(":", "")
2916
3369
  fileName = fileName[0:254]
2917
3370
  else:
2918
- hashValue = filehash(archiveHtml)
3371
+ # For binary files, hash the raw bytes; for text, hash the text
3372
+ if isBinary:
3373
+ hashValue = filehash(archiveContent.hex())
3374
+ else:
3375
+ hashValue = filehash(archiveHtml)
2919
3376
  fileName = hashValue
2920
3377
 
2921
3378
  # Determine extension of file from the content-type using the mimetypes library
@@ -2933,7 +3390,7 @@ def getURLScanDOM(originalUrl, domUrl):
2933
3390
  pass
2934
3391
 
2935
3392
  # If the extension is blank, numeric, longer than 4 characters or not alphanumeric - then set to html if the content ends with HTML tag, otherwise set to unknown
2936
- if extension == "":
3393
+ if extension == "" and not isBinary:
2937
3394
  if (
2938
3395
  archiveHtml.lower().strip().endswith("</html>")
2939
3396
  or archiveHtml.lower().strip().endswith("</body>")
@@ -2944,6 +3401,8 @@ def getURLScanDOM(originalUrl, domUrl):
2944
3401
  extension = "html"
2945
3402
  else:
2946
3403
  extension = "unknown"
3404
+ elif extension == "" and isBinary:
3405
+ extension = "bin"
2947
3406
 
2948
3407
  fileName = fileName + "." + extension
2949
3408
 
@@ -2960,10 +3419,14 @@ def getURLScanDOM(originalUrl, domUrl):
2960
3419
  + f"{fileName}"
2961
3420
  )
2962
3421
 
2963
- # Write the file
3422
+ # Write the file - binary mode for binary files, text mode for text files
2964
3423
  try:
2965
- responseFile = open(filePath, "w", encoding="utf8")
2966
- responseFile.write(archiveHtml)
3424
+ if isBinary:
3425
+ responseFile = open(filePath, "wb")
3426
+ responseFile.write(archiveContent)
3427
+ else:
3428
+ responseFile = open(filePath, "w", encoding="utf8")
3429
+ responseFile.write(archiveHtml)
2967
3430
  responseFile.close()
2968
3431
  fileCount = fileCount + 1
2969
3432
  except Exception as e:
@@ -3083,98 +3546,614 @@ def getURLScanDOM(originalUrl, domUrl):
3083
3546
  writerr(colored("ERROR getURLScanDOM 1: " + str(e), "red"))
3084
3547
 
3085
3548
 
3086
- def format_date_for_urlscan(date_str):
3087
- # Handle different lengths of input
3088
- if len(date_str) == 4: # YYYY
3089
- date_str += "0101"
3090
- elif len(date_str) == 6: # YYYYMM
3091
- date_str += "01"
3092
-
3093
- # Convert to YYYY-MM-DD format
3094
- try:
3095
- formatted_date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
3096
- return formatted_date
3097
- except Exception:
3098
- return ""
3099
-
3100
-
3101
- def getURLScanUrls():
3549
+ def getGhostArchiveWARC(originalUrl, domUrl):
3102
3550
  """
3103
- Get URLs from the URLSCan API, urlscan.io
3551
+ Get the DOM for the passed GhostArchive link - parses WARC files containing multiple request/response pairs
3104
3552
  """
3105
- global URLSCAN_API_KEY, linksFound, linkMimes, waymorePath, subs, stopProgram, stopSourceURLScan, argsInput, checkURLScan, argsInputHostname, linkCountURLScan, linksFoundURLScan
3106
-
3107
- # Write the file of URL's for the passed domain/URL
3553
+ global stopProgram, successCount, failureCount, fileCount, DEFAULT_OUTPUT_DIR, totalResponses, indexFile, argsInput, argsInputHostname, REGEX_404, linksFound, extraWarcLinks, links_lock
3108
3554
  try:
3109
- requestsMade = 0
3110
- stopSourceURLScan = False
3111
- linksFoundURLScan = set()
3112
- totalUrls = 0
3113
- checkResponse = True
3114
-
3115
- # Set the URL to just the hostname
3116
- url = URLSCAN_URL.replace("{DOMAIN}", quote(argsInputHostname))
3555
+ if stopProgram is None:
3117
3556
 
3118
- # If the --from-date or --to-date parameters were paassed then also add a date filter
3119
- if args.from_date or args.to_date:
3120
- if args.from_date:
3121
- fromDate = format_date_for_urlscan(str(args.from_date)[:8])
3122
- else:
3123
- fromDate = "2016-01-01" # The year URLScan started
3124
- if args.to_date:
3125
- toDate = format_date_for_urlscan(str(args.to_date)[:8])
3126
- else:
3127
- toDate = "now"
3128
- url = url.replace("{DATERANGE}", f"%20date:[{fromDate}%20TO%20{toDate}]")
3129
- else:
3130
- url = url.replace("{DATERANGE}", "")
3557
+ # The WARC files are found by replacing /archive with /chimurai4 and using the .warc file extension
3558
+ warcUrl = domUrl.replace("/archive", "/chimurai4") + ".warc"
3131
3559
 
3132
- if verbose():
3133
- if args.mode == "R":
3134
- write(
3135
- colored(
3136
- "URLScan - [ INFO ] The URLScan URL requested to get links for responses: ",
3137
- "magenta",
3138
- )
3139
- + colored(url + "\n", "white")
3140
- )
3141
- else:
3142
- write(
3143
- colored(
3144
- "URLScan - [ INFO ] The URLScan URL requested to get links: ", "magenta"
3145
- )
3146
- + colored(url + "\n", "white")
3147
- )
3560
+ # Get memory usage every 100 responses
3561
+ if (successCount + failureCount) % 100 == 0:
3562
+ try:
3563
+ getMemory()
3564
+ except Exception:
3565
+ pass
3148
3566
 
3149
- if args.mode in ("U", "B") and not args.check_only:
3150
- write(
3151
- colored(
3152
- "URLScan - [ INFO ] Getting links from urlscan.io API (this can take a while for some domains)...",
3153
- "cyan",
3154
- )
3155
- )
3567
+ # Fetch content
3568
+ try:
3569
+ # Show progress bar
3570
+ fillTest = (successCount + failureCount) % 2
3571
+ fillChar = "o"
3572
+ if fillTest == 0:
3573
+ fillChar = "O"
3574
+ suffix = "Complete "
3156
3575
 
3157
- # Get the first page from urlscan.io
3158
- try:
3159
- # Choose a random user agent string to use for any requests
3160
- # For other sources we would use `random.choice(USER_AGENT)` to asignn a random user-agent, but it seems
3161
- # that there are a handful of those that ALWAYS return 429. Passing a specific one all the time seems to
3162
- # be successful all the time
3163
- userAgent = "waymore v" + __version__ + " by xnl-h4ck3r"
3164
- session = requests.Session()
3165
- session.mount("https://", HTTP_ADAPTER)
3166
- session.mount("http://", HTTP_ADAPTER)
3167
- # Pass the API-Key header too. This can change the max endpoints per page, depending on URLScan subscription
3168
- resp = session.get(url, headers={"User-Agent": userAgent, "API-Key": URLSCAN_API_KEY})
3169
- requestsMade = requestsMade + 1
3170
- except Exception as e:
3171
- write(
3172
- colored(
3173
- "URLScan - [ ERR ] Unable to get links from urlscan.io: " + str(e),
3174
- "red",
3576
+ printProgressBar(
3577
+ successCount + failureCount,
3578
+ totalResponses,
3579
+ prefix="Processing " + str(totalResponses) + " WARC files:",
3580
+ suffix=suffix,
3581
+ length=getProgressBarLength(),
3582
+ fill=fillChar,
3175
3583
  )
3176
- )
3177
- return
3584
+
3585
+ try:
3586
+ try:
3587
+ if verbose() and os.environ.get("USER") == "xnl":
3588
+ writerr(
3589
+ colored(
3590
+ "[ DBG ] Requesting file " + warcUrl,
3591
+ "yellow",
3592
+ attrs=["dark"],
3593
+ )
3594
+ )
3595
+ except Exception:
3596
+ pass
3597
+
3598
+ # Choose a random user agent string to use for any requests
3599
+ userAgent = random.choice(USER_AGENT)
3600
+ session = requests.Session()
3601
+ session.mount("https://", HTTP_ADAPTER)
3602
+ session.mount("http://", HTTP_ADAPTER)
3603
+
3604
+ # Retry loop for 503 or maintenance responses
3605
+ maxRetries = 3
3606
+ warcBytes = b""
3607
+ for attempt in range(maxRetries):
3608
+ resp = session.get(
3609
+ warcUrl,
3610
+ headers={"User-Agent": userAgent},
3611
+ allow_redirects=True,
3612
+ timeout=args.timeout,
3613
+ )
3614
+ warcBytes = resp.content
3615
+
3616
+ # Check if we need to retry (decode just for this check)
3617
+ try:
3618
+ warcTextCheck = warcBytes.decode("utf-8", errors="replace").lower()
3619
+ except Exception:
3620
+ warcTextCheck = ""
3621
+ if resp.status_code == 503 or "website under maintenance" in warcTextCheck:
3622
+ if attempt < maxRetries - 1:
3623
+ import time
3624
+
3625
+ time.sleep(0.5)
3626
+ continue
3627
+ break
3628
+
3629
+ # Parse the WARC file to extract multiple responses
3630
+ # WARC header lines are text, but response bodies may be binary
3631
+ # Split by line separator but keep bytes for body extraction
3632
+ lineBytes = warcBytes.split(b"\n")
3633
+ lines = [lb.decode("utf-8", errors="replace") for lb in lineBytes]
3634
+
3635
+ # State machine to track parsing
3636
+ currentTargetUri = ""
3637
+ inResponse = False
3638
+ contentType = ""
3639
+ responsesFound = (
3640
+ []
3641
+ ) # List of (targetUri, contentType, responseBytes, httpStatusCode)
3642
+
3643
+ i = 0
3644
+ skipCurrentResponse = False # Initialize before loop
3645
+ pendingResponseType = (
3646
+ False # Track if we saw WARC-Type: response and are waiting for Target-URI
3647
+ )
3648
+ responseStartIdx = -1 # Initialize before loop
3649
+ httpStatusCode = "" # Initialize before loop
3650
+ while i < len(lines) and stopProgram is None and not stopSourceGhostArchive:
3651
+ line = lines[i]
3652
+
3653
+ # When we see a new WARC record start, reset pending state
3654
+ if line.startswith("WARC/1.0"):
3655
+ # If we were in a response and collecting, save it before moving to new record
3656
+ if inResponse and responseStartIdx >= 0:
3657
+ responseBodyBytes = b"\n".join(lineBytes[responseStartIdx:i])
3658
+ responsesFound.append(
3659
+ (
3660
+ currentTargetUri,
3661
+ contentType,
3662
+ responseBodyBytes,
3663
+ httpStatusCode if "httpStatusCode" in dir() else "",
3664
+ )
3665
+ )
3666
+ inResponse = False
3667
+ responseStartIdx = -1
3668
+ contentType = ""
3669
+ httpStatusCode = ""
3670
+ pendingResponseType = False
3671
+ skipCurrentResponse = False
3672
+
3673
+ # Look for WARC-Type: response - mark that we're in a response record header
3674
+ elif line.startswith("WARC-Type: response"):
3675
+ pendingResponseType = True
3676
+ inResponse = False # Don't start capturing body yet
3677
+ responseStartIdx = -1
3678
+ contentType = ""
3679
+
3680
+ # Look for WARC-Target-URI to get the request URL
3681
+ elif line.startswith("WARC-Target-URI:"):
3682
+ currentTargetUri = line.split(":", 1)[1].strip()
3683
+ skipCurrentResponse = False
3684
+
3685
+ # Check: URL host must contain the input hostname
3686
+ if argsInputHostname:
3687
+ try:
3688
+ parsed = urlparse(currentTargetUri)
3689
+ host = parsed.netloc.lower()
3690
+ if argsInputHostname.lower() not in host:
3691
+ skipCurrentResponse = True
3692
+ except Exception:
3693
+ skipCurrentResponse = True
3694
+
3695
+ # Check: Filter by URL (FILTER_URL)
3696
+ if not skipCurrentResponse and FILTER_URL and currentTargetUri:
3697
+ filterUrls = [u.strip().lower() for u in FILTER_URL.split(",")]
3698
+ for filterUrl in filterUrls:
3699
+ if filterUrl in currentTargetUri.lower():
3700
+ skipCurrentResponse = True
3701
+ break
3702
+
3703
+ # If we were waiting for Target-URI after seeing WARC-Type: response, and it's valid, start response mode
3704
+ if pendingResponseType and not skipCurrentResponse:
3705
+ inResponse = True
3706
+ pendingResponseType = False
3707
+
3708
+ # If we're in a response section (after seeing both WARC-Type: response and valid WARC-Target-URI)
3709
+ elif inResponse:
3710
+ # Check for HTTP start and capture status code
3711
+ if line.startswith("HTTP"):
3712
+ # Extract status code (e.g., "HTTP/1.1 200 OK" -> "200")
3713
+ try:
3714
+ httpStatusCode = line.split()[1]
3715
+ except Exception:
3716
+ httpStatusCode = ""
3717
+
3718
+ # Early check: Filter by HTTP status code (FILTER_CODE)
3719
+ if FILTER_CODE and httpStatusCode:
3720
+ filterCodes = [c.strip() for c in FILTER_CODE.split(",")]
3721
+ if httpStatusCode in filterCodes:
3722
+ inResponse = False
3723
+ responseStartIdx = -1
3724
+ i += 1
3725
+ continue
3726
+
3727
+ responseStartIdx = i # Mark start of response
3728
+ elif responseStartIdx >= 0:
3729
+ # Capture Content-Type if present (case-insensitive check)
3730
+ if line.lower().startswith("content-type:"):
3731
+ try:
3732
+ contentType = (
3733
+ line.split(":", 1)[1].strip().split(";")[0].lower()
3734
+ )
3735
+ except Exception:
3736
+ pass
3737
+
3738
+ # Early check: Filter by MIME type (FILTER_MIME)
3739
+ if FILTER_MIME and contentType:
3740
+ filterMimes = [
3741
+ m.strip().lower() for m in FILTER_MIME.split(",")
3742
+ ]
3743
+ if contentType in filterMimes:
3744
+ inResponse = False
3745
+ responseStartIdx = -1
3746
+ i += 1
3747
+ continue
3748
+
3749
+ i += 1
3750
+
3751
+ if stopProgram is not None:
3752
+ return
3753
+
3754
+ # Don't forget the last response if file doesn't end with WARC/1.0
3755
+ if inResponse and responseStartIdx >= 0:
3756
+ responseBodyBytes = b"\n".join(lineBytes[responseStartIdx:])
3757
+ responsesFound.append(
3758
+ (
3759
+ currentTargetUri,
3760
+ contentType,
3761
+ responseBodyBytes,
3762
+ httpStatusCode if "httpStatusCode" in dir() else "",
3763
+ )
3764
+ )
3765
+
3766
+ # Process each response found
3767
+ for targetUri, contentType, responseBytes, httpStatusCode in responsesFound:
3768
+ if stopProgram is not None:
3769
+ break
3770
+
3771
+ if not responseBytes:
3772
+ continue
3773
+
3774
+ # Split HTTP header from body in bytes (look for \r\n\r\n or \n\n separator)
3775
+ if b"\r\n\r\n" in responseBytes:
3776
+ bodyBytes = responseBytes.split(b"\r\n\r\n", 1)[1]
3777
+ elif b"\n\n" in responseBytes:
3778
+ bodyBytes = responseBytes.split(b"\n\n", 1)[1]
3779
+ else:
3780
+ bodyBytes = responseBytes
3781
+
3782
+ # Skip empty bodies or "not found" responses
3783
+ if not bodyBytes or bodyBytes.lower().strip() == b"not found":
3784
+ continue
3785
+
3786
+ # If -f / --filter-responses-only is passed, track all URLs immediately (before filtering)
3787
+ if args.mode == "B" and args.filter_responses_only and targetUri:
3788
+ with links_lock:
3789
+ if targetUri not in linksFound and targetUri not in extraWarcLinks:
3790
+ extraWarcLinks.add(targetUri)
3791
+
3792
+ # Use isBinaryContent to detect if this is binary content
3793
+ isBinary = isBinaryContent(bodyBytes, contentType, targetUri)
3794
+
3795
+ if isBinary:
3796
+ # Binary file - save raw bytes
3797
+ archiveContent = bodyBytes
3798
+ archiveHtml = None
3799
+ else:
3800
+ # Text file - decode to string
3801
+ archiveHtml = bodyBytes.decode("utf-8", errors="replace")
3802
+ archiveContent = None
3803
+
3804
+ # Collapse multiple blank lines into one
3805
+ archiveHtml = re.sub(r"\n{3,}", "\n\n", archiveHtml)
3806
+
3807
+ # Skip if body is empty after processing
3808
+ if not archiveHtml.strip():
3809
+ continue
3810
+
3811
+ if stopProgram is not None:
3812
+ break
3813
+
3814
+ # Determine if this is HTML or JS based on content-type or URL
3815
+ isHtml = (
3816
+ contentType in ["text/html", "application/xhtml+xml"]
3817
+ or targetUri.lower().endswith(".html")
3818
+ or targetUri.lower().endswith(".htm")
3819
+ )
3820
+ isJs = contentType in [
3821
+ "text/javascript",
3822
+ "application/javascript",
3823
+ "application/x-javascript",
3824
+ ] or targetUri.lower().endswith(".js")
3825
+
3826
+ # Add the URL as a comment at the start of the response (only for text files)
3827
+ if not isBinary and args.url_filename:
3828
+ if isHtml:
3829
+ archiveHtml = (
3830
+ "<!-- Original URL: " + targetUri + " -->\n" + archiveHtml
3831
+ )
3832
+ elif isJs:
3833
+ archiveHtml = (
3834
+ "/* Original URL: " + targetUri + " */\n" + archiveHtml
3835
+ )
3836
+
3837
+ # Create file name based on url or hash value
3838
+ if args.url_filename:
3839
+ fileName = targetUri.replace("/", "-").replace(":", "")
3840
+ fileName = fileName[0:254]
3841
+ hashValue = ""
3842
+ else:
3843
+ # Hash the content to get the filename
3844
+ if isBinary:
3845
+ hashValue = filehash(archiveContent)
3846
+ else:
3847
+ hashValue = filehash(archiveHtml)
3848
+ fileName = hashValue
3849
+
3850
+ # Determine extension of file from the content-type or URL
3851
+ extension = ""
3852
+ try:
3853
+ # Get path extension from URL
3854
+ if "://" in targetUri:
3855
+ targetUrl = "https://" + targetUri.split("://")[1]
3856
+ parsed = urlparse(targetUrl.strip())
3857
+ path = parsed.path
3858
+ extension = path[path.rindex(".") + 1 :]
3859
+ if "/" in extension:
3860
+ extension = ""
3861
+ # If extension is over 6 characters, it's likely not a real extension (e.g. API endpoint ID)
3862
+ if len(extension) > 6:
3863
+ extension = ""
3864
+ except Exception:
3865
+ pass
3866
+
3867
+ # If extension is blank, determine from MIME type or content
3868
+ if extension == "":
3869
+ if isBinary:
3870
+ # Binary file extensions from MIME type
3871
+ if contentType:
3872
+ if "image/png" in contentType:
3873
+ extension = "png"
3874
+ elif (
3875
+ "image/jpeg" in contentType
3876
+ or "image/jpg" in contentType
3877
+ ):
3878
+ extension = "jpg"
3879
+ elif "image/gif" in contentType:
3880
+ extension = "gif"
3881
+ elif "image/webp" in contentType:
3882
+ extension = "webp"
3883
+ elif "application/pdf" in contentType:
3884
+ extension = "pdf"
3885
+ elif "application/zip" in contentType:
3886
+ extension = "zip"
3887
+ else:
3888
+ extension = "bin"
3889
+ else:
3890
+ extension = "bin"
3891
+ else:
3892
+ # Text file extensions
3893
+ if contentType and "javascript" in contentType.lower():
3894
+ extension = "js"
3895
+ elif contentType and "html" in contentType.lower():
3896
+ extension = "html"
3897
+ elif contentType and "json" in contentType.lower():
3898
+ extension = "json"
3899
+ elif contentType and "text" in contentType.lower():
3900
+ extension = "txt"
3901
+ elif archiveHtml and (
3902
+ archiveHtml.lower().strip().endswith("</html>")
3903
+ or archiveHtml.lower().strip().endswith("</body>")
3904
+ or archiveHtml.lower().strip().startswith("<!doctype html")
3905
+ or archiveHtml.lower().strip().startswith("<html")
3906
+ or archiveHtml.lower().strip().startswith("<head")
3907
+ ):
3908
+ extension = "html"
3909
+ else:
3910
+ extension = "unknown"
3911
+
3912
+ fileName = fileName + "." + extension
3913
+
3914
+ # Determine file path
3915
+ if args.output_responses != "":
3916
+ filePath = args.output_responses + "/" + f"{fileName}"
3917
+ else:
3918
+ filePath = (
3919
+ DEFAULT_OUTPUT_DIR
3920
+ + "/results/"
3921
+ + str(argsInput).replace("/", "-")
3922
+ + "/"
3923
+ + f"{fileName}"
3924
+ )
3925
+
3926
+ if stopProgram is not None:
3927
+ break
3928
+
3929
+ # Write the file
3930
+ try:
3931
+ if isBinary:
3932
+ # Binary file - write as bytes
3933
+ responseFile = open(filePath, "wb")
3934
+ responseFile.write(archiveContent)
3935
+ else:
3936
+ # Text file - write as UTF-8
3937
+ responseFile = open(filePath, "w", encoding="utf8")
3938
+ responseFile.write(archiveHtml)
3939
+ responseFile.close()
3940
+ with links_lock:
3941
+ fileCount = fileCount + 1
3942
+
3943
+ # Track extra URLs found in WARC files for mode B (only when -f is not passed, since we track earlier if it is)
3944
+ if args.mode == "B" and not args.filter_responses_only and targetUri:
3945
+ with links_lock:
3946
+ if (
3947
+ targetUri not in linksFound
3948
+ and targetUri not in extraWarcLinks
3949
+ ):
3950
+ extraWarcLinks.add(targetUri)
3951
+ except Exception as e:
3952
+ writerr(
3953
+ colored(
3954
+ "GhostArchive - [ ERR ] Failed to write file "
3955
+ + filePath
3956
+ + ": "
3957
+ + str(e),
3958
+ "red",
3959
+ )
3960
+ )
3961
+
3962
+ # Write the hash value and URL to the index file
3963
+ if not args.url_filename and hashValue:
3964
+ try:
3965
+ timestamp = str(datetime.now())
3966
+ indexFile.write(
3967
+ hashValue
3968
+ + ","
3969
+ + domUrl
3970
+ + "#"
3971
+ + targetUri
3972
+ + " ,"
3973
+ + timestamp
3974
+ + "\n"
3975
+ )
3976
+ indexFile.flush()
3977
+ except Exception as e:
3978
+ writerr(
3979
+ colored(
3980
+ 'GhostArchive - [ ERR ] Failed to write to waymore_index.txt for "'
3981
+ + warcUrl
3982
+ + '": '
3983
+ + str(e),
3984
+ "red",
3985
+ )
3986
+ )
3987
+
3988
+ successCount = successCount + 1
3989
+
3990
+ except WayBackException:
3991
+ failureCount = failureCount + 1
3992
+
3993
+ except Exception as e:
3994
+ failureCount = failureCount + 1
3995
+ if verbose():
3996
+ # Simplify common error messages
3997
+ if "connection broken" in str(e).lower():
3998
+ errorMsg = "Connection Broken"
3999
+ else:
4000
+ errorMsg = str(e)
4001
+ try:
4002
+ statusCode = (
4003
+ resp.status_code if "resp" in dir() and resp is not None else "ERR"
4004
+ )
4005
+ writerr(
4006
+ colored(
4007
+ "GhostArchive - [ "
4008
+ + str(statusCode)
4009
+ + ' ] Failed to get response for "'
4010
+ + warcUrl
4011
+ + '": '
4012
+ + errorMsg,
4013
+ "red",
4014
+ )
4015
+ )
4016
+ except Exception:
4017
+ writerr(
4018
+ colored(
4019
+ 'GhostArchive - [ ERR ] Failed to get response for "'
4020
+ + warcUrl
4021
+ + '": '
4022
+ + errorMsg,
4023
+ "red",
4024
+ )
4025
+ )
4026
+
4027
+ # Show memory usage if -v option chosen, and check memory every 25 responses (or if its the last)
4028
+ if (successCount + failureCount) % 25 == 1 or (
4029
+ successCount + failureCount
4030
+ ) == totalResponses:
4031
+ try:
4032
+ getMemory()
4033
+ if verbose():
4034
+ suffix = (
4035
+ "Complete (Mem Usage "
4036
+ + humanReadableSize(currentMemUsage)
4037
+ + ", Total Mem "
4038
+ + str(currentMemPercent)
4039
+ + "%) "
4040
+ )
4041
+ except Exception:
4042
+ if verbose():
4043
+ suffix = 'Complete (To show mem use, run "pip install psutil")'
4044
+ printProgressBar(
4045
+ successCount + failureCount,
4046
+ totalResponses,
4047
+ prefix="Processing " + str(totalResponses) + " WARC files:",
4048
+ suffix=suffix,
4049
+ length=getProgressBarLength(),
4050
+ fill=fillChar,
4051
+ )
4052
+
4053
+ except Exception as e:
4054
+ if verbose():
4055
+ writerr(
4056
+ colored(
4057
+ 'GhostArchive - [ ERR ] Error for "' + domUrl + '": ' + str(e), "red"
4058
+ )
4059
+ )
4060
+
4061
+ except Exception as e:
4062
+ writerr(colored("ERROR getGhostArchiveWARC 1: " + str(e), "red"))
4063
+
4064
+
4065
+ def format_date_for_urlscan(date_str):
4066
+ # Handle different lengths of input
4067
+ if len(date_str) == 4: # YYYY
4068
+ date_str += "0101"
4069
+ elif len(date_str) == 6: # YYYYMM
4070
+ date_str += "01"
4071
+
4072
+ # Convert to YYYY-MM-DD format
4073
+ try:
4074
+ formatted_date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
4075
+ return formatted_date
4076
+ except Exception:
4077
+ return ""
4078
+
4079
+
4080
+ def getURLScanUrls():
4081
+ """
4082
+ Get URLs from the URLSCan API, urlscan.io
4083
+ """
4084
+ global URLSCAN_API_KEY, linksFound, linkMimes, waymorePath, subs, stopProgram, stopSourceURLScan, argsInput, checkURLScan, argsInputHostname, linkCountURLScan, linksFoundURLScan
4085
+
4086
+ # Write the file of URL's for the passed domain/URL
4087
+ try:
4088
+ requestsMade = 0
4089
+ stopSourceURLScan = False
4090
+ linksFoundURLScan = set()
4091
+ totalUrls = 0
4092
+ checkResponse = True
4093
+
4094
+ # Set the URL to just the hostname
4095
+ url = URLSCAN_URL.replace("{DOMAIN}", quote(argsInputHostname))
4096
+
4097
+ # If the --from-date or --to-date parameters were paassed then also add a date filter
4098
+ if args.from_date or args.to_date:
4099
+ if args.from_date:
4100
+ fromDate = format_date_for_urlscan(str(args.from_date)[:8])
4101
+ else:
4102
+ fromDate = "2016-01-01" # The year URLScan started
4103
+ if args.to_date:
4104
+ toDate = format_date_for_urlscan(str(args.to_date)[:8])
4105
+ else:
4106
+ toDate = "now"
4107
+ url = url.replace("{DATERANGE}", f"%20date:[{fromDate}%20TO%20{toDate}]")
4108
+ else:
4109
+ url = url.replace("{DATERANGE}", "")
4110
+
4111
+ if verbose():
4112
+ if args.mode == "R":
4113
+ write(
4114
+ colored(
4115
+ "URLScan - [ INFO ] The URLScan URL requested to get links for responses: ",
4116
+ "magenta",
4117
+ )
4118
+ + colored(url + "\n", "white")
4119
+ )
4120
+ else:
4121
+ write(
4122
+ colored(
4123
+ "URLScan - [ INFO ] The URLScan URL requested to get links: ", "magenta"
4124
+ )
4125
+ + colored(url + "\n", "white")
4126
+ )
4127
+
4128
+ if args.mode in ("U", "B") and not args.check_only:
4129
+ write(
4130
+ colored(
4131
+ "URLScan - [ INFO ] Getting links from urlscan.io API (this can take a while for some domains)...",
4132
+ "cyan",
4133
+ )
4134
+ )
4135
+
4136
+ # Get the first page from urlscan.io
4137
+ try:
4138
+ # Choose a random user agent string to use for any requests
4139
+ # For other sources we would use `random.choice(USER_AGENT)` to asignn a random user-agent, but it seems
4140
+ # that there are a handful of those that ALWAYS return 429. Passing a specific one all the time seems to
4141
+ # be successful all the time
4142
+ userAgent = "waymore v" + __version__ + " by xnl-h4ck3r"
4143
+ session = requests.Session()
4144
+ session.mount("https://", HTTP_ADAPTER)
4145
+ session.mount("http://", HTTP_ADAPTER)
4146
+ # Pass the API-Key header too. This can change the max endpoints per page, depending on URLScan subscription
4147
+ resp = session.get(url, headers={"User-Agent": userAgent, "API-Key": URLSCAN_API_KEY})
4148
+ requestsMade = requestsMade + 1
4149
+ except Exception as e:
4150
+ write(
4151
+ colored(
4152
+ "URLScan - [ ERR ] Unable to get links from urlscan.io: " + str(e),
4153
+ "red",
4154
+ )
4155
+ )
4156
+ return
3178
4157
 
3179
4158
  # If the rate limit was reached then determine if to wait and then try again
3180
4159
  if resp.status_code == 429:
@@ -3753,7 +4732,6 @@ def processWayBackPage(url):
3753
4732
  pass
3754
4733
  return
3755
4734
  else:
3756
- print("DEBUG: HERE END!") # DEBUG
3757
4735
  pass
3758
4736
  except Exception as e:
3759
4737
  if verbose():
@@ -4935,80 +5913,373 @@ def processIntelxType(target, credits):
4935
5913
  writerr(colored("ERROR processIntelxType 1: " + str(e), "red"))
4936
5914
 
4937
5915
 
4938
- def getIntelxAccountInfo() -> str:
4939
- """
4940
- Get the account info and return the number of Credits remaining from the /phonebook/search
4941
- """
4942
- initIntelxTls()
4943
- try:
4944
- resp = chooseIntelxBase(INTELX_API_KEY)
4945
- if resp is None or resp.status_code != 200:
4946
- return "Unknown"
4947
- jsonResp = json.loads(resp.text.strip())
4948
- credits = str(
4949
- jsonResp.get("paths", {}).get("/phonebook/search", {}).get("Credit", "Unknown")
4950
- )
4951
- credits_max = str(
4952
- jsonResp.get("paths", {}).get("/phonebook/search", {}).get("CreditMax", "Unknown")
4953
- )
4954
- return credits + "/" + credits_max
4955
- except Exception:
4956
- return "Unknown"
5916
+ def getIntelxAccountInfo() -> str:
5917
+ """
5918
+ Get the account info and return the number of Credits remaining from the /phonebook/search
5919
+ """
5920
+ initIntelxTls()
5921
+ try:
5922
+ resp = chooseIntelxBase(INTELX_API_KEY)
5923
+ if resp is None or resp.status_code != 200:
5924
+ return "Unknown"
5925
+ jsonResp = json.loads(resp.text.strip())
5926
+ credits = str(
5927
+ jsonResp.get("paths", {}).get("/phonebook/search", {}).get("Credit", "Unknown")
5928
+ )
5929
+ credits_max = str(
5930
+ jsonResp.get("paths", {}).get("/phonebook/search", {}).get("CreditMax", "Unknown")
5931
+ )
5932
+ return credits + "/" + credits_max
5933
+ except Exception:
5934
+ return "Unknown"
5935
+
5936
+
5937
+ def getIntelxUrls():
5938
+ """
5939
+ Get URLs from the Intelligence X Phonebook search
5940
+ """
5941
+ global INTELX_API_KEY, linksFound, waymorePath, subs, stopProgram, stopSourceIntelx, argsInput, checkIntelx, argsInputHostname, intelxAPIIssue, linkCountIntelx, linksFoundIntelx
5942
+
5943
+ # Write the file of URL's for the passed domain/URL
5944
+ try:
5945
+ if args.check_only:
5946
+ write(
5947
+ colored("IntelX - [ INFO ] Get URLs from Intelligence X: ", "cyan")
5948
+ + colored("minimum 4 requests", "white")
5949
+ )
5950
+ checkIntelx = 4
5951
+ return
5952
+
5953
+ stopSourceIntelx = False
5954
+ linksFoundIntelx = set()
5955
+ initIntelxTls()
5956
+
5957
+ credits = getIntelxAccountInfo()
5958
+ if verbose():
5959
+ write(
5960
+ colored(
5961
+ "IntelX - [ INFO ] The Intelligence X URL requested to get links (Credits: "
5962
+ + credits
5963
+ + "): ",
5964
+ "magenta",
5965
+ )
5966
+ + colored(intelx_tls.INTELX_SEARCH_URL + "\n", "white")
5967
+ )
5968
+
5969
+ if not args.check_only:
5970
+ write(colored("IntelX - [ INFO ] Getting links from intelx.io API...", "cyan"))
5971
+
5972
+ # Get the domains from Intelligence X if the --no-subs wasn't passed
5973
+ if not args.no_subs:
5974
+ processIntelxType(1, credits)
5975
+
5976
+ # Get the URLs from Intelligence X
5977
+ if not intelxAPIIssue:
5978
+ processIntelxType(3, credits)
5979
+
5980
+ linkCountIntelx = len(linksFoundIntelx)
5981
+ write(
5982
+ colored("IntelX - [ INFO ] Links found on intelx.io: ", "cyan")
5983
+ + colored(str(linkCountIntelx), "white")
5984
+ )
5985
+ linksFound.update(linksFoundIntelx)
5986
+ linksFoundIntelx.clear()
5987
+
5988
+ except Exception as e:
5989
+ writerr(colored("ERROR getIntelxUrls 1: " + str(e), "red"))
5990
+
5991
+
5992
+ def processGhostArchiveUrl(url, ghostArchiveID=""):
5993
+ """
5994
+ Process a specific URL from ghostarchive.org to determine whether to save the link
5995
+ """
5996
+ global argsInput, argsInputHostname, links_lock, linkCountGhostArchive, linksFoundGhostArchive
5997
+
5998
+ addLink = True
5999
+
6000
+ try:
6001
+ # Strip Wayback Machine prefix if present (e.g., https://web.archive.org/web/20230101120000_/https://example.com)
6002
+ waybackMatch = re.match(r"^https?://web\.archive\.org/[^/]+/[a-zA-Z0-9]+_/", url)
6003
+ if waybackMatch:
6004
+ url = url[waybackMatch.end() :]
6005
+
6006
+ # If the input has a / in it, then a URL was passed, so the link will only be added if the URL matches
6007
+ if "/" in url:
6008
+ if argsInput not in url:
6009
+ addLink = False
6010
+
6011
+ # If filters are required then test them
6012
+ if addLink and not args.filter_responses_only:
6013
+
6014
+ # If the user requested -n / --no-subs then we don't want to add it if it has a sub domain (www. will not be classed as a sub domain)
6015
+ if args.no_subs:
6016
+ match = re.search(
6017
+ r"^[A-za-z]*\:\/\/(www\.)?" + re.escape(argsInputHostname),
6018
+ url,
6019
+ flags=re.IGNORECASE,
6020
+ )
6021
+ if match is None:
6022
+ addLink = False
6023
+
6024
+ # If the user didn't requested -f / --filter-responses-only then check http code
6025
+ if addLink and not args.filter_responses_only:
6026
+
6027
+ # Check the URL exclusions
6028
+ if addLink:
6029
+ match = re.search(
6030
+ r"(" + re.escape(FILTER_URL).replace(",", "|") + ")",
6031
+ url,
6032
+ flags=re.IGNORECASE,
6033
+ )
6034
+ if match is not None:
6035
+ addLink = False
6036
+
6037
+ # Set keywords filter if -ko argument passed
6038
+ if addLink and args.keywords_only:
6039
+ if args.keywords_only == "#CONFIG":
6040
+ match = re.search(
6041
+ r"(" + re.escape(FILTER_KEYWORDS).replace(",", "|") + ")",
6042
+ url,
6043
+ flags=re.IGNORECASE,
6044
+ )
6045
+ else:
6046
+ match = re.search(r"(" + args.keywords_only + ")", url, flags=re.IGNORECASE)
6047
+ if match is None:
6048
+ addLink = False
6049
+
6050
+ # Add link if it passed filters
6051
+ if addLink:
6052
+ # Just get the hostname of the url
6053
+ tldExtract = tldextract.extract(url)
6054
+ subDomain = tldExtract.subdomain
6055
+ if subDomain != "":
6056
+ subDomain = subDomain + "."
6057
+ domainOnly = subDomain + tldExtract.domain + "." + tldExtract.suffix
6058
+
6059
+ # GhostArchive might return URLs that aren't for the domain passed so we need to check for those and not process them
6060
+ # Check the URL
6061
+ match = re.search(
6062
+ r"(^|\.)" + re.escape(argsInputHostname) + "$",
6063
+ domainOnly,
6064
+ flags=re.IGNORECASE,
6065
+ )
6066
+ if match is not None:
6067
+ if args.mode in ("U", "B"):
6068
+ linksFoundAdd(url, linksFoundGhostArchive)
6069
+ # If Response mode is requested then add the DOM ID to try later, for the number of responses wanted
6070
+ if ghostArchiveID != "" and args.mode in ("R", "B"):
6071
+ if args.limit == 0 or len(ghostArchiveRequestLinks) < args.limit:
6072
+ with links_lock:
6073
+ ghostArchiveRequestLinks.add(
6074
+ (url, GHOSTARCHIVE_DOM_URL + ghostArchiveID)
6075
+ )
6076
+
6077
+ except Exception as e:
6078
+ writerr(colored("ERROR processGhostArchiveUrl 1: " + str(e), "red"))
6079
+
6080
+
6081
+ def getGhostArchiveUrls():
6082
+ """
6083
+ Get URLs from GhostArchive (ghostarchive.org)
6084
+ This source doesn't have an API, so we crawl the HTML pages directly.
6085
+ """
6086
+ global linksFound, path, subs, stopProgram, stopSourceGhostArchive, argsInput, checkGhostArchive, argsInputHostname, linkCountGhostArchive, linksFoundGhostArchive
6087
+
6088
+ try:
6089
+ stopSourceGhostArchive = False
6090
+ linksFoundGhostArchive = set()
6091
+
6092
+ # Build the base URL
6093
+ # If there is only one . in the hostname, we can guarantee that a subdoman wasn't passed, so we can prefix with . to the links quicker as it won't include other domains that end with the target domain,
6094
+ # Else, we need to get all and then confirm the actual host of the links later
6095
+ if argsInputHostname.count(".") == 1:
6096
+ baseUrl = GHOSTARCHIVE_URL.replace("{DOMAIN}", "." + quote(argsInput))
6097
+ else:
6098
+ baseUrl = GHOSTARCHIVE_URL.replace("{DOMAIN}", quote(argsInput))
6099
+
6100
+ if verbose():
6101
+ write(
6102
+ colored("GhostArchive - [ INFO ] The URL requested to get links: ", "magenta")
6103
+ + colored(baseUrl + "0\n", "white")
6104
+ )
6105
+
6106
+ if not args.check_only and args.mode == "U":
6107
+ write(
6108
+ colored(
6109
+ "GhostArchive - [ INFO ] Getting links from ghostarchive.org (this can take a while for some domains)...",
6110
+ "cyan",
6111
+ )
6112
+ )
6113
+
6114
+ # Set up session with cookie
6115
+ session = requests.Session()
6116
+ if HTTP_ADAPTER is not None:
6117
+ session.mount("https://", HTTP_ADAPTER)
6118
+ session.mount("http://", HTTP_ADAPTER)
6119
+
6120
+ userAgent = random.choice(USER_AGENT)
6121
+ headers = {"User-Agent": userAgent}
6122
+ cookies = {"theme": "original"}
6123
+
6124
+ pageNum = 0
6125
+
6126
+ while stopProgram is None and not stopSourceGhostArchive:
6127
+ getMemory()
6128
+
6129
+ url = baseUrl + str(pageNum)
6130
+
6131
+ try:
6132
+ resp = session.get(url, headers=headers, cookies=cookies, timeout=DEFAULT_TIMEOUT)
6133
+ except Exception as e:
6134
+ writerr(
6135
+ colored(
6136
+ "GhostArchive - [ ERR ] Unable to get page " + str(pageNum) + ": " + str(e),
6137
+ "red",
6138
+ )
6139
+ )
6140
+ break
6141
+
6142
+ if resp.status_code == 429:
6143
+ writerr(
6144
+ colored(
6145
+ "GhostArchive - [ 429 ] Rate limit reached at page " + str(pageNum) + ".",
6146
+ "red",
6147
+ )
6148
+ )
6149
+ break
6150
+
6151
+ # Check for maintenance/end of results indicator
6152
+ if (
6153
+ resp.status_code == 503
6154
+ or "The site is under maintenance and will be back soon" in resp.text
6155
+ or "No archives for that site" in resp.text
6156
+ ):
6157
+ if verbose():
6158
+ if pageNum == 0:
6159
+ if args.check_only:
6160
+ checkGhostArchive = 1
6161
+ write(
6162
+ colored(
6163
+ "GhostArchive - [ INFO ] Get URLs from GhostArchive: ", "cyan"
6164
+ )
6165
+ + colored("1 request", "white")
6166
+ )
6167
+ else:
6168
+ write(
6169
+ colored(
6170
+ "GhostArchive - [ INFO ] No results found",
6171
+ "cyan",
6172
+ )
6173
+ )
6174
+ else:
6175
+ write(
6176
+ colored(
6177
+ "GhostArchive - [ INFO ] Retrieved all results from "
6178
+ + str(pageNum)
6179
+ + " pages",
6180
+ "cyan",
6181
+ )
6182
+ )
6183
+ break
6184
+ if resp.status_code != 200:
6185
+ writerr(
6186
+ colored(
6187
+ "GhostArchive - [ ERR ] [ "
6188
+ + str(resp.status_code)
6189
+ + " ] at page "
6190
+ + str(pageNum),
6191
+ "red",
6192
+ )
6193
+ )
6194
+ break
6195
+
6196
+ # Check only mode - just count pages
6197
+ if args.check_only:
6198
+ # For check only, we check if there are results and try to get total count
6199
+ if pageNum == 0:
6200
+ # Check if there are any results on the first page
6201
+ if '<a href="/archive/' in resp.text:
6202
+ # Try to find "out of X" to determine total results/pages
6203
+ outOfMatch = re.search(r"out of (\d+)", resp.text)
6204
+ if outOfMatch:
6205
+ totalResults = int(outOfMatch.group(1))
6206
+ checkGhostArchive = totalResults
6207
+ write(
6208
+ colored(
6209
+ "GhostArchive - [ INFO ] Get URLs from GhostArchive: ", "cyan"
6210
+ )
6211
+ + colored(f"{totalResults} requests (pagination required)", "white")
6212
+ )
6213
+ else:
6214
+ checkGhostArchive = 1
6215
+ write(
6216
+ colored(
6217
+ "GhostArchive - [ INFO ] Get URLs from GhostArchive: ", "cyan"
6218
+ )
6219
+ + colored("unknown requests (pagination required)", "white")
6220
+ )
6221
+ else:
6222
+ checkGhostArchive = 1
6223
+ write(
6224
+ colored("GhostArchive - [ INFO ] Get URLs from GhostArchive: ", "cyan")
6225
+ + colored("1 request (no results)", "white")
6226
+ )
6227
+ break
6228
+
6229
+ # Use regex to extract URLs from anchor tag text content
6230
+ # Pattern matches: <a href="/archive/ID">URL_HERE</a> - captures both href path and URL
6231
+ pattern = r'<a href="(/archive/[^"]*)">([^<]+)</a>'
6232
+ matches = re.findall(pattern, resp.text)
4957
6233
 
6234
+ # If no matches found, we've reached the end of results
6235
+ if not matches:
6236
+ if verbose():
6237
+ write(
6238
+ colored(
6239
+ "GhostArchive - [ INFO ] Retrieved all results from "
6240
+ + str(pageNum + 1)
6241
+ + " pages",
6242
+ "cyan",
6243
+ )
6244
+ )
6245
+ break
4958
6246
 
4959
- def getIntelxUrls():
4960
- """
4961
- Get URLs from the Intelligence X Phonebook search
4962
- """
4963
- global INTELX_API_KEY, linksFound, waymorePath, subs, stopProgram, stopSourceIntelx, argsInput, checkIntelx, argsInputHostname, intelxAPIIssue, linkCountIntelx, linksFoundIntelx
6247
+ for match in matches:
6248
+ ghostArchiveId = match[0] # e.g., "/archive/gkOOR"
6249
+ potentialUrl = match[1].strip()
6250
+ processGhostArchiveUrl(potentialUrl, ghostArchiveId)
4964
6251
 
4965
- # Write the file of URL's for the passed domain/URL
4966
- try:
4967
- if args.check_only:
4968
- write(
4969
- colored("IntelX - [ INFO ] Get URLs from Intelligence X: ", "cyan")
4970
- + colored("minimum 4 requests", "white")
4971
- )
4972
- checkIntelx = 4
4973
- return
6252
+ # Check if there's a "Next Page" link - if not, we've reached the last page
6253
+ # GhostArchive resets to Page 1 when exceeding actual pages, so checking for Next Page is essential
6254
+ if "Next Page" not in resp.text and ">»</a>" not in resp.text:
6255
+ if verbose():
6256
+ write(
6257
+ colored(
6258
+ "GhostArchive - [ INFO ] Retrieved all results from "
6259
+ + str(pageNum + 1)
6260
+ + " pages",
6261
+ "cyan",
6262
+ )
6263
+ )
6264
+ break
4974
6265
 
4975
- stopSourceIntelx = False
4976
- linksFoundIntelx = set()
4977
- initIntelxTls()
6266
+ pageNum += 1
4978
6267
 
4979
- credits = getIntelxAccountInfo()
4980
- if verbose():
6268
+ if not args.check_only:
6269
+ # Count links based on mode - in R mode, count response links; in U/B mode, count URL links
6270
+ if args.mode == "R":
6271
+ linkCountGhostArchive = len(ghostArchiveRequestLinks)
6272
+ else:
6273
+ linkCountGhostArchive = len(linksFoundGhostArchive)
4981
6274
  write(
4982
- colored(
4983
- "IntelX - [ INFO ] The Intelligence X URL requested to get links (Credits: "
4984
- + credits
4985
- + "): ",
4986
- "magenta",
4987
- )
4988
- + colored(intelx_tls.INTELX_SEARCH_URL + "\n", "white")
6275
+ colored("GhostArchive - [ INFO ] Links found on ghostarchive.org: ", "cyan")
6276
+ + colored(str(linkCountGhostArchive), "white")
4989
6277
  )
4990
-
4991
- if not args.check_only:
4992
- write(colored("IntelX - [ INFO ] Getting links from intelx.io API...", "cyan"))
4993
-
4994
- # Get the domains from Intelligence X if the --no-subs wasn't passed
4995
- if not args.no_subs:
4996
- processIntelxType(1, credits)
4997
-
4998
- # Get the URLs from Intelligence X
4999
- if not intelxAPIIssue:
5000
- processIntelxType(3, credits)
5001
-
5002
- linkCountIntelx = len(linksFoundIntelx)
5003
- write(
5004
- colored("IntelX - [ INFO ] Links found on intelx.io: ", "cyan")
5005
- + colored(str(linkCountIntelx), "white")
5006
- )
5007
- linksFound.update(linksFoundIntelx)
5008
- linksFoundIntelx.clear()
6278
+ linksFound.update(linksFoundGhostArchive)
6279
+ linksFoundGhostArchive.clear()
5009
6280
 
5010
6281
  except Exception as e:
5011
- writerr(colored("ERROR getIntelxUrls 1: " + str(e), "red"))
6282
+ writerr(colored("ERROR getGhostArchiveUrls 1: " + str(e), "red"))
5012
6283
 
5013
6284
 
5014
6285
  def processResponses():
@@ -5018,6 +6289,10 @@ def processResponses():
5018
6289
  global stopProgram, totalFileCount
5019
6290
  try:
5020
6291
 
6292
+ # Get responses from GhostArchive unless excluded
6293
+ if stopProgram is None and not args.xga:
6294
+ processResponsesGhostArchive()
6295
+
5021
6296
  # Get responses from URLScan unless excluded
5022
6297
  if stopProgram is None and not args.xus:
5023
6298
  processResponsesURLScan()
@@ -5039,6 +6314,235 @@ def processResponses():
5039
6314
  writerr(colored(getSPACER("ERROR processResponses 1: " + str(e)), "red"))
5040
6315
 
5041
6316
 
6317
+ def processResponsesGhostArchive():
6318
+ """
6319
+ Get archived responses from GhostArchive (ghostarchive.org)
6320
+ """
6321
+ global subs, path, indexFile, totalResponses, stopProgram, argsInput, successCount, fileCount, DEFAULT_OUTPUT_DIR, responseOutputDirectory, ghostArchiveRequestLinks, failureCount, totalFileCount, checkGhostArchive
6322
+ try:
6323
+ fileCount = 0
6324
+ failureCount = 0
6325
+ if not args.check_only:
6326
+ # Create 'results' and domain directory if needed
6327
+ createDirs()
6328
+
6329
+ # Get the path of the files, depending on whether -oR / --output_responses was passed
6330
+ try:
6331
+ responsesPath = responseOutputDirectory + "responses.GhostArchive.tmp"
6332
+ indexPath = responseOutputDirectory + "waymore_index.txt"
6333
+ except Exception as e:
6334
+ if verbose():
6335
+ writerr(colored("ERROR processResponsesGhostArchive 4: " + str(e), "red"))
6336
+
6337
+ # Get URLs from GhostArchive if the DOM ID's haven't been retrieved yet
6338
+ if stopProgram is None and not args.check_only:
6339
+ if args.mode in ("R", "B"):
6340
+ write(
6341
+ colored(
6342
+ "GhostArchive - [ INFO ] Getting list of response links (this can take a while for some domains)...",
6343
+ "cyan",
6344
+ )
6345
+ )
6346
+ if args.mode == "R":
6347
+ getGhostArchiveUrls()
6348
+
6349
+ # Check if a responses.GhostArchive.tmp files exists
6350
+ if not args.check_only and os.path.exists(responsesPath):
6351
+
6352
+ # Load the links into the set
6353
+ with open(responsesPath, "rb") as fl:
6354
+ linkRequests = pickle.load(fl)
6355
+
6356
+ # Set start point
6357
+ successCount = 0
6358
+
6359
+ # Get the URLScan DOM links
6360
+ linkRequests = []
6361
+ for originalUrl, domUrl in ghostArchiveRequestLinks:
6362
+ linkRequests.append((originalUrl, domUrl))
6363
+
6364
+ # Write the links to a temp file
6365
+ if not args.check_only:
6366
+ with open(responsesPath, "wb") as f:
6367
+ pickle.dump(linkRequests, f)
6368
+
6369
+ # Get the total number of responses we will try to get and set the current file count to the success count
6370
+ totalResponses = len(linkRequests)
6371
+ checkGhostArchive = checkGhostArchive + totalResponses
6372
+
6373
+ # If there are no reponses to download, diaplay an error and exit
6374
+ if args.mode != "R" and totalResponses == 0:
6375
+ writerr(
6376
+ colored(
6377
+ getSPACER(
6378
+ "Failed to get responses from GhostArchive (ghostarchive.org) - check input and try again."
6379
+ ),
6380
+ "red",
6381
+ )
6382
+ )
6383
+ return
6384
+
6385
+ fileCount = successCount
6386
+
6387
+ if args.check_only:
6388
+ writerr(
6389
+ colored("Downloading archived responses: ", "cyan")
6390
+ + colored("UNKNOWN requests", "cyan")
6391
+ )
6392
+ writerr(
6393
+ colored(
6394
+ "\n-> Downloading the responses can vary depending on the target and the rate limiting on GhostArchive",
6395
+ "green",
6396
+ )
6397
+ )
6398
+ write("")
6399
+ else:
6400
+ # If the limit has been set over the default, give a warning that this could take a long time!
6401
+ if totalResponses - successCount > DEFAULT_LIMIT:
6402
+ if successCount > 0:
6403
+ writerr(
6404
+ colored(
6405
+ getSPACER(
6406
+ "WARNING: Downloading remaining "
6407
+ + str(totalResponses - successCount)
6408
+ + " responses may take a loooooooong time! Consider using arguments -ko, -l, -ci, -from and -to wisely!"
6409
+ ),
6410
+ "yellow",
6411
+ )
6412
+ )
6413
+ else:
6414
+ writerr(
6415
+ colored(
6416
+ getSPACER(
6417
+ "WARNING: Downloading "
6418
+ + str(totalResponses)
6419
+ + " responses may take a loooooooong time! Consider using arguments -ko, -l, -ci, -from and -to wisely!"
6420
+ ),
6421
+ "yellow",
6422
+ )
6423
+ )
6424
+
6425
+ # Open the index file if hash value is going to be used (not URL)
6426
+ if not args.url_filename:
6427
+ indexFile = open(indexPath, "a")
6428
+
6429
+ # Process the URLs from GhostArchive
6430
+ if stopProgram is None:
6431
+ p = mp.Pool(
6432
+ args.processes * 2
6433
+ ) # Double the number of processes to speed up the download
6434
+ p.starmap(getGhostArchiveWARC, linkRequests[successCount:])
6435
+ p.close()
6436
+ p.join()
6437
+
6438
+ # Delete the tmp files now it has run successfully
6439
+ if stopProgram is None:
6440
+ try:
6441
+ os.remove(responsesPath)
6442
+ except Exception:
6443
+ pass
6444
+
6445
+ # Close the index file if hash value is going to be used (not URL)
6446
+ if not args.url_filename:
6447
+ indexFile.close()
6448
+
6449
+ if not args.check_only:
6450
+ try:
6451
+ if failureCount > 0:
6452
+ if verbose():
6453
+ write(
6454
+ colored("GhostArchive - [ INFO ] Responses saved to ", "cyan")
6455
+ + colored(responseOutputDirectory, "white")
6456
+ + colored(" for " + subs + argsInput + ": ", "cyan")
6457
+ + colored(
6458
+ str(fileCount) + " 🤘",
6459
+ "white",
6460
+ )
6461
+ + colored(" (" + str(failureCount) + " not found)\n", "red")
6462
+ )
6463
+ else:
6464
+ write(
6465
+ colored("GhostArchive - [ INFO ] Responses saved to ", "cyan")
6466
+ + colored(responseOutputDirectory, "white")
6467
+ + colored(" for " + subs + argsInput + ": ", "cyan")
6468
+ + colored(str(fileCount) + " 🤘", "white")
6469
+ + colored(" (" + str(failureCount) + " not found)\n", "red")
6470
+ )
6471
+ else:
6472
+ if verbose():
6473
+ write(
6474
+ colored("GhostArchive - [ INFO ] Responses saved to ", "cyan")
6475
+ + colored(responseOutputDirectory, "white")
6476
+ + colored(" for " + subs + argsInput + ": ", "cyan")
6477
+ + colored(str(fileCount) + " 🤘\n", "white")
6478
+ )
6479
+ else:
6480
+ write(
6481
+ colored("GhostArchive - [ INFO ] Responses saved to ", "cyan")
6482
+ + colored(responseOutputDirectory, "white")
6483
+ + colored(" for " + subs + argsInput + ": ", "cyan")
6484
+ + colored(str(fileCount) + " 🤘\n", "white")
6485
+ )
6486
+ except Exception as e:
6487
+ if verbose():
6488
+ writerr(colored("ERROR processResponsesGhostArchive 5: " + str(e), "red"))
6489
+
6490
+ # Append extra links from WARC files to URL output file (for mode B)
6491
+ try:
6492
+ if args.mode == "B" and len(extraWarcLinks) > 0:
6493
+ # Determine URL output file path (same logic as processURLOutput)
6494
+ if args.output_urls == "":
6495
+ if args.output_responses != "":
6496
+ urlFilePath = args.output_responses + "/waymore.txt"
6497
+ else:
6498
+ urlFilePath = (
6499
+ str(DEFAULT_OUTPUT_DIR)
6500
+ + "/results/"
6501
+ + str(argsInput).replace("/", "-")
6502
+ + "/waymore.txt"
6503
+ )
6504
+ else:
6505
+ urlFilePath = args.output_urls
6506
+
6507
+ # Load existing URLs from file to avoid duplicates
6508
+ existingUrls = set()
6509
+ try:
6510
+ with open(urlFilePath) as f:
6511
+ for line in f:
6512
+ existingUrls.add(line.strip())
6513
+ except Exception:
6514
+ pass
6515
+
6516
+ # Append only new unique URLs
6517
+ newLinks = [
6518
+ url
6519
+ for url in extraWarcLinks
6520
+ if url not in existingUrls and url not in linksFound
6521
+ ]
6522
+ if len(newLinks) > 0:
6523
+ with open(urlFilePath, "a") as f:
6524
+ for url in newLinks:
6525
+ f.write(url + "\n")
6526
+
6527
+ # Display message about extra links
6528
+ write(
6529
+ colored("GhostArchive - [ INFO ] ", "cyan")
6530
+ + colored(str(len(newLinks)), "white")
6531
+ + colored(" extra links found in WARC files added to file ", "cyan")
6532
+ + colored(urlFilePath, "white")
6533
+ + "\n"
6534
+ )
6535
+ except Exception as e:
6536
+ if verbose():
6537
+ writerr(colored("ERROR processResponsesGhostArchive 6: " + str(e), "red"))
6538
+
6539
+ totalFileCount = totalFileCount + fileCount
6540
+ except Exception as e:
6541
+ writerr(colored(getSPACER("ERROR processResponsesGhostArchive 1: " + str(e)), "red"))
6542
+ finally:
6543
+ linkRequests = None
6544
+
6545
+
5042
6546
  def processResponsesURLScan():
5043
6547
  """
5044
6548
  Get archived responses from URLScan (urlscan.io)
@@ -6254,6 +7758,12 @@ async def fetch_intelx_async():
6254
7758
  await loop.run_in_executor(None, getIntelxUrls)
6255
7759
 
6256
7760
 
7761
+ async def fetch_ghostarchive_async():
7762
+ """Async wrapper for getGhostArchiveUrls - runs in thread pool"""
7763
+ loop = asyncio.get_event_loop()
7764
+ await loop.run_in_executor(None, getGhostArchiveUrls)
7765
+
7766
+
6257
7767
  async def fetch_all_sources_async():
6258
7768
  """
6259
7769
  Orchestrator function to fetch from all enabled sources concurrently.
@@ -6276,6 +7786,8 @@ async def fetch_all_sources_async():
6276
7786
  tasks.append(("VirusTotal", fetch_virustotal_async()))
6277
7787
  if not args.xix and INTELX_API_KEY != "" and stopProgram is None:
6278
7788
  tasks.append(("Intelligence X", fetch_intelx_async()))
7789
+ if not args.xga and stopProgram is None:
7790
+ tasks.append(("GhostArchive", fetch_ghostarchive_async()))
6279
7791
 
6280
7792
  if not tasks:
6281
7793
  return
@@ -6301,7 +7813,7 @@ async def fetch_all_sources_async():
6301
7813
 
6302
7814
  # Run waymore
6303
7815
  def main():
6304
- global args, DEFAULT_TIMEOUT, inputValues, argsInput, linksFound, linkMimes, successCount, failureCount, fileCount, totalResponses, totalPages, indexFile, path, stopSource, stopProgram, VIRUSTOTAL_API_KEY, inputIsSubDomain, argsInputHostname, WEBHOOK_DISCORD, responseOutputDirectory, fileCount, INTELX_API_KEY, stopSourceAlienVault, stopSourceCommonCrawl, stopSourceWayback, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx
7816
+ global args, DEFAULT_TIMEOUT, inputValues, argsInput, linksFound, linkMimes, successCount, failureCount, fileCount, totalResponses, totalPages, indexFile, path, stopSource, stopProgram, VIRUSTOTAL_API_KEY, inputIsSubDomain, argsInputHostname, WEBHOOK_DISCORD, responseOutputDirectory, fileCount, INTELX_API_KEY, stopSourceAlienVault, stopSourceCommonCrawl, stopSourceWayback, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx, stopSourceGhostArchive, extraWarcLinks
6305
7817
 
6306
7818
  # Tell Python to run the handler() function when SIGINT is received
6307
7819
  signal(SIGINT, handler)
@@ -6457,13 +7969,19 @@ def main():
6457
7969
  help="Exclude checks for links from intelx.io",
6458
7970
  default=False,
6459
7971
  )
7972
+ parser.add_argument(
7973
+ "-xga",
7974
+ action="store_true",
7975
+ help="Exclude checks for links from ghostarchive.org",
7976
+ default=False,
7977
+ )
6460
7978
  parser.add_argument(
6461
7979
  "--providers",
6462
7980
  action="store",
6463
- help="A comma separated list of source providers that you want to get URLs from. The values can be wayback,commoncrawl,otx,urlscan,virustotal and intelx. Passing this will override any exclude arguments (e.g. -xwm,-xcc, etc.) passed to exclude sources, and reset those based on what was passed with this argument.",
7981
+ help="A comma separated list of source providers that you want to get URLs from. The values can be wayback,commoncrawl,otx,urlscan,virustotal,intelx and ghostarchive. Passing this will override any exclude arguments (e.g. -xwm,-xcc, etc.) passed to exclude sources, and reset those based on what was passed with this argument.",
6464
7982
  default=[],
6465
7983
  type=validateArgProviders,
6466
- metavar="{wayback,commoncrawl,otx,urlscan,virustotal,intelx}",
7984
+ metavar="{wayback,commoncrawl,otx,urlscan,virustotal,intelx,ghostarchive}",
6467
7985
  )
6468
7986
  parser.add_argument(
6469
7987
  "-lcc",
@@ -6630,6 +8148,10 @@ def main():
6630
8148
  args.xix = True
6631
8149
  else:
6632
8150
  args.xix = False
8151
+ if "ghostarchive" not in args.providers:
8152
+ args.xga = True
8153
+ else:
8154
+ args.xga = False
6633
8155
 
6634
8156
  # If no input was given, raise an error
6635
8157
  if sys.stdin.isatty():
@@ -6700,6 +8222,7 @@ def main():
6700
8222
  # Reset global variables
6701
8223
  linksFound = set()
6702
8224
  linkMimes = set()
8225
+ extraWarcLinks = set()
6703
8226
  successCount = 0
6704
8227
  failureCount = 0
6705
8228
  fileCount = 0
@@ -6714,6 +8237,7 @@ def main():
6714
8237
  stopSourceURLScan = False
6715
8238
  stopSourceVirusTotal = False
6716
8239
  stopSourceIntelx = False
8240
+ stopSourceGhostArchive = False
6717
8241
 
6718
8242
  # Get the config settings from the config.yml file
6719
8243
  getConfig()