waymore 7.7__py3-none-any.whl → 8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
waymore/waymore.py CHANGED
@@ -70,6 +70,7 @@ stopSourceAlienVault = False
70
70
  stopSourceURLScan = False
71
71
  stopSourceVirusTotal = False
72
72
  stopSourceIntelx = False
73
+ stopSourceGhostArchive = False
73
74
  successCount = 0
74
75
  failureCount = 0
75
76
  fileCount = 0
@@ -79,6 +80,7 @@ totalPages = 0
79
80
  indexFile = None
80
81
  continueRespFile = None
81
82
  continueRespFileURLScan = None
83
+ continueRespFileGhostArchive = None
82
84
  inputIsDomainANDPath = False
83
85
  inputIsSubDomain = False
84
86
  subs = "*."
@@ -102,6 +104,7 @@ checkAlienVault = 0
102
104
  checkURLScan = 0
103
105
  checkVirusTotal = 0
104
106
  checkIntelx = 0
107
+ checkGhostArchive = 0
105
108
  argsInputHostname = ""
106
109
  responseOutputDirectory = ""
107
110
  urlscanRequestLinks = set()
@@ -112,11 +115,14 @@ linkCountAlienVault = 0
112
115
  linkCountURLScan = 0
113
116
  linkCountVirusTotal = 0
114
117
  linkCountIntelx = 0
118
+ linkCountGhostArchive = 0
115
119
  linksFoundCommonCrawl = set()
116
120
  linksFoundAlienVault = set()
117
121
  linksFoundURLScan = set()
118
122
  linksFoundVirusTotal = set()
119
123
  linksFoundIntelx = set()
124
+ linksFoundGhostArchive = set()
125
+ ghostArchiveRequestLinks = set()
120
126
 
121
127
  # Thread lock for protecting shared state during concurrent operations
122
128
  links_lock = threading.Lock()
@@ -124,6 +130,7 @@ links_lock = threading.Lock()
124
130
  # Shared state for link collection across all sources
125
131
  linksFound = set()
126
132
  linkMimes = set()
133
+ extraWarcLinks = set() # Track extra URLs found in WARC files for mode B
127
134
 
128
135
  # Source Provider URLs
129
136
  WAYBACK_URL = "https://web.archive.org/cdx/search/cdx?url={DOMAIN}{COLLAPSE}&fl=timestamp,original,mimetype,statuscode,digest"
@@ -134,6 +141,8 @@ URLSCAN_DOM_URL = "https://urlscan.io/dom/"
134
141
  VIRUSTOTAL_URL = "https://www.virustotal.com/vtapi/v2/domain/report?apikey={APIKEY}&domain={DOMAIN}"
135
142
  # Paid endpoint first, free endpoint as fallback
136
143
  INTELX_BASES = ["https://2.intelx.io", "https://free.intelx.io"]
144
+ GHOSTARCHIVE_URL = "https://ghostarchive.org/search?term={DOMAIN}&page="
145
+ GHOSTARCHIVE_DOM_URL = "https://ghostarchive.org"
137
146
 
138
147
  intelx_tls = threading.local()
139
148
 
@@ -247,10 +256,10 @@ DEFAULT_LIMIT = 5000
247
256
  DEFAULT_TIMEOUT = 30
248
257
 
249
258
  # Exclusions used to exclude responses we will try to get from web.archive.org
250
- DEFAULT_FILTER_URL = ".css,.jpg,.jpeg,.png,.svg,.img,.gif,.mp4,.flv,.ogv,.webm,.webp,.mov,.mp3,.m4a,.m4p,.scss,.tif,.tiff,.ttf,.otf,.woff,.woff2,.bmp,.ico,.eot,.htc,.rtf,.swf,.image,/image,/img,/css,/wp-json,/wp-content,/wp-includes,/theme,/audio,/captcha,/font,node_modules,/jquery,/bootstrap,/_incapsula_resource,.wmv,.wma,.asx"
259
+ DEFAULT_FILTER_URL = ".css,.jpg,.jpeg,.png,.svg,.img,.gif,.mp4,.flv,.ogv,.webm,.webp,.mov,.mp3,.m4a,.m4p,.scss,.tif,.tiff,.ttf,.otf,.woff,.woff2,.bmp,.ico,.eot,.htc,.rtf,.swf,.image,/image,/img,/css,/wp-json,/wp-content,/wp-includes,/theme,/audio,/captcha,/font,node_modules,/jquery,/bootstrap,/_incapsula_resource,.wmv,.wma,.asx,.avif"
251
260
 
252
261
  # MIME Content-Type exclusions used to filter links and responses from web.archive.org through their API
253
- DEFAULT_FILTER_MIME = "text/css,image/jpeg,image/jpg,image/png,image/svg+xml,image/gif,image/tiff,image/webp,image/bmp,image/vnd,image/x-icon,image/vnd.microsoft.icon,font/ttf,font/woff,font/woff2,font/x-woff2,font/x-woff,font/otf,audio/mpeg,audio/wav,audio/webm,audio/aac,audio/ogg,audio/wav,audio/webm,video/mp4,video/mpeg,video/webm,video/ogg,video/mp2t,video/webm,video/x-msvideo,video/x-flv,application/font-woff,application/font-woff2,application/x-font-woff,application/x-font-woff2,application/vnd.ms-fontobject,application/font-sfnt,application/vnd.android.package-archive,binary/octet-stream,application/octet-stream,application/pdf,application/x-font-ttf,application/x-font-otf,video/webm,video/3gpp,application/font-ttf,audio/mp3,audio/x-wav,image/pjpeg,audio/basic,application/font-otf,application/x-ms-application,application/x-msdownload,video/x-ms-wmv,image/x-png,video/quicktime,image/x-ms-bmp,font/opentype,application/x-font-opentype,application/x-woff,audio/aiff,video/x-ms-asf,audio/x-ms-wma,audio/wma,application/x-mplayer2"
262
+ DEFAULT_FILTER_MIME = "text/css,image/jpeg,image/jpg,image/png,image/svg+xml,image/gif,image/tiff,image/webp,image/bmp,image/vnd,image/x-icon,image/vnd.microsoft.icon,font/ttf,font/woff,font/woff2,font/x-woff2,font/x-woff,font/otf,audio/mpeg,audio/wav,audio/webm,audio/aac,audio/ogg,audio/wav,audio/webm,video/mp4,video/mpeg,video/webm,video/ogg,video/mp2t,video/webm,video/x-msvideo,video/x-flv,application/font-woff,application/font-woff2,application/x-font-woff,application/x-font-woff2,application/vnd.ms-fontobject,application/font-sfnt,application/vnd.android.package-archive,binary/octet-stream,application/octet-stream,application/x-font-ttf,application/x-font-otf,video/webm,video/3gpp,application/font-ttf,audio/mp3,audio/x-wav,image/pjpeg,audio/basic,application/font-otf,application/x-ms-application,application/x-msdownload,video/x-ms-wmv,image/x-png,video/quicktime,image/x-ms-bmp,font/opentype,application/x-font-opentype,application/x-woff,audio/aiff,video/x-ms-asf,audio/x-ms-wma,audio/wma,application/x-mplayer2,image/avif"
254
263
 
255
264
  # Response code exclusions we will use to filter links and responses from web.archive.org through their API
256
265
  DEFAULT_FILTER_CODE = "404,301,302"
@@ -743,7 +752,7 @@ def handler(signal_received, frame):
743
752
  This function is called if Ctrl-C is called by the user
744
753
  An attempt will be made to try and clean up properly
745
754
  """
746
- global stopSource, stopProgram, stopProgramCount, stopSourceWayback, stopSourceCommonCrawl, stopSourceAlienVault, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx, current_response, current_session
755
+ global stopSource, stopProgram, stopProgramCount, stopSourceWayback, stopSourceCommonCrawl, stopSourceAlienVault, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx, stopSourceGhostArchive, current_response, current_session
747
756
 
748
757
  if stopProgram is not None:
749
758
  stopProgramCount = stopProgramCount + 1
@@ -778,6 +787,7 @@ def handler(signal_received, frame):
778
787
  stopSourceURLScan = True
779
788
  stopSourceVirusTotal = True
780
789
  stopSourceIntelx = True
790
+ stopSourceGhostArchive = True
781
791
  # Try to close any active response or session to interrupt blocking network I/O
782
792
  try:
783
793
  if current_response is not None:
@@ -1753,11 +1763,15 @@ def printProgressBar(
1753
1763
 
1754
1764
  def filehash(text):
1755
1765
  """
1756
- Generate a hash value for the passed string. This is used for the file name of a downloaded archived response
1766
+ Generate a hash value for the passed string or bytes. This is used for the file name of a downloaded archived response
1757
1767
  """
1758
1768
  hash = 0
1759
1769
  for ch in text:
1760
- hash = (hash * 281 ^ ord(ch) * 997) & 0xFFFFFFFFFFF
1770
+ # Handle both str (gives chars needing ord()) and bytes (gives ints directly)
1771
+ if isinstance(ch, int):
1772
+ hash = (hash * 281 ^ ch * 997) & 0xFFFFFFFFFFF
1773
+ else:
1774
+ hash = (hash * 281 ^ ord(ch) * 997) & 0xFFFFFFFFFFF
1761
1775
  return str(hash)
1762
1776
 
1763
1777
 
@@ -1945,7 +1959,7 @@ def processArchiveUrl(url):
1945
1959
  try:
1946
1960
  try:
1947
1961
  try:
1948
- if os.environ.get("USER") == "xnl":
1962
+ if verbose() and os.environ.get("USER") == "xnl":
1949
1963
  writerr(
1950
1964
  colored(
1951
1965
  "[ DBG ] Requesting file " + archiveUrl,
@@ -2265,7 +2279,7 @@ def processArchiveUrl(url):
2265
2279
  debugText = "INTERNET ARCHIVE"
2266
2280
  elif archiveHtml.lower().find("wombat") > 0:
2267
2281
  debugText = "WOMBAT (JS)"
2268
- if debugText != "":
2282
+ if verbose() and debugText != "":
2269
2283
  writerr(
2270
2284
  colored(
2271
2285
  getSPACER(
@@ -2280,16 +2294,17 @@ def processArchiveUrl(url):
2280
2294
  )
2281
2295
  )
2282
2296
  except Exception as e:
2283
- writerr(
2284
- colored(
2285
- '[ DBG ] Error - Failed to output debug info for "'
2286
- + archiveUrl
2287
- + '": '
2288
- + str(e),
2289
- "red",
2290
- attrs=["dark"],
2297
+ if verbose():
2298
+ writerr(
2299
+ colored(
2300
+ '[ DBG ] Error - Failed to output debug info for "'
2301
+ + archiveUrl
2302
+ + '": '
2303
+ + str(e),
2304
+ "red",
2305
+ attrs=["dark"],
2306
+ )
2291
2307
  )
2292
- )
2293
2308
  pass
2294
2309
 
2295
2310
  successCount = successCount + 1
@@ -2760,17 +2775,20 @@ def validateArgProviders(x):
2760
2775
  - urlscan
2761
2776
  - virustotal
2762
2777
  - intelx
2778
+ - ghostarchive
2763
2779
  """
2764
2780
  invalid = False
2765
2781
  x = x.lower()
2766
2782
  providers = x.split(",")
2767
2783
  for provider in providers:
2768
- if not re.fullmatch(r"(wayback|commoncrawl|otx|urlscan|virustotal|intelx)", provider):
2784
+ if not re.fullmatch(
2785
+ r"(wayback|commoncrawl|otx|urlscan|virustotal|intelx|ghostarchive)", provider
2786
+ ):
2769
2787
  invalid = True
2770
2788
  break
2771
2789
  if invalid:
2772
2790
  raise argparse.ArgumentTypeError(
2773
- "Pass providers separated by a comma, e.g. wayback,commoncrawl,otx,urlscan,virustotal,intelx"
2791
+ "Pass providers separated by a comma, e.g. wayback,commoncrawl,otx,urlscan,virustotal,intelx,ghostarchive"
2774
2792
  )
2775
2793
  return x
2776
2794
 
@@ -3528,6 +3546,522 @@ def getURLScanDOM(originalUrl, domUrl):
3528
3546
  writerr(colored("ERROR getURLScanDOM 1: " + str(e), "red"))
3529
3547
 
3530
3548
 
3549
+ def getGhostArchiveWARC(originalUrl, domUrl):
3550
+ """
3551
+ Get the DOM for the passed GhostArchive link - parses WARC files containing multiple request/response pairs
3552
+ """
3553
+ global stopProgram, successCount, failureCount, fileCount, DEFAULT_OUTPUT_DIR, totalResponses, indexFile, argsInput, argsInputHostname, REGEX_404, linksFound, extraWarcLinks, links_lock
3554
+ try:
3555
+ if stopProgram is None:
3556
+
3557
+ # The WARC files are found by replacing /archive with /chimurai4 and using the .warc file extension
3558
+ warcUrl = domUrl.replace("/archive", "/chimurai4") + ".warc"
3559
+
3560
+ # Get memory usage every 100 responses
3561
+ if (successCount + failureCount) % 100 == 0:
3562
+ try:
3563
+ getMemory()
3564
+ except Exception:
3565
+ pass
3566
+
3567
+ # Fetch content
3568
+ try:
3569
+ # Show progress bar
3570
+ fillTest = (successCount + failureCount) % 2
3571
+ fillChar = "o"
3572
+ if fillTest == 0:
3573
+ fillChar = "O"
3574
+ suffix = "Complete "
3575
+
3576
+ printProgressBar(
3577
+ successCount + failureCount,
3578
+ totalResponses,
3579
+ prefix="Processing " + str(totalResponses) + " WARC files:",
3580
+ suffix=suffix,
3581
+ length=getProgressBarLength(),
3582
+ fill=fillChar,
3583
+ )
3584
+
3585
+ try:
3586
+ try:
3587
+ if verbose() and os.environ.get("USER") == "xnl":
3588
+ writerr(
3589
+ colored(
3590
+ "[ DBG ] Requesting file " + warcUrl,
3591
+ "yellow",
3592
+ attrs=["dark"],
3593
+ )
3594
+ )
3595
+ except Exception:
3596
+ pass
3597
+
3598
+ # Choose a random user agent string to use for any requests
3599
+ userAgent = random.choice(USER_AGENT)
3600
+ session = requests.Session()
3601
+ session.mount("https://", HTTP_ADAPTER)
3602
+ session.mount("http://", HTTP_ADAPTER)
3603
+
3604
+ # Retry loop for 503 or maintenance responses
3605
+ maxRetries = 3
3606
+ warcBytes = b""
3607
+ for attempt in range(maxRetries):
3608
+ resp = session.get(
3609
+ warcUrl,
3610
+ headers={"User-Agent": userAgent},
3611
+ allow_redirects=True,
3612
+ timeout=args.timeout,
3613
+ )
3614
+ warcBytes = resp.content
3615
+
3616
+ # Check if we need to retry (decode just for this check)
3617
+ try:
3618
+ warcTextCheck = warcBytes.decode("utf-8", errors="replace").lower()
3619
+ except Exception:
3620
+ warcTextCheck = ""
3621
+ if resp.status_code == 503 or "website under maintenance" in warcTextCheck:
3622
+ if attempt < maxRetries - 1:
3623
+ import time
3624
+
3625
+ time.sleep(0.5)
3626
+ continue
3627
+ break
3628
+
3629
+ # Parse the WARC file to extract multiple responses
3630
+ # WARC header lines are text, but response bodies may be binary
3631
+ # Split by line separator but keep bytes for body extraction
3632
+ lineBytes = warcBytes.split(b"\n")
3633
+ lines = [lb.decode("utf-8", errors="replace") for lb in lineBytes]
3634
+
3635
+ # State machine to track parsing
3636
+ currentTargetUri = ""
3637
+ inResponse = False
3638
+ contentType = ""
3639
+ responsesFound = (
3640
+ []
3641
+ ) # List of (targetUri, contentType, responseBytes, httpStatusCode)
3642
+
3643
+ i = 0
3644
+ skipCurrentResponse = False # Initialize before loop
3645
+ pendingResponseType = (
3646
+ False # Track if we saw WARC-Type: response and are waiting for Target-URI
3647
+ )
3648
+ responseStartIdx = -1 # Initialize before loop
3649
+ httpStatusCode = "" # Initialize before loop
3650
+ while i < len(lines) and stopProgram is None and not stopSourceGhostArchive:
3651
+ line = lines[i]
3652
+
3653
+ # When we see a new WARC record start, reset pending state
3654
+ if line.startswith("WARC/1.0"):
3655
+ # If we were in a response and collecting, save it before moving to new record
3656
+ if inResponse and responseStartIdx >= 0:
3657
+ responseBodyBytes = b"\n".join(lineBytes[responseStartIdx:i])
3658
+ responsesFound.append(
3659
+ (
3660
+ currentTargetUri,
3661
+ contentType,
3662
+ responseBodyBytes,
3663
+ httpStatusCode if "httpStatusCode" in dir() else "",
3664
+ )
3665
+ )
3666
+ inResponse = False
3667
+ responseStartIdx = -1
3668
+ contentType = ""
3669
+ httpStatusCode = ""
3670
+ pendingResponseType = False
3671
+ skipCurrentResponse = False
3672
+
3673
+ # Look for WARC-Type: response - mark that we're in a response record header
3674
+ elif line.startswith("WARC-Type: response"):
3675
+ pendingResponseType = True
3676
+ inResponse = False # Don't start capturing body yet
3677
+ responseStartIdx = -1
3678
+ contentType = ""
3679
+
3680
+ # Look for WARC-Target-URI to get the request URL
3681
+ elif line.startswith("WARC-Target-URI:"):
3682
+ currentTargetUri = line.split(":", 1)[1].strip()
3683
+ skipCurrentResponse = False
3684
+
3685
+ # Check: URL host must contain the input hostname
3686
+ if argsInputHostname:
3687
+ try:
3688
+ parsed = urlparse(currentTargetUri)
3689
+ host = parsed.netloc.lower()
3690
+ if argsInputHostname.lower() not in host:
3691
+ skipCurrentResponse = True
3692
+ except Exception:
3693
+ skipCurrentResponse = True
3694
+
3695
+ # Check: Filter by URL (FILTER_URL)
3696
+ if not skipCurrentResponse and FILTER_URL and currentTargetUri:
3697
+ filterUrls = [u.strip().lower() for u in FILTER_URL.split(",")]
3698
+ for filterUrl in filterUrls:
3699
+ if filterUrl in currentTargetUri.lower():
3700
+ skipCurrentResponse = True
3701
+ break
3702
+
3703
+ # If we were waiting for Target-URI after seeing WARC-Type: response, and it's valid, start response mode
3704
+ if pendingResponseType and not skipCurrentResponse:
3705
+ inResponse = True
3706
+ pendingResponseType = False
3707
+
3708
+ # If we're in a response section (after seeing both WARC-Type: response and valid WARC-Target-URI)
3709
+ elif inResponse:
3710
+ # Check for HTTP start and capture status code
3711
+ if line.startswith("HTTP"):
3712
+ # Extract status code (e.g., "HTTP/1.1 200 OK" -> "200")
3713
+ try:
3714
+ httpStatusCode = line.split()[1]
3715
+ except Exception:
3716
+ httpStatusCode = ""
3717
+
3718
+ # Early check: Filter by HTTP status code (FILTER_CODE)
3719
+ if FILTER_CODE and httpStatusCode:
3720
+ filterCodes = [c.strip() for c in FILTER_CODE.split(",")]
3721
+ if httpStatusCode in filterCodes:
3722
+ inResponse = False
3723
+ responseStartIdx = -1
3724
+ i += 1
3725
+ continue
3726
+
3727
+ responseStartIdx = i # Mark start of response
3728
+ elif responseStartIdx >= 0:
3729
+ # Capture Content-Type if present (case-insensitive check)
3730
+ if line.lower().startswith("content-type:"):
3731
+ try:
3732
+ contentType = (
3733
+ line.split(":", 1)[1].strip().split(";")[0].lower()
3734
+ )
3735
+ except Exception:
3736
+ pass
3737
+
3738
+ # Early check: Filter by MIME type (FILTER_MIME)
3739
+ if FILTER_MIME and contentType:
3740
+ filterMimes = [
3741
+ m.strip().lower() for m in FILTER_MIME.split(",")
3742
+ ]
3743
+ if contentType in filterMimes:
3744
+ inResponse = False
3745
+ responseStartIdx = -1
3746
+ i += 1
3747
+ continue
3748
+
3749
+ i += 1
3750
+
3751
+ if stopProgram is not None:
3752
+ return
3753
+
3754
+ # Don't forget the last response if file doesn't end with WARC/1.0
3755
+ if inResponse and responseStartIdx >= 0:
3756
+ responseBodyBytes = b"\n".join(lineBytes[responseStartIdx:])
3757
+ responsesFound.append(
3758
+ (
3759
+ currentTargetUri,
3760
+ contentType,
3761
+ responseBodyBytes,
3762
+ httpStatusCode if "httpStatusCode" in dir() else "",
3763
+ )
3764
+ )
3765
+
3766
+ # Process each response found
3767
+ for targetUri, contentType, responseBytes, httpStatusCode in responsesFound:
3768
+ if stopProgram is not None:
3769
+ break
3770
+
3771
+ if not responseBytes:
3772
+ continue
3773
+
3774
+ # Split HTTP header from body in bytes (look for \r\n\r\n or \n\n separator)
3775
+ if b"\r\n\r\n" in responseBytes:
3776
+ bodyBytes = responseBytes.split(b"\r\n\r\n", 1)[1]
3777
+ elif b"\n\n" in responseBytes:
3778
+ bodyBytes = responseBytes.split(b"\n\n", 1)[1]
3779
+ else:
3780
+ bodyBytes = responseBytes
3781
+
3782
+ # Skip empty bodies or "not found" responses
3783
+ if not bodyBytes or bodyBytes.lower().strip() == b"not found":
3784
+ continue
3785
+
3786
+ # If -f / --filter-responses-only is passed, track all URLs immediately (before filtering)
3787
+ if args.mode == "B" and args.filter_responses_only and targetUri:
3788
+ with links_lock:
3789
+ if targetUri not in linksFound and targetUri not in extraWarcLinks:
3790
+ extraWarcLinks.add(targetUri)
3791
+
3792
+ # Use isBinaryContent to detect if this is binary content
3793
+ isBinary = isBinaryContent(bodyBytes, contentType, targetUri)
3794
+
3795
+ if isBinary:
3796
+ # Binary file - save raw bytes
3797
+ archiveContent = bodyBytes
3798
+ archiveHtml = None
3799
+ else:
3800
+ # Text file - decode to string
3801
+ archiveHtml = bodyBytes.decode("utf-8", errors="replace")
3802
+ archiveContent = None
3803
+
3804
+ # Collapse multiple blank lines into one
3805
+ archiveHtml = re.sub(r"\n{3,}", "\n\n", archiveHtml)
3806
+
3807
+ # Skip if body is empty after processing
3808
+ if not archiveHtml.strip():
3809
+ continue
3810
+
3811
+ if stopProgram is not None:
3812
+ break
3813
+
3814
+ # Determine if this is HTML or JS based on content-type or URL
3815
+ isHtml = (
3816
+ contentType in ["text/html", "application/xhtml+xml"]
3817
+ or targetUri.lower().endswith(".html")
3818
+ or targetUri.lower().endswith(".htm")
3819
+ )
3820
+ isJs = contentType in [
3821
+ "text/javascript",
3822
+ "application/javascript",
3823
+ "application/x-javascript",
3824
+ ] or targetUri.lower().endswith(".js")
3825
+
3826
+ # Add the URL as a comment at the start of the response (only for text files)
3827
+ if not isBinary and args.url_filename:
3828
+ if isHtml:
3829
+ archiveHtml = (
3830
+ "<!-- Original URL: " + targetUri + " -->\n" + archiveHtml
3831
+ )
3832
+ elif isJs:
3833
+ archiveHtml = (
3834
+ "/* Original URL: " + targetUri + " */\n" + archiveHtml
3835
+ )
3836
+
3837
+ # Create file name based on url or hash value
3838
+ if args.url_filename:
3839
+ fileName = targetUri.replace("/", "-").replace(":", "")
3840
+ fileName = fileName[0:254]
3841
+ hashValue = ""
3842
+ else:
3843
+ # Hash the content to get the filename
3844
+ if isBinary:
3845
+ hashValue = filehash(archiveContent)
3846
+ else:
3847
+ hashValue = filehash(archiveHtml)
3848
+ fileName = hashValue
3849
+
3850
+ # Determine extension of file from the content-type or URL
3851
+ extension = ""
3852
+ try:
3853
+ # Get path extension from URL
3854
+ if "://" in targetUri:
3855
+ targetUrl = "https://" + targetUri.split("://")[1]
3856
+ parsed = urlparse(targetUrl.strip())
3857
+ path = parsed.path
3858
+ extension = path[path.rindex(".") + 1 :]
3859
+ if "/" in extension:
3860
+ extension = ""
3861
+ # If extension is over 6 characters, it's likely not a real extension (e.g. API endpoint ID)
3862
+ if len(extension) > 6:
3863
+ extension = ""
3864
+ except Exception:
3865
+ pass
3866
+
3867
+ # If extension is blank, determine from MIME type or content
3868
+ if extension == "":
3869
+ if isBinary:
3870
+ # Binary file extensions from MIME type
3871
+ if contentType:
3872
+ if "image/png" in contentType:
3873
+ extension = "png"
3874
+ elif (
3875
+ "image/jpeg" in contentType
3876
+ or "image/jpg" in contentType
3877
+ ):
3878
+ extension = "jpg"
3879
+ elif "image/gif" in contentType:
3880
+ extension = "gif"
3881
+ elif "image/webp" in contentType:
3882
+ extension = "webp"
3883
+ elif "application/pdf" in contentType:
3884
+ extension = "pdf"
3885
+ elif "application/zip" in contentType:
3886
+ extension = "zip"
3887
+ else:
3888
+ extension = "bin"
3889
+ else:
3890
+ extension = "bin"
3891
+ else:
3892
+ # Text file extensions
3893
+ if contentType and "javascript" in contentType.lower():
3894
+ extension = "js"
3895
+ elif contentType and "html" in contentType.lower():
3896
+ extension = "html"
3897
+ elif contentType and "json" in contentType.lower():
3898
+ extension = "json"
3899
+ elif contentType and "text" in contentType.lower():
3900
+ extension = "txt"
3901
+ elif archiveHtml and (
3902
+ archiveHtml.lower().strip().endswith("</html>")
3903
+ or archiveHtml.lower().strip().endswith("</body>")
3904
+ or archiveHtml.lower().strip().startswith("<!doctype html")
3905
+ or archiveHtml.lower().strip().startswith("<html")
3906
+ or archiveHtml.lower().strip().startswith("<head")
3907
+ ):
3908
+ extension = "html"
3909
+ else:
3910
+ extension = "unknown"
3911
+
3912
+ fileName = fileName + "." + extension
3913
+
3914
+ # Determine file path
3915
+ if args.output_responses != "":
3916
+ filePath = args.output_responses + "/" + f"{fileName}"
3917
+ else:
3918
+ filePath = (
3919
+ DEFAULT_OUTPUT_DIR
3920
+ + "/results/"
3921
+ + str(argsInput).replace("/", "-")
3922
+ + "/"
3923
+ + f"{fileName}"
3924
+ )
3925
+
3926
+ if stopProgram is not None:
3927
+ break
3928
+
3929
+ # Write the file
3930
+ try:
3931
+ if isBinary:
3932
+ # Binary file - write as bytes
3933
+ responseFile = open(filePath, "wb")
3934
+ responseFile.write(archiveContent)
3935
+ else:
3936
+ # Text file - write as UTF-8
3937
+ responseFile = open(filePath, "w", encoding="utf8")
3938
+ responseFile.write(archiveHtml)
3939
+ responseFile.close()
3940
+ with links_lock:
3941
+ fileCount = fileCount + 1
3942
+
3943
+ # Track extra URLs found in WARC files for mode B (only when -f is not passed, since we track earlier if it is)
3944
+ if args.mode == "B" and not args.filter_responses_only and targetUri:
3945
+ with links_lock:
3946
+ if (
3947
+ targetUri not in linksFound
3948
+ and targetUri not in extraWarcLinks
3949
+ ):
3950
+ extraWarcLinks.add(targetUri)
3951
+ except Exception as e:
3952
+ writerr(
3953
+ colored(
3954
+ "GhostArchive - [ ERR ] Failed to write file "
3955
+ + filePath
3956
+ + ": "
3957
+ + str(e),
3958
+ "red",
3959
+ )
3960
+ )
3961
+
3962
+ # Write the hash value and URL to the index file
3963
+ if not args.url_filename and hashValue:
3964
+ try:
3965
+ timestamp = str(datetime.now())
3966
+ indexFile.write(
3967
+ hashValue
3968
+ + ","
3969
+ + domUrl
3970
+ + "#"
3971
+ + targetUri
3972
+ + " ,"
3973
+ + timestamp
3974
+ + "\n"
3975
+ )
3976
+ indexFile.flush()
3977
+ except Exception as e:
3978
+ writerr(
3979
+ colored(
3980
+ 'GhostArchive - [ ERR ] Failed to write to waymore_index.txt for "'
3981
+ + warcUrl
3982
+ + '": '
3983
+ + str(e),
3984
+ "red",
3985
+ )
3986
+ )
3987
+
3988
+ successCount = successCount + 1
3989
+
3990
+ except WayBackException:
3991
+ failureCount = failureCount + 1
3992
+
3993
+ except Exception as e:
3994
+ failureCount = failureCount + 1
3995
+ if verbose():
3996
+ # Simplify common error messages
3997
+ if "connection broken" in str(e).lower():
3998
+ errorMsg = "Connection Broken"
3999
+ else:
4000
+ errorMsg = str(e)
4001
+ try:
4002
+ statusCode = (
4003
+ resp.status_code if "resp" in dir() and resp is not None else "ERR"
4004
+ )
4005
+ writerr(
4006
+ colored(
4007
+ "GhostArchive - [ "
4008
+ + str(statusCode)
4009
+ + ' ] Failed to get response for "'
4010
+ + warcUrl
4011
+ + '": '
4012
+ + errorMsg,
4013
+ "red",
4014
+ )
4015
+ )
4016
+ except Exception:
4017
+ writerr(
4018
+ colored(
4019
+ 'GhostArchive - [ ERR ] Failed to get response for "'
4020
+ + warcUrl
4021
+ + '": '
4022
+ + errorMsg,
4023
+ "red",
4024
+ )
4025
+ )
4026
+
4027
+ # Show memory usage if -v option chosen, and check memory every 25 responses (or if its the last)
4028
+ if (successCount + failureCount) % 25 == 1 or (
4029
+ successCount + failureCount
4030
+ ) == totalResponses:
4031
+ try:
4032
+ getMemory()
4033
+ if verbose():
4034
+ suffix = (
4035
+ "Complete (Mem Usage "
4036
+ + humanReadableSize(currentMemUsage)
4037
+ + ", Total Mem "
4038
+ + str(currentMemPercent)
4039
+ + "%) "
4040
+ )
4041
+ except Exception:
4042
+ if verbose():
4043
+ suffix = 'Complete (To show mem use, run "pip install psutil")'
4044
+ printProgressBar(
4045
+ successCount + failureCount,
4046
+ totalResponses,
4047
+ prefix="Processing " + str(totalResponses) + " WARC files:",
4048
+ suffix=suffix,
4049
+ length=getProgressBarLength(),
4050
+ fill=fillChar,
4051
+ )
4052
+
4053
+ except Exception as e:
4054
+ if verbose():
4055
+ writerr(
4056
+ colored(
4057
+ 'GhostArchive - [ ERR ] Error for "' + domUrl + '": ' + str(e), "red"
4058
+ )
4059
+ )
4060
+
4061
+ except Exception as e:
4062
+ writerr(colored("ERROR getGhostArchiveWARC 1: " + str(e), "red"))
4063
+
4064
+
3531
4065
  def format_date_for_urlscan(date_str):
3532
4066
  # Handle different lengths of input
3533
4067
  if len(date_str) == 4: # YYYY
@@ -4198,7 +4732,6 @@ def processWayBackPage(url):
4198
4732
  pass
4199
4733
  return
4200
4734
  else:
4201
- print("DEBUG: HERE END!") # DEBUG
4202
4735
  pass
4203
4736
  except Exception as e:
4204
4737
  if verbose():
@@ -5456,13 +5989,310 @@ def getIntelxUrls():
5456
5989
  writerr(colored("ERROR getIntelxUrls 1: " + str(e), "red"))
5457
5990
 
5458
5991
 
5459
- def processResponses():
5992
+ def processGhostArchiveUrl(url, ghostArchiveID=""):
5993
+ """
5994
+ Process a specific URL from ghostarchive.org to determine whether to save the link
5995
+ """
5996
+ global argsInput, argsInputHostname, links_lock, linkCountGhostArchive, linksFoundGhostArchive
5997
+
5998
+ addLink = True
5999
+
6000
+ try:
6001
+ # Strip Wayback Machine prefix if present (e.g., https://web.archive.org/web/20230101120000_/https://example.com)
6002
+ waybackMatch = re.match(r"^https?://web\.archive\.org/[^/]+/[a-zA-Z0-9]+_/", url)
6003
+ if waybackMatch:
6004
+ url = url[waybackMatch.end() :]
6005
+
6006
+ # If the input has a / in it, then a URL was passed, so the link will only be added if the URL matches
6007
+ if "/" in url:
6008
+ if argsInput not in url:
6009
+ addLink = False
6010
+
6011
+ # If filters are required then test them
6012
+ if addLink and not args.filter_responses_only:
6013
+
6014
+ # If the user requested -n / --no-subs then we don't want to add it if it has a sub domain (www. will not be classed as a sub domain)
6015
+ if args.no_subs:
6016
+ match = re.search(
6017
+ r"^[A-za-z]*\:\/\/(www\.)?" + re.escape(argsInputHostname),
6018
+ url,
6019
+ flags=re.IGNORECASE,
6020
+ )
6021
+ if match is None:
6022
+ addLink = False
6023
+
6024
+ # If the user didn't requested -f / --filter-responses-only then check http code
6025
+ if addLink and not args.filter_responses_only:
6026
+
6027
+ # Check the URL exclusions
6028
+ if addLink:
6029
+ match = re.search(
6030
+ r"(" + re.escape(FILTER_URL).replace(",", "|") + ")",
6031
+ url,
6032
+ flags=re.IGNORECASE,
6033
+ )
6034
+ if match is not None:
6035
+ addLink = False
6036
+
6037
+ # Set keywords filter if -ko argument passed
6038
+ if addLink and args.keywords_only:
6039
+ if args.keywords_only == "#CONFIG":
6040
+ match = re.search(
6041
+ r"(" + re.escape(FILTER_KEYWORDS).replace(",", "|") + ")",
6042
+ url,
6043
+ flags=re.IGNORECASE,
6044
+ )
6045
+ else:
6046
+ match = re.search(r"(" + args.keywords_only + ")", url, flags=re.IGNORECASE)
6047
+ if match is None:
6048
+ addLink = False
6049
+
6050
+ # Add link if it passed filters
6051
+ if addLink:
6052
+ # Just get the hostname of the url
6053
+ tldExtract = tldextract.extract(url)
6054
+ subDomain = tldExtract.subdomain
6055
+ if subDomain != "":
6056
+ subDomain = subDomain + "."
6057
+ domainOnly = subDomain + tldExtract.domain + "." + tldExtract.suffix
6058
+
6059
+ # GhostArchive might return URLs that aren't for the domain passed so we need to check for those and not process them
6060
+ # Check the URL
6061
+ match = re.search(
6062
+ r"(^|\.)" + re.escape(argsInputHostname) + "$",
6063
+ domainOnly,
6064
+ flags=re.IGNORECASE,
6065
+ )
6066
+ if match is not None:
6067
+ if args.mode in ("U", "B"):
6068
+ linksFoundAdd(url, linksFoundGhostArchive)
6069
+ # If Response mode is requested then add the DOM ID to try later, for the number of responses wanted
6070
+ if ghostArchiveID != "" and args.mode in ("R", "B"):
6071
+ if args.limit == 0 or len(ghostArchiveRequestLinks) < args.limit:
6072
+ with links_lock:
6073
+ ghostArchiveRequestLinks.add(
6074
+ (url, GHOSTARCHIVE_DOM_URL + ghostArchiveID)
6075
+ )
6076
+
6077
+ except Exception as e:
6078
+ writerr(colored("ERROR processGhostArchiveUrl 1: " + str(e), "red"))
6079
+
6080
+
6081
+ def getGhostArchiveUrls():
6082
+ """
6083
+ Get URLs from GhostArchive (ghostarchive.org)
6084
+ This source doesn't have an API, so we crawl the HTML pages directly.
6085
+ """
6086
+ global linksFound, path, subs, stopProgram, stopSourceGhostArchive, argsInput, checkGhostArchive, argsInputHostname, linkCountGhostArchive, linksFoundGhostArchive
6087
+
6088
+ try:
6089
+ stopSourceGhostArchive = False
6090
+ linksFoundGhostArchive = set()
6091
+
6092
+ # Build the base URL
6093
+ # If there is only one . in the hostname, we can guarantee that a subdoman wasn't passed, so we can prefix with . to the links quicker as it won't include other domains that end with the target domain,
6094
+ # Else, we need to get all and then confirm the actual host of the links later
6095
+ if argsInputHostname.count(".") == 1:
6096
+ baseUrl = GHOSTARCHIVE_URL.replace("{DOMAIN}", "." + quote(argsInput))
6097
+ else:
6098
+ baseUrl = GHOSTARCHIVE_URL.replace("{DOMAIN}", quote(argsInput))
6099
+
6100
+ if verbose():
6101
+ write(
6102
+ colored("GhostArchive - [ INFO ] The URL requested to get links: ", "magenta")
6103
+ + colored(baseUrl + "0\n", "white")
6104
+ )
6105
+
6106
+ if not args.check_only and args.mode == "U":
6107
+ write(
6108
+ colored(
6109
+ "GhostArchive - [ INFO ] Getting links from ghostarchive.org (this can take a while for some domains)...",
6110
+ "cyan",
6111
+ )
6112
+ )
6113
+
6114
+ # Set up session with cookie
6115
+ session = requests.Session()
6116
+ if HTTP_ADAPTER is not None:
6117
+ session.mount("https://", HTTP_ADAPTER)
6118
+ session.mount("http://", HTTP_ADAPTER)
6119
+
6120
+ userAgent = random.choice(USER_AGENT)
6121
+ headers = {"User-Agent": userAgent}
6122
+ cookies = {"theme": "original"}
6123
+
6124
+ pageNum = 0
6125
+
6126
+ while stopProgram is None and not stopSourceGhostArchive:
6127
+ getMemory()
6128
+
6129
+ url = baseUrl + str(pageNum)
6130
+
6131
+ try:
6132
+ resp = session.get(url, headers=headers, cookies=cookies, timeout=DEFAULT_TIMEOUT)
6133
+ except Exception as e:
6134
+ writerr(
6135
+ colored(
6136
+ "GhostArchive - [ ERR ] Unable to get page " + str(pageNum) + ": " + str(e),
6137
+ "red",
6138
+ )
6139
+ )
6140
+ break
6141
+
6142
+ if resp.status_code == 429:
6143
+ writerr(
6144
+ colored(
6145
+ "GhostArchive - [ 429 ] Rate limit reached at page " + str(pageNum) + ".",
6146
+ "red",
6147
+ )
6148
+ )
6149
+ break
6150
+
6151
+ # Check for maintenance/end of results indicator
6152
+ if (
6153
+ resp.status_code == 503
6154
+ or "The site is under maintenance and will be back soon" in resp.text
6155
+ or "No archives for that site" in resp.text
6156
+ ):
6157
+ if verbose():
6158
+ if pageNum == 0:
6159
+ if args.check_only:
6160
+ checkGhostArchive = 1
6161
+ write(
6162
+ colored(
6163
+ "GhostArchive - [ INFO ] Get URLs from GhostArchive: ", "cyan"
6164
+ )
6165
+ + colored("1 request", "white")
6166
+ )
6167
+ else:
6168
+ write(
6169
+ colored(
6170
+ "GhostArchive - [ INFO ] No results found",
6171
+ "cyan",
6172
+ )
6173
+ )
6174
+ else:
6175
+ write(
6176
+ colored(
6177
+ "GhostArchive - [ INFO ] Retrieved all results from "
6178
+ + str(pageNum)
6179
+ + " pages",
6180
+ "cyan",
6181
+ )
6182
+ )
6183
+ break
6184
+ if resp.status_code != 200:
6185
+ writerr(
6186
+ colored(
6187
+ "GhostArchive - [ ERR ] [ "
6188
+ + str(resp.status_code)
6189
+ + " ] at page "
6190
+ + str(pageNum),
6191
+ "red",
6192
+ )
6193
+ )
6194
+ break
6195
+
6196
+ # Check only mode - just count pages
6197
+ if args.check_only:
6198
+ # For check only, we check if there are results and try to get total count
6199
+ if pageNum == 0:
6200
+ # Check if there are any results on the first page
6201
+ if '<a href="/archive/' in resp.text:
6202
+ # Try to find "out of X" to determine total results/pages
6203
+ outOfMatch = re.search(r"out of (\d+)", resp.text)
6204
+ if outOfMatch:
6205
+ totalResults = int(outOfMatch.group(1))
6206
+ checkGhostArchive = totalResults
6207
+ write(
6208
+ colored(
6209
+ "GhostArchive - [ INFO ] Get URLs from GhostArchive: ", "cyan"
6210
+ )
6211
+ + colored(f"{totalResults} requests (pagination required)", "white")
6212
+ )
6213
+ else:
6214
+ checkGhostArchive = 1
6215
+ write(
6216
+ colored(
6217
+ "GhostArchive - [ INFO ] Get URLs from GhostArchive: ", "cyan"
6218
+ )
6219
+ + colored("unknown requests (pagination required)", "white")
6220
+ )
6221
+ else:
6222
+ checkGhostArchive = 1
6223
+ write(
6224
+ colored("GhostArchive - [ INFO ] Get URLs from GhostArchive: ", "cyan")
6225
+ + colored("1 request (no results)", "white")
6226
+ )
6227
+ break
6228
+
6229
+ # Use regex to extract URLs from anchor tag text content
6230
+ # Pattern matches: <a href="/archive/ID">URL_HERE</a> - captures both href path and URL
6231
+ pattern = r'<a href="(/archive/[^"]*)">([^<]+)</a>'
6232
+ matches = re.findall(pattern, resp.text)
6233
+
6234
+ # If no matches found, we've reached the end of results
6235
+ if not matches:
6236
+ if verbose():
6237
+ write(
6238
+ colored(
6239
+ "GhostArchive - [ INFO ] Retrieved all results from "
6240
+ + str(pageNum + 1)
6241
+ + " pages",
6242
+ "cyan",
6243
+ )
6244
+ )
6245
+ break
6246
+
6247
+ for match in matches:
6248
+ ghostArchiveId = match[0] # e.g., "/archive/gkOOR"
6249
+ potentialUrl = match[1].strip()
6250
+ processGhostArchiveUrl(potentialUrl, ghostArchiveId)
6251
+
6252
+ # Check if there's a "Next Page" link - if not, we've reached the last page
6253
+ # GhostArchive resets to Page 1 when exceeding actual pages, so checking for Next Page is essential
6254
+ if "Next Page" not in resp.text and ">»</a>" not in resp.text:
6255
+ if verbose():
6256
+ write(
6257
+ colored(
6258
+ "GhostArchive - [ INFO ] Retrieved all results from "
6259
+ + str(pageNum + 1)
6260
+ + " pages",
6261
+ "cyan",
6262
+ )
6263
+ )
6264
+ break
6265
+
6266
+ pageNum += 1
6267
+
6268
+ if not args.check_only:
6269
+ # Count links based on mode - in R mode, count response links; in U/B mode, count URL links
6270
+ if args.mode == "R":
6271
+ linkCountGhostArchive = len(ghostArchiveRequestLinks)
6272
+ else:
6273
+ linkCountGhostArchive = len(linksFoundGhostArchive)
6274
+ write(
6275
+ colored("GhostArchive - [ INFO ] Links found on ghostarchive.org: ", "cyan")
6276
+ + colored(str(linkCountGhostArchive), "white")
6277
+ )
6278
+ linksFound.update(linksFoundGhostArchive)
6279
+ linksFoundGhostArchive.clear()
6280
+
6281
+ except Exception as e:
6282
+ writerr(colored("ERROR getGhostArchiveUrls 1: " + str(e), "red"))
6283
+
6284
+
6285
+ def processResponses():
5460
6286
  """
5461
6287
  Get archived responses from al sources
5462
6288
  """
5463
6289
  global stopProgram, totalFileCount
5464
6290
  try:
5465
6291
 
6292
+ # Get responses from GhostArchive unless excluded
6293
+ if stopProgram is None and not args.xga:
6294
+ processResponsesGhostArchive()
6295
+
5466
6296
  # Get responses from URLScan unless excluded
5467
6297
  if stopProgram is None and not args.xus:
5468
6298
  processResponsesURLScan()
@@ -5484,6 +6314,235 @@ def processResponses():
5484
6314
  writerr(colored(getSPACER("ERROR processResponses 1: " + str(e)), "red"))
5485
6315
 
5486
6316
 
6317
+ def processResponsesGhostArchive():
6318
+ """
6319
+ Get archived responses from GhostArchive (ghostarchive.org)
6320
+ """
6321
+ global subs, path, indexFile, totalResponses, stopProgram, argsInput, successCount, fileCount, DEFAULT_OUTPUT_DIR, responseOutputDirectory, ghostArchiveRequestLinks, failureCount, totalFileCount, checkGhostArchive
6322
+ try:
6323
+ fileCount = 0
6324
+ failureCount = 0
6325
+ if not args.check_only:
6326
+ # Create 'results' and domain directory if needed
6327
+ createDirs()
6328
+
6329
+ # Get the path of the files, depending on whether -oR / --output_responses was passed
6330
+ try:
6331
+ responsesPath = responseOutputDirectory + "responses.GhostArchive.tmp"
6332
+ indexPath = responseOutputDirectory + "waymore_index.txt"
6333
+ except Exception as e:
6334
+ if verbose():
6335
+ writerr(colored("ERROR processResponsesGhostArchive 4: " + str(e), "red"))
6336
+
6337
+ # Get URLs from GhostArchive if the DOM ID's haven't been retrieved yet
6338
+ if stopProgram is None and not args.check_only:
6339
+ if args.mode in ("R", "B"):
6340
+ write(
6341
+ colored(
6342
+ "GhostArchive - [ INFO ] Getting list of response links (this can take a while for some domains)...",
6343
+ "cyan",
6344
+ )
6345
+ )
6346
+ if args.mode == "R":
6347
+ getGhostArchiveUrls()
6348
+
6349
+ # Check if a responses.GhostArchive.tmp files exists
6350
+ if not args.check_only and os.path.exists(responsesPath):
6351
+
6352
+ # Load the links into the set
6353
+ with open(responsesPath, "rb") as fl:
6354
+ linkRequests = pickle.load(fl)
6355
+
6356
+ # Set start point
6357
+ successCount = 0
6358
+
6359
+ # Get the URLScan DOM links
6360
+ linkRequests = []
6361
+ for originalUrl, domUrl in ghostArchiveRequestLinks:
6362
+ linkRequests.append((originalUrl, domUrl))
6363
+
6364
+ # Write the links to a temp file
6365
+ if not args.check_only:
6366
+ with open(responsesPath, "wb") as f:
6367
+ pickle.dump(linkRequests, f)
6368
+
6369
+ # Get the total number of responses we will try to get and set the current file count to the success count
6370
+ totalResponses = len(linkRequests)
6371
+ checkGhostArchive = checkGhostArchive + totalResponses
6372
+
6373
+ # If there are no reponses to download, diaplay an error and exit
6374
+ if args.mode != "R" and totalResponses == 0:
6375
+ writerr(
6376
+ colored(
6377
+ getSPACER(
6378
+ "Failed to get responses from GhostArchive (ghostarchive.org) - check input and try again."
6379
+ ),
6380
+ "red",
6381
+ )
6382
+ )
6383
+ return
6384
+
6385
+ fileCount = successCount
6386
+
6387
+ if args.check_only:
6388
+ writerr(
6389
+ colored("Downloading archived responses: ", "cyan")
6390
+ + colored("UNKNOWN requests", "cyan")
6391
+ )
6392
+ writerr(
6393
+ colored(
6394
+ "\n-> Downloading the responses can vary depending on the target and the rate limiting on GhostArchive",
6395
+ "green",
6396
+ )
6397
+ )
6398
+ write("")
6399
+ else:
6400
+ # If the limit has been set over the default, give a warning that this could take a long time!
6401
+ if totalResponses - successCount > DEFAULT_LIMIT:
6402
+ if successCount > 0:
6403
+ writerr(
6404
+ colored(
6405
+ getSPACER(
6406
+ "WARNING: Downloading remaining "
6407
+ + str(totalResponses - successCount)
6408
+ + " responses may take a loooooooong time! Consider using arguments -ko, -l, -ci, -from and -to wisely!"
6409
+ ),
6410
+ "yellow",
6411
+ )
6412
+ )
6413
+ else:
6414
+ writerr(
6415
+ colored(
6416
+ getSPACER(
6417
+ "WARNING: Downloading "
6418
+ + str(totalResponses)
6419
+ + " responses may take a loooooooong time! Consider using arguments -ko, -l, -ci, -from and -to wisely!"
6420
+ ),
6421
+ "yellow",
6422
+ )
6423
+ )
6424
+
6425
+ # Open the index file if hash value is going to be used (not URL)
6426
+ if not args.url_filename:
6427
+ indexFile = open(indexPath, "a")
6428
+
6429
+ # Process the URLs from GhostArchive
6430
+ if stopProgram is None:
6431
+ p = mp.Pool(
6432
+ args.processes * 2
6433
+ ) # Double the number of processes to speed up the download
6434
+ p.starmap(getGhostArchiveWARC, linkRequests[successCount:])
6435
+ p.close()
6436
+ p.join()
6437
+
6438
+ # Delete the tmp files now it has run successfully
6439
+ if stopProgram is None:
6440
+ try:
6441
+ os.remove(responsesPath)
6442
+ except Exception:
6443
+ pass
6444
+
6445
+ # Close the index file if hash value is going to be used (not URL)
6446
+ if not args.url_filename:
6447
+ indexFile.close()
6448
+
6449
+ if not args.check_only:
6450
+ try:
6451
+ if failureCount > 0:
6452
+ if verbose():
6453
+ write(
6454
+ colored("GhostArchive - [ INFO ] Responses saved to ", "cyan")
6455
+ + colored(responseOutputDirectory, "white")
6456
+ + colored(" for " + subs + argsInput + ": ", "cyan")
6457
+ + colored(
6458
+ str(fileCount) + " 🤘",
6459
+ "white",
6460
+ )
6461
+ + colored(" (" + str(failureCount) + " not found)\n", "red")
6462
+ )
6463
+ else:
6464
+ write(
6465
+ colored("GhostArchive - [ INFO ] Responses saved to ", "cyan")
6466
+ + colored(responseOutputDirectory, "white")
6467
+ + colored(" for " + subs + argsInput + ": ", "cyan")
6468
+ + colored(str(fileCount) + " 🤘", "white")
6469
+ + colored(" (" + str(failureCount) + " not found)\n", "red")
6470
+ )
6471
+ else:
6472
+ if verbose():
6473
+ write(
6474
+ colored("GhostArchive - [ INFO ] Responses saved to ", "cyan")
6475
+ + colored(responseOutputDirectory, "white")
6476
+ + colored(" for " + subs + argsInput + ": ", "cyan")
6477
+ + colored(str(fileCount) + " 🤘\n", "white")
6478
+ )
6479
+ else:
6480
+ write(
6481
+ colored("GhostArchive - [ INFO ] Responses saved to ", "cyan")
6482
+ + colored(responseOutputDirectory, "white")
6483
+ + colored(" for " + subs + argsInput + ": ", "cyan")
6484
+ + colored(str(fileCount) + " 🤘\n", "white")
6485
+ )
6486
+ except Exception as e:
6487
+ if verbose():
6488
+ writerr(colored("ERROR processResponsesGhostArchive 5: " + str(e), "red"))
6489
+
6490
+ # Append extra links from WARC files to URL output file (for mode B)
6491
+ try:
6492
+ if args.mode == "B" and len(extraWarcLinks) > 0:
6493
+ # Determine URL output file path (same logic as processURLOutput)
6494
+ if args.output_urls == "":
6495
+ if args.output_responses != "":
6496
+ urlFilePath = args.output_responses + "/waymore.txt"
6497
+ else:
6498
+ urlFilePath = (
6499
+ str(DEFAULT_OUTPUT_DIR)
6500
+ + "/results/"
6501
+ + str(argsInput).replace("/", "-")
6502
+ + "/waymore.txt"
6503
+ )
6504
+ else:
6505
+ urlFilePath = args.output_urls
6506
+
6507
+ # Load existing URLs from file to avoid duplicates
6508
+ existingUrls = set()
6509
+ try:
6510
+ with open(urlFilePath) as f:
6511
+ for line in f:
6512
+ existingUrls.add(line.strip())
6513
+ except Exception:
6514
+ pass
6515
+
6516
+ # Append only new unique URLs
6517
+ newLinks = [
6518
+ url
6519
+ for url in extraWarcLinks
6520
+ if url not in existingUrls and url not in linksFound
6521
+ ]
6522
+ if len(newLinks) > 0:
6523
+ with open(urlFilePath, "a") as f:
6524
+ for url in newLinks:
6525
+ f.write(url + "\n")
6526
+
6527
+ # Display message about extra links
6528
+ write(
6529
+ colored("GhostArchive - [ INFO ] ", "cyan")
6530
+ + colored(str(len(newLinks)), "white")
6531
+ + colored(" extra links found in WARC files added to file ", "cyan")
6532
+ + colored(urlFilePath, "white")
6533
+ + "\n"
6534
+ )
6535
+ except Exception as e:
6536
+ if verbose():
6537
+ writerr(colored("ERROR processResponsesGhostArchive 6: " + str(e), "red"))
6538
+
6539
+ totalFileCount = totalFileCount + fileCount
6540
+ except Exception as e:
6541
+ writerr(colored(getSPACER("ERROR processResponsesGhostArchive 1: " + str(e)), "red"))
6542
+ finally:
6543
+ linkRequests = None
6544
+
6545
+
5487
6546
  def processResponsesURLScan():
5488
6547
  """
5489
6548
  Get archived responses from URLScan (urlscan.io)
@@ -6699,6 +7758,12 @@ async def fetch_intelx_async():
6699
7758
  await loop.run_in_executor(None, getIntelxUrls)
6700
7759
 
6701
7760
 
7761
+ async def fetch_ghostarchive_async():
7762
+ """Async wrapper for getGhostArchiveUrls - runs in thread pool"""
7763
+ loop = asyncio.get_event_loop()
7764
+ await loop.run_in_executor(None, getGhostArchiveUrls)
7765
+
7766
+
6702
7767
  async def fetch_all_sources_async():
6703
7768
  """
6704
7769
  Orchestrator function to fetch from all enabled sources concurrently.
@@ -6721,6 +7786,8 @@ async def fetch_all_sources_async():
6721
7786
  tasks.append(("VirusTotal", fetch_virustotal_async()))
6722
7787
  if not args.xix and INTELX_API_KEY != "" and stopProgram is None:
6723
7788
  tasks.append(("Intelligence X", fetch_intelx_async()))
7789
+ if not args.xga and stopProgram is None:
7790
+ tasks.append(("GhostArchive", fetch_ghostarchive_async()))
6724
7791
 
6725
7792
  if not tasks:
6726
7793
  return
@@ -6746,7 +7813,7 @@ async def fetch_all_sources_async():
6746
7813
 
6747
7814
  # Run waymore
6748
7815
  def main():
6749
- global args, DEFAULT_TIMEOUT, inputValues, argsInput, linksFound, linkMimes, successCount, failureCount, fileCount, totalResponses, totalPages, indexFile, path, stopSource, stopProgram, VIRUSTOTAL_API_KEY, inputIsSubDomain, argsInputHostname, WEBHOOK_DISCORD, responseOutputDirectory, fileCount, INTELX_API_KEY, stopSourceAlienVault, stopSourceCommonCrawl, stopSourceWayback, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx
7816
+ global args, DEFAULT_TIMEOUT, inputValues, argsInput, linksFound, linkMimes, successCount, failureCount, fileCount, totalResponses, totalPages, indexFile, path, stopSource, stopProgram, VIRUSTOTAL_API_KEY, inputIsSubDomain, argsInputHostname, WEBHOOK_DISCORD, responseOutputDirectory, fileCount, INTELX_API_KEY, stopSourceAlienVault, stopSourceCommonCrawl, stopSourceWayback, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx, stopSourceGhostArchive, extraWarcLinks
6750
7817
 
6751
7818
  # Tell Python to run the handler() function when SIGINT is received
6752
7819
  signal(SIGINT, handler)
@@ -6902,13 +7969,19 @@ def main():
6902
7969
  help="Exclude checks for links from intelx.io",
6903
7970
  default=False,
6904
7971
  )
7972
+ parser.add_argument(
7973
+ "-xga",
7974
+ action="store_true",
7975
+ help="Exclude checks for links from ghostarchive.org",
7976
+ default=False,
7977
+ )
6905
7978
  parser.add_argument(
6906
7979
  "--providers",
6907
7980
  action="store",
6908
- help="A comma separated list of source providers that you want to get URLs from. The values can be wayback,commoncrawl,otx,urlscan,virustotal and intelx. Passing this will override any exclude arguments (e.g. -xwm,-xcc, etc.) passed to exclude sources, and reset those based on what was passed with this argument.",
7981
+ help="A comma separated list of source providers that you want to get URLs from. The values can be wayback,commoncrawl,otx,urlscan,virustotal,intelx and ghostarchive. Passing this will override any exclude arguments (e.g. -xwm,-xcc, etc.) passed to exclude sources, and reset those based on what was passed with this argument.",
6909
7982
  default=[],
6910
7983
  type=validateArgProviders,
6911
- metavar="{wayback,commoncrawl,otx,urlscan,virustotal,intelx}",
7984
+ metavar="{wayback,commoncrawl,otx,urlscan,virustotal,intelx,ghostarchive}",
6912
7985
  )
6913
7986
  parser.add_argument(
6914
7987
  "-lcc",
@@ -7075,6 +8148,10 @@ def main():
7075
8148
  args.xix = True
7076
8149
  else:
7077
8150
  args.xix = False
8151
+ if "ghostarchive" not in args.providers:
8152
+ args.xga = True
8153
+ else:
8154
+ args.xga = False
7078
8155
 
7079
8156
  # If no input was given, raise an error
7080
8157
  if sys.stdin.isatty():
@@ -7145,6 +8222,7 @@ def main():
7145
8222
  # Reset global variables
7146
8223
  linksFound = set()
7147
8224
  linkMimes = set()
8225
+ extraWarcLinks = set()
7148
8226
  successCount = 0
7149
8227
  failureCount = 0
7150
8228
  fileCount = 0
@@ -7159,6 +8237,7 @@ def main():
7159
8237
  stopSourceURLScan = False
7160
8238
  stopSourceVirusTotal = False
7161
8239
  stopSourceIntelx = False
8240
+ stopSourceGhostArchive = False
7162
8241
 
7163
8242
  # Get the config settings from the config.yml file
7164
8243
  getConfig()