waymore 7.7__py3-none-any.whl → 8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
waymore/waymore.py CHANGED
@@ -70,6 +70,7 @@ stopSourceAlienVault = False
70
70
  stopSourceURLScan = False
71
71
  stopSourceVirusTotal = False
72
72
  stopSourceIntelx = False
73
+ stopSourceGhostArchive = False
73
74
  successCount = 0
74
75
  failureCount = 0
75
76
  fileCount = 0
@@ -79,6 +80,7 @@ totalPages = 0
79
80
  indexFile = None
80
81
  continueRespFile = None
81
82
  continueRespFileURLScan = None
83
+ continueRespFileGhostArchive = None
82
84
  inputIsDomainANDPath = False
83
85
  inputIsSubDomain = False
84
86
  subs = "*."
@@ -102,6 +104,7 @@ checkAlienVault = 0
102
104
  checkURLScan = 0
103
105
  checkVirusTotal = 0
104
106
  checkIntelx = 0
107
+ checkGhostArchive = 0
105
108
  argsInputHostname = ""
106
109
  responseOutputDirectory = ""
107
110
  urlscanRequestLinks = set()
@@ -112,11 +115,14 @@ linkCountAlienVault = 0
112
115
  linkCountURLScan = 0
113
116
  linkCountVirusTotal = 0
114
117
  linkCountIntelx = 0
118
+ linkCountGhostArchive = 0
115
119
  linksFoundCommonCrawl = set()
116
120
  linksFoundAlienVault = set()
117
121
  linksFoundURLScan = set()
118
122
  linksFoundVirusTotal = set()
119
123
  linksFoundIntelx = set()
124
+ linksFoundGhostArchive = set()
125
+ ghostArchiveRequestLinks = set()
120
126
 
121
127
  # Thread lock for protecting shared state during concurrent operations
122
128
  links_lock = threading.Lock()
@@ -124,6 +130,7 @@ links_lock = threading.Lock()
124
130
  # Shared state for link collection across all sources
125
131
  linksFound = set()
126
132
  linkMimes = set()
133
+ extraWarcLinks = set() # Track extra URLs found in WARC files for mode B
127
134
 
128
135
  # Source Provider URLs
129
136
  WAYBACK_URL = "https://web.archive.org/cdx/search/cdx?url={DOMAIN}{COLLAPSE}&fl=timestamp,original,mimetype,statuscode,digest"
@@ -134,6 +141,8 @@ URLSCAN_DOM_URL = "https://urlscan.io/dom/"
134
141
  VIRUSTOTAL_URL = "https://www.virustotal.com/vtapi/v2/domain/report?apikey={APIKEY}&domain={DOMAIN}"
135
142
  # Paid endpoint first, free endpoint as fallback
136
143
  INTELX_BASES = ["https://2.intelx.io", "https://free.intelx.io"]
144
+ GHOSTARCHIVE_URL = "https://ghostarchive.org/search?term={DOMAIN}&page="
145
+ GHOSTARCHIVE_DOM_URL = "https://ghostarchive.org"
137
146
 
138
147
  intelx_tls = threading.local()
139
148
 
@@ -247,10 +256,10 @@ DEFAULT_LIMIT = 5000
247
256
  DEFAULT_TIMEOUT = 30
248
257
 
249
258
  # Exclusions used to exclude responses we will try to get from web.archive.org
250
- DEFAULT_FILTER_URL = ".css,.jpg,.jpeg,.png,.svg,.img,.gif,.mp4,.flv,.ogv,.webm,.webp,.mov,.mp3,.m4a,.m4p,.scss,.tif,.tiff,.ttf,.otf,.woff,.woff2,.bmp,.ico,.eot,.htc,.rtf,.swf,.image,/image,/img,/css,/wp-json,/wp-content,/wp-includes,/theme,/audio,/captcha,/font,node_modules,/jquery,/bootstrap,/_incapsula_resource,.wmv,.wma,.asx"
259
+ DEFAULT_FILTER_URL = ".css,.jpg,.jpeg,.png,.svg,.img,.gif,.mp4,.flv,.ogv,.webm,.webp,.mov,.mp3,.m4a,.m4p,.scss,.tif,.tiff,.ttf,.otf,.woff,.woff2,.bmp,.ico,.eot,.htc,.rtf,.swf,.image,/image,/img,/css,/wp-json,/wp-content,/wp-includes,/theme,/audio,/captcha,/font,node_modules,/jquery,/bootstrap,/_incapsula_resource,.wmv,.wma,.asx,.avif"
251
260
 
252
261
  # MIME Content-Type exclusions used to filter links and responses from web.archive.org through their API
253
- DEFAULT_FILTER_MIME = "text/css,image/jpeg,image/jpg,image/png,image/svg+xml,image/gif,image/tiff,image/webp,image/bmp,image/vnd,image/x-icon,image/vnd.microsoft.icon,font/ttf,font/woff,font/woff2,font/x-woff2,font/x-woff,font/otf,audio/mpeg,audio/wav,audio/webm,audio/aac,audio/ogg,audio/wav,audio/webm,video/mp4,video/mpeg,video/webm,video/ogg,video/mp2t,video/webm,video/x-msvideo,video/x-flv,application/font-woff,application/font-woff2,application/x-font-woff,application/x-font-woff2,application/vnd.ms-fontobject,application/font-sfnt,application/vnd.android.package-archive,binary/octet-stream,application/octet-stream,application/pdf,application/x-font-ttf,application/x-font-otf,video/webm,video/3gpp,application/font-ttf,audio/mp3,audio/x-wav,image/pjpeg,audio/basic,application/font-otf,application/x-ms-application,application/x-msdownload,video/x-ms-wmv,image/x-png,video/quicktime,image/x-ms-bmp,font/opentype,application/x-font-opentype,application/x-woff,audio/aiff,video/x-ms-asf,audio/x-ms-wma,audio/wma,application/x-mplayer2"
262
+ DEFAULT_FILTER_MIME = "text/css,image/jpeg,image/jpg,image/png,image/svg+xml,image/gif,image/tiff,image/webp,image/bmp,image/vnd,image/x-icon,image/vnd.microsoft.icon,font/ttf,font/woff,font/woff2,font/x-woff2,font/x-woff,font/otf,audio/mpeg,audio/wav,audio/webm,audio/aac,audio/ogg,audio/wav,audio/webm,video/mp4,video/mpeg,video/webm,video/ogg,video/mp2t,video/webm,video/x-msvideo,video/x-flv,application/font-woff,application/font-woff2,application/x-font-woff,application/x-font-woff2,application/vnd.ms-fontobject,application/font-sfnt,application/vnd.android.package-archive,binary/octet-stream,application/octet-stream,application/x-font-ttf,application/x-font-otf,video/webm,video/3gpp,application/font-ttf,audio/mp3,audio/x-wav,image/pjpeg,audio/basic,application/font-otf,application/x-ms-application,application/x-msdownload,video/x-ms-wmv,image/x-png,video/quicktime,image/x-ms-bmp,font/opentype,application/x-font-opentype,application/x-woff,audio/aiff,video/x-ms-asf,audio/x-ms-wma,audio/wma,application/x-mplayer2,image/avif"
254
263
 
255
264
  # Response code exclusions we will use to filter links and responses from web.archive.org through their API
256
265
  DEFAULT_FILTER_CODE = "404,301,302"
@@ -743,7 +752,7 @@ def handler(signal_received, frame):
743
752
  This function is called if Ctrl-C is called by the user
744
753
  An attempt will be made to try and clean up properly
745
754
  """
746
- global stopSource, stopProgram, stopProgramCount, stopSourceWayback, stopSourceCommonCrawl, stopSourceAlienVault, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx, current_response, current_session
755
+ global stopSource, stopProgram, stopProgramCount, stopSourceWayback, stopSourceCommonCrawl, stopSourceAlienVault, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx, stopSourceGhostArchive, current_response, current_session
747
756
 
748
757
  if stopProgram is not None:
749
758
  stopProgramCount = stopProgramCount + 1
@@ -778,6 +787,7 @@ def handler(signal_received, frame):
778
787
  stopSourceURLScan = True
779
788
  stopSourceVirusTotal = True
780
789
  stopSourceIntelx = True
790
+ stopSourceGhostArchive = True
781
791
  # Try to close any active response or session to interrupt blocking network I/O
782
792
  try:
783
793
  if current_response is not None:
@@ -1324,6 +1334,46 @@ def getConfig():
1324
1334
  configPath = Path(waymorePath / "config.yml")
1325
1335
  else:
1326
1336
  configPath = Path(args.config)
1337
+
1338
+ # If the config file doesn't exist, create the default one
1339
+ if not os.path.isfile(configPath):
1340
+ try:
1341
+ # Make sure the directory exists
1342
+ os.makedirs(os.path.dirname(configPath), exist_ok=True)
1343
+ # Create the default config content using the DEFAULT_* constants
1344
+ defaultConfig = f"""FILTER_CODE: {DEFAULT_FILTER_CODE}
1345
+ FILTER_MIME: {DEFAULT_FILTER_MIME}
1346
+ FILTER_URL: {DEFAULT_FILTER_URL}
1347
+ FILTER_KEYWORDS: {DEFAULT_FILTER_KEYWORDS}
1348
+ URLSCAN_API_KEY:
1349
+ VIRUSTOTAL_API_KEY:
1350
+ CONTINUE_RESPONSES_IF_PIPED: True
1351
+ WEBHOOK_DISCORD: YOUR_WEBHOOK
1352
+ TELEGRAM_BOT_TOKEN: YOUR_TOKEN
1353
+ TELEGRAM_CHAT_ID: YOUR_CHAT_ID
1354
+ DEFAULT_OUTPUT_DIR:
1355
+ INTELX_API_KEY:
1356
+ SOURCE_IP:
1357
+ """
1358
+ with open(configPath, "w", encoding="utf-8") as f:
1359
+ f.write(defaultConfig)
1360
+ writerr(
1361
+ colored(
1362
+ 'Config file not found - created default config at "'
1363
+ + str(configPath)
1364
+ + '"',
1365
+ "yellow",
1366
+ )
1367
+ )
1368
+ except Exception as e:
1369
+ writerr(
1370
+ colored(
1371
+ "Config file not found, but failed to create default config file: "
1372
+ + str(e),
1373
+ "red",
1374
+ )
1375
+ )
1376
+
1327
1377
  config = yaml.safe_load(open(configPath))
1328
1378
  try:
1329
1379
  FILTER_URL = config.get("FILTER_URL")
@@ -1753,11 +1803,15 @@ def printProgressBar(
1753
1803
 
1754
1804
  def filehash(text):
1755
1805
  """
1756
- Generate a hash value for the passed string. This is used for the file name of a downloaded archived response
1806
+ Generate a hash value for the passed string or bytes. This is used for the file name of a downloaded archived response
1757
1807
  """
1758
1808
  hash = 0
1759
1809
  for ch in text:
1760
- hash = (hash * 281 ^ ord(ch) * 997) & 0xFFFFFFFFFFF
1810
+ # Handle both str (gives chars needing ord()) and bytes (gives ints directly)
1811
+ if isinstance(ch, int):
1812
+ hash = (hash * 281 ^ ch * 997) & 0xFFFFFFFFFFF
1813
+ else:
1814
+ hash = (hash * 281 ^ ord(ch) * 997) & 0xFFFFFFFFFFF
1761
1815
  return str(hash)
1762
1816
 
1763
1817
 
@@ -1945,7 +1999,7 @@ def processArchiveUrl(url):
1945
1999
  try:
1946
2000
  try:
1947
2001
  try:
1948
- if os.environ.get("USER") == "xnl":
2002
+ if verbose() and os.environ.get("USER") == "xnl":
1949
2003
  writerr(
1950
2004
  colored(
1951
2005
  "[ DBG ] Requesting file " + archiveUrl,
@@ -2265,7 +2319,7 @@ def processArchiveUrl(url):
2265
2319
  debugText = "INTERNET ARCHIVE"
2266
2320
  elif archiveHtml.lower().find("wombat") > 0:
2267
2321
  debugText = "WOMBAT (JS)"
2268
- if debugText != "":
2322
+ if verbose() and debugText != "":
2269
2323
  writerr(
2270
2324
  colored(
2271
2325
  getSPACER(
@@ -2280,16 +2334,17 @@ def processArchiveUrl(url):
2280
2334
  )
2281
2335
  )
2282
2336
  except Exception as e:
2283
- writerr(
2284
- colored(
2285
- '[ DBG ] Error - Failed to output debug info for "'
2286
- + archiveUrl
2287
- + '": '
2288
- + str(e),
2289
- "red",
2290
- attrs=["dark"],
2337
+ if verbose():
2338
+ writerr(
2339
+ colored(
2340
+ '[ DBG ] Error - Failed to output debug info for "'
2341
+ + archiveUrl
2342
+ + '": '
2343
+ + str(e),
2344
+ "red",
2345
+ attrs=["dark"],
2346
+ )
2291
2347
  )
2292
- )
2293
2348
  pass
2294
2349
 
2295
2350
  successCount = successCount + 1
@@ -2760,17 +2815,20 @@ def validateArgProviders(x):
2760
2815
  - urlscan
2761
2816
  - virustotal
2762
2817
  - intelx
2818
+ - ghostarchive
2763
2819
  """
2764
2820
  invalid = False
2765
2821
  x = x.lower()
2766
2822
  providers = x.split(",")
2767
2823
  for provider in providers:
2768
- if not re.fullmatch(r"(wayback|commoncrawl|otx|urlscan|virustotal|intelx)", provider):
2824
+ if not re.fullmatch(
2825
+ r"(wayback|commoncrawl|otx|urlscan|virustotal|intelx|ghostarchive)", provider
2826
+ ):
2769
2827
  invalid = True
2770
2828
  break
2771
2829
  if invalid:
2772
2830
  raise argparse.ArgumentTypeError(
2773
- "Pass providers separated by a comma, e.g. wayback,commoncrawl,otx,urlscan,virustotal,intelx"
2831
+ "Pass providers separated by a comma, e.g. wayback,commoncrawl,otx,urlscan,virustotal,intelx,ghostarchive"
2774
2832
  )
2775
2833
  return x
2776
2834
 
@@ -3528,181 +3586,697 @@ def getURLScanDOM(originalUrl, domUrl):
3528
3586
  writerr(colored("ERROR getURLScanDOM 1: " + str(e), "red"))
3529
3587
 
3530
3588
 
3531
- def format_date_for_urlscan(date_str):
3532
- # Handle different lengths of input
3533
- if len(date_str) == 4: # YYYY
3534
- date_str += "0101"
3535
- elif len(date_str) == 6: # YYYYMM
3536
- date_str += "01"
3537
-
3538
- # Convert to YYYY-MM-DD format
3539
- try:
3540
- formatted_date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
3541
- return formatted_date
3542
- except Exception:
3543
- return ""
3544
-
3545
-
3546
- def getURLScanUrls():
3589
+ def getGhostArchiveWARC(originalUrl, domUrl):
3547
3590
  """
3548
- Get URLs from the URLSCan API, urlscan.io
3591
+ Get the DOM for the passed GhostArchive link - parses WARC files containing multiple request/response pairs
3549
3592
  """
3550
- global URLSCAN_API_KEY, linksFound, linkMimes, waymorePath, subs, stopProgram, stopSourceURLScan, argsInput, checkURLScan, argsInputHostname, linkCountURLScan, linksFoundURLScan
3551
-
3552
- # Write the file of URL's for the passed domain/URL
3593
+ global stopProgram, successCount, failureCount, fileCount, DEFAULT_OUTPUT_DIR, totalResponses, indexFile, argsInput, argsInputHostname, REGEX_404, linksFound, extraWarcLinks, links_lock
3553
3594
  try:
3554
- requestsMade = 0
3555
- stopSourceURLScan = False
3556
- linksFoundURLScan = set()
3557
- totalUrls = 0
3558
- checkResponse = True
3559
-
3560
- # Set the URL to just the hostname
3561
- url = URLSCAN_URL.replace("{DOMAIN}", quote(argsInputHostname))
3595
+ if stopProgram is None:
3562
3596
 
3563
- # If the --from-date or --to-date parameters were paassed then also add a date filter
3564
- if args.from_date or args.to_date:
3565
- if args.from_date:
3566
- fromDate = format_date_for_urlscan(str(args.from_date)[:8])
3567
- else:
3568
- fromDate = "2016-01-01" # The year URLScan started
3569
- if args.to_date:
3570
- toDate = format_date_for_urlscan(str(args.to_date)[:8])
3571
- else:
3572
- toDate = "now"
3573
- url = url.replace("{DATERANGE}", f"%20date:[{fromDate}%20TO%20{toDate}]")
3574
- else:
3575
- url = url.replace("{DATERANGE}", "")
3597
+ # The WARC files are found by replacing /archive with /chimurai4 and using the .warc file extension
3598
+ warcUrl = domUrl.replace("/archive", "/chimurai4") + ".warc"
3576
3599
 
3577
- if verbose():
3578
- if args.mode == "R":
3579
- write(
3580
- colored(
3581
- "URLScan - [ INFO ] The URLScan URL requested to get links for responses: ",
3582
- "magenta",
3583
- )
3584
- + colored(url + "\n", "white")
3585
- )
3586
- else:
3587
- write(
3588
- colored(
3589
- "URLScan - [ INFO ] The URLScan URL requested to get links: ", "magenta"
3590
- )
3591
- + colored(url + "\n", "white")
3592
- )
3600
+ # Get memory usage every 100 responses
3601
+ if (successCount + failureCount) % 100 == 0:
3602
+ try:
3603
+ getMemory()
3604
+ except Exception:
3605
+ pass
3593
3606
 
3594
- if args.mode in ("U", "B") and not args.check_only:
3595
- write(
3596
- colored(
3597
- "URLScan - [ INFO ] Getting links from urlscan.io API (this can take a while for some domains)...",
3598
- "cyan",
3599
- )
3600
- )
3607
+ # Fetch content
3608
+ try:
3609
+ # Show progress bar
3610
+ fillTest = (successCount + failureCount) % 2
3611
+ fillChar = "o"
3612
+ if fillTest == 0:
3613
+ fillChar = "O"
3614
+ suffix = "Complete "
3601
3615
 
3602
- # Get the first page from urlscan.io
3603
- try:
3604
- # Choose a random user agent string to use for any requests
3605
- # For other sources we would use `random.choice(USER_AGENT)` to asignn a random user-agent, but it seems
3606
- # that there are a handful of those that ALWAYS return 429. Passing a specific one all the time seems to
3607
- # be successful all the time
3608
- userAgent = "waymore v" + __version__ + " by xnl-h4ck3r"
3609
- session = requests.Session()
3610
- session.mount("https://", HTTP_ADAPTER)
3611
- session.mount("http://", HTTP_ADAPTER)
3612
- # Pass the API-Key header too. This can change the max endpoints per page, depending on URLScan subscription
3613
- resp = session.get(url, headers={"User-Agent": userAgent, "API-Key": URLSCAN_API_KEY})
3614
- requestsMade = requestsMade + 1
3615
- except Exception as e:
3616
- write(
3617
- colored(
3618
- "URLScan - [ ERR ] Unable to get links from urlscan.io: " + str(e),
3619
- "red",
3616
+ printProgressBar(
3617
+ successCount + failureCount,
3618
+ totalResponses,
3619
+ prefix="Processing " + str(totalResponses) + " WARC files:",
3620
+ suffix=suffix,
3621
+ length=getProgressBarLength(),
3622
+ fill=fillChar,
3620
3623
  )
3621
- )
3622
- return
3623
3624
 
3624
- # If the rate limit was reached then determine if to wait and then try again
3625
- if resp.status_code == 429:
3626
- # Get the number of seconds the rate limit resets
3627
- match = re.search(r"Reset in (\d+) seconds", resp.text, flags=re.IGNORECASE)
3628
- if match is not None:
3629
- seconds = int(match.group(1))
3630
- if seconds <= args.urlscan_rate_limit_retry * 60:
3631
- writerr(
3632
- colored(
3633
- "URLScan - [ 429 ] Rate limit reached, so waiting for another "
3634
- + str(seconds)
3635
- + " seconds before continuing...",
3636
- "yellow",
3637
- )
3638
- )
3639
- # Wait can be interrupted by SIGINT via interrupt_event
3640
- interrupt_event.clear()
3641
- if interrupt_event.wait(seconds + 1):
3642
- # Interrupted by SIGINT
3643
- return
3625
+ try:
3644
3626
  try:
3645
- resp = session.get(
3646
- url,
3647
- headers={
3648
- "User-Agent": userAgent,
3649
- "API-Key": URLSCAN_API_KEY,
3650
- },
3651
- )
3652
- requestsMade = requestsMade + 1
3653
- except Exception as e:
3654
- write(
3655
- colored(
3656
- "URLScan - [ ERR ] Unable to get links from urlscan.io: " + str(e),
3657
- "red",
3627
+ if verbose() and os.environ.get("USER") == "xnl":
3628
+ writerr(
3629
+ colored(
3630
+ "[ DBG ] Requesting file " + warcUrl,
3631
+ "yellow",
3632
+ attrs=["dark"],
3633
+ )
3658
3634
  )
3635
+ except Exception:
3636
+ pass
3637
+
3638
+ # Choose a random user agent string to use for any requests
3639
+ userAgent = random.choice(USER_AGENT)
3640
+ session = requests.Session()
3641
+ session.mount("https://", HTTP_ADAPTER)
3642
+ session.mount("http://", HTTP_ADAPTER)
3643
+
3644
+ # Retry loop for 503 or maintenance responses
3645
+ maxRetries = 3
3646
+ warcBytes = b""
3647
+ for attempt in range(maxRetries):
3648
+ resp = session.get(
3649
+ warcUrl,
3650
+ headers={"User-Agent": userAgent},
3651
+ allow_redirects=True,
3652
+ timeout=args.timeout,
3659
3653
  )
3654
+ warcBytes = resp.content
3655
+
3656
+ # Check if we need to retry (decode just for this check)
3657
+ try:
3658
+ warcTextCheck = warcBytes.decode("utf-8", errors="replace").lower()
3659
+ except Exception:
3660
+ warcTextCheck = ""
3661
+ if resp.status_code == 503 or "website under maintenance" in warcTextCheck:
3662
+ if attempt < maxRetries - 1:
3663
+ import time
3664
+
3665
+ time.sleep(0.5)
3666
+ continue
3667
+ break
3668
+
3669
+ # Parse the WARC file to extract multiple responses
3670
+ # WARC header lines are text, but response bodies may be binary
3671
+ # Split by line separator but keep bytes for body extraction
3672
+ lineBytes = warcBytes.split(b"\n")
3673
+ lines = [lb.decode("utf-8", errors="replace") for lb in lineBytes]
3674
+
3675
+ # State machine to track parsing
3676
+ currentTargetUri = ""
3677
+ inResponse = False
3678
+ contentType = ""
3679
+ responsesFound = (
3680
+ []
3681
+ ) # List of (targetUri, contentType, responseBytes, httpStatusCode)
3682
+
3683
+ i = 0
3684
+ skipCurrentResponse = False # Initialize before loop
3685
+ pendingResponseType = (
3686
+ False # Track if we saw WARC-Type: response and are waiting for Target-URI
3687
+ )
3688
+ responseStartIdx = -1 # Initialize before loop
3689
+ httpStatusCode = "" # Initialize before loop
3690
+ while i < len(lines) and stopProgram is None and not stopSourceGhostArchive:
3691
+ line = lines[i]
3692
+
3693
+ # When we see a new WARC record start, reset pending state
3694
+ if line.startswith("WARC/1.0"):
3695
+ # If we were in a response and collecting, save it before moving to new record
3696
+ if inResponse and responseStartIdx >= 0:
3697
+ responseBodyBytes = b"\n".join(lineBytes[responseStartIdx:i])
3698
+ responsesFound.append(
3699
+ (
3700
+ currentTargetUri,
3701
+ contentType,
3702
+ responseBodyBytes,
3703
+ httpStatusCode if "httpStatusCode" in dir() else "",
3704
+ )
3705
+ )
3706
+ inResponse = False
3707
+ responseStartIdx = -1
3708
+ contentType = ""
3709
+ httpStatusCode = ""
3710
+ pendingResponseType = False
3711
+ skipCurrentResponse = False
3712
+
3713
+ # Look for WARC-Type: response - mark that we're in a response record header
3714
+ elif line.startswith("WARC-Type: response"):
3715
+ pendingResponseType = True
3716
+ inResponse = False # Don't start capturing body yet
3717
+ responseStartIdx = -1
3718
+ contentType = ""
3719
+
3720
+ # Look for WARC-Target-URI to get the request URL
3721
+ elif line.startswith("WARC-Target-URI:"):
3722
+ currentTargetUri = line.split(":", 1)[1].strip()
3723
+ skipCurrentResponse = False
3724
+
3725
+ # Check: URL host must contain the input hostname
3726
+ if argsInputHostname:
3727
+ try:
3728
+ parsed = urlparse(currentTargetUri)
3729
+ host = parsed.netloc.lower()
3730
+ if argsInputHostname.lower() not in host:
3731
+ skipCurrentResponse = True
3732
+ except Exception:
3733
+ skipCurrentResponse = True
3734
+
3735
+ # Check: Filter by URL (FILTER_URL)
3736
+ if not skipCurrentResponse and FILTER_URL and currentTargetUri:
3737
+ filterUrls = [u.strip().lower() for u in FILTER_URL.split(",")]
3738
+ for filterUrl in filterUrls:
3739
+ if filterUrl in currentTargetUri.lower():
3740
+ skipCurrentResponse = True
3741
+ break
3742
+
3743
+ # If we were waiting for Target-URI after seeing WARC-Type: response, and it's valid, start response mode
3744
+ if pendingResponseType and not skipCurrentResponse:
3745
+ inResponse = True
3746
+ pendingResponseType = False
3747
+
3748
+ # If we're in a response section (after seeing both WARC-Type: response and valid WARC-Target-URI)
3749
+ elif inResponse:
3750
+ # Check for HTTP start and capture status code
3751
+ if line.startswith("HTTP"):
3752
+ # Extract status code (e.g., "HTTP/1.1 200 OK" -> "200")
3753
+ try:
3754
+ httpStatusCode = line.split()[1]
3755
+ except Exception:
3756
+ httpStatusCode = ""
3757
+
3758
+ # Early check: Filter by HTTP status code (FILTER_CODE)
3759
+ if FILTER_CODE and httpStatusCode:
3760
+ filterCodes = [c.strip() for c in FILTER_CODE.split(",")]
3761
+ if httpStatusCode in filterCodes:
3762
+ inResponse = False
3763
+ responseStartIdx = -1
3764
+ i += 1
3765
+ continue
3766
+
3767
+ responseStartIdx = i # Mark start of response
3768
+ elif responseStartIdx >= 0:
3769
+ # Capture Content-Type if present (case-insensitive check)
3770
+ if line.lower().startswith("content-type:"):
3771
+ try:
3772
+ contentType = (
3773
+ line.split(":", 1)[1].strip().split(";")[0].lower()
3774
+ )
3775
+ except Exception:
3776
+ pass
3777
+
3778
+ # Early check: Filter by MIME type (FILTER_MIME)
3779
+ if FILTER_MIME and contentType:
3780
+ filterMimes = [
3781
+ m.strip().lower() for m in FILTER_MIME.split(",")
3782
+ ]
3783
+ if contentType in filterMimes:
3784
+ inResponse = False
3785
+ responseStartIdx = -1
3786
+ i += 1
3787
+ continue
3788
+
3789
+ i += 1
3790
+
3791
+ if stopProgram is not None:
3660
3792
  return
3661
3793
 
3662
- # If the rate limit was reached or if a 401 (which likely means the API key isn't valid), try without API key
3663
- if resp.status_code in (401, 429):
3664
- if URLSCAN_API_KEY != "":
3665
- try:
3666
- if resp.status_code == 429:
3667
- writerr(
3668
- colored(
3669
- "URLScan - [ 429 ] Rate limit reached so trying without API Key...",
3670
- "red",
3671
- )
3672
- )
3673
- else:
3674
- writerr(
3675
- colored(
3676
- "URLScan - [ INF ] The API Key is invalid so trying without API Key...",
3677
- "red",
3794
+ # Don't forget the last response if file doesn't end with WARC/1.0
3795
+ if inResponse and responseStartIdx >= 0:
3796
+ responseBodyBytes = b"\n".join(lineBytes[responseStartIdx:])
3797
+ responsesFound.append(
3798
+ (
3799
+ currentTargetUri,
3800
+ contentType,
3801
+ responseBodyBytes,
3802
+ httpStatusCode if "httpStatusCode" in dir() else "",
3678
3803
  )
3679
3804
  )
3680
- # Set key to blank for further requests
3681
- URLSCAN_API_KEY = ""
3682
- session_no_key = requests.Session()
3683
- session_no_key.mount("https://", HTTP_ADAPTER)
3684
- session_no_key.mount("http://", HTTP_ADAPTER)
3685
- resp = session_no_key.get(url, headers={"User-Agent": userAgent})
3686
- except Exception as e:
3687
- writerr(
3688
- colored(
3689
- "URLScan - [ ERR ] Unable to get links from urlscan.io: " + str(e),
3690
- "red",
3691
- )
3692
- )
3693
- checkResponse = False
3694
3805
 
3695
- # If the rate limit was reached end now
3696
- if resp.status_code == 429:
3697
- writerr(
3698
- colored(
3699
- "URLScan - [ 429 ] Rate limit reached without API Key so unable to get links.",
3700
- "red",
3806
+ # Process each response found
3807
+ for targetUri, contentType, responseBytes, httpStatusCode in responsesFound:
3808
+ if stopProgram is not None:
3809
+ break
3810
+
3811
+ if not responseBytes:
3812
+ continue
3813
+
3814
+ # Split HTTP header from body in bytes (look for \r\n\r\n or \n\n separator)
3815
+ if b"\r\n\r\n" in responseBytes:
3816
+ bodyBytes = responseBytes.split(b"\r\n\r\n", 1)[1]
3817
+ elif b"\n\n" in responseBytes:
3818
+ bodyBytes = responseBytes.split(b"\n\n", 1)[1]
3819
+ else:
3820
+ bodyBytes = responseBytes
3821
+
3822
+ # Skip empty bodies or "not found" responses
3823
+ if not bodyBytes or bodyBytes.lower().strip() == b"not found":
3824
+ continue
3825
+
3826
+ # If -f / --filter-responses-only is passed, track all URLs immediately (before filtering)
3827
+ if args.mode == "B" and args.filter_responses_only and targetUri:
3828
+ with links_lock:
3829
+ if targetUri not in linksFound and targetUri not in extraWarcLinks:
3830
+ extraWarcLinks.add(targetUri)
3831
+
3832
+ # Use isBinaryContent to detect if this is binary content
3833
+ isBinary = isBinaryContent(bodyBytes, contentType, targetUri)
3834
+
3835
+ if isBinary:
3836
+ # Binary file - save raw bytes
3837
+ archiveContent = bodyBytes
3838
+ archiveHtml = None
3839
+ else:
3840
+ # Text file - decode to string
3841
+ archiveHtml = bodyBytes.decode("utf-8", errors="replace")
3842
+ archiveContent = None
3843
+
3844
+ # Collapse multiple blank lines into one
3845
+ archiveHtml = re.sub(r"\n{3,}", "\n\n", archiveHtml)
3846
+
3847
+ # Skip if body is empty after processing
3848
+ if not archiveHtml.strip():
3849
+ continue
3850
+
3851
+ if stopProgram is not None:
3852
+ break
3853
+
3854
+ # Determine if this is HTML or JS based on content-type or URL
3855
+ isHtml = (
3856
+ contentType in ["text/html", "application/xhtml+xml"]
3857
+ or targetUri.lower().endswith(".html")
3858
+ or targetUri.lower().endswith(".htm")
3701
3859
  )
3702
- )
3703
- checkResponse = False
3704
- else:
3705
- writerr(
3860
+ isJs = contentType in [
3861
+ "text/javascript",
3862
+ "application/javascript",
3863
+ "application/x-javascript",
3864
+ ] or targetUri.lower().endswith(".js")
3865
+
3866
+ # Add the URL as a comment at the start of the response (only for text files)
3867
+ if not isBinary and args.url_filename:
3868
+ if isHtml:
3869
+ archiveHtml = (
3870
+ "<!-- Original URL: " + targetUri + " -->\n" + archiveHtml
3871
+ )
3872
+ elif isJs:
3873
+ archiveHtml = (
3874
+ "/* Original URL: " + targetUri + " */\n" + archiveHtml
3875
+ )
3876
+
3877
+ # Create file name based on url or hash value
3878
+ if args.url_filename:
3879
+ fileName = targetUri.replace("/", "-").replace(":", "")
3880
+ fileName = fileName[0:254]
3881
+ hashValue = ""
3882
+ else:
3883
+ # Hash the content to get the filename
3884
+ if isBinary:
3885
+ hashValue = filehash(archiveContent)
3886
+ else:
3887
+ hashValue = filehash(archiveHtml)
3888
+ fileName = hashValue
3889
+
3890
+ # Determine extension of file from the content-type or URL
3891
+ extension = ""
3892
+ try:
3893
+ # Get path extension from URL
3894
+ if "://" in targetUri:
3895
+ targetUrl = "https://" + targetUri.split("://")[1]
3896
+ parsed = urlparse(targetUrl.strip())
3897
+ path = parsed.path
3898
+ extension = path[path.rindex(".") + 1 :]
3899
+ if "/" in extension:
3900
+ extension = ""
3901
+ # If extension is over 6 characters, it's likely not a real extension (e.g. API endpoint ID)
3902
+ if len(extension) > 6:
3903
+ extension = ""
3904
+ except Exception:
3905
+ pass
3906
+
3907
+ # If extension is blank, determine from MIME type or content
3908
+ if extension == "":
3909
+ if isBinary:
3910
+ # Binary file extensions from MIME type
3911
+ if contentType:
3912
+ if "image/png" in contentType:
3913
+ extension = "png"
3914
+ elif (
3915
+ "image/jpeg" in contentType
3916
+ or "image/jpg" in contentType
3917
+ ):
3918
+ extension = "jpg"
3919
+ elif "image/gif" in contentType:
3920
+ extension = "gif"
3921
+ elif "image/webp" in contentType:
3922
+ extension = "webp"
3923
+ elif "application/pdf" in contentType:
3924
+ extension = "pdf"
3925
+ elif "application/zip" in contentType:
3926
+ extension = "zip"
3927
+ else:
3928
+ extension = "bin"
3929
+ else:
3930
+ extension = "bin"
3931
+ else:
3932
+ # Text file extensions
3933
+ if contentType and "javascript" in contentType.lower():
3934
+ extension = "js"
3935
+ elif contentType and "html" in contentType.lower():
3936
+ extension = "html"
3937
+ elif contentType and "json" in contentType.lower():
3938
+ extension = "json"
3939
+ elif contentType and "text" in contentType.lower():
3940
+ extension = "txt"
3941
+ elif archiveHtml and (
3942
+ archiveHtml.lower().strip().endswith("</html>")
3943
+ or archiveHtml.lower().strip().endswith("</body>")
3944
+ or archiveHtml.lower().strip().startswith("<!doctype html")
3945
+ or archiveHtml.lower().strip().startswith("<html")
3946
+ or archiveHtml.lower().strip().startswith("<head")
3947
+ ):
3948
+ extension = "html"
3949
+ else:
3950
+ extension = "unknown"
3951
+
3952
+ fileName = fileName + "." + extension
3953
+
3954
+ # Determine file path
3955
+ if args.output_responses != "":
3956
+ filePath = args.output_responses + "/" + f"{fileName}"
3957
+ else:
3958
+ filePath = (
3959
+ DEFAULT_OUTPUT_DIR
3960
+ + "/results/"
3961
+ + str(argsInput).replace("/", "-")
3962
+ + "/"
3963
+ + f"{fileName}"
3964
+ )
3965
+
3966
+ if stopProgram is not None:
3967
+ break
3968
+
3969
+ # Write the file
3970
+ try:
3971
+ if isBinary:
3972
+ # Binary file - write as bytes
3973
+ responseFile = open(filePath, "wb")
3974
+ responseFile.write(archiveContent)
3975
+ else:
3976
+ # Text file - write as UTF-8
3977
+ responseFile = open(filePath, "w", encoding="utf8")
3978
+ responseFile.write(archiveHtml)
3979
+ responseFile.close()
3980
+ with links_lock:
3981
+ fileCount = fileCount + 1
3982
+
3983
+ # Track extra URLs found in WARC files for mode B (only when -f is not passed, since we track earlier if it is)
3984
+ if args.mode == "B" and not args.filter_responses_only and targetUri:
3985
+ with links_lock:
3986
+ if (
3987
+ targetUri not in linksFound
3988
+ and targetUri not in extraWarcLinks
3989
+ ):
3990
+ extraWarcLinks.add(targetUri)
3991
+ except Exception as e:
3992
+ writerr(
3993
+ colored(
3994
+ "GhostArchive - [ ERR ] Failed to write file "
3995
+ + filePath
3996
+ + ": "
3997
+ + str(e),
3998
+ "red",
3999
+ )
4000
+ )
4001
+
4002
+ # Write the hash value and URL to the index file
4003
+ if not args.url_filename and hashValue:
4004
+ try:
4005
+ timestamp = str(datetime.now())
4006
+ indexFile.write(
4007
+ hashValue
4008
+ + ","
4009
+ + domUrl
4010
+ + "#"
4011
+ + targetUri
4012
+ + " ,"
4013
+ + timestamp
4014
+ + "\n"
4015
+ )
4016
+ indexFile.flush()
4017
+ except Exception as e:
4018
+ writerr(
4019
+ colored(
4020
+ 'GhostArchive - [ ERR ] Failed to write to waymore_index.txt for "'
4021
+ + warcUrl
4022
+ + '": '
4023
+ + str(e),
4024
+ "red",
4025
+ )
4026
+ )
4027
+
4028
+ successCount = successCount + 1
4029
+
4030
+ except WayBackException:
4031
+ failureCount = failureCount + 1
4032
+
4033
+ except Exception as e:
4034
+ failureCount = failureCount + 1
4035
+ if verbose():
4036
+ # Simplify common error messages
4037
+ if "connection broken" in str(e).lower():
4038
+ errorMsg = "Connection Broken"
4039
+ else:
4040
+ errorMsg = str(e)
4041
+ try:
4042
+ statusCode = (
4043
+ resp.status_code if "resp" in dir() and resp is not None else "ERR"
4044
+ )
4045
+ writerr(
4046
+ colored(
4047
+ "GhostArchive - [ "
4048
+ + str(statusCode)
4049
+ + ' ] Failed to get response for "'
4050
+ + warcUrl
4051
+ + '": '
4052
+ + errorMsg,
4053
+ "red",
4054
+ )
4055
+ )
4056
+ except Exception:
4057
+ writerr(
4058
+ colored(
4059
+ 'GhostArchive - [ ERR ] Failed to get response for "'
4060
+ + warcUrl
4061
+ + '": '
4062
+ + errorMsg,
4063
+ "red",
4064
+ )
4065
+ )
4066
+
4067
+ # Show memory usage if -v option chosen, and check memory every 25 responses (or if its the last)
4068
+ if (successCount + failureCount) % 25 == 1 or (
4069
+ successCount + failureCount
4070
+ ) == totalResponses:
4071
+ try:
4072
+ getMemory()
4073
+ if verbose():
4074
+ suffix = (
4075
+ "Complete (Mem Usage "
4076
+ + humanReadableSize(currentMemUsage)
4077
+ + ", Total Mem "
4078
+ + str(currentMemPercent)
4079
+ + "%) "
4080
+ )
4081
+ except Exception:
4082
+ if verbose():
4083
+ suffix = 'Complete (To show mem use, run "pip install psutil")'
4084
+ printProgressBar(
4085
+ successCount + failureCount,
4086
+ totalResponses,
4087
+ prefix="Processing " + str(totalResponses) + " WARC files:",
4088
+ suffix=suffix,
4089
+ length=getProgressBarLength(),
4090
+ fill=fillChar,
4091
+ )
4092
+
4093
+ except Exception as e:
4094
+ if verbose():
4095
+ writerr(
4096
+ colored(
4097
+ 'GhostArchive - [ ERR ] Error for "' + domUrl + '": ' + str(e), "red"
4098
+ )
4099
+ )
4100
+
4101
+ except Exception as e:
4102
+ writerr(colored("ERROR getGhostArchiveWARC 1: " + str(e), "red"))
4103
+
4104
+
4105
+ def format_date_for_urlscan(date_str):
4106
+ # Handle different lengths of input
4107
+ if len(date_str) == 4: # YYYY
4108
+ date_str += "0101"
4109
+ elif len(date_str) == 6: # YYYYMM
4110
+ date_str += "01"
4111
+
4112
+ # Convert to YYYY-MM-DD format
4113
+ try:
4114
+ formatted_date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
4115
+ return formatted_date
4116
+ except Exception:
4117
+ return ""
4118
+
4119
+
4120
+ def getURLScanUrls():
4121
+ """
4122
+ Get URLs from the URLSCan API, urlscan.io
4123
+ """
4124
+ global URLSCAN_API_KEY, linksFound, linkMimes, waymorePath, subs, stopProgram, stopSourceURLScan, argsInput, checkURLScan, argsInputHostname, linkCountURLScan, linksFoundURLScan
4125
+
4126
+ # Write the file of URL's for the passed domain/URL
4127
+ try:
4128
+ requestsMade = 0
4129
+ stopSourceURLScan = False
4130
+ linksFoundURLScan = set()
4131
+ totalUrls = 0
4132
+ checkResponse = True
4133
+
4134
+ # Set the URL to just the hostname
4135
+ url = URLSCAN_URL.replace("{DOMAIN}", quote(argsInputHostname))
4136
+
4137
+ # If the --from-date or --to-date parameters were paassed then also add a date filter
4138
+ if args.from_date or args.to_date:
4139
+ if args.from_date:
4140
+ fromDate = format_date_for_urlscan(str(args.from_date)[:8])
4141
+ else:
4142
+ fromDate = "2016-01-01" # The year URLScan started
4143
+ if args.to_date:
4144
+ toDate = format_date_for_urlscan(str(args.to_date)[:8])
4145
+ else:
4146
+ toDate = "now"
4147
+ url = url.replace("{DATERANGE}", f"%20date:[{fromDate}%20TO%20{toDate}]")
4148
+ else:
4149
+ url = url.replace("{DATERANGE}", "")
4150
+
4151
+ if verbose():
4152
+ if args.mode == "R":
4153
+ write(
4154
+ colored(
4155
+ "URLScan - [ INFO ] The URLScan URL requested to get links for responses: ",
4156
+ "magenta",
4157
+ )
4158
+ + colored(url + "\n", "white")
4159
+ )
4160
+ else:
4161
+ write(
4162
+ colored(
4163
+ "URLScan - [ INFO ] The URLScan URL requested to get links: ", "magenta"
4164
+ )
4165
+ + colored(url + "\n", "white")
4166
+ )
4167
+
4168
+ if args.mode in ("U", "B") and not args.check_only:
4169
+ write(
4170
+ colored(
4171
+ "URLScan - [ INFO ] Getting links from urlscan.io API (this can take a while for some domains)...",
4172
+ "cyan",
4173
+ )
4174
+ )
4175
+
4176
+ # Get the first page from urlscan.io
4177
+ try:
4178
+ # Choose a random user agent string to use for any requests
4179
+ # For other sources we would use `random.choice(USER_AGENT)` to asignn a random user-agent, but it seems
4180
+ # that there are a handful of those that ALWAYS return 429. Passing a specific one all the time seems to
4181
+ # be successful all the time
4182
+ userAgent = "waymore v" + __version__ + " by xnl-h4ck3r"
4183
+ session = requests.Session()
4184
+ session.mount("https://", HTTP_ADAPTER)
4185
+ session.mount("http://", HTTP_ADAPTER)
4186
+ # Pass the API-Key header too. This can change the max endpoints per page, depending on URLScan subscription
4187
+ resp = session.get(url, headers={"User-Agent": userAgent, "API-Key": URLSCAN_API_KEY})
4188
+ requestsMade = requestsMade + 1
4189
+ except Exception as e:
4190
+ write(
4191
+ colored(
4192
+ "URLScan - [ ERR ] Unable to get links from urlscan.io: " + str(e),
4193
+ "red",
4194
+ )
4195
+ )
4196
+ return
4197
+
4198
+ # If the rate limit was reached then determine if to wait and then try again
4199
+ if resp.status_code == 429:
4200
+ # Get the number of seconds the rate limit resets
4201
+ match = re.search(r"Reset in (\d+) seconds", resp.text, flags=re.IGNORECASE)
4202
+ if match is not None:
4203
+ seconds = int(match.group(1))
4204
+ if seconds <= args.urlscan_rate_limit_retry * 60:
4205
+ writerr(
4206
+ colored(
4207
+ "URLScan - [ 429 ] Rate limit reached, so waiting for another "
4208
+ + str(seconds)
4209
+ + " seconds before continuing...",
4210
+ "yellow",
4211
+ )
4212
+ )
4213
+ # Wait can be interrupted by SIGINT via interrupt_event
4214
+ interrupt_event.clear()
4215
+ if interrupt_event.wait(seconds + 1):
4216
+ # Interrupted by SIGINT
4217
+ return
4218
+ try:
4219
+ resp = session.get(
4220
+ url,
4221
+ headers={
4222
+ "User-Agent": userAgent,
4223
+ "API-Key": URLSCAN_API_KEY,
4224
+ },
4225
+ )
4226
+ requestsMade = requestsMade + 1
4227
+ except Exception as e:
4228
+ write(
4229
+ colored(
4230
+ "URLScan - [ ERR ] Unable to get links from urlscan.io: " + str(e),
4231
+ "red",
4232
+ )
4233
+ )
4234
+ return
4235
+
4236
+ # If the rate limit was reached or if a 401 (which likely means the API key isn't valid), try without API key
4237
+ if resp.status_code in (401, 429):
4238
+ if URLSCAN_API_KEY != "":
4239
+ try:
4240
+ if resp.status_code == 429:
4241
+ writerr(
4242
+ colored(
4243
+ "URLScan - [ 429 ] Rate limit reached so trying without API Key...",
4244
+ "red",
4245
+ )
4246
+ )
4247
+ else:
4248
+ writerr(
4249
+ colored(
4250
+ "URLScan - [ INF ] The API Key is invalid so trying without API Key...",
4251
+ "red",
4252
+ )
4253
+ )
4254
+ # Set key to blank for further requests
4255
+ URLSCAN_API_KEY = ""
4256
+ session_no_key = requests.Session()
4257
+ session_no_key.mount("https://", HTTP_ADAPTER)
4258
+ session_no_key.mount("http://", HTTP_ADAPTER)
4259
+ resp = session_no_key.get(url, headers={"User-Agent": userAgent})
4260
+ except Exception as e:
4261
+ writerr(
4262
+ colored(
4263
+ "URLScan - [ ERR ] Unable to get links from urlscan.io: " + str(e),
4264
+ "red",
4265
+ )
4266
+ )
4267
+ checkResponse = False
4268
+
4269
+ # If the rate limit was reached end now
4270
+ if resp.status_code == 429:
4271
+ writerr(
4272
+ colored(
4273
+ "URLScan - [ 429 ] Rate limit reached without API Key so unable to get links.",
4274
+ "red",
4275
+ )
4276
+ )
4277
+ checkResponse = False
4278
+ else:
4279
+ writerr(
3706
4280
  colored(
3707
4281
  "URLScan - [ 429 ] Rate limit reached so unable to get links.",
3708
4282
  "red",
@@ -4198,7 +4772,6 @@ def processWayBackPage(url):
4198
4772
  pass
4199
4773
  return
4200
4774
  else:
4201
- print("DEBUG: HERE END!") # DEBUG
4202
4775
  pass
4203
4776
  except Exception as e:
4204
4777
  if verbose():
@@ -5380,80 +5953,373 @@ def processIntelxType(target, credits):
5380
5953
  writerr(colored("ERROR processIntelxType 1: " + str(e), "red"))
5381
5954
 
5382
5955
 
5383
- def getIntelxAccountInfo() -> str:
5384
- """
5385
- Get the account info and return the number of Credits remaining from the /phonebook/search
5386
- """
5387
- initIntelxTls()
5388
- try:
5389
- resp = chooseIntelxBase(INTELX_API_KEY)
5390
- if resp is None or resp.status_code != 200:
5391
- return "Unknown"
5392
- jsonResp = json.loads(resp.text.strip())
5393
- credits = str(
5394
- jsonResp.get("paths", {}).get("/phonebook/search", {}).get("Credit", "Unknown")
5395
- )
5396
- credits_max = str(
5397
- jsonResp.get("paths", {}).get("/phonebook/search", {}).get("CreditMax", "Unknown")
5398
- )
5399
- return credits + "/" + credits_max
5400
- except Exception:
5401
- return "Unknown"
5956
+ def getIntelxAccountInfo() -> str:
5957
+ """
5958
+ Get the account info and return the number of Credits remaining from the /phonebook/search
5959
+ """
5960
+ initIntelxTls()
5961
+ try:
5962
+ resp = chooseIntelxBase(INTELX_API_KEY)
5963
+ if resp is None or resp.status_code != 200:
5964
+ return "Unknown"
5965
+ jsonResp = json.loads(resp.text.strip())
5966
+ credits = str(
5967
+ jsonResp.get("paths", {}).get("/phonebook/search", {}).get("Credit", "Unknown")
5968
+ )
5969
+ credits_max = str(
5970
+ jsonResp.get("paths", {}).get("/phonebook/search", {}).get("CreditMax", "Unknown")
5971
+ )
5972
+ return credits + "/" + credits_max
5973
+ except Exception:
5974
+ return "Unknown"
5975
+
5976
+
5977
+ def getIntelxUrls():
5978
+ """
5979
+ Get URLs from the Intelligence X Phonebook search
5980
+ """
5981
+ global INTELX_API_KEY, linksFound, waymorePath, subs, stopProgram, stopSourceIntelx, argsInput, checkIntelx, argsInputHostname, intelxAPIIssue, linkCountIntelx, linksFoundIntelx
5982
+
5983
+ # Write the file of URL's for the passed domain/URL
5984
+ try:
5985
+ if args.check_only:
5986
+ write(
5987
+ colored("IntelX - [ INFO ] Get URLs from Intelligence X: ", "cyan")
5988
+ + colored("minimum 4 requests", "white")
5989
+ )
5990
+ checkIntelx = 4
5991
+ return
5992
+
5993
+ stopSourceIntelx = False
5994
+ linksFoundIntelx = set()
5995
+ initIntelxTls()
5996
+
5997
+ credits = getIntelxAccountInfo()
5998
+ if verbose():
5999
+ write(
6000
+ colored(
6001
+ "IntelX - [ INFO ] The Intelligence X URL requested to get links (Credits: "
6002
+ + credits
6003
+ + "): ",
6004
+ "magenta",
6005
+ )
6006
+ + colored(intelx_tls.INTELX_SEARCH_URL + "\n", "white")
6007
+ )
6008
+
6009
+ if not args.check_only:
6010
+ write(colored("IntelX - [ INFO ] Getting links from intelx.io API...", "cyan"))
6011
+
6012
+ # Get the domains from Intelligence X if the --no-subs wasn't passed
6013
+ if not args.no_subs:
6014
+ processIntelxType(1, credits)
6015
+
6016
+ # Get the URLs from Intelligence X
6017
+ if not intelxAPIIssue:
6018
+ processIntelxType(3, credits)
6019
+
6020
+ linkCountIntelx = len(linksFoundIntelx)
6021
+ write(
6022
+ colored("IntelX - [ INFO ] Links found on intelx.io: ", "cyan")
6023
+ + colored(str(linkCountIntelx), "white")
6024
+ )
6025
+ linksFound.update(linksFoundIntelx)
6026
+ linksFoundIntelx.clear()
6027
+
6028
+ except Exception as e:
6029
+ writerr(colored("ERROR getIntelxUrls 1: " + str(e), "red"))
6030
+
6031
+
6032
+ def processGhostArchiveUrl(url, ghostArchiveID=""):
6033
+ """
6034
+ Process a specific URL from ghostarchive.org to determine whether to save the link
6035
+ """
6036
+ global argsInput, argsInputHostname, links_lock, linkCountGhostArchive, linksFoundGhostArchive
6037
+
6038
+ addLink = True
6039
+
6040
+ try:
6041
+ # Strip Wayback Machine prefix if present (e.g., https://web.archive.org/web/20230101120000_/https://example.com)
6042
+ waybackMatch = re.match(r"^https?://web\.archive\.org/[^/]+/[a-zA-Z0-9]+_/", url)
6043
+ if waybackMatch:
6044
+ url = url[waybackMatch.end() :]
6045
+
6046
+ # If the input has a / in it, then a URL was passed, so the link will only be added if the URL matches
6047
+ if "/" in url:
6048
+ if argsInput not in url:
6049
+ addLink = False
6050
+
6051
+ # If filters are required then test them
6052
+ if addLink and not args.filter_responses_only:
6053
+
6054
+ # If the user requested -n / --no-subs then we don't want to add it if it has a sub domain (www. will not be classed as a sub domain)
6055
+ if args.no_subs:
6056
+ match = re.search(
6057
+ r"^[A-za-z]*\:\/\/(www\.)?" + re.escape(argsInputHostname),
6058
+ url,
6059
+ flags=re.IGNORECASE,
6060
+ )
6061
+ if match is None:
6062
+ addLink = False
6063
+
6064
+ # If the user didn't requested -f / --filter-responses-only then check http code
6065
+ if addLink and not args.filter_responses_only:
6066
+
6067
+ # Check the URL exclusions
6068
+ if addLink:
6069
+ match = re.search(
6070
+ r"(" + re.escape(FILTER_URL).replace(",", "|") + ")",
6071
+ url,
6072
+ flags=re.IGNORECASE,
6073
+ )
6074
+ if match is not None:
6075
+ addLink = False
6076
+
6077
+ # Set keywords filter if -ko argument passed
6078
+ if addLink and args.keywords_only:
6079
+ if args.keywords_only == "#CONFIG":
6080
+ match = re.search(
6081
+ r"(" + re.escape(FILTER_KEYWORDS).replace(",", "|") + ")",
6082
+ url,
6083
+ flags=re.IGNORECASE,
6084
+ )
6085
+ else:
6086
+ match = re.search(r"(" + args.keywords_only + ")", url, flags=re.IGNORECASE)
6087
+ if match is None:
6088
+ addLink = False
6089
+
6090
+ # Add link if it passed filters
6091
+ if addLink:
6092
+ # Just get the hostname of the url
6093
+ tldExtract = tldextract.extract(url)
6094
+ subDomain = tldExtract.subdomain
6095
+ if subDomain != "":
6096
+ subDomain = subDomain + "."
6097
+ domainOnly = subDomain + tldExtract.domain + "." + tldExtract.suffix
6098
+
6099
+ # GhostArchive might return URLs that aren't for the domain passed so we need to check for those and not process them
6100
+ # Check the URL
6101
+ match = re.search(
6102
+ r"(^|\.)" + re.escape(argsInputHostname) + "$",
6103
+ domainOnly,
6104
+ flags=re.IGNORECASE,
6105
+ )
6106
+ if match is not None:
6107
+ if args.mode in ("U", "B"):
6108
+ linksFoundAdd(url, linksFoundGhostArchive)
6109
+ # If Response mode is requested then add the DOM ID to try later, for the number of responses wanted
6110
+ if ghostArchiveID != "" and args.mode in ("R", "B"):
6111
+ if args.limit == 0 or len(ghostArchiveRequestLinks) < args.limit:
6112
+ with links_lock:
6113
+ ghostArchiveRequestLinks.add(
6114
+ (url, GHOSTARCHIVE_DOM_URL + ghostArchiveID)
6115
+ )
6116
+
6117
+ except Exception as e:
6118
+ writerr(colored("ERROR processGhostArchiveUrl 1: " + str(e), "red"))
6119
+
6120
+
6121
+ def getGhostArchiveUrls():
6122
+ """
6123
+ Get URLs from GhostArchive (ghostarchive.org)
6124
+ This source doesn't have an API, so we crawl the HTML pages directly.
6125
+ """
6126
+ global linksFound, path, subs, stopProgram, stopSourceGhostArchive, argsInput, checkGhostArchive, argsInputHostname, linkCountGhostArchive, linksFoundGhostArchive
6127
+
6128
+ try:
6129
+ stopSourceGhostArchive = False
6130
+ linksFoundGhostArchive = set()
6131
+
6132
+ # Build the base URL
6133
+ # If there is only one . in the hostname, we can guarantee that a subdoman wasn't passed, so we can prefix with . to the links quicker as it won't include other domains that end with the target domain,
6134
+ # Else, we need to get all and then confirm the actual host of the links later
6135
+ if argsInputHostname.count(".") == 1:
6136
+ baseUrl = GHOSTARCHIVE_URL.replace("{DOMAIN}", "." + quote(argsInput))
6137
+ else:
6138
+ baseUrl = GHOSTARCHIVE_URL.replace("{DOMAIN}", quote(argsInput))
6139
+
6140
+ if verbose():
6141
+ write(
6142
+ colored("GhostArchive - [ INFO ] The URL requested to get links: ", "magenta")
6143
+ + colored(baseUrl + "0\n", "white")
6144
+ )
6145
+
6146
+ if not args.check_only and args.mode == "U":
6147
+ write(
6148
+ colored(
6149
+ "GhostArchive - [ INFO ] Getting links from ghostarchive.org (this can take a while for some domains)...",
6150
+ "cyan",
6151
+ )
6152
+ )
6153
+
6154
+ # Set up session with cookie
6155
+ session = requests.Session()
6156
+ if HTTP_ADAPTER is not None:
6157
+ session.mount("https://", HTTP_ADAPTER)
6158
+ session.mount("http://", HTTP_ADAPTER)
6159
+
6160
+ userAgent = random.choice(USER_AGENT)
6161
+ headers = {"User-Agent": userAgent}
6162
+ cookies = {"theme": "original"}
6163
+
6164
+ pageNum = 0
6165
+
6166
+ while stopProgram is None and not stopSourceGhostArchive:
6167
+ getMemory()
6168
+
6169
+ url = baseUrl + str(pageNum)
6170
+
6171
+ try:
6172
+ resp = session.get(url, headers=headers, cookies=cookies, timeout=DEFAULT_TIMEOUT)
6173
+ except Exception as e:
6174
+ writerr(
6175
+ colored(
6176
+ "GhostArchive - [ ERR ] Unable to get page " + str(pageNum) + ": " + str(e),
6177
+ "red",
6178
+ )
6179
+ )
6180
+ break
6181
+
6182
+ if resp.status_code == 429:
6183
+ writerr(
6184
+ colored(
6185
+ "GhostArchive - [ 429 ] Rate limit reached at page " + str(pageNum) + ".",
6186
+ "red",
6187
+ )
6188
+ )
6189
+ break
6190
+
6191
+ # Check for maintenance/end of results indicator
6192
+ if (
6193
+ resp.status_code == 503
6194
+ or "The site is under maintenance and will be back soon" in resp.text
6195
+ or "No archives for that site" in resp.text
6196
+ ):
6197
+ if verbose():
6198
+ if pageNum == 0:
6199
+ if args.check_only:
6200
+ checkGhostArchive = 1
6201
+ write(
6202
+ colored(
6203
+ "GhostArchive - [ INFO ] Get URLs from GhostArchive: ", "cyan"
6204
+ )
6205
+ + colored("1 request", "white")
6206
+ )
6207
+ else:
6208
+ write(
6209
+ colored(
6210
+ "GhostArchive - [ INFO ] No results found",
6211
+ "cyan",
6212
+ )
6213
+ )
6214
+ else:
6215
+ write(
6216
+ colored(
6217
+ "GhostArchive - [ INFO ] Retrieved all results from "
6218
+ + str(pageNum)
6219
+ + " pages",
6220
+ "cyan",
6221
+ )
6222
+ )
6223
+ break
6224
+ if resp.status_code != 200:
6225
+ writerr(
6226
+ colored(
6227
+ "GhostArchive - [ ERR ] [ "
6228
+ + str(resp.status_code)
6229
+ + " ] at page "
6230
+ + str(pageNum),
6231
+ "red",
6232
+ )
6233
+ )
6234
+ break
6235
+
6236
+ # Check only mode - just count pages
6237
+ if args.check_only:
6238
+ # For check only, we check if there are results and try to get total count
6239
+ if pageNum == 0:
6240
+ # Check if there are any results on the first page
6241
+ if '<a href="/archive/' in resp.text:
6242
+ # Try to find "out of X" to determine total results/pages
6243
+ outOfMatch = re.search(r"out of (\d+)", resp.text)
6244
+ if outOfMatch:
6245
+ totalResults = int(outOfMatch.group(1))
6246
+ checkGhostArchive = totalResults
6247
+ write(
6248
+ colored(
6249
+ "GhostArchive - [ INFO ] Get URLs from GhostArchive: ", "cyan"
6250
+ )
6251
+ + colored(f"{totalResults} requests (pagination required)", "white")
6252
+ )
6253
+ else:
6254
+ checkGhostArchive = 1
6255
+ write(
6256
+ colored(
6257
+ "GhostArchive - [ INFO ] Get URLs from GhostArchive: ", "cyan"
6258
+ )
6259
+ + colored("unknown requests (pagination required)", "white")
6260
+ )
6261
+ else:
6262
+ checkGhostArchive = 1
6263
+ write(
6264
+ colored("GhostArchive - [ INFO ] Get URLs from GhostArchive: ", "cyan")
6265
+ + colored("1 request (no results)", "white")
6266
+ )
6267
+ break
6268
+
6269
+ # Use regex to extract URLs from anchor tag text content
6270
+ # Pattern matches: <a href="/archive/ID">URL_HERE</a> - captures both href path and URL
6271
+ pattern = r'<a href="(/archive/[^"]*)">([^<]+)</a>'
6272
+ matches = re.findall(pattern, resp.text)
5402
6273
 
6274
+ # If no matches found, we've reached the end of results
6275
+ if not matches:
6276
+ if verbose():
6277
+ write(
6278
+ colored(
6279
+ "GhostArchive - [ INFO ] Retrieved all results from "
6280
+ + str(pageNum + 1)
6281
+ + " pages",
6282
+ "cyan",
6283
+ )
6284
+ )
6285
+ break
5403
6286
 
5404
- def getIntelxUrls():
5405
- """
5406
- Get URLs from the Intelligence X Phonebook search
5407
- """
5408
- global INTELX_API_KEY, linksFound, waymorePath, subs, stopProgram, stopSourceIntelx, argsInput, checkIntelx, argsInputHostname, intelxAPIIssue, linkCountIntelx, linksFoundIntelx
6287
+ for match in matches:
6288
+ ghostArchiveId = match[0] # e.g., "/archive/gkOOR"
6289
+ potentialUrl = match[1].strip()
6290
+ processGhostArchiveUrl(potentialUrl, ghostArchiveId)
5409
6291
 
5410
- # Write the file of URL's for the passed domain/URL
5411
- try:
5412
- if args.check_only:
5413
- write(
5414
- colored("IntelX - [ INFO ] Get URLs from Intelligence X: ", "cyan")
5415
- + colored("minimum 4 requests", "white")
5416
- )
5417
- checkIntelx = 4
5418
- return
6292
+ # Check if there's a "Next Page" link - if not, we've reached the last page
6293
+ # GhostArchive resets to Page 1 when exceeding actual pages, so checking for Next Page is essential
6294
+ if "Next Page" not in resp.text and ">»</a>" not in resp.text:
6295
+ if verbose():
6296
+ write(
6297
+ colored(
6298
+ "GhostArchive - [ INFO ] Retrieved all results from "
6299
+ + str(pageNum + 1)
6300
+ + " pages",
6301
+ "cyan",
6302
+ )
6303
+ )
6304
+ break
5419
6305
 
5420
- stopSourceIntelx = False
5421
- linksFoundIntelx = set()
5422
- initIntelxTls()
6306
+ pageNum += 1
5423
6307
 
5424
- credits = getIntelxAccountInfo()
5425
- if verbose():
6308
+ if not args.check_only:
6309
+ # Count links based on mode - in R mode, count response links; in U/B mode, count URL links
6310
+ if args.mode == "R":
6311
+ linkCountGhostArchive = len(ghostArchiveRequestLinks)
6312
+ else:
6313
+ linkCountGhostArchive = len(linksFoundGhostArchive)
5426
6314
  write(
5427
- colored(
5428
- "IntelX - [ INFO ] The Intelligence X URL requested to get links (Credits: "
5429
- + credits
5430
- + "): ",
5431
- "magenta",
5432
- )
5433
- + colored(intelx_tls.INTELX_SEARCH_URL + "\n", "white")
6315
+ colored("GhostArchive - [ INFO ] Links found on ghostarchive.org: ", "cyan")
6316
+ + colored(str(linkCountGhostArchive), "white")
5434
6317
  )
5435
-
5436
- if not args.check_only:
5437
- write(colored("IntelX - [ INFO ] Getting links from intelx.io API...", "cyan"))
5438
-
5439
- # Get the domains from Intelligence X if the --no-subs wasn't passed
5440
- if not args.no_subs:
5441
- processIntelxType(1, credits)
5442
-
5443
- # Get the URLs from Intelligence X
5444
- if not intelxAPIIssue:
5445
- processIntelxType(3, credits)
5446
-
5447
- linkCountIntelx = len(linksFoundIntelx)
5448
- write(
5449
- colored("IntelX - [ INFO ] Links found on intelx.io: ", "cyan")
5450
- + colored(str(linkCountIntelx), "white")
5451
- )
5452
- linksFound.update(linksFoundIntelx)
5453
- linksFoundIntelx.clear()
6318
+ linksFound.update(linksFoundGhostArchive)
6319
+ linksFoundGhostArchive.clear()
5454
6320
 
5455
6321
  except Exception as e:
5456
- writerr(colored("ERROR getIntelxUrls 1: " + str(e), "red"))
6322
+ writerr(colored("ERROR getGhostArchiveUrls 1: " + str(e), "red"))
5457
6323
 
5458
6324
 
5459
6325
  def processResponses():
@@ -5463,6 +6329,10 @@ def processResponses():
5463
6329
  global stopProgram, totalFileCount
5464
6330
  try:
5465
6331
 
6332
+ # Get responses from GhostArchive unless excluded
6333
+ if stopProgram is None and not args.xga:
6334
+ processResponsesGhostArchive()
6335
+
5466
6336
  # Get responses from URLScan unless excluded
5467
6337
  if stopProgram is None and not args.xus:
5468
6338
  processResponsesURLScan()
@@ -5484,6 +6354,235 @@ def processResponses():
5484
6354
  writerr(colored(getSPACER("ERROR processResponses 1: " + str(e)), "red"))
5485
6355
 
5486
6356
 
6357
+ def processResponsesGhostArchive():
6358
+ """
6359
+ Get archived responses from GhostArchive (ghostarchive.org)
6360
+ """
6361
+ global subs, path, indexFile, totalResponses, stopProgram, argsInput, successCount, fileCount, DEFAULT_OUTPUT_DIR, responseOutputDirectory, ghostArchiveRequestLinks, failureCount, totalFileCount, checkGhostArchive
6362
+ try:
6363
+ fileCount = 0
6364
+ failureCount = 0
6365
+ if not args.check_only:
6366
+ # Create 'results' and domain directory if needed
6367
+ createDirs()
6368
+
6369
+ # Get the path of the files, depending on whether -oR / --output_responses was passed
6370
+ try:
6371
+ responsesPath = responseOutputDirectory + "responses.GhostArchive.tmp"
6372
+ indexPath = responseOutputDirectory + "waymore_index.txt"
6373
+ except Exception as e:
6374
+ if verbose():
6375
+ writerr(colored("ERROR processResponsesGhostArchive 4: " + str(e), "red"))
6376
+
6377
+ # Get URLs from GhostArchive if the DOM ID's haven't been retrieved yet
6378
+ if stopProgram is None and not args.check_only:
6379
+ if args.mode in ("R", "B"):
6380
+ write(
6381
+ colored(
6382
+ "GhostArchive - [ INFO ] Getting list of response links (this can take a while for some domains)...",
6383
+ "cyan",
6384
+ )
6385
+ )
6386
+ if args.mode == "R":
6387
+ getGhostArchiveUrls()
6388
+
6389
+ # Check if a responses.GhostArchive.tmp files exists
6390
+ if not args.check_only and os.path.exists(responsesPath):
6391
+
6392
+ # Load the links into the set
6393
+ with open(responsesPath, "rb") as fl:
6394
+ linkRequests = pickle.load(fl)
6395
+
6396
+ # Set start point
6397
+ successCount = 0
6398
+
6399
+ # Get the URLScan DOM links
6400
+ linkRequests = []
6401
+ for originalUrl, domUrl in ghostArchiveRequestLinks:
6402
+ linkRequests.append((originalUrl, domUrl))
6403
+
6404
+ # Write the links to a temp file
6405
+ if not args.check_only:
6406
+ with open(responsesPath, "wb") as f:
6407
+ pickle.dump(linkRequests, f)
6408
+
6409
+ # Get the total number of responses we will try to get and set the current file count to the success count
6410
+ totalResponses = len(linkRequests)
6411
+ checkGhostArchive = checkGhostArchive + totalResponses
6412
+
6413
+ # If there are no reponses to download, diaplay an error and exit
6414
+ if args.mode != "R" and totalResponses == 0:
6415
+ writerr(
6416
+ colored(
6417
+ getSPACER(
6418
+ "Failed to get responses from GhostArchive (ghostarchive.org) - check input and try again."
6419
+ ),
6420
+ "red",
6421
+ )
6422
+ )
6423
+ return
6424
+
6425
+ fileCount = successCount
6426
+
6427
+ if args.check_only:
6428
+ writerr(
6429
+ colored("Downloading archived responses: ", "cyan")
6430
+ + colored("UNKNOWN requests", "cyan")
6431
+ )
6432
+ writerr(
6433
+ colored(
6434
+ "\n-> Downloading the responses can vary depending on the target and the rate limiting on GhostArchive",
6435
+ "green",
6436
+ )
6437
+ )
6438
+ write("")
6439
+ else:
6440
+ # If the limit has been set over the default, give a warning that this could take a long time!
6441
+ if totalResponses - successCount > DEFAULT_LIMIT:
6442
+ if successCount > 0:
6443
+ writerr(
6444
+ colored(
6445
+ getSPACER(
6446
+ "WARNING: Downloading remaining "
6447
+ + str(totalResponses - successCount)
6448
+ + " responses may take a loooooooong time! Consider using arguments -ko, -l, -ci, -from and -to wisely!"
6449
+ ),
6450
+ "yellow",
6451
+ )
6452
+ )
6453
+ else:
6454
+ writerr(
6455
+ colored(
6456
+ getSPACER(
6457
+ "WARNING: Downloading "
6458
+ + str(totalResponses)
6459
+ + " responses may take a loooooooong time! Consider using arguments -ko, -l, -ci, -from and -to wisely!"
6460
+ ),
6461
+ "yellow",
6462
+ )
6463
+ )
6464
+
6465
+ # Open the index file if hash value is going to be used (not URL)
6466
+ if not args.url_filename:
6467
+ indexFile = open(indexPath, "a")
6468
+
6469
+ # Process the URLs from GhostArchive
6470
+ if stopProgram is None:
6471
+ p = mp.Pool(
6472
+ args.processes * 2
6473
+ ) # Double the number of processes to speed up the download
6474
+ p.starmap(getGhostArchiveWARC, linkRequests[successCount:])
6475
+ p.close()
6476
+ p.join()
6477
+
6478
+ # Delete the tmp files now it has run successfully
6479
+ if stopProgram is None:
6480
+ try:
6481
+ os.remove(responsesPath)
6482
+ except Exception:
6483
+ pass
6484
+
6485
+ # Close the index file if hash value is going to be used (not URL)
6486
+ if not args.url_filename:
6487
+ indexFile.close()
6488
+
6489
+ if not args.check_only:
6490
+ try:
6491
+ if failureCount > 0:
6492
+ if verbose():
6493
+ write(
6494
+ colored("GhostArchive - [ INFO ] Responses saved to ", "cyan")
6495
+ + colored(responseOutputDirectory, "white")
6496
+ + colored(" for " + subs + argsInput + ": ", "cyan")
6497
+ + colored(
6498
+ str(fileCount) + " 🤘",
6499
+ "white",
6500
+ )
6501
+ + colored(" (" + str(failureCount) + " not found)\n", "red")
6502
+ )
6503
+ else:
6504
+ write(
6505
+ colored("GhostArchive - [ INFO ] Responses saved to ", "cyan")
6506
+ + colored(responseOutputDirectory, "white")
6507
+ + colored(" for " + subs + argsInput + ": ", "cyan")
6508
+ + colored(str(fileCount) + " 🤘", "white")
6509
+ + colored(" (" + str(failureCount) + " not found)\n", "red")
6510
+ )
6511
+ else:
6512
+ if verbose():
6513
+ write(
6514
+ colored("GhostArchive - [ INFO ] Responses saved to ", "cyan")
6515
+ + colored(responseOutputDirectory, "white")
6516
+ + colored(" for " + subs + argsInput + ": ", "cyan")
6517
+ + colored(str(fileCount) + " 🤘\n", "white")
6518
+ )
6519
+ else:
6520
+ write(
6521
+ colored("GhostArchive - [ INFO ] Responses saved to ", "cyan")
6522
+ + colored(responseOutputDirectory, "white")
6523
+ + colored(" for " + subs + argsInput + ": ", "cyan")
6524
+ + colored(str(fileCount) + " 🤘\n", "white")
6525
+ )
6526
+ except Exception as e:
6527
+ if verbose():
6528
+ writerr(colored("ERROR processResponsesGhostArchive 5: " + str(e), "red"))
6529
+
6530
+ # Append extra links from WARC files to URL output file (for mode B)
6531
+ try:
6532
+ if args.mode == "B" and len(extraWarcLinks) > 0:
6533
+ # Determine URL output file path (same logic as processURLOutput)
6534
+ if args.output_urls == "":
6535
+ if args.output_responses != "":
6536
+ urlFilePath = args.output_responses + "/waymore.txt"
6537
+ else:
6538
+ urlFilePath = (
6539
+ str(DEFAULT_OUTPUT_DIR)
6540
+ + "/results/"
6541
+ + str(argsInput).replace("/", "-")
6542
+ + "/waymore.txt"
6543
+ )
6544
+ else:
6545
+ urlFilePath = args.output_urls
6546
+
6547
+ # Load existing URLs from file to avoid duplicates
6548
+ existingUrls = set()
6549
+ try:
6550
+ with open(urlFilePath) as f:
6551
+ for line in f:
6552
+ existingUrls.add(line.strip())
6553
+ except Exception:
6554
+ pass
6555
+
6556
+ # Append only new unique URLs
6557
+ newLinks = [
6558
+ url
6559
+ for url in extraWarcLinks
6560
+ if url not in existingUrls and url not in linksFound
6561
+ ]
6562
+ if len(newLinks) > 0:
6563
+ with open(urlFilePath, "a") as f:
6564
+ for url in newLinks:
6565
+ f.write(url + "\n")
6566
+
6567
+ # Display message about extra links
6568
+ write(
6569
+ colored("GhostArchive - [ INFO ] ", "cyan")
6570
+ + colored(str(len(newLinks)), "white")
6571
+ + colored(" extra links found in WARC files added to file ", "cyan")
6572
+ + colored(urlFilePath, "white")
6573
+ + "\n"
6574
+ )
6575
+ except Exception as e:
6576
+ if verbose():
6577
+ writerr(colored("ERROR processResponsesGhostArchive 6: " + str(e), "red"))
6578
+
6579
+ totalFileCount = totalFileCount + fileCount
6580
+ except Exception as e:
6581
+ writerr(colored(getSPACER("ERROR processResponsesGhostArchive 1: " + str(e)), "red"))
6582
+ finally:
6583
+ linkRequests = None
6584
+
6585
+
5487
6586
  def processResponsesURLScan():
5488
6587
  """
5489
6588
  Get archived responses from URLScan (urlscan.io)
@@ -6699,6 +7798,12 @@ async def fetch_intelx_async():
6699
7798
  await loop.run_in_executor(None, getIntelxUrls)
6700
7799
 
6701
7800
 
7801
+ async def fetch_ghostarchive_async():
7802
+ """Async wrapper for getGhostArchiveUrls - runs in thread pool"""
7803
+ loop = asyncio.get_event_loop()
7804
+ await loop.run_in_executor(None, getGhostArchiveUrls)
7805
+
7806
+
6702
7807
  async def fetch_all_sources_async():
6703
7808
  """
6704
7809
  Orchestrator function to fetch from all enabled sources concurrently.
@@ -6721,6 +7826,8 @@ async def fetch_all_sources_async():
6721
7826
  tasks.append(("VirusTotal", fetch_virustotal_async()))
6722
7827
  if not args.xix and INTELX_API_KEY != "" and stopProgram is None:
6723
7828
  tasks.append(("Intelligence X", fetch_intelx_async()))
7829
+ if not args.xga and stopProgram is None:
7830
+ tasks.append(("GhostArchive", fetch_ghostarchive_async()))
6724
7831
 
6725
7832
  if not tasks:
6726
7833
  return
@@ -6746,7 +7853,7 @@ async def fetch_all_sources_async():
6746
7853
 
6747
7854
  # Run waymore
6748
7855
  def main():
6749
- global args, DEFAULT_TIMEOUT, inputValues, argsInput, linksFound, linkMimes, successCount, failureCount, fileCount, totalResponses, totalPages, indexFile, path, stopSource, stopProgram, VIRUSTOTAL_API_KEY, inputIsSubDomain, argsInputHostname, WEBHOOK_DISCORD, responseOutputDirectory, fileCount, INTELX_API_KEY, stopSourceAlienVault, stopSourceCommonCrawl, stopSourceWayback, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx
7856
+ global args, DEFAULT_TIMEOUT, inputValues, argsInput, linksFound, linkMimes, successCount, failureCount, fileCount, totalResponses, totalPages, indexFile, path, stopSource, stopProgram, VIRUSTOTAL_API_KEY, inputIsSubDomain, argsInputHostname, WEBHOOK_DISCORD, responseOutputDirectory, fileCount, INTELX_API_KEY, stopSourceAlienVault, stopSourceCommonCrawl, stopSourceWayback, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx, stopSourceGhostArchive, extraWarcLinks
6750
7857
 
6751
7858
  # Tell Python to run the handler() function when SIGINT is received
6752
7859
  signal(SIGINT, handler)
@@ -6902,13 +8009,19 @@ def main():
6902
8009
  help="Exclude checks for links from intelx.io",
6903
8010
  default=False,
6904
8011
  )
8012
+ parser.add_argument(
8013
+ "-xga",
8014
+ action="store_true",
8015
+ help="Exclude checks for links from ghostarchive.org",
8016
+ default=False,
8017
+ )
6905
8018
  parser.add_argument(
6906
8019
  "--providers",
6907
8020
  action="store",
6908
- help="A comma separated list of source providers that you want to get URLs from. The values can be wayback,commoncrawl,otx,urlscan,virustotal and intelx. Passing this will override any exclude arguments (e.g. -xwm,-xcc, etc.) passed to exclude sources, and reset those based on what was passed with this argument.",
8021
+ help="A comma separated list of source providers that you want to get URLs from. The values can be wayback,commoncrawl,otx,urlscan,virustotal,intelx and ghostarchive. Passing this will override any exclude arguments (e.g. -xwm,-xcc, etc.) passed to exclude sources, and reset those based on what was passed with this argument.",
6909
8022
  default=[],
6910
8023
  type=validateArgProviders,
6911
- metavar="{wayback,commoncrawl,otx,urlscan,virustotal,intelx}",
8024
+ metavar="{wayback,commoncrawl,otx,urlscan,virustotal,intelx,ghostarchive}",
6912
8025
  )
6913
8026
  parser.add_argument(
6914
8027
  "-lcc",
@@ -7075,6 +8188,10 @@ def main():
7075
8188
  args.xix = True
7076
8189
  else:
7077
8190
  args.xix = False
8191
+ if "ghostarchive" not in args.providers:
8192
+ args.xga = True
8193
+ else:
8194
+ args.xga = False
7078
8195
 
7079
8196
  # If no input was given, raise an error
7080
8197
  if sys.stdin.isatty():
@@ -7145,6 +8262,7 @@ def main():
7145
8262
  # Reset global variables
7146
8263
  linksFound = set()
7147
8264
  linkMimes = set()
8265
+ extraWarcLinks = set()
7148
8266
  successCount = 0
7149
8267
  failureCount = 0
7150
8268
  fileCount = 0
@@ -7159,6 +8277,7 @@ def main():
7159
8277
  stopSourceURLScan = False
7160
8278
  stopSourceVirusTotal = False
7161
8279
  stopSourceIntelx = False
8280
+ stopSourceGhostArchive = False
7162
8281
 
7163
8282
  # Get the config settings from the config.yml file
7164
8283
  getConfig()