PyPI - waymore - Versions diffs - 7.6__py3-none-any.whl → 8.0__py3-none-any.whl - Mend

waymore 7.6py3-none-any.whl → 8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

waymore/__init__.py +1 -1
waymore/waymore.py +1829 -305
{waymore-7.6.dist-info → waymore-8.0.dist-info}/METADATA +5 -2
waymore-8.0.dist-info/RECORD +8 -0
{waymore-7.6.dist-info → waymore-8.0.dist-info}/WHEEL +1 -1
waymore-7.6.dist-info/RECORD +0 -8
{waymore-7.6.dist-info → waymore-8.0.dist-info}/entry_points.txt +0 -0
{waymore-7.6.dist-info → waymore-8.0.dist-info}/licenses/LICENSE +0 -0
{waymore-7.6.dist-info → waymore-8.0.dist-info}/top_level.txt +0 -0

waymore/waymore.py CHANGED Viewed

@@ -70,6 +70,7 @@ stopSourceAlienVault = False
 stopSourceURLScan = False
 stopSourceVirusTotal = False
 stopSourceIntelx = False
+stopSourceGhostArchive = False
 successCount = 0
 failureCount = 0
 fileCount = 0
@@ -79,6 +80,7 @@ totalPages = 0
 indexFile = None
 continueRespFile = None
 continueRespFileURLScan = None
+continueRespFileGhostArchive = None
 inputIsDomainANDPath = False
 inputIsSubDomain = False
 subs = "*."
@@ -102,6 +104,7 @@ checkAlienVault = 0
 checkURLScan = 0
 checkVirusTotal = 0
 checkIntelx = 0
+checkGhostArchive = 0
 argsInputHostname = ""
 responseOutputDirectory = ""
 urlscanRequestLinks = set()
@@ -112,11 +115,14 @@ linkCountAlienVault = 0
 linkCountURLScan = 0
 linkCountVirusTotal = 0
 linkCountIntelx = 0
+linkCountGhostArchive = 0
 linksFoundCommonCrawl = set()
 linksFoundAlienVault = set()
 linksFoundURLScan = set()
 linksFoundVirusTotal = set()
 linksFoundIntelx = set()
+linksFoundGhostArchive = set()
+ghostArchiveRequestLinks = set()
 # Thread lock for protecting shared state during concurrent operations
 links_lock = threading.Lock()
@@ -124,6 +130,7 @@ links_lock = threading.Lock()
 # Shared state for link collection across all sources
 linksFound = set()
 linkMimes = set()
+extraWarcLinks = set()  # Track extra URLs found in WARC files for mode B
 # Source Provider URLs
 WAYBACK_URL = "https://web.archive.org/cdx/search/cdx?url={DOMAIN}{COLLAPSE}&fl=timestamp,original,mimetype,statuscode,digest"
@@ -134,6 +141,8 @@ URLSCAN_DOM_URL = "https://urlscan.io/dom/"
 VIRUSTOTAL_URL = "https://www.virustotal.com/vtapi/v2/domain/report?apikey={APIKEY}&domain={DOMAIN}"
 # Paid endpoint first, free endpoint as fallback
 INTELX_BASES = ["https://2.intelx.io", "https://free.intelx.io"]
+GHOSTARCHIVE_URL = "https://ghostarchive.org/search?term={DOMAIN}&page="
+GHOSTARCHIVE_DOM_URL = "https://ghostarchive.org"
 intelx_tls = threading.local()
@@ -247,10 +256,10 @@ DEFAULT_LIMIT = 5000
 DEFAULT_TIMEOUT = 30
 # Exclusions used to exclude responses we will try to get from web.archive.org
-DEFAULT_FILTER_URL = ".css,.jpg,.jpeg,.png,.svg,.img,.gif,.mp4,.flv,.ogv,.webm,.webp,.mov,.mp3,.m4a,.m4p,.scss,.tif,.tiff,.ttf,.otf,.woff,.woff2,.bmp,.ico,.eot,.htc,.rtf,.swf,.image,/image,/img,/css,/wp-json,/wp-content,/wp-includes,/theme,/audio,/captcha,/font,node_modules,/jquery,/bootstrap,/_incapsula_resource"
+DEFAULT_FILTER_URL = ".css,.jpg,.jpeg,.png,.svg,.img,.gif,.mp4,.flv,.ogv,.webm,.webp,.mov,.mp3,.m4a,.m4p,.scss,.tif,.tiff,.ttf,.otf,.woff,.woff2,.bmp,.ico,.eot,.htc,.rtf,.swf,.image,/image,/img,/css,/wp-json,/wp-content,/wp-includes,/theme,/audio,/captcha,/font,node_modules,/jquery,/bootstrap,/_incapsula_resource,.wmv,.wma,.asx,.avif"
 # MIME Content-Type exclusions used to filter links and responses from web.archive.org through their API
-DEFAULT_FILTER_MIME = "text/css,image/jpeg,image/jpg,image/png,image/svg+xml,image/gif,image/tiff,image/webp,image/bmp,image/vnd,image/x-icon,image/vnd.microsoft.icon,font/ttf,font/woff,font/woff2,font/x-woff2,font/x-woff,font/otf,audio/mpeg,audio/wav,audio/webm,audio/aac,audio/ogg,audio/wav,audio/webm,video/mp4,video/mpeg,video/webm,video/ogg,video/mp2t,video/webm,video/x-msvideo,video/x-flv,application/font-woff,application/font-woff2,application/x-font-woff,application/x-font-woff2,application/vnd.ms-fontobject,application/font-sfnt,application/vnd.android.package-archive,binary/octet-stream,application/octet-stream,application/pdf,application/x-font-ttf,application/x-font-otf,video/webm,video/3gpp,application/font-ttf,audio/mp3,audio/x-wav,image/pjpeg,audio/basic,application/font-otf,application/x-ms-application,application/x-msdownload,video/x-ms-wmv,image/x-png,video/quicktime,image/x-ms-bmp,font/opentype,application/x-font-opentype,application/x-woff,audio/aiff"
+DEFAULT_FILTER_MIME = "text/css,image/jpeg,image/jpg,image/png,image/svg+xml,image/gif,image/tiff,image/webp,image/bmp,image/vnd,image/x-icon,image/vnd.microsoft.icon,font/ttf,font/woff,font/woff2,font/x-woff2,font/x-woff,font/otf,audio/mpeg,audio/wav,audio/webm,audio/aac,audio/ogg,audio/wav,audio/webm,video/mp4,video/mpeg,video/webm,video/ogg,video/mp2t,video/webm,video/x-msvideo,video/x-flv,application/font-woff,application/font-woff2,application/x-font-woff,application/x-font-woff2,application/vnd.ms-fontobject,application/font-sfnt,application/vnd.android.package-archive,binary/octet-stream,application/octet-stream,application/x-font-ttf,application/x-font-otf,video/webm,video/3gpp,application/font-ttf,audio/mp3,audio/x-wav,image/pjpeg,audio/basic,application/font-otf,application/x-ms-application,application/x-msdownload,video/x-ms-wmv,image/x-png,video/quicktime,image/x-ms-bmp,font/opentype,application/x-font-opentype,application/x-woff,audio/aiff,video/x-ms-asf,audio/x-ms-wma,audio/wma,application/x-mplayer2,image/avif"
 # Response code exclusions we will use to filter links and responses from web.archive.org through their API
 DEFAULT_FILTER_CODE = "404,301,302"
@@ -297,6 +306,298 @@ INLINE_JS_EXCLUDE = [
     ".json",
 ]
+# Binary file extensions that should be saved as raw bytes, not text
+BINARY_EXTENSIONS = frozenset(
+    [
+        ".zip",
+        ".gz",
+        ".gzip",
+        ".tar",
+        ".rar",
+        ".7z",
+        ".bz2",
+        ".xz",
+        ".pdf",
+        ".doc",
+        ".docx",
+        ".xls",
+        ".xlsx",
+        ".ppt",
+        ".pptx",
+        ".exe",
+        ".msi",
+        ".dll",
+        ".bin",
+        ".so",
+        ".dmg",
+        ".deb",
+        ".rpm",
+        ".png",
+        ".jpg",
+        ".jpeg",
+        ".gif",
+        ".bmp",
+        ".ico",
+        ".webp",
+        ".svg",
+        ".tiff",
+        ".tif",
+        ".mp3",
+        ".mp4",
+        ".wav",
+        ".avi",
+        ".mov",
+        ".mkv",
+        ".flv",
+        ".wmv",
+        ".webm",
+        ".ogg",
+        ".ttf",
+        ".otf",
+        ".woff",
+        ".woff2",
+        ".eot",
+        ".class",
+        ".jar",
+        ".war",
+        ".ear",
+        ".pyc",
+        ".pyo",
+        ".o",
+        ".a",
+        ".lib",
+        ".iso",
+        ".img",
+        ".sqlite",
+        ".db",
+        ".mdb",
+        ".swf",
+        ".fla",
+    ]
+)
+# Binary MIME types that should be saved as raw bytes, not text
+BINARY_MIME_TYPES = frozenset(
+    [
+        "application/zip",
+        "application/x-zip-compressed",
+        "application/x-gzip",
+        "application/gzip",
+        "application/x-tar",
+        "application/x-rar-compressed",
+        "application/x-7z-compressed",
+        "application/x-bzip2",
+        "application/x-xz",
+        "application/pdf",
+        "application/msword",
+        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        "application/vnd.ms-excel",
+        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+        "application/vnd.ms-powerpoint",
+        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+        "application/x-msdownload",
+        "application/x-msi",
+        "application/x-dosexec",
+        "application/octet-stream",
+        "image/png",
+        "image/jpeg",
+        "image/gif",
+        "image/bmp",
+        "image/x-icon",
+        "image/webp",
+        "image/tiff",
+        "audio/mpeg",
+        "audio/wav",
+        "audio/ogg",
+        "audio/webm",
+        "video/mp4",
+        "video/avi",
+        "video/quicktime",
+        "video/x-msvideo",
+        "video/x-matroska",
+        "video/webm",
+        "video/ogg",
+        "font/ttf",
+        "font/otf",
+        "font/woff",
+        "font/woff2",
+        "application/x-font-ttf",
+        "application/x-font-otf",
+        "application/font-woff",
+        "application/font-woff2",
+        "application/java-archive",
+        "application/x-java-class",
+        "application/x-shockwave-flash",
+        "application/x-sqlite3",
+        "application/x-iso9660-image",
+    ]
+)
+def isBinaryContent(contentBytes, contentType, url=""):
+    """
+    Determine if content should be treated as binary based on actual content, Content-Type, and URL.
+    Priority (highest to lowest):
+    1. Content inspection - check for text signatures (most reliable)
+    2. Content-Type header
+    3. URL extension (least reliable - archive might have captured an HTML error page)
+    Args:
+        contentBytes: The raw response bytes (at least first 100 bytes)
+        contentType: The Content-Type header value
+        url: The URL (optional, used as fallback)
+    Returns True if content is binary and should be saved as raw bytes.
+    """
+    # STEP 1: Check actual content for text signatures (most reliable)
+    # If content starts with text markers, it's definitely NOT binary regardless of extension
+    try:
+        if contentBytes and len(contentBytes) > 0:
+            # Get first 100 bytes and strip leading whitespace/newlines for checking
+            preview = contentBytes[:100].lstrip()
+            previewLower = preview.lower()
+            # Common text file signatures
+            textSignatures = [
+                b"<!doctype",  # HTML doctype
+                b"<html",  # HTML tag
+                b"<head",  # HTML head
+                b"<body",  # HTML body
+                b"<?xml",  # XML declaration
+                b"<svg",  # SVG image (actually XML text)
+                b"{",  # JSON object
+                b"[",  # JSON array
+                b"/*",  # CSS/JS comment
+                b"//",  # JS comment
+                b"#!",  # Shebang (shell scripts)
+                b"var ",  # JavaScript
+                b"let ",  # JavaScript
+                b"const ",  # JavaScript
+                b"function",  # JavaScript
+                b"import ",  # JavaScript/Python
+                b"export ",  # JavaScript
+                b"class ",  # Various languages
+                b"def ",  # Python
+            ]
+            for sig in textSignatures:
+                if previewLower.startswith(sig):
+                    return False  # Definitely text, not binary
+            # Check for binary file magic bytes (file signatures)
+            binarySignatures = [
+                b"%PDF",  # PDF
+                b"PK\x03\x04",  # ZIP, DOCX, XLSX, etc.
+                b"PK\x05\x06",  # Empty ZIP
+                b"\x1f\x8b",  # GZIP
+                b"\x89PNG",  # PNG
+                b"\xff\xd8\xff",  # JPEG
+                b"GIF87a",  # GIF
+                b"GIF89a",  # GIF
+                b"BM",  # BMP (check at start)
+                b"RIFF",  # WAV, AVI, WebP
+                b"\x00\x00\x00",  # Various binary formats (MP4, etc.)
+                b"ID3",  # MP3 with ID3 tag
+                b"\xff\xfb",  # MP3
+                b"\xff\xfa",  # MP3
+                b"OggS",  # OGG
+                b"\x4d\x5a",  # EXE/DLL (MZ header)
+                b"\x7fELF",  # Linux ELF binary
+                b"\xca\xfe\xba\xbe",  # Java class file
+                b"\x30\x26\xb2\x75",  # ASF/WMV/WMA (first 4 bytes of ASF GUID)
+                b"FLV\x01",  # FLV (Flash Video)
+                b"ftyp",  # MP4/M4A/MOV (after 4 byte size prefix)
+                b"Rar!\x1a\x07",  # RAR archive
+                b"7z\xbc\xaf\x27\x1c",  # 7-Zip archive
+                b"\x1a\x45\xdf\xa3",  # WebM/MKV (EBML)
+                b"II\x2a\x00",  # TIFF (Intel byte order)
+                b"MM\x00\x2a",  # TIFF (Motorola byte order)
+                b"\x00\x00\x01\x00",  # ICO (Windows Icon)
+                b"\x00\x00\x02\x00",  # CUR (Windows Cursor)
+                b"wOFF",  # WOFF font
+                b"wOF2",  # WOFF2 font
+                b"FWS",  # SWF (uncompressed Flash)
+                b"CWS",  # SWF (zlib compressed Flash)
+                b"ZWS",  # SWF (LZMA compressed Flash)
+                b"\x00\x01\x00\x00",  # TrueType font
+                b"OTTO",  # OpenType font with CFF
+            ]
+            for sig in binarySignatures:
+                if preview.startswith(sig):
+                    return True  # Definitely binary
+    except Exception:
+        pass
+    # STEP 2: Check Content-Type header
+    try:
+        if contentType:
+            mimeType = contentType.lower().split(";")[0].strip()
+            # Explicit text types
+            textMimeTypes = [
+                "text/html",
+                "text/plain",
+                "text/css",
+                "text/javascript",
+                "text/xml",
+                "text/csv",
+                "text/markdown",
+                "application/json",
+                "application/javascript",
+                "application/xml",
+                "application/xhtml+xml",
+                "application/rss+xml",
+                "application/atom+xml",
+            ]
+            if mimeType in textMimeTypes or mimeType.startswith("text/"):
+                return False  # Text type
+            # Known binary types
+            if mimeType in BINARY_MIME_TYPES:
+                return True
+            # Generic binary prefixes
+            if (
+                mimeType.startswith("image/")
+                or mimeType.startswith("audio/")
+                or mimeType.startswith("video/")
+            ):
+                return True
+            if mimeType.startswith("application/") and mimeType not in textMimeTypes:
+                # application/* is often binary, but not always - be conservative
+                if "octet-stream" in mimeType or "binary" in mimeType:
+                    return True
+    except Exception:
+        pass
+    # STEP 3: Check URL extension as last resort
+    try:
+        if url:
+            # Extract actual URL from prefixed formats (Wayback/URLScan)
+            actualUrl = url
+            httpPos = url.find("http://")
+            httpsPos = url.find("https://")
+            if httpsPos >= 0:
+                actualUrl = url[httpsPos:]
+            elif httpPos >= 0:
+                actualUrl = url[httpPos:]
+            parsed = urlparse(actualUrl.strip())
+            path = parsed.path.lower()
+            if "." in path:
+                ext = "." + path.rsplit(".", 1)[-1]
+                if "?" in ext:
+                    ext = ext.split("?")[0]
+                if ext in BINARY_EXTENSIONS:
+                    return True
+    except Exception:
+        pass
+    # Default: treat as text (safer - text processing won't corrupt text)
+    return False
 # Get memory usage for
 def getMemory():
@@ -451,7 +752,7 @@ def handler(signal_received, frame):
     This function is called if Ctrl-C is called by the user
     An attempt will be made to try and clean up properly
     """
-    global stopSource, stopProgram, stopProgramCount, stopSourceWayback, stopSourceCommonCrawl, stopSourceAlienVault, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx, current_response, current_session
+    global stopSource, stopProgram, stopProgramCount, stopSourceWayback, stopSourceCommonCrawl, stopSourceAlienVault, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx, stopSourceGhostArchive, current_response, current_session
     if stopProgram is not None:
         stopProgramCount = stopProgramCount + 1
@@ -486,6 +787,7 @@ def handler(signal_received, frame):
         stopSourceURLScan = True
         stopSourceVirusTotal = True
         stopSourceIntelx = True
+        stopSourceGhostArchive = True
         # Try to close any active response or session to interrupt blocking network I/O
         try:
             if current_response is not None:
@@ -955,16 +1257,12 @@ def showOptions():
                 )
             )
+        # Only show --source-ip if it's explicitly configured
         if SOURCE_IP:
             write(
                 colored("--source-ip: " + str(SOURCE_IP), "magenta")
                 + colored(" Outbound requests will bind to this IP.", "white")
             )
-        else:
-            write(
-                colored("--source-ip: default", "magenta")
-                + colored(" Outbound IP determined by OS routing table.", "white")
-            )
         write()
@@ -1465,11 +1763,15 @@ def printProgressBar(
 def filehash(text):
     """
-    Generate a hash value for the passed string. This is used for the file name of a downloaded archived response
+    Generate a hash value for the passed string or bytes. This is used for the file name of a downloaded archived response
     """
     hash = 0
     for ch in text:
-        hash = (hash * 281 ^ ord(ch) * 997) & 0xFFFFFFFFFFF
+        # Handle both str (gives chars needing ord()) and bytes (gives ints directly)
+        if isinstance(ch, int):
+            hash = (hash * 281 ^ ch * 997) & 0xFFFFFFFFFFF
+        else:
+            hash = (hash * 281 ^ ord(ch) * 997) & 0xFFFFFFFFFFF
     return str(hash)
@@ -1497,6 +1799,63 @@ def fixArchiveOrgUrl(url):
     return url
+def isLikelyBinaryUrl(url):
+    """
+    Check if a URL likely points to a binary file based on its extension.
+    This is used BEFORE making a request to decide if we need the raw/id_ version.
+    """
+    try:
+        # Extract actual URL from prefixed formats (Wayback timestamp/URLScan UUID)
+        actualUrl = url
+        httpPos = url.find("http://")
+        httpsPos = url.find("https://")
+        if httpsPos >= 0:
+            actualUrl = url[httpsPos:]
+        elif httpPos >= 0:
+            actualUrl = url[httpPos:]
+        parsed = urlparse(actualUrl.strip())
+        path = parsed.path.lower()
+        if "." in path:
+            ext = "." + path.rsplit(".", 1)[-1]
+            if "?" in ext:
+                ext = ext.split("?")[0]
+            if ext in BINARY_EXTENSIONS:
+                return True
+    except Exception:
+        pass
+    return False
+def addRawModifier(archiveUrl):
+    """
+    Add 'id_' modifier to Wayback Machine URL to get raw/original content.
+    This is essential for binary files to avoid Wayback modifications.
+    Example:
+      Input:  https://web.archive.org/web/20090315210455/http://example.com/file.wmv
+      Output: https://web.archive.org/web/20090315210455id_/http://example.com/file.wmv
+    """
+    try:
+        # Find the timestamp in the URL (14 digits after /web/)
+        webPos = archiveUrl.find("/web/")
+        if webPos >= 0:
+            # Find where the timestamp ends (first / after /web/)
+            afterWeb = webPos + 5  # Position after "/web/"
+            slashAfterTimestamp = archiveUrl.find("/", afterWeb)
+            if slashAfterTimestamp > afterWeb:
+                # Insert id_ before the slash after timestamp
+                timestamp = archiveUrl[afterWeb:slashAfterTimestamp]
+                # Only add id_ if it's not already there
+                if not timestamp.endswith("id_"):
+                    return (
+                        archiveUrl[:slashAfterTimestamp] + "id_" + archiveUrl[slashAfterTimestamp:]
+                    )
+    except Exception:
+        pass
+    return archiveUrl
 # Add a link to the linksFound collection for archived responses (included timestamp preifx)
 def linksFoundResponseAdd(link):
     global linksFound, argsInput, argsInputHostname, links_lock
@@ -1581,6 +1940,12 @@ def processArchiveUrl(url):
         if stopProgram is None:
             archiveUrl = "https://web.archive.org/web/" + fixArchiveOrgUrl(url)
+            # For binary files, add id_ modifier to get raw/original content
+            # This prevents Wayback Machine from modifying the content
+            if isLikelyBinaryUrl(url):
+                archiveUrl = addRawModifier(archiveUrl)
             hashValue = ""
             # Get memory usage every 100 responses
@@ -1593,6 +1958,18 @@ def processArchiveUrl(url):
             # Make a request to the web archive
             try:
                 try:
+                    try:
+                        if verbose() and os.environ.get("USER") == "xnl":
+                            writerr(
+                                colored(
+                                    "[ DBG ] Requesting file " + archiveUrl,
+                                    "yellow",
+                                    attrs=["dark"],
+                                )
+                            )
+                    except Exception:
+                        pass
                     # Choose a random user agent string to use for any requests
                     userAgent = random.choice(USER_AGENT)
@@ -1604,146 +1981,175 @@ def processArchiveUrl(url):
                         headers={"User-Agent": userAgent},
                         allow_redirects=True,
                     )
-                    archiveHtml = str(resp.text)
+                    # Get raw content bytes first
+                    contentBytes = resp.content
                     try:
-                        contentType = resp.headers.get("Content-Type").split(";")[0].lower()
+                        contentType = resp.headers.get("Content-Type", "").split(";")[0].lower()
                     except Exception:
                         contentType = ""
+                    # Determine if this is binary content based on actual content, Content-Type, and URL
+                    isBinary = isBinaryContent(contentBytes, contentType, url)
+                    if isBinary:
+                        # For binary files, use raw bytes as-is
+                        archiveContent = contentBytes
+                        archiveHtml = None  # Not used for binary files
+                    else:
+                        # For text files, decode to string
+                        archiveHtml = contentBytes.decode("utf-8", errors="replace")
+                        archiveContent = None  # Not used for text files
                     # Only create a file if there is a response
-                    if len(archiveHtml) != 0:
+                    responseLength = len(archiveContent) if isBinary else len(archiveHtml)
+                    if responseLength != 0:
+                        # For text files, check for custom 404 pages
                         # If the FILTER_CODE doesn't include 404, OR
                         # If the FILTER_CODE includes 404, and it doesn't seem to be a custom 404 page
-                        if "404" not in FILTER_CODE or (
-                            "404" in FILTER_CODE
-                            and not re.findall(REGEX_404, archiveHtml, re.DOTALL | re.IGNORECASE)
-                        ):
-                            # Add the URL as a comment at the start of the response
-                            if args.url_filename:
-                                archiveHtml = (
-                                    "/* Original URL: " + archiveUrl + " */\n" + archiveHtml
+                        if (
+                            isBinary
+                            or "404" not in FILTER_CODE
+                            or (
+                                "404" in FILTER_CODE
+                                and not re.findall(
+                                    REGEX_404, archiveHtml, re.DOTALL | re.IGNORECASE
                                 )
-                            # Remove all web archive references in the response
-                            archiveHtml = re.sub(
-                                r'\<script type=\"text\/javascript" src=\"\/_static\/js\/bundle-playback\.js\?v=[A-Za-z0-9]*" charset="utf-8"><\/script>\n<script type="text\/javascript" src="\/_static\/js\/wombat\.js.*\<\!-- End Wayback Rewrite JS Include --\>',
-                                "",
-                                archiveHtml,
-                                1,
-                                flags=re.DOTALL | re.IGNORECASE,
-                            )
-                            archiveHtml = re.sub(
-                                r"\<script src=\"\/\/archive\.org.*\<\!-- End Wayback Rewrite JS Include --\>",
-                                "",
-                                archiveHtml,
-                                1,
-                                flags=re.DOTALL | re.IGNORECASE,
-                            )
-                            archiveHtml = re.sub(
-                                r"\<script\>window\.RufflePlayer[^\<]*\<\/script\>",
-                                "",
-                                archiveHtml,
-                                1,
-                                flags=re.DOTALL | re.IGNORECASE,
-                            )
-                            archiveHtml = re.sub(
-                                r"\<\!-- BEGIN WAYBACK TOOLBAR INSERT --\>.*\<\!-- END WAYBACK TOOLBAR INSERT --\>",
-                                "",
-                                archiveHtml,
-                                1,
-                                flags=re.DOTALL | re.IGNORECASE,
-                            )
-                            archiveHtml = re.sub(
-                                r"(}\n)?(\/\*|<!--\n)\s*FILE ARCHIVED ON.*108\(a\)\(3\)\)\.\n(\*\/|-->)",
-                                "",
-                                archiveHtml,
-                                1,
-                                flags=re.DOTALL | re.IGNORECASE,
-                            )
-                            archiveHtml = re.sub(
-                                r"var\s_____WB\$wombat\$assign\$function.*WB\$wombat\$assign\$function_____\(\"opener\"\);",
-                                "",
-                                archiveHtml,
-                                1,
-                                flags=re.DOTALL | re.IGNORECASE,
-                            )
-                            archiveHtml = re.sub(
-                                r"(\<\!--|\/\*)\nplayback timings.*(--\>|\*\/)",
-                                "",
-                                archiveHtml,
-                                1,
-                                flags=re.DOTALL | re.IGNORECASE,
-                            )
-                            archiveHtml = re.sub(
-                                r"((https:)?\/\/web\.archive\.org)?\/web\/[0-9]{14}([A-Za-z]{2}\_)?\/",
-                                "",
-                                archiveHtml,
-                                flags=re.IGNORECASE,
-                            )
-                            archiveHtml = re.sub(
-                                r"((https:)?\\\/\\\/web\.archive\.org)?\\\/web\\\/[0-9]{14}([A-Za-z]{2}\_)?\\\/",
-                                "",
-                                archiveHtml,
-                                flags=re.IGNORECASE,
-                            )
-                            archiveHtml = re.sub(
-                                r"((https:)?%2F%2Fweb\.archive\.org)?%2Fweb%2F[0-9]{14}([A-Za-z]{2}\_)?%2F",
-                                "",
-                                archiveHtml,
-                                flags=re.IGNORECASE,
-                            )
-                            archiveHtml = re.sub(
-                                r"((https:)?\\u002F\\u002Fweb\.archive\.org)?\\u002Fweb\\u002F[0-9]{14}([A-Za-z]{2}\_)?\\u002F",
-                                "",
-                                archiveHtml,
-                                flags=re.IGNORECASE,
-                            )
-                            archiveHtml = re.sub(
-                                r"\<script type=\"text\/javascript\">\s*__wm\.init\(\"https:\/\/web\.archive\.org\/web\"\);[^\<]*\<\/script\>",
-                                "",
-                                archiveHtml,
-                                flags=re.IGNORECASE,
-                            )
-                            archiveHtml = re.sub(
-                                r'\<script type=\"text\/javascript\" src="https:\/\/web-static\.archive\.org[^\<]*\<\/script\>',
-                                "",
-                                archiveHtml,
-                                flags=re.IGNORECASE,
-                            )
-                            archiveHtml = re.sub(
-                                r"\<link rel=\"stylesheet\" type=\"text\/css\" href=\"https:\/\/web-static\.archive\.org[^\<]*\/\>",
-                                "",
-                                archiveHtml,
-                                flags=re.IGNORECASE,
-                            )
-                            archiveHtml = re.sub(
-                                r"\<\!-- End Wayback Rewrite JS Include --\>",
-                                "",
-                                archiveHtml,
-                                re.IGNORECASE,
                             )
+                        ):
-                            # If there is a specific Wayback error in the response, raise an exception
-                            if (
-                                archiveHtml.lower().find(
-                                    "wayback machine has not archived that url"
+                            # For text files only: Add URL comment and clean up wayback references
+                            if not isBinary:
+                                # Add the URL as a comment at the start of the response
+                                if args.url_filename:
+                                    archiveHtml = (
+                                        "/* Original URL: " + archiveUrl + " */\n" + archiveHtml
+                                    )
+                                # Remove all web archive references in the response
+                                archiveHtml = re.sub(
+                                    r'\<script type=\"text\/javascript" src=\"\/_static\/js\/bundle-playback\.js\?v=[A-Za-z0-9]*" charset="utf-8"\><\/script>\n<script type="text\/javascript" src="\/_static\/js\/wombat\.js.*\<\!-- End Wayback Rewrite JS Include --\>',
+                                    "",
+                                    archiveHtml,
+                                    1,
+                                    flags=re.DOTALL | re.IGNORECASE,
                                 )
-                                > 0
-                                or archiveHtml.lower().find(
-                                    "snapshot cannot be displayed due to an internal error"
+                                archiveHtml = re.sub(
+                                    r"\<script src=\"\/\/archive\.org.*\<\!-- End Wayback Rewrite JS Include --\>",
+                                    "",
+                                    archiveHtml,
+                                    1,
+                                    flags=re.DOTALL | re.IGNORECASE,
                                 )
-                                > 0
-                            ):
-                                raise WayBackException
+                                archiveHtml = re.sub(
+                                    r"\<script\>window\.RufflePlayer[^\<]*\<\/script\>",
+                                    "",
+                                    archiveHtml,
+                                    1,
+                                    flags=re.DOTALL | re.IGNORECASE,
+                                )
+                                archiveHtml = re.sub(
+                                    r"\<\!-- BEGIN WAYBACK TOOLBAR INSERT --\>.*\<\!-- END WAYBACK TOOLBAR INSERT --\>",
+                                    "",
+                                    archiveHtml,
+                                    1,
+                                    flags=re.DOTALL | re.IGNORECASE,
+                                )
+                                archiveHtml = re.sub(
+                                    r"(}\n)?(\/\*|<\!--\n)\s*FILE ARCHIVED ON.*108\(a\)\(3\)\)\.\n(\*\/|--\>)",
+                                    "",
+                                    archiveHtml,
+                                    1,
+                                    flags=re.DOTALL | re.IGNORECASE,
+                                )
+                                archiveHtml = re.sub(
+                                    r"var\s_____WB\$wombat\$assign\$function.*WB\$wombat\$assign\$function_____\(\"opener\"\);",
+                                    "",
+                                    archiveHtml,
+                                    1,
+                                    flags=re.DOTALL | re.IGNORECASE,
+                                )
+                                archiveHtml = re.sub(
+                                    r"(\<\!--|\/\*)\nplayback timings.*(--\>|\*\/)",
+                                    "",
+                                    archiveHtml,
+                                    1,
+                                    flags=re.DOTALL | re.IGNORECASE,
+                                )
+                                archiveHtml = re.sub(
+                                    r"((https:)?\/\/web\.archive\.org)?\/web\/[0-9]{14}([A-Za-z]{2}\_)?\/",
+                                    "",
+                                    archiveHtml,
+                                    flags=re.IGNORECASE,
+                                )
+                                archiveHtml = re.sub(
+                                    r"((https:)?\\\/\\\/web\.archive\.org)?\\\/web\\\/[0-9]{14}([A-Za-z]{2}\_)?\\\/",
+                                    "",
+                                    archiveHtml,
+                                    flags=re.IGNORECASE,
+                                )
+                                archiveHtml = re.sub(
+                                    r"((https:)?%2F%2Fweb\.archive\.org)?%2Fweb%2F[0-9]{14}([A-Za-z]{2}\_)?%2F",
+                                    "",
+                                    archiveHtml,
+                                    flags=re.IGNORECASE,
+                                )
+                                archiveHtml = re.sub(
+                                    r"((https:)?\\u002F\\u002Fweb\.archive\.org)?\\u002Fweb\\u002F[0-9]{14}([A-Za-z]{2}\_)?\\u002F",
+                                    "",
+                                    archiveHtml,
+                                    flags=re.IGNORECASE,
+                                )
+                                archiveHtml = re.sub(
+                                    r"\<script type=\"text\/javascript\"\>\s*__wm\.init\(\"https:\/\/web\.archive\.org\/web\"\);[^\<]*\<\/script\>",
+                                    "",
+                                    archiveHtml,
+                                    flags=re.IGNORECASE,
+                                )
+                                archiveHtml = re.sub(
+                                    r'\<script type=\"text\/javascript\" src="https:\/\/web-static\.archive\.org[^\<]*\<\/script\>',
+                                    "",
+                                    archiveHtml,
+                                    flags=re.IGNORECASE,
+                                )
+                                archiveHtml = re.sub(
+                                    r"\<link rel=\"stylesheet\" type=\"text\/css\" href=\"https:\/\/web-static\.archive\.org[^\<]*\/\>",
+                                    "",
+                                    archiveHtml,
+                                    flags=re.IGNORECASE,
+                                )
+                                archiveHtml = re.sub(
+                                    r"\<\!-- End Wayback Rewrite JS Include --\>",
+                                    "",
+                                    archiveHtml,
+                                    re.IGNORECASE,
+                                )
+                                # If there is a specific Wayback error in the response, raise an exception
+                                if (
+                                    archiveHtml.lower().find(
+                                        "wayback machine has not archived that url"
+                                    )
+                                    > 0
+                                    or archiveHtml.lower().find(
+                                        "snapshot cannot be displayed due to an internal error"
+                                    )
+                                    > 0
+                                ):
+                                    raise WayBackException
                             # Create file name based on url or hash value of the response, depending on selection. Ensure the file name isn't over 255 characters
                             if args.url_filename:
                                 fileName = url.replace("/", "-").replace(":", "")
                                 fileName = fileName[0:254]
                             else:
-                                hashValue = filehash(archiveHtml)
+                                # For binary files, hash the raw bytes; for text, hash the text
+                                if isBinary:
+                                    hashValue = filehash(archiveContent.hex())
+                                else:
+                                    hashValue = filehash(archiveHtml)
                                 fileName = hashValue
                                 # Determine extension of file from the content-type using the mimetypes library
@@ -1785,11 +2191,15 @@ def processArchiveUrl(url):
                                         extension = "css"
                                     elif "pdf" in extension:
                                         extension = "pdf"
+                                    elif "zip" in extension:
+                                        extension = "zip"
+                                    elif "gzip" in extension or "x-gzip" in extension:
+                                        extension = "gz"
                                     elif "plain" == extension:
                                         extension = "txt"
                                     # If extension is still blank, set to html if the content ends with HTML tag, otherwise set to unknown
-                                    if extension == "":
+                                    if extension == "" and not isBinary:
                                         if (
                                             archiveHtml.lower().strip().endswith("</html>")
                                             or archiveHtml.lower()
@@ -1800,6 +2210,8 @@ def processArchiveUrl(url):
                                             extension = "html"
                                         else:
                                             extension = "unknown"
+                                    elif extension == "" and isBinary:
+                                        extension = "bin"
                                 fileName = fileName + "." + extension
@@ -1816,10 +2228,14 @@ def processArchiveUrl(url):
                                     + f"{fileName}"
                                 )
-                            # Write the file
+                            # Write the file - binary mode for binary files, text mode for text files
                             try:
-                                responseFile = open(filePath, "w", encoding="utf8")
-                                responseFile.write(archiveHtml)
+                                if isBinary:
+                                    responseFile = open(filePath, "wb")
+                                    responseFile.write(archiveContent)
+                                else:
+                                    responseFile = open(filePath, "w", encoding="utf8")
+                                    responseFile.write(archiveHtml)
                                 responseFile.close()
                                 fileCount = fileCount + 1
                             except Exception as e:
@@ -1852,9 +2268,10 @@ def processArchiveUrl(url):
                                         )
                                     )
-                            # FOR DEBUGGING PURPOSES
+                            # FOR DEBUGGING PURPOSES (only for text files)
                             try:
-                                if os.environ.get("USER") == "xnl":
+                                if os.environ.get("USER") == "xnl" and not isBinary:
                                     debugText = ""
                                     if archiveHtml.lower().find("archive.org") > 0:
                                         debugText = "ARCHIVE.ORG"
@@ -1862,20 +2279,32 @@ def processArchiveUrl(url):
                                         debugText = "INTERNET ARCHIVE"
                                     elif archiveHtml.lower().find("wombat") > 0:
                                         debugText = "WOMBAT (JS)"
-                                    if debugText != "":
+                                    if verbose() and debugText != "":
                                         writerr(
                                             colored(
                                                 getSPACER(
-                                                    '"'
+                                                    '[ DBG ] "'
                                                     + fileName
                                                     + '" CONTAINS '
                                                     + debugText
                                                     + " - CHECK ITS A VALID REFERENCE"
                                                 ),
                                                 "yellow",
+                                                attrs=["dark"],
                                             )
                                         )
-                            except Exception:
+                            except Exception as e:
+                                if verbose():
+                                    writerr(
+                                        colored(
+                                            '[ DBG ] Error - Failed to output debug info for "'
+                                            + archiveUrl
+                                            + '": '
+                                            + str(e),
+                                            "red",
+                                            attrs=["dark"],
+                                        )
+                                    )
                                 pass
                     successCount = successCount + 1
@@ -2346,17 +2775,20 @@ def validateArgProviders(x):
     - urlscan
     - virustotal
     - intelx
+    - ghostarchive
     """
     invalid = False
     x = x.lower()
     providers = x.split(",")
     for provider in providers:
-        if not re.fullmatch(r"(wayback|commoncrawl|otx|urlscan|virustotal|intelx)", provider):
+        if not re.fullmatch(
+            r"(wayback|commoncrawl|otx|urlscan|virustotal|intelx|ghostarchive)", provider
+        ):
             invalid = True
             break
     if invalid:
         raise argparse.ArgumentTypeError(
-            "Pass providers separated by a comma, e.g. wayback,commoncrawl,otx,urlscan,virustotal,intelx"
+            "Pass providers separated by a comma, e.g. wayback,commoncrawl,otx,urlscan,virustotal,intelx,ghostarchive"
         )
     return x
@@ -2897,17 +3329,38 @@ def getURLScanDOM(originalUrl, domUrl):
                     resp = session.get(
                         domUrl, headers={"User-Agent": userAgent}, allow_redirects=True
                     )
-                    archiveHtml = str(resp.text)
+                    # Get raw content bytes first
+                    contentBytes = resp.content
+                    # Get content type from response headers
+                    try:
+                        contentType = resp.headers.get("Content-Type", "").split(";")[0].lower()
+                    except Exception:
+                        contentType = ""
+                    # Determine if this is binary content based on actual content, Content-Type, and URL
+                    isBinary = isBinaryContent(contentBytes, contentType, originalUrl)
+                    if isBinary:
+                        # For binary files, use raw bytes as-is
+                        archiveContent = contentBytes
+                        archiveHtml = None
+                    else:
+                        # For text files, decode to string
+                        archiveHtml = contentBytes.decode("utf-8", errors="replace")
+                        archiveContent = None
                     # If there is a specific URLScan error in the response, raise an exception
-                    if archiveHtml.lower().strip() == "not found!":
+                    if not isBinary and archiveHtml.lower().strip() == "not found!":
                         raise WayBackException
                     # Only create a file if there is a response
-                    if len(archiveHtml) != 0:
+                    responseLength = len(archiveContent) if isBinary else len(archiveHtml)
+                    if responseLength != 0:
-                        # Add the URL as a comment at the start of the response
-                        if args.url_filename:
+                        # Add the URL as a comment at the start of the response (text files only)
+                        if not isBinary and args.url_filename:
                             archiveHtml = "/* Original URL: " + originalUrl + " */\n" + archiveHtml
                         # Create file name based on url or hash value of the response, depending on selection. Ensure the file name isn't over 255 characters
@@ -2915,7 +3368,11 @@ def getURLScanDOM(originalUrl, domUrl):
                             fileName = originalUrl.replace("/", "-").replace(":", "")
                             fileName = fileName[0:254]
                         else:
-                            hashValue = filehash(archiveHtml)
+                            # For binary files, hash the raw bytes; for text, hash the text
+                            if isBinary:
+                                hashValue = filehash(archiveContent.hex())
+                            else:
+                                hashValue = filehash(archiveHtml)
                             fileName = hashValue
                             # Determine extension of file from the content-type using the mimetypes library
@@ -2933,7 +3390,7 @@ def getURLScanDOM(originalUrl, domUrl):
                                 pass
                             # If the extension is blank, numeric, longer than 4 characters or not alphanumeric - then set to html if the content ends with HTML tag, otherwise set to unknown
-                            if extension == "":
+                            if extension == "" and not isBinary:
                                 if (
                                     archiveHtml.lower().strip().endswith("</html>")
                                     or archiveHtml.lower().strip().endswith("</body>")
@@ -2944,6 +3401,8 @@ def getURLScanDOM(originalUrl, domUrl):
                                     extension = "html"
                                 else:
                                     extension = "unknown"
+                            elif extension == "" and isBinary:
+                                extension = "bin"
                             fileName = fileName + "." + extension
@@ -2960,10 +3419,14 @@ def getURLScanDOM(originalUrl, domUrl):
                                 + f"{fileName}"
                             )
-                        # Write the file
+                        # Write the file - binary mode for binary files, text mode for text files
                         try:
-                            responseFile = open(filePath, "w", encoding="utf8")
-                            responseFile.write(archiveHtml)
+                            if isBinary:
+                                responseFile = open(filePath, "wb")
+                                responseFile.write(archiveContent)
+                            else:
+                                responseFile = open(filePath, "w", encoding="utf8")
+                                responseFile.write(archiveHtml)
                             responseFile.close()
                             fileCount = fileCount + 1
                         except Exception as e:
@@ -3083,98 +3546,614 @@ def getURLScanDOM(originalUrl, domUrl):
         writerr(colored("ERROR getURLScanDOM 1:  " + str(e), "red"))
-def format_date_for_urlscan(date_str):
-    # Handle different lengths of input
-    if len(date_str) == 4:  # YYYY
-        date_str += "0101"
-    elif len(date_str) == 6:  # YYYYMM
-        date_str += "01"
-    # Convert to YYYY-MM-DD format
-    try:
-        formatted_date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
-        return formatted_date
-    except Exception:
-        return ""
-def getURLScanUrls():
+def getGhostArchiveWARC(originalUrl, domUrl):
     """
-    Get URLs from the URLSCan API, urlscan.io
+    Get the DOM for the passed GhostArchive link - parses WARC files containing multiple request/response pairs
     """
-    global URLSCAN_API_KEY, linksFound, linkMimes, waymorePath, subs, stopProgram, stopSourceURLScan, argsInput, checkURLScan, argsInputHostname, linkCountURLScan, linksFoundURLScan
-    # Write the file of URL's for the passed domain/URL
+    global stopProgram, successCount, failureCount, fileCount, DEFAULT_OUTPUT_DIR, totalResponses, indexFile, argsInput, argsInputHostname, REGEX_404, linksFound, extraWarcLinks, links_lock
     try:
-        requestsMade = 0
-        stopSourceURLScan = False
-        linksFoundURLScan = set()
-        totalUrls = 0
-        checkResponse = True
-        # Set the URL to just the hostname
-        url = URLSCAN_URL.replace("{DOMAIN}", quote(argsInputHostname))
+        if stopProgram is None:
-        # If the --from-date or --to-date parameters were paassed then also add a date filter
-        if args.from_date or args.to_date:
-            if args.from_date:
-                fromDate = format_date_for_urlscan(str(args.from_date)[:8])
-            else:
-                fromDate = "2016-01-01"  # The year URLScan started
-            if args.to_date:
-                toDate = format_date_for_urlscan(str(args.to_date)[:8])
-            else:
-                toDate = "now"
-            url = url.replace("{DATERANGE}", f"%20date:[{fromDate}%20TO%20{toDate}]")
-        else:
-            url = url.replace("{DATERANGE}", "")
+            # The WARC files are found by replacing /archive with /chimurai4 and using the .warc file extension
+            warcUrl = domUrl.replace("/archive", "/chimurai4") + ".warc"
-        if verbose():
-            if args.mode == "R":
-                write(
-                    colored(
-                        "URLScan - [ INFO ] The URLScan URL requested to get links for responses: ",
-                        "magenta",
-                    )
-                    + colored(url + "\n", "white")
-                )
-            else:
-                write(
-                    colored(
-                        "URLScan - [ INFO ] The URLScan URL requested to get links: ", "magenta"
-                    )
-                    + colored(url + "\n", "white")
-                )
+            # Get memory usage every 100 responses
+            if (successCount + failureCount) % 100 == 0:
+                try:
+                    getMemory()
+                except Exception:
+                    pass
-        if args.mode in ("U", "B") and not args.check_only:
-            write(
-                colored(
-                    "URLScan - [ INFO ] Getting links from urlscan.io API (this can take a while for some domains)...",
-                    "cyan",
-                )
-            )
+            # Fetch content
+            try:
+                # Show progress bar
+                fillTest = (successCount + failureCount) % 2
+                fillChar = "o"
+                if fillTest == 0:
+                    fillChar = "O"
+                suffix = "Complete "
-        # Get the first page from urlscan.io
-        try:
-            # Choose a random user agent string to use for any requests
-            # For other sources we would use `random.choice(USER_AGENT)` to asignn a random user-agent, but it seems
-            # that there are a handful of those that ALWAYS return 429. Passing a specific one all the time seems to
-            # be successful all the time
-            userAgent = "waymore v" + __version__ + " by xnl-h4ck3r"
-            session = requests.Session()
-            session.mount("https://", HTTP_ADAPTER)
-            session.mount("http://", HTTP_ADAPTER)
-            # Pass the API-Key header too. This can change the max endpoints per page, depending on URLScan subscription
-            resp = session.get(url, headers={"User-Agent": userAgent, "API-Key": URLSCAN_API_KEY})
-            requestsMade = requestsMade + 1
-        except Exception as e:
-            write(
-                colored(
-                    "URLScan - [ ERR ] Unable to get links from urlscan.io: " + str(e),
-                    "red",
+                printProgressBar(
+                    successCount + failureCount,
+                    totalResponses,
+                    prefix="Processing " + str(totalResponses) + " WARC files:",
+                    suffix=suffix,
+                    length=getProgressBarLength(),
+                    fill=fillChar,
                 )
-            )
-            return
+                try:
+                    try:
+                        if verbose() and os.environ.get("USER") == "xnl":
+                            writerr(
+                                colored(
+                                    "[ DBG ] Requesting file " + warcUrl,
+                                    "yellow",
+                                    attrs=["dark"],
+                                )
+                            )
+                    except Exception:
+                        pass
+                    # Choose a random user agent string to use for any requests
+                    userAgent = random.choice(USER_AGENT)
+                    session = requests.Session()
+                    session.mount("https://", HTTP_ADAPTER)
+                    session.mount("http://", HTTP_ADAPTER)
+                    # Retry loop for 503 or maintenance responses
+                    maxRetries = 3
+                    warcBytes = b""
+                    for attempt in range(maxRetries):
+                        resp = session.get(
+                            warcUrl,
+                            headers={"User-Agent": userAgent},
+                            allow_redirects=True,
+                            timeout=args.timeout,
+                        )
+                        warcBytes = resp.content
+                        # Check if we need to retry (decode just for this check)
+                        try:
+                            warcTextCheck = warcBytes.decode("utf-8", errors="replace").lower()
+                        except Exception:
+                            warcTextCheck = ""
+                        if resp.status_code == 503 or "website under maintenance" in warcTextCheck:
+                            if attempt < maxRetries - 1:
+                                import time
+                                time.sleep(0.5)
+                                continue
+                        break
+                    # Parse the WARC file to extract multiple responses
+                    # WARC header lines are text, but response bodies may be binary
+                    # Split by line separator but keep bytes for body extraction
+                    lineBytes = warcBytes.split(b"\n")
+                    lines = [lb.decode("utf-8", errors="replace") for lb in lineBytes]
+                    # State machine to track parsing
+                    currentTargetUri = ""
+                    inResponse = False
+                    contentType = ""
+                    responsesFound = (
+                        []
+                    )  # List of (targetUri, contentType, responseBytes, httpStatusCode)
+                    i = 0
+                    skipCurrentResponse = False  # Initialize before loop
+                    pendingResponseType = (
+                        False  # Track if we saw WARC-Type: response and are waiting for Target-URI
+                    )
+                    responseStartIdx = -1  # Initialize before loop
+                    httpStatusCode = ""  # Initialize before loop
+                    while i < len(lines) and stopProgram is None and not stopSourceGhostArchive:
+                        line = lines[i]
+                        # When we see a new WARC record start, reset pending state
+                        if line.startswith("WARC/1.0"):
+                            # If we were in a response and collecting, save it before moving to new record
+                            if inResponse and responseStartIdx >= 0:
+                                responseBodyBytes = b"\n".join(lineBytes[responseStartIdx:i])
+                                responsesFound.append(
+                                    (
+                                        currentTargetUri,
+                                        contentType,
+                                        responseBodyBytes,
+                                        httpStatusCode if "httpStatusCode" in dir() else "",
+                                    )
+                                )
+                                inResponse = False
+                                responseStartIdx = -1
+                                contentType = ""
+                                httpStatusCode = ""
+                            pendingResponseType = False
+                            skipCurrentResponse = False
+                        # Look for WARC-Type: response - mark that we're in a response record header
+                        elif line.startswith("WARC-Type: response"):
+                            pendingResponseType = True
+                            inResponse = False  # Don't start capturing body yet
+                            responseStartIdx = -1
+                            contentType = ""
+                        # Look for WARC-Target-URI to get the request URL
+                        elif line.startswith("WARC-Target-URI:"):
+                            currentTargetUri = line.split(":", 1)[1].strip()
+                            skipCurrentResponse = False
+                            # Check: URL host must contain the input hostname
+                            if argsInputHostname:
+                                try:
+                                    parsed = urlparse(currentTargetUri)
+                                    host = parsed.netloc.lower()
+                                    if argsInputHostname.lower() not in host:
+                                        skipCurrentResponse = True
+                                except Exception:
+                                    skipCurrentResponse = True
+                            # Check: Filter by URL (FILTER_URL)
+                            if not skipCurrentResponse and FILTER_URL and currentTargetUri:
+                                filterUrls = [u.strip().lower() for u in FILTER_URL.split(",")]
+                                for filterUrl in filterUrls:
+                                    if filterUrl in currentTargetUri.lower():
+                                        skipCurrentResponse = True
+                                        break
+                            # If we were waiting for Target-URI after seeing WARC-Type: response, and it's valid, start response mode
+                            if pendingResponseType and not skipCurrentResponse:
+                                inResponse = True
+                                pendingResponseType = False
+                        # If we're in a response section (after seeing both WARC-Type: response and valid WARC-Target-URI)
+                        elif inResponse:
+                            # Check for HTTP start and capture status code
+                            if line.startswith("HTTP"):
+                                # Extract status code (e.g., "HTTP/1.1 200 OK" -> "200")
+                                try:
+                                    httpStatusCode = line.split()[1]
+                                except Exception:
+                                    httpStatusCode = ""
+                                # Early check: Filter by HTTP status code (FILTER_CODE)
+                                if FILTER_CODE and httpStatusCode:
+                                    filterCodes = [c.strip() for c in FILTER_CODE.split(",")]
+                                    if httpStatusCode in filterCodes:
+                                        inResponse = False
+                                        responseStartIdx = -1
+                                        i += 1
+                                        continue
+                                responseStartIdx = i  # Mark start of response
+                            elif responseStartIdx >= 0:
+                                # Capture Content-Type if present (case-insensitive check)
+                                if line.lower().startswith("content-type:"):
+                                    try:
+                                        contentType = (
+                                            line.split(":", 1)[1].strip().split(";")[0].lower()
+                                        )
+                                    except Exception:
+                                        pass
+                                    # Early check: Filter by MIME type (FILTER_MIME)
+                                    if FILTER_MIME and contentType:
+                                        filterMimes = [
+                                            m.strip().lower() for m in FILTER_MIME.split(",")
+                                        ]
+                                        if contentType in filterMimes:
+                                            inResponse = False
+                                            responseStartIdx = -1
+                                            i += 1
+                                            continue
+                        i += 1
+                    if stopProgram is not None:
+                        return
+                    # Don't forget the last response if file doesn't end with WARC/1.0
+                    if inResponse and responseStartIdx >= 0:
+                        responseBodyBytes = b"\n".join(lineBytes[responseStartIdx:])
+                        responsesFound.append(
+                            (
+                                currentTargetUri,
+                                contentType,
+                                responseBodyBytes,
+                                httpStatusCode if "httpStatusCode" in dir() else "",
+                            )
+                        )
+                    # Process each response found
+                    for targetUri, contentType, responseBytes, httpStatusCode in responsesFound:
+                        if stopProgram is not None:
+                            break
+                        if not responseBytes:
+                            continue
+                        # Split HTTP header from body in bytes (look for \r\n\r\n or \n\n separator)
+                        if b"\r\n\r\n" in responseBytes:
+                            bodyBytes = responseBytes.split(b"\r\n\r\n", 1)[1]
+                        elif b"\n\n" in responseBytes:
+                            bodyBytes = responseBytes.split(b"\n\n", 1)[1]
+                        else:
+                            bodyBytes = responseBytes
+                        # Skip empty bodies or "not found" responses
+                        if not bodyBytes or bodyBytes.lower().strip() == b"not found":
+                            continue
+                        # If -f / --filter-responses-only is passed, track all URLs immediately (before filtering)
+                        if args.mode == "B" and args.filter_responses_only and targetUri:
+                            with links_lock:
+                                if targetUri not in linksFound and targetUri not in extraWarcLinks:
+                                    extraWarcLinks.add(targetUri)
+                        # Use isBinaryContent to detect if this is binary content
+                        isBinary = isBinaryContent(bodyBytes, contentType, targetUri)
+                        if isBinary:
+                            # Binary file - save raw bytes
+                            archiveContent = bodyBytes
+                            archiveHtml = None
+                        else:
+                            # Text file - decode to string
+                            archiveHtml = bodyBytes.decode("utf-8", errors="replace")
+                            archiveContent = None
+                            # Collapse multiple blank lines into one
+                            archiveHtml = re.sub(r"\n{3,}", "\n\n", archiveHtml)
+                            # Skip if body is empty after processing
+                            if not archiveHtml.strip():
+                                continue
+                        if stopProgram is not None:
+                            break
+                        # Determine if this is HTML or JS based on content-type or URL
+                        isHtml = (
+                            contentType in ["text/html", "application/xhtml+xml"]
+                            or targetUri.lower().endswith(".html")
+                            or targetUri.lower().endswith(".htm")
+                        )
+                        isJs = contentType in [
+                            "text/javascript",
+                            "application/javascript",
+                            "application/x-javascript",
+                        ] or targetUri.lower().endswith(".js")
+                        # Add the URL as a comment at the start of the response (only for text files)
+                        if not isBinary and args.url_filename:
+                            if isHtml:
+                                archiveHtml = (
+                                    "<!-- Original URL: " + targetUri + " -->\n" + archiveHtml
+                                )
+                            elif isJs:
+                                archiveHtml = (
+                                    "/* Original URL: " + targetUri + " */\n" + archiveHtml
+                                )
+                        # Create file name based on url or hash value
+                        if args.url_filename:
+                            fileName = targetUri.replace("/", "-").replace(":", "")
+                            fileName = fileName[0:254]
+                            hashValue = ""
+                        else:
+                            # Hash the content to get the filename
+                            if isBinary:
+                                hashValue = filehash(archiveContent)
+                            else:
+                                hashValue = filehash(archiveHtml)
+                            fileName = hashValue
+                            # Determine extension of file from the content-type or URL
+                            extension = ""
+                            try:
+                                # Get path extension from URL
+                                if "://" in targetUri:
+                                    targetUrl = "https://" + targetUri.split("://")[1]
+                                    parsed = urlparse(targetUrl.strip())
+                                    path = parsed.path
+                                    extension = path[path.rindex(".") + 1 :]
+                                    if "/" in extension:
+                                        extension = ""
+                                    # If extension is over 6 characters, it's likely not a real extension (e.g. API endpoint ID)
+                                    if len(extension) > 6:
+                                        extension = ""
+                            except Exception:
+                                pass
+                            # If extension is blank, determine from MIME type or content
+                            if extension == "":
+                                if isBinary:
+                                    # Binary file extensions from MIME type
+                                    if contentType:
+                                        if "image/png" in contentType:
+                                            extension = "png"
+                                        elif (
+                                            "image/jpeg" in contentType
+                                            or "image/jpg" in contentType
+                                        ):
+                                            extension = "jpg"
+                                        elif "image/gif" in contentType:
+                                            extension = "gif"
+                                        elif "image/webp" in contentType:
+                                            extension = "webp"
+                                        elif "application/pdf" in contentType:
+                                            extension = "pdf"
+                                        elif "application/zip" in contentType:
+                                            extension = "zip"
+                                        else:
+                                            extension = "bin"
+                                    else:
+                                        extension = "bin"
+                                else:
+                                    # Text file extensions
+                                    if contentType and "javascript" in contentType.lower():
+                                        extension = "js"
+                                    elif contentType and "html" in contentType.lower():
+                                        extension = "html"
+                                    elif contentType and "json" in contentType.lower():
+                                        extension = "json"
+                                    elif contentType and "text" in contentType.lower():
+                                        extension = "txt"
+                                    elif archiveHtml and (
+                                        archiveHtml.lower().strip().endswith("</html>")
+                                        or archiveHtml.lower().strip().endswith("</body>")
+                                        or archiveHtml.lower().strip().startswith("<!doctype html")
+                                        or archiveHtml.lower().strip().startswith("<html")
+                                        or archiveHtml.lower().strip().startswith("<head")
+                                    ):
+                                        extension = "html"
+                                    else:
+                                        extension = "unknown"
+                            fileName = fileName + "." + extension
+                        # Determine file path
+                        if args.output_responses != "":
+                            filePath = args.output_responses + "/" + f"{fileName}"
+                        else:
+                            filePath = (
+                                DEFAULT_OUTPUT_DIR
+                                + "/results/"
+                                + str(argsInput).replace("/", "-")
+                                + "/"
+                                + f"{fileName}"
+                            )
+                        if stopProgram is not None:
+                            break
+                        # Write the file
+                        try:
+                            if isBinary:
+                                # Binary file - write as bytes
+                                responseFile = open(filePath, "wb")
+                                responseFile.write(archiveContent)
+                            else:
+                                # Text file - write as UTF-8
+                                responseFile = open(filePath, "w", encoding="utf8")
+                                responseFile.write(archiveHtml)
+                            responseFile.close()
+                            with links_lock:
+                                fileCount = fileCount + 1
+                            # Track extra URLs found in WARC files for mode B (only when -f is not passed, since we track earlier if it is)
+                            if args.mode == "B" and not args.filter_responses_only and targetUri:
+                                with links_lock:
+                                    if (
+                                        targetUri not in linksFound
+                                        and targetUri not in extraWarcLinks
+                                    ):
+                                        extraWarcLinks.add(targetUri)
+                        except Exception as e:
+                            writerr(
+                                colored(
+                                    "GhostArchive - [ ERR ] Failed to write file "
+                                    + filePath
+                                    + ": "
+                                    + str(e),
+                                    "red",
+                                )
+                            )
+                        # Write the hash value and URL to the index file
+                        if not args.url_filename and hashValue:
+                            try:
+                                timestamp = str(datetime.now())
+                                indexFile.write(
+                                    hashValue
+                                    + ","
+                                    + domUrl
+                                    + "#"
+                                    + targetUri
+                                    + " ,"
+                                    + timestamp
+                                    + "\n"
+                                )
+                                indexFile.flush()
+                            except Exception as e:
+                                writerr(
+                                    colored(
+                                        'GhostArchive - [ ERR ] Failed to write to waymore_index.txt for "'
+                                        + warcUrl
+                                        + '": '
+                                        + str(e),
+                                        "red",
+                                    )
+                                )
+                    successCount = successCount + 1
+                except WayBackException:
+                    failureCount = failureCount + 1
+                except Exception as e:
+                    failureCount = failureCount + 1
+                    if verbose():
+                        # Simplify common error messages
+                        if "connection broken" in str(e).lower():
+                            errorMsg = "Connection Broken"
+                        else:
+                            errorMsg = str(e)
+                        try:
+                            statusCode = (
+                                resp.status_code if "resp" in dir() and resp is not None else "ERR"
+                            )
+                            writerr(
+                                colored(
+                                    "GhostArchive - [ "
+                                    + str(statusCode)
+                                    + ' ] Failed to get response for "'
+                                    + warcUrl
+                                    + '": '
+                                    + errorMsg,
+                                    "red",
+                                )
+                            )
+                        except Exception:
+                            writerr(
+                                colored(
+                                    'GhostArchive - [ ERR ] Failed to get response for "'
+                                    + warcUrl
+                                    + '": '
+                                    + errorMsg,
+                                    "red",
+                                )
+                            )
+                # Show memory usage if -v option chosen, and check memory every 25 responses (or if its the last)
+                if (successCount + failureCount) % 25 == 1 or (
+                    successCount + failureCount
+                ) == totalResponses:
+                    try:
+                        getMemory()
+                        if verbose():
+                            suffix = (
+                                "Complete (Mem Usage "
+                                + humanReadableSize(currentMemUsage)
+                                + ", Total Mem "
+                                + str(currentMemPercent)
+                                + "%)   "
+                            )
+                    except Exception:
+                        if verbose():
+                            suffix = 'Complete (To show mem use, run "pip install psutil")'
+                printProgressBar(
+                    successCount + failureCount,
+                    totalResponses,
+                    prefix="Processing " + str(totalResponses) + " WARC files:",
+                    suffix=suffix,
+                    length=getProgressBarLength(),
+                    fill=fillChar,
+                )
+            except Exception as e:
+                if verbose():
+                    writerr(
+                        colored(
+                            'GhostArchive - [ ERR ] Error for "' + domUrl + '": ' + str(e), "red"
+                        )
+                    )
+    except Exception as e:
+        writerr(colored("ERROR getGhostArchiveWARC 1:  " + str(e), "red"))
+def format_date_for_urlscan(date_str):
+    # Handle different lengths of input
+    if len(date_str) == 4:  # YYYY
+        date_str += "0101"
+    elif len(date_str) == 6:  # YYYYMM
+        date_str += "01"
+    # Convert to YYYY-MM-DD format
+    try:
+        formatted_date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
+        return formatted_date
+    except Exception:
+        return ""
+def getURLScanUrls():
+    """
+    Get URLs from the URLSCan API, urlscan.io
+    """
+    global URLSCAN_API_KEY, linksFound, linkMimes, waymorePath, subs, stopProgram, stopSourceURLScan, argsInput, checkURLScan, argsInputHostname, linkCountURLScan, linksFoundURLScan
+    # Write the file of URL's for the passed domain/URL
+    try:
+        requestsMade = 0
+        stopSourceURLScan = False
+        linksFoundURLScan = set()
+        totalUrls = 0
+        checkResponse = True
+        # Set the URL to just the hostname
+        url = URLSCAN_URL.replace("{DOMAIN}", quote(argsInputHostname))
+        # If the --from-date or --to-date parameters were paassed then also add a date filter
+        if args.from_date or args.to_date:
+            if args.from_date:
+                fromDate = format_date_for_urlscan(str(args.from_date)[:8])
+            else:
+                fromDate = "2016-01-01"  # The year URLScan started
+            if args.to_date:
+                toDate = format_date_for_urlscan(str(args.to_date)[:8])
+            else:
+                toDate = "now"
+            url = url.replace("{DATERANGE}", f"%20date:[{fromDate}%20TO%20{toDate}]")
+        else:
+            url = url.replace("{DATERANGE}", "")
+        if verbose():
+            if args.mode == "R":
+                write(
+                    colored(
+                        "URLScan - [ INFO ] The URLScan URL requested to get links for responses: ",
+                        "magenta",
+                    )
+                    + colored(url + "\n", "white")
+                )
+            else:
+                write(
+                    colored(
+                        "URLScan - [ INFO ] The URLScan URL requested to get links: ", "magenta"
+                    )
+                    + colored(url + "\n", "white")
+                )
+        if args.mode in ("U", "B") and not args.check_only:
+            write(
+                colored(
+                    "URLScan - [ INFO ] Getting links from urlscan.io API (this can take a while for some domains)...",
+                    "cyan",
+                )
+            )
+        # Get the first page from urlscan.io
+        try:
+            # Choose a random user agent string to use for any requests
+            # For other sources we would use `random.choice(USER_AGENT)` to asignn a random user-agent, but it seems
+            # that there are a handful of those that ALWAYS return 429. Passing a specific one all the time seems to
+            # be successful all the time
+            userAgent = "waymore v" + __version__ + " by xnl-h4ck3r"
+            session = requests.Session()
+            session.mount("https://", HTTP_ADAPTER)
+            session.mount("http://", HTTP_ADAPTER)
+            # Pass the API-Key header too. This can change the max endpoints per page, depending on URLScan subscription
+            resp = session.get(url, headers={"User-Agent": userAgent, "API-Key": URLSCAN_API_KEY})
+            requestsMade = requestsMade + 1
+        except Exception as e:
+            write(
+                colored(
+                    "URLScan - [ ERR ] Unable to get links from urlscan.io: " + str(e),
+                    "red",
+                )
+            )
+            return
         # If the rate limit was reached then determine if to wait and then try again
         if resp.status_code == 429:
@@ -3753,7 +4732,6 @@ def processWayBackPage(url):
                     pass
                 return
         else:
-            print("DEBUG: HERE END!")  # DEBUG
             pass
     except Exception as e:
         if verbose():
@@ -4935,80 +5913,373 @@ def processIntelxType(target, credits):
         writerr(colored("ERROR processIntelxType 1: " + str(e), "red"))
-def getIntelxAccountInfo() -> str:
-    """
-    Get the account info and return the number of Credits remaining from the /phonebook/search
-    """
-    initIntelxTls()
-    try:
-        resp = chooseIntelxBase(INTELX_API_KEY)
-        if resp is None or resp.status_code != 200:
-            return "Unknown"
-        jsonResp = json.loads(resp.text.strip())
-        credits = str(
-            jsonResp.get("paths", {}).get("/phonebook/search", {}).get("Credit", "Unknown")
-        )
-        credits_max = str(
-            jsonResp.get("paths", {}).get("/phonebook/search", {}).get("CreditMax", "Unknown")
-        )
-        return credits + "/" + credits_max
-    except Exception:
-        return "Unknown"
+def getIntelxAccountInfo() -> str:
+    """
+    Get the account info and return the number of Credits remaining from the /phonebook/search
+    """
+    initIntelxTls()
+    try:
+        resp = chooseIntelxBase(INTELX_API_KEY)
+        if resp is None or resp.status_code != 200:
+            return "Unknown"
+        jsonResp = json.loads(resp.text.strip())
+        credits = str(
+            jsonResp.get("paths", {}).get("/phonebook/search", {}).get("Credit", "Unknown")
+        )
+        credits_max = str(
+            jsonResp.get("paths", {}).get("/phonebook/search", {}).get("CreditMax", "Unknown")
+        )
+        return credits + "/" + credits_max
+    except Exception:
+        return "Unknown"
+def getIntelxUrls():
+    """
+    Get URLs from the Intelligence X Phonebook search
+    """
+    global INTELX_API_KEY, linksFound, waymorePath, subs, stopProgram, stopSourceIntelx, argsInput, checkIntelx, argsInputHostname, intelxAPIIssue, linkCountIntelx, linksFoundIntelx
+    # Write the file of URL's for the passed domain/URL
+    try:
+        if args.check_only:
+            write(
+                colored("IntelX - [ INFO ] Get URLs from Intelligence X: ", "cyan")
+                + colored("minimum 4 requests", "white")
+            )
+            checkIntelx = 4
+            return
+        stopSourceIntelx = False
+        linksFoundIntelx = set()
+        initIntelxTls()
+        credits = getIntelxAccountInfo()
+        if verbose():
+            write(
+                colored(
+                    "IntelX - [ INFO ] The Intelligence X URL requested to get links (Credits: "
+                    + credits
+                    + "): ",
+                    "magenta",
+                )
+                + colored(intelx_tls.INTELX_SEARCH_URL + "\n", "white")
+            )
+        if not args.check_only:
+            write(colored("IntelX - [ INFO ] Getting links from intelx.io API...", "cyan"))
+        # Get the domains from Intelligence X if the --no-subs wasn't passed
+        if not args.no_subs:
+            processIntelxType(1, credits)
+        # Get the URLs from Intelligence X
+        if not intelxAPIIssue:
+            processIntelxType(3, credits)
+        linkCountIntelx = len(linksFoundIntelx)
+        write(
+            colored("IntelX - [ INFO ] Links found on intelx.io: ", "cyan")
+            + colored(str(linkCountIntelx), "white")
+        )
+        linksFound.update(linksFoundIntelx)
+        linksFoundIntelx.clear()
+    except Exception as e:
+        writerr(colored("ERROR getIntelxUrls 1: " + str(e), "red"))
+def processGhostArchiveUrl(url, ghostArchiveID=""):
+    """
+    Process a specific URL from ghostarchive.org to determine whether to save the link
+    """
+    global argsInput, argsInputHostname, links_lock, linkCountGhostArchive, linksFoundGhostArchive
+    addLink = True
+    try:
+        # Strip Wayback Machine prefix if present (e.g., https://web.archive.org/web/20230101120000_/https://example.com)
+        waybackMatch = re.match(r"^https?://web\.archive\.org/[^/]+/[a-zA-Z0-9]+_/", url)
+        if waybackMatch:
+            url = url[waybackMatch.end() :]
+        # If the input has a / in it, then a URL was passed, so the link will only be added if the URL matches
+        if "/" in url:
+            if argsInput not in url:
+                addLink = False
+        # If filters are required then test them
+        if addLink and not args.filter_responses_only:
+            # If the user requested -n / --no-subs then we don't want to add it if it has a sub domain (www. will not be classed as a sub domain)
+            if args.no_subs:
+                match = re.search(
+                    r"^[A-za-z]*\:\/\/(www\.)?" + re.escape(argsInputHostname),
+                    url,
+                    flags=re.IGNORECASE,
+                )
+                if match is None:
+                    addLink = False
+            # If the user didn't requested -f / --filter-responses-only then check http code
+            if addLink and not args.filter_responses_only:
+                # Check the URL exclusions
+                if addLink:
+                    match = re.search(
+                        r"(" + re.escape(FILTER_URL).replace(",", "|") + ")",
+                        url,
+                        flags=re.IGNORECASE,
+                    )
+                    if match is not None:
+                        addLink = False
+                # Set keywords filter if -ko argument passed
+                if addLink and args.keywords_only:
+                    if args.keywords_only == "#CONFIG":
+                        match = re.search(
+                            r"(" + re.escape(FILTER_KEYWORDS).replace(",", "|") + ")",
+                            url,
+                            flags=re.IGNORECASE,
+                        )
+                    else:
+                        match = re.search(r"(" + args.keywords_only + ")", url, flags=re.IGNORECASE)
+                    if match is None:
+                        addLink = False
+        # Add link if it passed filters
+        if addLink:
+            # Just get the hostname of the url
+            tldExtract = tldextract.extract(url)
+            subDomain = tldExtract.subdomain
+            if subDomain != "":
+                subDomain = subDomain + "."
+            domainOnly = subDomain + tldExtract.domain + "." + tldExtract.suffix
+            # GhostArchive might return URLs that aren't for the domain passed so we need to check for those and not process them
+            # Check the URL
+            match = re.search(
+                r"(^|\.)" + re.escape(argsInputHostname) + "$",
+                domainOnly,
+                flags=re.IGNORECASE,
+            )
+            if match is not None:
+                if args.mode in ("U", "B"):
+                    linksFoundAdd(url, linksFoundGhostArchive)
+                # If Response mode is requested then add the DOM ID to try later, for the number of responses wanted
+                if ghostArchiveID != "" and args.mode in ("R", "B"):
+                    if args.limit == 0 or len(ghostArchiveRequestLinks) < args.limit:
+                        with links_lock:
+                            ghostArchiveRequestLinks.add(
+                                (url, GHOSTARCHIVE_DOM_URL + ghostArchiveID)
+                            )
+    except Exception as e:
+        writerr(colored("ERROR processGhostArchiveUrl 1: " + str(e), "red"))
+def getGhostArchiveUrls():
+    """
+    Get URLs from GhostArchive (ghostarchive.org)
+    This source doesn't have an API, so we crawl the HTML pages directly.
+    """
+    global linksFound, path, subs, stopProgram, stopSourceGhostArchive, argsInput, checkGhostArchive, argsInputHostname, linkCountGhostArchive, linksFoundGhostArchive
+    try:
+        stopSourceGhostArchive = False
+        linksFoundGhostArchive = set()
+        # Build the base URL
+        # If there is only one . in the hostname, we can guarantee that a subdoman wasn't passed, so we can prefix with . to the links quicker as it won't include other domains that end with the target domain,
+        # Else, we need to get all and then confirm the actual host of the links later
+        if argsInputHostname.count(".") == 1:
+            baseUrl = GHOSTARCHIVE_URL.replace("{DOMAIN}", "." + quote(argsInput))
+        else:
+            baseUrl = GHOSTARCHIVE_URL.replace("{DOMAIN}", quote(argsInput))
+        if verbose():
+            write(
+                colored("GhostArchive - [ INFO ] The URL requested to get links: ", "magenta")
+                + colored(baseUrl + "0\n", "white")
+            )
+        if not args.check_only and args.mode == "U":
+            write(
+                colored(
+                    "GhostArchive - [ INFO ] Getting links from ghostarchive.org (this can take a while for some domains)...",
+                    "cyan",
+                )
+            )
+        # Set up session with cookie
+        session = requests.Session()
+        if HTTP_ADAPTER is not None:
+            session.mount("https://", HTTP_ADAPTER)
+            session.mount("http://", HTTP_ADAPTER)
+        userAgent = random.choice(USER_AGENT)
+        headers = {"User-Agent": userAgent}
+        cookies = {"theme": "original"}
+        pageNum = 0
+        while stopProgram is None and not stopSourceGhostArchive:
+            getMemory()
+            url = baseUrl + str(pageNum)
+            try:
+                resp = session.get(url, headers=headers, cookies=cookies, timeout=DEFAULT_TIMEOUT)
+            except Exception as e:
+                writerr(
+                    colored(
+                        "GhostArchive - [ ERR ] Unable to get page " + str(pageNum) + ": " + str(e),
+                        "red",
+                    )
+                )
+                break
+            if resp.status_code == 429:
+                writerr(
+                    colored(
+                        "GhostArchive - [ 429 ] Rate limit reached at page " + str(pageNum) + ".",
+                        "red",
+                    )
+                )
+                break
+            # Check for maintenance/end of results indicator
+            if (
+                resp.status_code == 503
+                or "The site is under maintenance and will be back soon" in resp.text
+                or "No archives for that site" in resp.text
+            ):
+                if verbose():
+                    if pageNum == 0:
+                        if args.check_only:
+                            checkGhostArchive = 1
+                            write(
+                                colored(
+                                    "GhostArchive - [ INFO ] Get URLs from GhostArchive: ", "cyan"
+                                )
+                                + colored("1 request", "white")
+                            )
+                        else:
+                            write(
+                                colored(
+                                    "GhostArchive - [ INFO ] No results found",
+                                    "cyan",
+                                )
+                            )
+                    else:
+                        write(
+                            colored(
+                                "GhostArchive - [ INFO ] Retrieved all results from "
+                                + str(pageNum)
+                                + " pages",
+                                "cyan",
+                            )
+                        )
+                break
+            if resp.status_code != 200:
+                writerr(
+                    colored(
+                        "GhostArchive - [ ERR ] [ "
+                        + str(resp.status_code)
+                        + " ] at page "
+                        + str(pageNum),
+                        "red",
+                    )
+                )
+                break
+            # Check only mode - just count pages
+            if args.check_only:
+                # For check only, we check if there are results and try to get total count
+                if pageNum == 0:
+                    # Check if there are any results on the first page
+                    if '<a href="/archive/' in resp.text:
+                        # Try to find "out of X" to determine total results/pages
+                        outOfMatch = re.search(r"out of (\d+)", resp.text)
+                        if outOfMatch:
+                            totalResults = int(outOfMatch.group(1))
+                            checkGhostArchive = totalResults
+                            write(
+                                colored(
+                                    "GhostArchive - [ INFO ] Get URLs from GhostArchive: ", "cyan"
+                                )
+                                + colored(f"{totalResults} requests (pagination required)", "white")
+                            )
+                        else:
+                            checkGhostArchive = 1
+                            write(
+                                colored(
+                                    "GhostArchive - [ INFO ] Get URLs from GhostArchive: ", "cyan"
+                                )
+                                + colored("unknown requests (pagination required)", "white")
+                            )
+                    else:
+                        checkGhostArchive = 1
+                        write(
+                            colored("GhostArchive - [ INFO ] Get URLs from GhostArchive: ", "cyan")
+                            + colored("1 request (no results)", "white")
+                        )
+                break
+            # Use regex to extract URLs from anchor tag text content
+            # Pattern matches: <a href="/archive/ID">URL_HERE</a> - captures both href path and URL
+            pattern = r'<a href="(/archive/[^"]*)">([^<]+)</a>'
+            matches = re.findall(pattern, resp.text)
+            # If no matches found, we've reached the end of results
+            if not matches:
+                if verbose():
+                    write(
+                        colored(
+                            "GhostArchive - [ INFO ] Retrieved all results from "
+                            + str(pageNum + 1)
+                            + " pages",
+                            "cyan",
+                        )
+                    )
+                break
-def getIntelxUrls():
-    """
-    Get URLs from the Intelligence X Phonebook search
-    """
-    global INTELX_API_KEY, linksFound, waymorePath, subs, stopProgram, stopSourceIntelx, argsInput, checkIntelx, argsInputHostname, intelxAPIIssue, linkCountIntelx, linksFoundIntelx
+            for match in matches:
+                ghostArchiveId = match[0]  # e.g., "/archive/gkOOR"
+                potentialUrl = match[1].strip()
+                processGhostArchiveUrl(potentialUrl, ghostArchiveId)
-    # Write the file of URL's for the passed domain/URL
-    try:
-        if args.check_only:
-            write(
-                colored("IntelX - [ INFO ] Get URLs from Intelligence X: ", "cyan")
-                + colored("minimum 4 requests", "white")
-            )
-            checkIntelx = 4
-            return
+            # Check if there's a "Next Page" link - if not, we've reached the last page
+            # GhostArchive resets to Page 1 when exceeding actual pages, so checking for Next Page is essential
+            if "Next Page" not in resp.text and ">»</a>" not in resp.text:
+                if verbose():
+                    write(
+                        colored(
+                            "GhostArchive - [ INFO ] Retrieved all results from "
+                            + str(pageNum + 1)
+                            + " pages",
+                            "cyan",
+                        )
+                    )
+                break
-        stopSourceIntelx = False
-        linksFoundIntelx = set()
-        initIntelxTls()
+            pageNum += 1
-        credits = getIntelxAccountInfo()
-        if verbose():
+        if not args.check_only:
+            # Count links based on mode - in R mode, count response links; in U/B mode, count URL links
+            if args.mode == "R":
+                linkCountGhostArchive = len(ghostArchiveRequestLinks)
+            else:
+                linkCountGhostArchive = len(linksFoundGhostArchive)
             write(
-                colored(
-                    "IntelX - [ INFO ] The Intelligence X URL requested to get links (Credits: "
-                    + credits
-                    + "): ",
-                    "magenta",
-                )
-                + colored(intelx_tls.INTELX_SEARCH_URL + "\n", "white")
+                colored("GhostArchive - [ INFO ] Links found on ghostarchive.org: ", "cyan")
+                + colored(str(linkCountGhostArchive), "white")
             )
-        if not args.check_only:
-            write(colored("IntelX - [ INFO ] Getting links from intelx.io API...", "cyan"))
-        # Get the domains from Intelligence X if the --no-subs wasn't passed
-        if not args.no_subs:
-            processIntelxType(1, credits)
-        # Get the URLs from Intelligence X
-        if not intelxAPIIssue:
-            processIntelxType(3, credits)
-        linkCountIntelx = len(linksFoundIntelx)
-        write(
-            colored("IntelX - [ INFO ] Links found on intelx.io: ", "cyan")
-            + colored(str(linkCountIntelx), "white")
-        )
-        linksFound.update(linksFoundIntelx)
-        linksFoundIntelx.clear()
+            linksFound.update(linksFoundGhostArchive)
+            linksFoundGhostArchive.clear()
     except Exception as e:
-        writerr(colored("ERROR getIntelxUrls 1: " + str(e), "red"))
+        writerr(colored("ERROR getGhostArchiveUrls 1: " + str(e), "red"))
 def processResponses():
@@ -5018,6 +6289,10 @@ def processResponses():
     global stopProgram, totalFileCount
     try:
+        # Get responses from GhostArchive unless excluded
+        if stopProgram is None and not args.xga:
+            processResponsesGhostArchive()
         # Get responses from URLScan unless excluded
         if stopProgram is None and not args.xus:
             processResponsesURLScan()
@@ -5039,6 +6314,235 @@ def processResponses():
         writerr(colored(getSPACER("ERROR processResponses 1: " + str(e)), "red"))
+def processResponsesGhostArchive():
+    """
+    Get archived responses from GhostArchive (ghostarchive.org)
+    """
+    global subs, path, indexFile, totalResponses, stopProgram, argsInput, successCount, fileCount, DEFAULT_OUTPUT_DIR, responseOutputDirectory, ghostArchiveRequestLinks, failureCount, totalFileCount, checkGhostArchive
+    try:
+        fileCount = 0
+        failureCount = 0
+        if not args.check_only:
+            # Create 'results' and domain directory if needed
+            createDirs()
+            # Get the path of the files, depending on whether -oR / --output_responses was passed
+            try:
+                responsesPath = responseOutputDirectory + "responses.GhostArchive.tmp"
+                indexPath = responseOutputDirectory + "waymore_index.txt"
+            except Exception as e:
+                if verbose():
+                    writerr(colored("ERROR processResponsesGhostArchive 4: " + str(e), "red"))
+        # Get URLs from GhostArchive if the DOM ID's haven't been retrieved yet
+        if stopProgram is None and not args.check_only:
+            if args.mode in ("R", "B"):
+                write(
+                    colored(
+                        "GhostArchive - [ INFO ] Getting list of response links (this can take a while for some domains)...",
+                        "cyan",
+                    )
+                )
+            if args.mode == "R":
+                getGhostArchiveUrls()
+        # Check if a responses.GhostArchive.tmp files exists
+        if not args.check_only and os.path.exists(responsesPath):
+            # Load the links into the set
+            with open(responsesPath, "rb") as fl:
+                linkRequests = pickle.load(fl)
+        # Set start point
+        successCount = 0
+        # Get the URLScan DOM links
+        linkRequests = []
+        for originalUrl, domUrl in ghostArchiveRequestLinks:
+            linkRequests.append((originalUrl, domUrl))
+        # Write the links to a temp file
+        if not args.check_only:
+            with open(responsesPath, "wb") as f:
+                pickle.dump(linkRequests, f)
+        # Get the total number of responses we will try to get and set the current file count to the success count
+        totalResponses = len(linkRequests)
+        checkGhostArchive = checkGhostArchive + totalResponses
+        # If there are no reponses to download, diaplay an error and exit
+        if args.mode != "R" and totalResponses == 0:
+            writerr(
+                colored(
+                    getSPACER(
+                        "Failed to get responses from GhostArchive (ghostarchive.org) - check input and try again."
+                    ),
+                    "red",
+                )
+            )
+            return
+        fileCount = successCount
+        if args.check_only:
+            writerr(
+                colored("Downloading archived responses: ", "cyan")
+                + colored("UNKNOWN requests", "cyan")
+            )
+            writerr(
+                colored(
+                    "\n-> Downloading the responses can vary depending on the target and the rate limiting on GhostArchive",
+                    "green",
+                )
+            )
+            write("")
+        else:
+            # If the limit has been set over the default, give a warning that this could take a long time!
+            if totalResponses - successCount > DEFAULT_LIMIT:
+                if successCount > 0:
+                    writerr(
+                        colored(
+                            getSPACER(
+                                "WARNING: Downloading remaining "
+                                + str(totalResponses - successCount)
+                                + " responses may take a loooooooong time! Consider using arguments -ko, -l, -ci, -from and -to wisely!"
+                            ),
+                            "yellow",
+                        )
+                    )
+                else:
+                    writerr(
+                        colored(
+                            getSPACER(
+                                "WARNING: Downloading "
+                                + str(totalResponses)
+                                + " responses may take a loooooooong time! Consider using arguments -ko, -l, -ci, -from and -to wisely!"
+                            ),
+                            "yellow",
+                        )
+                    )
+            # Open the index file if hash value is going to be used (not URL)
+            if not args.url_filename:
+                indexFile = open(indexPath, "a")
+            # Process the URLs from GhostArchive
+            if stopProgram is None:
+                p = mp.Pool(
+                    args.processes * 2
+                )  # Double the number of processes to speed up the download
+                p.starmap(getGhostArchiveWARC, linkRequests[successCount:])
+                p.close()
+                p.join()
+            # Delete the tmp files now it has run successfully
+            if stopProgram is None:
+                try:
+                    os.remove(responsesPath)
+                except Exception:
+                    pass
+            # Close the index file if hash value is going to be used (not URL)
+            if not args.url_filename:
+                indexFile.close()
+        if not args.check_only:
+            try:
+                if failureCount > 0:
+                    if verbose():
+                        write(
+                            colored("GhostArchive - [ INFO ] Responses saved to ", "cyan")
+                            + colored(responseOutputDirectory, "white")
+                            + colored(" for " + subs + argsInput + ": ", "cyan")
+                            + colored(
+                                str(fileCount) + " 🤘",
+                                "white",
+                            )
+                            + colored(" (" + str(failureCount) + " not found)\n", "red")
+                        )
+                    else:
+                        write(
+                            colored("GhostArchive - [ INFO ] Responses saved to ", "cyan")
+                            + colored(responseOutputDirectory, "white")
+                            + colored(" for " + subs + argsInput + ": ", "cyan")
+                            + colored(str(fileCount) + " 🤘", "white")
+                            + colored(" (" + str(failureCount) + " not found)\n", "red")
+                        )
+                else:
+                    if verbose():
+                        write(
+                            colored("GhostArchive - [ INFO ] Responses saved to ", "cyan")
+                            + colored(responseOutputDirectory, "white")
+                            + colored(" for " + subs + argsInput + ": ", "cyan")
+                            + colored(str(fileCount) + " 🤘\n", "white")
+                        )
+                    else:
+                        write(
+                            colored("GhostArchive - [ INFO ] Responses saved to ", "cyan")
+                            + colored(responseOutputDirectory, "white")
+                            + colored(" for " + subs + argsInput + ": ", "cyan")
+                            + colored(str(fileCount) + " 🤘\n", "white")
+                        )
+            except Exception as e:
+                if verbose():
+                    writerr(colored("ERROR processResponsesGhostArchive 5: " + str(e), "red"))
+            # Append extra links from WARC files to URL output file (for mode B)
+            try:
+                if args.mode == "B" and len(extraWarcLinks) > 0:
+                    # Determine URL output file path (same logic as processURLOutput)
+                    if args.output_urls == "":
+                        if args.output_responses != "":
+                            urlFilePath = args.output_responses + "/waymore.txt"
+                        else:
+                            urlFilePath = (
+                                str(DEFAULT_OUTPUT_DIR)
+                                + "/results/"
+                                + str(argsInput).replace("/", "-")
+                                + "/waymore.txt"
+                            )
+                    else:
+                        urlFilePath = args.output_urls
+                    # Load existing URLs from file to avoid duplicates
+                    existingUrls = set()
+                    try:
+                        with open(urlFilePath) as f:
+                            for line in f:
+                                existingUrls.add(line.strip())
+                    except Exception:
+                        pass
+                    # Append only new unique URLs
+                    newLinks = [
+                        url
+                        for url in extraWarcLinks
+                        if url not in existingUrls and url not in linksFound
+                    ]
+                    if len(newLinks) > 0:
+                        with open(urlFilePath, "a") as f:
+                            for url in newLinks:
+                                f.write(url + "\n")
+                        # Display message about extra links
+                        write(
+                            colored("GhostArchive - [ INFO ] ", "cyan")
+                            + colored(str(len(newLinks)), "white")
+                            + colored(" extra links found in WARC files added to file ", "cyan")
+                            + colored(urlFilePath, "white")
+                            + "\n"
+                        )
+            except Exception as e:
+                if verbose():
+                    writerr(colored("ERROR processResponsesGhostArchive 6: " + str(e), "red"))
+        totalFileCount = totalFileCount + fileCount
+    except Exception as e:
+        writerr(colored(getSPACER("ERROR processResponsesGhostArchive 1: " + str(e)), "red"))
+    finally:
+        linkRequests = None
 def processResponsesURLScan():
     """
     Get archived responses from URLScan (urlscan.io)
@@ -6254,6 +7758,12 @@ async def fetch_intelx_async():
     await loop.run_in_executor(None, getIntelxUrls)
+async def fetch_ghostarchive_async():
+    """Async wrapper for getGhostArchiveUrls - runs in thread pool"""
+    loop = asyncio.get_event_loop()
+    await loop.run_in_executor(None, getGhostArchiveUrls)
 async def fetch_all_sources_async():
     """
     Orchestrator function to fetch from all enabled sources concurrently.
@@ -6276,6 +7786,8 @@ async def fetch_all_sources_async():
         tasks.append(("VirusTotal", fetch_virustotal_async()))
     if not args.xix and INTELX_API_KEY != "" and stopProgram is None:
         tasks.append(("Intelligence X", fetch_intelx_async()))
+    if not args.xga and stopProgram is None:
+        tasks.append(("GhostArchive", fetch_ghostarchive_async()))
     if not tasks:
         return
@@ -6301,7 +7813,7 @@ async def fetch_all_sources_async():
 # Run waymore
 def main():
-    global args, DEFAULT_TIMEOUT, inputValues, argsInput, linksFound, linkMimes, successCount, failureCount, fileCount, totalResponses, totalPages, indexFile, path, stopSource, stopProgram, VIRUSTOTAL_API_KEY, inputIsSubDomain, argsInputHostname, WEBHOOK_DISCORD, responseOutputDirectory, fileCount, INTELX_API_KEY, stopSourceAlienVault, stopSourceCommonCrawl, stopSourceWayback, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx
+    global args, DEFAULT_TIMEOUT, inputValues, argsInput, linksFound, linkMimes, successCount, failureCount, fileCount, totalResponses, totalPages, indexFile, path, stopSource, stopProgram, VIRUSTOTAL_API_KEY, inputIsSubDomain, argsInputHostname, WEBHOOK_DISCORD, responseOutputDirectory, fileCount, INTELX_API_KEY, stopSourceAlienVault, stopSourceCommonCrawl, stopSourceWayback, stopSourceURLScan, stopSourceVirusTotal, stopSourceIntelx, stopSourceGhostArchive, extraWarcLinks
     # Tell Python to run the handler() function when SIGINT is received
     signal(SIGINT, handler)
@@ -6457,13 +7969,19 @@ def main():
         help="Exclude checks for links from intelx.io",
         default=False,
     )
+    parser.add_argument(
+        "-xga",
+        action="store_true",
+        help="Exclude checks for links from ghostarchive.org",
+        default=False,
+    )
     parser.add_argument(
         "--providers",
         action="store",
-        help="A comma separated list of source providers that you want to get URLs from. The values can be wayback,commoncrawl,otx,urlscan,virustotal and intelx. Passing this will override any exclude arguments (e.g. -xwm,-xcc, etc.) passed to exclude sources, and reset those based on what was passed with this argument.",
+        help="A comma separated list of source providers that you want to get URLs from. The values can be wayback,commoncrawl,otx,urlscan,virustotal,intelx and ghostarchive. Passing this will override any exclude arguments (e.g. -xwm,-xcc, etc.) passed to exclude sources, and reset those based on what was passed with this argument.",
         default=[],
         type=validateArgProviders,
-        metavar="{wayback,commoncrawl,otx,urlscan,virustotal,intelx}",
+        metavar="{wayback,commoncrawl,otx,urlscan,virustotal,intelx,ghostarchive}",
     )
     parser.add_argument(
         "-lcc",
@@ -6630,6 +8148,10 @@ def main():
             args.xix = True
         else:
             args.xix = False
+        if "ghostarchive" not in args.providers:
+            args.xga = True
+        else:
+            args.xga = False
     # If no input was given, raise an error
     if sys.stdin.isatty():
@@ -6700,6 +8222,7 @@ def main():
             # Reset global variables
             linksFound = set()
             linkMimes = set()
+            extraWarcLinks = set()
             successCount = 0
             failureCount = 0
             fileCount = 0
@@ -6714,6 +8237,7 @@ def main():
             stopSourceURLScan = False
             stopSourceVirusTotal = False
             stopSourceIntelx = False
+            stopSourceGhostArchive = False
             # Get the config settings from the config.yml file
             getConfig()

waymore 7.6__py3-none-any.whl → 8.0__py3-none-any.whl

waymore 7.6py3-none-any.whl → 8.0py3-none-any.whl