PyPI - waymore - Versions diffs - 4.4__py3-none-any.whl → 4.6__py3-none-any.whl - Mend

waymore 4.4py3-none-any.whl → 4.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

waymore/__init__.py +1 -1
waymore/waymore.py +253 -97
{waymore-4.4.dist-info → waymore-4.6.dist-info}/METADATA +11 -7
waymore-4.6.dist-info/RECORD +8 -0
{waymore-4.4.dist-info → waymore-4.6.dist-info}/WHEEL +1 -1
waymore-4.4.dist-info/RECORD +0 -8
{waymore-4.4.dist-info → waymore-4.6.dist-info}/LICENSE +0 -0
{waymore-4.4.dist-info → waymore-4.6.dist-info}/entry_points.txt +0 -0
{waymore-4.4.dist-info → waymore-4.6.dist-info}/top_level.txt +0 -0

waymore/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__="4.4"
1	+ __version__="4.6"

waymore/waymore.py CHANGED Viewed

@@ -136,6 +136,7 @@ DEFAULT_FILTER_KEYWORDS = 'admin,login,logon,signin,signup,register,registration
 # Yaml config values
 FILTER_URL = ''
 FILTER_MIME = ''
+MATCH_MIME = ''
 FILTER_CODE = ''
 MATCH_CODE  = ''
 FILTER_KEYWORDS = ''
@@ -313,8 +314,21 @@ def showOptions():
         else:
             write(colored('-n: ' +str(args.no_subs), 'magenta')+colored(' Sub domains are included in the search.','white'))
-        write(colored('-xwm: ' +str(args.xwm), 'magenta')+colored(' Whether to exclude checks for links from Wayback Machine (archive.org)','white'))
-        write(colored('-xcc: ' +str(args.xcc), 'magenta')+colored(' Whether to exclude checks for links from commoncrawl.org','white'))
+        providers = ''
+        if not args.xwm:
+            providers = providers + 'Wayback, '
+        if not args.xcc:
+            providers = providers + 'CommonCrawl, '
+        if not args.xav:
+            providers = providers + 'Alien Vault OTX, '
+        if not args.xus:
+            providers = providers + 'URLScan, '
+        if not args.xvt:
+            providers = providers + 'VirusTotal, '
+        if providers == '':
+            providers = 'None'
+        write(colored('Providers: ' +str(providers.strip(', ')), 'magenta')+colored(' Which providers to check for URLs.','white'))
         if not args.xcc:
             if args.lcc ==0 and args.lcy == 0:
                 write(colored('-lcc: ' +str(args.lcc), 'magenta')+colored(' Search ALL Common Crawl index collections.','white'))
@@ -325,13 +339,12 @@ def showOptions():
                     if args.lcc != 0:
                         write(colored('-lcc: ' +str(args.lcc), 'magenta')+colored(' The number of latest Common Crawl index collections to be searched.','white'))
                     write(colored('-lcy: ' +str(args.lcy), 'magenta')+colored(' Search all Common Crawl index collections with data from year '+str(args.lcy)+' and after.','white'))
-        write(colored('-xav: ' +str(args.xav), 'magenta')+colored(' Whether to exclude checks for links from alienvault.com','white'))
-        write(colored('-xus: ' +str(args.xus), 'magenta')+colored(' Whether to exclude checks for links from urlscan.io','white'))
         if URLSCAN_API_KEY == '':
             write(colored('URLScan API Key:', 'magenta')+colored(' {none} - You can get a FREE or paid API Key at https://urlscan.io/user/signup which will let you get more back, and quicker.','white'))
         else:
             write(colored('URLScan API Key: ', 'magenta')+colored(URLSCAN_API_KEY))
-        write(colored('-xvt: ' +str(args.xvt), 'magenta')+colored(' Whether to exclude checks for links from virustotal.com','white'))
         if VIRUSTOTAL_API_KEY == '':
             write(colored('VirusTotal API Key:', 'magenta')+colored(' {none} - You can get a FREE or paid API Key at https://www.virustotal.com/gui/join-us which will let you get some extra URLs.','white'))
         else:
@@ -382,11 +395,19 @@ def showOptions():
             write(colored('-mc: ' +str(args.mc), 'magenta')+colored(' Only retrieve URLs and Responses that match these HTTP Status codes.','white'))
         else:
             if args.fc:
-                write(colored('-fc: ' +str(args.mc), 'magenta')+colored(' Don\'t retrieve URLs and Responses that match these HTTP Status codes.','white'))
-        write(colored('MIME Type exclusions: ', 'magenta')+colored(FILTER_MIME))
+                write(colored('-fc: ' +str(args.fc), 'magenta')+colored(' Don\'t retrieve URLs and Responses that match these HTTP Status codes.','white'))
         if not args.mc and args.fc:
             write(colored('Response Code exclusions: ', 'magenta')+colored(FILTER_CODE))
         write(colored('Response URL exclusions: ', 'magenta')+colored(FILTER_URL))
+        if args.mt:
+            write(colored('-mt: ' +str(args.mt.lower()), 'magenta')+colored(' Only retrieve URLs and Responses that match these MIME Types.','white')+colored(' NOTE: This will NOT be applied to Alien Vault OTX and Virus Total because they don\'t have the ability to filter on MIME Type. Sometimes URLScan does not have a MIME Type defined - these will always be included. Consider excluding sources if this matters to you','yellow'))
+        else:
+            if args.ft:
+                write(colored('-ft: ' +str(args.ft.lower()), 'magenta')+colored(' Don\'t retrieve URLs and Responses that match these MIME Types.','white')+colored(' NOTE: This will NOT be applied to Alien Vault OTX and Virus Total because they don\'t have the ability to filter on MIME Type. Sometimes URLScan does not have a MIME Type defined - these will always be included. Consider excluding sources if this matters to you','yellow'))
+            else:
+                write(colored('MIME Type exclusions: ', 'magenta')+colored(FILTER_MIME)+colored(' Don\'t retrieve URLs and Responses that match these MIME Types.','white')+colored(' NOTE: This will NOT be applied to Alien Vault OTX and Virus Total because they don\'t have the ability to filter on MIME Type. Sometimes URLScan does not have a MIME Type defined - these will always be included. Consider excluding sources if this matters to you','yellow'))
         if args.keywords_only and args.keywords_only == '#CONFIG':
             if FILTER_KEYWORDS == '':
                 write(colored('Keywords only: ', 'magenta')+colored('It looks like no keywords have been set in config.yml file.','red'))
@@ -423,7 +444,7 @@ def getConfig():
     """
     Try to get the values from the config file, otherwise use the defaults
     """
-    global FILTER_CODE, FILTER_MIME, FILTER_URL, FILTER_KEYWORDS, URLSCAN_API_KEY, VIRUSTOTAL_API_KEY, CONTINUE_RESPONSES_IF_PIPED, subs, path, waymorePath, inputIsDomainANDPath, HTTP_ADAPTER, HTTP_ADAPTER_CC, argsInput, terminalWidth, MATCH_CODE, WEBHOOK_DISCORD, DEFAULT_OUTPUT_DIR
+    global FILTER_CODE, FILTER_MIME, FILTER_URL, FILTER_KEYWORDS, URLSCAN_API_KEY, VIRUSTOTAL_API_KEY, CONTINUE_RESPONSES_IF_PIPED, subs, path, waymorePath, inputIsDomainANDPath, HTTP_ADAPTER, HTTP_ADAPTER_CC, argsInput, terminalWidth, MATCH_CODE, WEBHOOK_DISCORD, DEFAULT_OUTPUT_DIR, MATCH_MIME
     try:
         # Set terminal width
@@ -467,7 +488,7 @@ def getConfig():
         # Set up an HTTPAdaptor for retry strategy for Common Crawl when making requests
         try:
             retry= Retry(
-                total=args.retries+20,
+                total=args.retries+3,
                 backoff_factor=1.1,
                 status_forcelist=[503],
                 raise_on_status=False,
@@ -505,14 +526,22 @@ def getConfig():
                 writerr(colored('Unable to read "FILTER_URL" from config.yml - default set', 'red'))
                 FILTER_URL = DEFAULT_FILTER_URL
-            try:
-                FILTER_MIME = config.get('FILTER_MIME')
-                if str(FILTER_MIME) == 'None':
-                    writerr(colored('No value for "FILTER_MIME" in config.yml - default set', 'yellow'))
-                    FILTER_MIME = ''
-            except Exception as e:
-                writerr(colored('Unable to read "FILTER_MIME" from config.yml - default set', 'red'))
-                FILTER_MIME = DEFAULT_FILTER_MIME
+            # If the argument -ft was passed, don't try to get from the config
+            if args.ft:
+                FILTER_MIME = args.ft.lower()
+            else:
+                try:
+                    FILTER_MIME = config.get('FILTER_MIME')
+                    if str(FILTER_MIME) == 'None':
+                        writerr(colored('No value for "FILTER_MIME" in config.yml - default set', 'yellow'))
+                        FILTER_MIME = ''
+                except Exception as e:
+                    writerr(colored('Unable to read "FILTER_MIME" from config.yml - default set', 'red'))
+                    FILTER_MIME = DEFAULT_FILTER_MIME
+            # Set the match codes if they were passed
+            if args.mt:
+                MATCH_MIME = args.mt.lower()
             # If the argument -fc was passed, don't try to get from the config
             if args.fc:
@@ -530,7 +559,7 @@ def getConfig():
             # Set the match codes if they were passed
             if args.mc:
                 MATCH_CODE = args.mc
             try:
                 URLSCAN_API_KEY = config.get('URLSCAN_API_KEY')
                 if str(URLSCAN_API_KEY) == 'None':
@@ -618,7 +647,9 @@ def getConfig():
         # Use defaults if required
         if useDefaults:
             FILTER_URL = DEFAULT_FILTER_URL
+            MATCH_MIME = ''
             FILTER_MIME = DEFAULT_FILTER_MIME
+            MATCH_CODE = ''
             FILTER_CODE = DEFAULT_FILTER_CODE
             URLSCAN_API_KEY = ''
             VIRUSTOTAL_API_KEY = ''
@@ -1224,6 +1255,44 @@ def validateArgStatusCodes(x):
         raise argparse.ArgumentTypeError('Pass HTTP status codes separated by a comma')
     return x
+def validateArgMimeTypes(x):
+    """
+    Validate the -ft and -mt arguments
+    The passed values will be changed to lower case.
+    Only values matching the regex '[a-z]+\/[a-z0-9\-\+]+' separated by a comma
+    """
+    invalid = False
+    x = x.lower()
+    mimeTypes = x.split(',')
+    for mimeType in mimeTypes:
+        if not re.fullmatch(r'[a-z]+/[a-z0-9\-\+]+', mimeType):
+            invalid = True
+            break
+    if invalid:
+        raise argparse.ArgumentTypeError('Pass MIME Types separated by a comma, e.g. text/html,text/xml')
+    return x
+def validateArgProviders(x):
+    """
+    Validate the --providers argument
+    Only the following values in a comma separated list are accepted:
+    - wayback
+    - commoncrawl
+    - otx
+    - urlscan
+    - virustotal
+    """
+    invalid = False
+    x = x.lower()
+    providers = x.split(',')
+    for provider in providers:
+        if not re.fullmatch(r'(wayback|commoncrawl|otx|urlscan|virustotal)', provider):
+            invalid = True
+            break
+    if invalid:
+        raise argparse.ArgumentTypeError('Pass providers separated by a comma, e.g. wayback,commoncrawl,otx,urlscan,virustotal')
+    return x
 def processAlienVaultPage(url):
     """
     Get URLs from a specific page of otx.alienvault.org API for the input domain
@@ -1384,11 +1453,15 @@ def getAlienVaultUrls():
         # Carry on if something was found
         if resp.text.lower().find('"error": "') < 0:
-            # Get the JSON response
-            jsonResp = json.loads(resp.text.strip())
-            # Try to get the number of results
-            totalUrls = jsonResp['full_size']
+            try:
+                # Get the JSON response
+                jsonResp = json.loads(resp.text.strip())
+                # Try to get the number of results
+                totalUrls = int(jsonResp['full_size'])
+            except:
+                writerr(colored(getSPACER('[ ERR ] There was an unexpected response from the Alien Vault API'),'red'))
+                totalUrls = 0
             # If there are results, carry on
             if totalUrls > 0 or args.check_only:
@@ -1454,7 +1527,6 @@ def processURLScanUrl(url, httpCode, mimeType):
                     addLink = False
             # If the user didn't requested -f / --filter-responses-only then check http code
-            # Note we can't check MIME filter because it is not returned by URLScan API
             if addLink and not args.filter_responses_only:
                 # Compare the HTTP code against the Code exclusions and matches
@@ -1484,13 +1556,18 @@ def processURLScanUrl(url, httpCode, mimeType):
                 # Check the MIME exclusions
                 if mimeType != '':
-                    match = re.search(r'('+re.escape(FILTER_MIME).replace(',','|')+')', mimeType, flags=re.IGNORECASE)
-                    if match is not None:
-                        addLink = False
+                    if MATCH_MIME != '':
+                        match = re.search(r'('+re.escape(MATCH_MIME).replace(',','|')+')', mimeType, flags=re.IGNORECASE)
+                        if match is None:
+                            addLink = False
                     else:
-                        # Add MIME Types if --verbose option was selected
-                        if verbose():
-                            linkMimes.add(mimeType)
+                        match = re.search(r'('+re.escape(FILTER_MIME).replace(',','|')+')', mimeType, flags=re.IGNORECASE)
+                        if match is not None:
+                            addLink = False
+                # Add MIME Types if --verbose option was selected
+                if verbose():
+                    linkMimes.add(mimeType)
         # Add link if it passed filters
         if addLink:
@@ -1588,19 +1665,28 @@ def getURLScanUrls():
             writerr(colored(getSPACER('[ ' + str(resp.status_code) + ' ] Unable to get links from urlscan.io'),'red'))
             return
-        # Get the JSON response
-        jsonResp = json.loads(resp.text.strip())
+        try:
+            # Get the JSON response
+            jsonResp = json.loads(resp.text.strip())
-        # Get the number of results
-        totalUrls = jsonResp['total']
+            # Get the number of results
+            totalUrls = int(jsonResp['total'])
+        except:
+            writerr(colored(getSPACER('[ ERR ] There was an unexpected response from the URLScan API'),'red'))
+            totalUrls = 0
+        # Carry on if something was found
         if args.check_only:
-            hasMore = jsonResp['has_more']
-            if hasMore:
-                write(colored('Get URLs from URLScan: ','cyan')+colored('UNKNOWN requests','white'))
-            else:
-                write(colored('Get URLs from URLScan: ','cyan')+colored('1 request','white'))
+            try:
+                hasMore = jsonResp['has_more']
+                if hasMore:
+                    write(colored('Get URLs from URLScan: ','cyan')+colored('UNKNOWN requests','white'))
+                else:
+                    write(colored('Get URLs from URLScan: ','cyan')+colored('1 request','white'))
+            except:
+                pass
             checkURLScan = 1
         else:
             # Carry on if something was found
             if int(totalUrls) > 0:
@@ -1746,6 +1832,7 @@ def processWayBackPage(url):
         if not stopSource:
             try:
                 # Choose a random user agent string to use for any requests
+                resp = None
                 userAgent = random.choice(USER_AGENT)
                 page = url.split('page=')[1]
                 session = requests.Session()
@@ -1817,8 +1904,11 @@ def processWayBackPage(url):
                     results = line.decode("utf-8")
                     foundUrl = fixArchiveOrgUrl(str(results).split(' ')[1])
-                    # Check the URL exclusions
-                    match = re.search(r'('+re.escape(FILTER_URL).replace(',','|')+')', foundUrl, flags=re.IGNORECASE)
+                    # If --filter-responses-only wasn't used, then check the URL exclusions
+                    if args.filter_responses_only:
+                        match = None
+                    else:
+                        match = re.search(r'('+re.escape(FILTER_URL).replace(',','|')+')', foundUrl, flags=re.IGNORECASE)
                     if match is None:
                         # Only get MIME Types if --verbose option was selected
                         if verbose():
@@ -1852,8 +1942,14 @@ def getWaybackUrls():
     # Write the file of URL's for the passed domain/URL
     try:
         stopSource = False
-        # If there any + in the MIME types, e.g. image/svg+xml, then replace the + with a . otherwise the wayback API does not recognise it
-        filterMIME = '&filter=!mimetype:warc/revisit|' + re.escape(FILTER_MIME).replace(',','|').replace('+','.')
+        if MATCH_MIME != '':
+            filterMIME = '&filter=mimetype:' + re.escape(MATCH_MIME).replace(',','|')
+        else:
+            filterMIME = '&filter=!mimetype:warc/revisit|' + re.escape(FILTER_MIME).replace(',','|')
+        # If there any \+ in the MIME types, e.g. image/svg\+xml (the backslash is because it was previosuly escaped), then replace the \+ with a . otherwise the wayback API does not recognise it
+        filterMIME = filterMIME.replace('\+','.')
         if MATCH_CODE != '':
             filterCode = '&filter=statuscode:' + re.escape(MATCH_CODE).replace(',','|')
         else:
@@ -1975,9 +2071,13 @@ def processCommonCrawlCollection(cdxApiUrl):
         if not stopSource:
             # Set mime content type filter
-            filterMIME = '&filter=!~mime:(warc/revisit|'
-            if FILTER_MIME.strip() != '':
-                filterMIME = filterMIME + re.escape(FILTER_MIME).replace(',','|')
+            if MATCH_MIME.strip() != '':
+                filterMIME = '&filter=~mime:('
+                filterMIME = filterMIME + re.escape(MATCH_MIME).replace(',','|')
+            else:
+                filterMIME = '&filter=!~mime:(warc/revisit|'
+                if FILTER_MIME.strip() != '':
+                    filterMIME = filterMIME + re.escape(FILTER_MIME).replace(',','|')
             filterMIME = filterMIME + ')'
             # Set status code filter
@@ -2169,9 +2269,13 @@ def getCommonCrawlUrls():
         originalLinkCount = len(linksFound)
         # Set mime content type filter
-        filterMIME = '&filter=!~mime:(warc/revisit|'
-        if FILTER_MIME.strip() != '':
-            filterMIME = filterMIME + re.escape(FILTER_MIME).replace(',','|')
+        if MATCH_MIME.strip() != '':
+            filterMIME = '&filter=~mime:('
+            filterMIME = filterMIME + re.escape(MATCH_MIME).replace(',','|')
+        else:
+            filterMIME = '&filter=!~mime:(warc/revisit|'
+            if FILTER_MIME.strip() != '':
+                filterMIME = filterMIME + re.escape(FILTER_MIME).replace(',','|')
         filterMIME = filterMIME + ')'
         # Set status code filter
@@ -2194,32 +2298,34 @@ def getCommonCrawlUrls():
         # Get the Common Crawl index collections
         cdxApiUrls = getCommonCrawlIndexes()
-        if args.check_only:
-            if args.lcc < len(cdxApiUrls):
-                checkCommonCrawl = args.lcc+1
+        # If there were URLs returned then continue
+        if cdxApiUrls:
+            if args.check_only:
+                if args.lcc < len(cdxApiUrls):
+                    checkCommonCrawl = args.lcc+1
+                else:
+                    checkCommonCrawl = len(cdxApiUrls)+1
+                write(colored('Get URLs from Common Crawl: ','cyan')+colored(str(checkCommonCrawl)+' requests','white'))
             else:
-                checkCommonCrawl = len(cdxApiUrls)+1
-            write(colored('Get URLs from Common Crawl: ','cyan')+colored(str(checkCommonCrawl)+' requests','white'))
-        else:
-            write(colored('\rGetting links from the latest ' + str(len(cdxApiUrls)) + ' commoncrawl.org index collections (this can take a while for some domains)...\r','cyan'))
+                write(colored('\rGetting links from the latest ' + str(len(cdxApiUrls)) + ' commoncrawl.org index collections (this can take a while for some domains)...\r','cyan'))
+                # Process the URLs from common crawl
+                if stopProgram is None:
+                    p = mp.Pool(args.processes)
+                    p.map(processCommonCrawlCollection, cdxApiUrls)
+                    p.close()
+                    p.join()
+                # Show the MIME types found (in case user wants to exclude more)
+                if verbose() and len(linkMimes) > 0:
+                    linkMimes.discard('warc/revisit')
+                    write(getSPACER(colored('MIME types found: ','magenta')+colored(str(linkMimes),'white'))+'\n')
-            # Process the URLs from common crawl
-            if stopProgram is None:
-                p = mp.Pool(args.processes)
-                p.map(processCommonCrawlCollection, cdxApiUrls)
-                p.close()
-                p.join()
-            # Show the MIME types found (in case user wants to exclude more)
-            if verbose() and len(linkMimes) > 0:
-                linkMimes.discard('warc/revisit')
-                write(getSPACER(colored('MIME types found: ','magenta')+colored(str(linkMimes),'white'))+'\n')
-            linkCount = len(linksFound) - originalLinkCount
-            if args.xwm:
-                write(getSPACER(colored('Links found on commoncrawl.org: ', 'cyan')+colored(str(linkCount),'white'))+'\n')
-            else:
-                write(getSPACER(colored('Extra links found on commoncrawl.org: ', 'cyan')+colored(str(linkCount),'white'))+'\n')
+                linkCount = len(linksFound) - originalLinkCount
+                if args.xwm:
+                    write(getSPACER(colored('Links found on commoncrawl.org: ', 'cyan')+colored(str(linkCount),'white'))+'\n')
+                else:
+                    write(getSPACER(colored('Extra links found on commoncrawl.org: ', 'cyan')+colored(str(linkCount),'white'))+'\n')
     except Exception as e:
         writerr(colored('ERROR getCommonCrawlUrls 1: ' + str(e), 'red'))
@@ -2332,29 +2438,33 @@ def getVirusTotalUrls():
             return
         # Get the JSON response
-        jsonResp = json.loads(resp.text.strip())
+        try:
+            jsonResp = json.loads(resp.text.strip())
-        # Get the different URLs
-        if args.no_subs:
-            subDomains = []
-        else:
+            # Get the different URLs
+            if args.no_subs:
+                subDomains = []
+            else:
+                try:
+                    subDomains = jsonResp['subdomains']
+                except Exception as e:
+                    subDomains = []
+            try:
+                detectedUrls = [entry['url'] for entry in jsonResp.get('detected_urls', [])]
+            except Exception as e:
+                detectedUrls = []
             try:
-                subDomains = jsonResp['subdomains']
+                undetectedUrls = [entry[0] for entry in jsonResp.get('undetected_urls', [])]
             except Exception as e:
-                subDomains = []
-        try:
-            detectedUrls = [entry['url'] for entry in jsonResp.get('detected_urls', [])]
-        except Exception as e:
-            detectedUrls = []
-        try:
-            undetectedUrls = [entry[0] for entry in jsonResp.get('undetected_urls', [])]
-        except Exception as e:
-            undetectedUrls = []
-        try:
-            totalUrls = set(subDomains + detectedUrls + undetectedUrls)
-        except Exception as e:
+                undetectedUrls = []
+            try:
+                totalUrls = set(subDomains + detectedUrls + undetectedUrls)
+            except Exception as e:
+                totalUrls = []
+        except:
+            writerr(colored(getSPACER('[ ERR ] There was an unexpected response from the VirusTotal API'),'red'))
             totalUrls = []
         if args.check_only:
             write(colored('Get URLs from VirusTotal: ','cyan')+colored('1 request','white'))
             checkVirusTotal = 1
@@ -2457,8 +2567,11 @@ def processResponses():
             linksFound = set()
             # Set mime content type filter
-            filterMIME = '&filter=!mimetype:warc/revisit'
-            if FILTER_MIME.strip() != '':
+            filterMIME = ''
+            if MATCH_MIME.strip() != '':
+                filterMIME = '&filter=mimetype:' + re.escape(MATCH_MIME).replace(',','|')
+            else:
+                filterMIME = '&filter=!mimetype:warc/revisit'
                 filterMIME = filterMIME + '|' + re.escape(FILTER_MIME).replace(',','|')
             # Set status code filter
@@ -2928,12 +3041,24 @@ def main():
         help='Filter HTTP status codes for retrieved URLs and responses. Comma separated list of codes (default: the FILTER_CODE values from config.yml). Passing this argument will override the value from config.yml',
         type=validateArgStatusCodes,
     )
+    parser.add_argument(
+        '-ft',
+        action='store',
+        help='Filter MIME Types for retrieved URLs and responses. Comma separated list of MIME Types (default: the FILTER_MIME values from config.yml). Passing this argument will override the value from config.yml. NOTE: This will NOT be applied to Alien Vault OTX and Virus Total because they don\'t have the ability to filter on MIME Type. Sometimes URLScan does not have a MIME Type defined - these will always be included. Consider excluding sources if this matters to you.',
+        type=validateArgMimeTypes,
+    )
     parser.add_argument(
         '-mc',
         action='store',
         help='Only Match HTTP status codes for retrieved URLs and responses. Comma separated list of codes. Passing this argument overrides the config FILTER_CODE and -fc.',
         type=validateArgStatusCodes,
     )
+    parser.add_argument(
+        '-mt',
+        action='store',
+        help='Only MIME Types for retrieved URLs and responses. Comma separated list of MIME types. Passing this argument overrides the config FILTER_MIME and -ft. NOTE: This will NOT be applied to Alien Vault OTX and Virus Total because they don\'t have the ability to filter on MIME Type. Sometimes URLScan does not have a MIME Type defined - these will always be included. Consider excluding sources if this matters to you.',
+        type=validateArgMimeTypes,
+    )
     parser.add_argument(
         '-l',
         '--limit',
@@ -3009,11 +3134,19 @@ def main():
         help='Exclude checks for links from virustotal.com',
         default=False
     )
+    parser.add_argument(
+        '--providers',
+        action='store',
+        help='A comma separated list of source providers that you want to get URLs from. The values can be wayback,commoncrawl,otx,urlscan and virustotal. Passing this will override any exclude arguments (e.g. -xwm,-xcc, etc.) passed to exclude sources, and reset those based on what was passed with this argument.',
+        default=[],
+        type=validateArgProviders,
+        metavar='{wayback,commoncrawl,otx,urlscan,virustotal}'
+    )
     parser.add_argument(
         '-lcc',
         action='store',
         type=int,
-        help='Limit the number of Common Crawl index collections searched, e.g. \'-lcc 10\' will just search the latest 10 collections (default: 3). As of July 2023 there are currently 95 collections. Setting to 0 (default) will search ALL collections. If you don\'t want to search Common Crawl at all, use the -xcc option.'
+        help='Limit the number of Common Crawl index collections searched, e.g. \'-lcc 10\' will just search the latest 10 collections (default: 1). As of November 2024 there are currently 106 collections. Setting to 0 (default) will search ALL collections. If you don\'t want to search Common Crawl at all, use the -xcc option.'
     )
     parser.add_argument(
         '-lcy',
@@ -3132,13 +3265,36 @@ def main():
         write(colored('Waymore - v' + __version__,'cyan'))
         sys.exit()
-    # If -lcc wasn't passed then set to the default of 3 if -lcy is 0. This will make them work together
+    # If -lcc wasn't passed then set to the default of 1 if -lcy is 0. This will make them work together
     if args.lcc is None:
         if args.lcy == 0:
-            args.lcc = 3
+            args.lcc = 1
         else:
             args.lcc = 0
+    # If --providers was passed, then manually set the exclude arguments;
+    if args.providers:
+        if 'wayback' not in args.providers:
+            args.xwm = True
+        else:
+            args.xwm = False
+        if 'commoncrawl' not in args.providers:
+            args.xcc = True
+        else:
+            args.xcc = False
+        if 'otx' not in args.providers:
+            args.xav = True
+        else:
+            args.xav = False
+        if 'urlscan' not in args.providers:
+            args.xus = True
+        else:
+            args.xus = False
+        if 'virustotal' not in args.providers:
+            args.xvt = True
+        else:
+            args.xvt = False
     # If no input was given, raise an error
     if sys.stdin.isatty():
         if args.input is None:

{waymore-4.4.dist-info → waymore-4.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: waymore
-Version: 4.4
+Version: 4.6
 Summary: Find way more from the Wayback Machine, Common Crawl, Alien Vault OTX, URLScan & VirusTotal!
 Home-page: https://github.com/xnl-h4ck3r/waymore
 Author: @xnl-h4ck3r
@@ -15,7 +15,7 @@ Requires-Dist: tldextract
 <center><img src="https://github.com/xnl-h4ck3r/waymore/blob/main/waymore/images/title.png"></center>
-## About - v4.4
+## About - v4.6
 The idea behind **waymore** is to find even more links from the Wayback Machine than other existing tools.
@@ -83,7 +83,9 @@ pipx install git+https://github.com/xnl-h4ck3r/waymore.git
 | -n            | --no-subs                  | Don't include subdomains of the target domain (only used if input is not a domain with a specific path).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
 | -f            | --filter-responses-only    | The initial links from sources will not be filtered, only the responses that are downloaded, e.g. it maybe useful to still see all available paths from the links, even if you don't want to check the content.                                                                                                                                                                                                                                                                                                                                                                                                                                                |
 | -fc           |                            | Filter HTTP status codes for retrieved URLs and responses. Comma separated list of codes (default: the `FILTER_CODE` values from `config.yml`). Passing this argument will override the value from `config.yml`                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+| -ft           |                            | Filter MIME Types for retrieved URLs and responses. Comma separated list of MIME Types (default: the `FILTER_MIME` values from `config.yml`). Passing this argument will override the value from `config.yml`. **NOTE: This will NOT be applied to Alien Vault OTX and Virus Total because they don't have the ability to filter on MIME Type. Sometimes URLScan does not have a MIME Type defined - these will always be included. Consider excluding sources if this matters to you.**.                                                                                                                                                                      |
 | -mc           |                            | Only Match HTTP status codes for retrieved URLs and responses. Comma separated list of codes. Passing this argument overrides the config `FILTER_CODE` and `-fc`.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| -mt           |                            | Only MIME Types for retrieved URLs and responses. Comma separated list of MIME types. Passing this argument overrides the config `FILTER_MIME` and `-ft`. **NOTE: This will NOT be applied to Alien Vault OTX and Virus Total because they don't have the ability to filter on MIME Type. Sometimes URLScan does not have a MIME Type defined - these will always be included. Consider excluding sources if this matters to you.**.                                                                                                                                                                                                                           |
 | -l            | --limit                    | How many responses will be saved (if `-mode R` or `-mode B` is passed). A positive value will get the **first N** results, a negative value will get the **last N** results. A value of 0 will get **ALL** responses (default: 5000)                                                                                                                                                                                                                                                                                                                                                                                                                           |
 | -from         | --from-date                | What date to get responses from. If not specified it will get from the earliest possible results. A partial value can be passed, e.g. `2016`, `201805`, etc.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
 | -to           | --to-date                  | What date to get responses to. If not specified it will get to the latest possible results. A partial value can be passed, e.g. `2021`, `202112`, etc.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
@@ -95,7 +97,7 @@ pipx install git+https://github.com/xnl-h4ck3r/waymore.git
 | -xav          |                            | Exclude checks for links from alienvault.com                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
 | -xus          |                            | Exclude checks for links from urlscan.io                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
 | -xvt          |                            | Exclude checks for links from virustotal.com                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-| -lcc          |                            | Limit the number of Common Crawl index collections searched, e.g. `-lcc 10` will just search the latest `10` collections (default: 3). As of July 2023 there are currently 95 collections. Setting to `0` (default) will search **ALL** collections. If you don't want to search Common Crawl at all, use the `-xcc` option.                                                                                                                                                                                                                                                                                                                                   |
+| -lcc          |                            | Limit the number of Common Crawl index collections searched, e.g. `-lcc 10` will just search the latest `10` collections (default: 1). As of November 2024 there are currently 106 collections. Setting to `0` will search **ALL** collections. If you don't want to search Common Crawl at all, use the `-xcc` option.                                                                                                                                                                                                                                                                                                                                        |
 | -lcy          |                            | Limit the number of Common Crawl index collections searched by the year of the index data. The earliest index has data from 2008. Setting to 0 (default) will search collections or any year (but in conjuction with `-lcc`). For example, if you are only interested in data from 2015 and after, pass `-lcy 2015`. This will override the value of `-lcc` if passed. If you don't want to search Common Crawl at all, use the `-xcc` option.                                                                                                                                                                                                                 |
 | -t            | --timeout                  | This is for archived responses only! How many seconds to wait for the server to send data before giving up (default: 30)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
 | -p            | --processes                | Basic multithreading is done when getting requests for a file of URLs. This argument determines the number of processes (threads) used (default: 1)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
@@ -154,8 +156,8 @@ If the input is just a domain, e.g. `redbull.com` then the `-mode` defaults to `
 The `config.yml` file (typically in `~/.config/waymore/`) have values that can be updated to suit your needs. Filters are all provided as comma separated lists:
-- `FILTER_CODE` - Exclusions used to exclude responses we will try to get from web.archive.org, and also for file names when `-i` is a directory, e.g. `301,302`. This can be overridden with the `-fc` argument. Passing the `-mc` (to match status codes instead of filter) will override any value in `FILTER_CODE` or `-fc`
-- `FILTER_MIME` - MIME Content-Type exclusions used to filter links and responses from web.archive.org through their API, e.g. `'text/css,image/jpeg`
+- `FILTER_CODE` - Exclusions used to exclude responses we will try to get from web.archive.org, and also for file names when `-i` is a directory, e.g. `301,302`. This can be overridden with the `-fc` argument. Passing the `-mc` (to match status codes instead of filter) will override any value in `FILTER_CODE` or `-fc`.
+- `FILTER_MIME` - MIME Content-Type exclusions used to filter links and responses from web.archive.org through their API, e.g. `'text/css,image/jpeg`. This can be overridden with the `-ft` argument. . Passing the `-mt` (to match MIME types instead of filter) will override any value in `FILTER_MIME` or `-ft`.
 - `FILTER_URL` - Response code exclusions we will use to filter links and responses from web.archive.org through their API, e.g. `.css,.jpg`
 - `FILTER_KEYWORDS` - Only links and responses will be returned that contain the specified keywords if the `-ko`/`--keywords-only` argument is passed (without providing an explicit value on the command line), e.g. `admin,portal`
 - `URLSCAN_API_KEY` - You can sign up to [urlscan.io](https://urlscan.io/user/signup) to get a **FREE** API key (there are also paid subscriptions available). It is recommended you get a key and put it into the config file so that you can get more back (and quicker) from their API. NOTE: You will get rate limited unless you have a full paid subscription.
@@ -163,7 +165,7 @@ The `config.yml` file (typically in `~/.config/waymore/`) have values that can b
 - `WEBHOOK_DISCORD` - If the `--notify-discord` argument is passed, `knoxnl` will send a notification to this Discord wehook when a successful XSS is found.
 - `DEFAULT_OUTPUT_DIR` - This is the default location of any output files written if the `-oU` and `-oR` arguments are not used. If the value of this key is blank, then it will default to the location of the `config.yml` file.
-  **NOTE: The MIME types cannot be filtered for Alien Vault results because they do not return that in the API response.**
+  **NOTE: The MIME types cannot be filtered for Alien Vault OTX and Virus Total because they don't have the ability to filter on MIME Type. Sometimes URLScan does not have a MIME Type defined for a URL. In these cases, URLs will be included regardless of filter or match. Bear this in mind and consider excluding certain providers if this is important.**
 ## Output
@@ -201,6 +203,8 @@ The archive.org Wayback Machine CDX API can sometimes can sometimes require a hu
 There is also a problem with the Wayback Machine CDX API where the number of pages returned is not correct when filters are applied and can cause issues (see https://github.com/internetarchive/wayback/issues/243). Until that issue is resolved, setting the `-lr` argument to a sensible value can help with that problem in the short term.
+The Common Crawl API has had a lot of issues for a long time. Including this source could make waymore take a lot longer to run and may not yield any extra results. You can check if tere is an issue by visiting http://index.commoncrawl.org/collinfo.json and seeing if this is successful. Consider excluding Common Crawl altogether using the `--providers` argument and not including `commoncrawl`, or using the `-xcc` argument.
 **The provider API servers aren't designed to cope with huge volumes, so be sensible and considerate about what you hit them with!**
 When downloading archived responses, this can take a long time and can sometimes be killed by the machine for some reason, or manually killed by the user.
@@ -218,7 +222,7 @@ The URLs are saved in the same path as `config.yml` (typically `~/.config/waymor
 ### Example 2
-Get ALL the URLs from Wayback for `redbull.com` (no filters are applied in `mode U` with `-f`, and no URLs are retrieved from Commone Crawl, Alien Vault, URLScan and Virus Total, because `-xcc`, `-xav`, `-xus`, `-xvt` are passed respectively).
+Get ALL the URLs from Wayback for `redbull.com` (no filters are applied in `mode U` with `-f`, and no URLs are retrieved from Commone Crawl, Alien Vault, URLScan and Virus Total, because `-xcc`, `-xav`, `-xus`, `-xvt` are passed respectively. This can also be achieved by passing `--providers wayback` instead of the exclude arguments).
 Save the FIRST 200 responses that are found starting from 2022 (`-l 200 -from 2022`):
 <center><img src="https://github.com/xnl-h4ck3r/waymore/blob/main/waymore/images/example2.png"></center>

waymore-4.6.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+waymore/__init__.py,sha256=nBVFOoDYjRXcFurm_7co-GNVg6LUhzeZpudhmYNojHw,17
+waymore/waymore.py,sha256=FhSRlLoK9DBGojEX89rMQdZ-bEacPSxJg2BJwQfUJGA,177093
+waymore-4.6.dist-info/LICENSE,sha256=o_jq62xZ1YxI8tqzQKbNtqr3RW2i5sh0rk6ixCJEroU,1068
+waymore-4.6.dist-info/METADATA,sha256=oQMMrr_MbK_QPmShWY-TrHf-bxOg9dtjdUK76QE29H8,49511
+waymore-4.6.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
+waymore-4.6.dist-info/entry_points.txt,sha256=YHy5EUf3r_7OTkt9jvylLjNeg7Z5yvIVm5RUAyfNcN4,49
+waymore-4.6.dist-info/top_level.txt,sha256=RFTphkWaRu1N7lUWIPUjabgCPQ3ETmNllF7qze4JJ_s,8
+waymore-4.6.dist-info/RECORD,,

{waymore-4.4.dist-info → waymore-4.6.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.43.0)
+Generator: setuptools (75.3.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

waymore-4.4.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-waymore/__init__.py,sha256=bb3D2cWPj3M9gB4ePNX8nrpDuS8IImWiON1Cc_z3vGg,17
-waymore/waymore.py,sha256=cnFkODCRHd4OxxBZVMWUwus5bTZ-ypTGAK_Aa9HPd-g,169799
-waymore-4.4.dist-info/LICENSE,sha256=o_jq62xZ1YxI8tqzQKbNtqr3RW2i5sh0rk6ixCJEroU,1068
-waymore-4.4.dist-info/METADATA,sha256=gpUxWzvVUkCmZUB_Dd-gl_8w2P9UFh5tpfyob7wMe-o,47221
-waymore-4.4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-waymore-4.4.dist-info/entry_points.txt,sha256=YHy5EUf3r_7OTkt9jvylLjNeg7Z5yvIVm5RUAyfNcN4,49
-waymore-4.4.dist-info/top_level.txt,sha256=RFTphkWaRu1N7lUWIPUjabgCPQ3ETmNllF7qze4JJ_s,8
-waymore-4.4.dist-info/RECORD,,

{waymore-4.4.dist-info → waymore-4.6.dist-info}/LICENSE RENAMED Viewed

File without changes

{waymore-4.4.dist-info → waymore-4.6.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{waymore-4.4.dist-info → waymore-4.6.dist-info}/top_level.txt RENAMED Viewed

File without changes

waymore 4.4__py3-none-any.whl → 4.6__py3-none-any.whl

waymore 4.4py3-none-any.whl → 4.6py3-none-any.whl