PyPI - waymore - Versions diffs - 4.2__py3-none-any.whl → 4.4__py3-none-any.whl - Mend

waymore 4.2py3-none-any.whl → 4.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

waymore/__init__.py +1 -1
waymore/waymore.py +95 -40
{waymore-4.2.dist-info → waymore-4.4.dist-info}/METADATA +2 -3
waymore-4.4.dist-info/RECORD +8 -0
waymore-4.2.dist-info/RECORD +0 -8
{waymore-4.2.dist-info → waymore-4.4.dist-info}/LICENSE +0 -0
{waymore-4.2.dist-info → waymore-4.4.dist-info}/WHEEL +0 -0
{waymore-4.2.dist-info → waymore-4.4.dist-info}/entry_points.txt +0 -0
{waymore-4.2.dist-info → waymore-4.4.dist-info}/top_level.txt +0 -0

waymore/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__="4.2"
1	+ __version__="4.4"

waymore/waymore.py CHANGED Viewed

@@ -83,7 +83,7 @@ argsInputHostname = ''
 responseOutputDirectory = ''
 # Source Provider URLs
-WAYBACK_URL = 'https://web.archive.org/cdx/search/cdx?url={DOMAIN}&collapse={COLLAPSE}&fl=timestamp,original,mimetype,statuscode,digest'
+WAYBACK_URL = 'https://web.archive.org/cdx/search/cdx?url={DOMAIN}{COLLAPSE}&fl=timestamp,original,mimetype,statuscode,digest'
 CCRAWL_INDEX_URL = 'https://index.commoncrawl.org/collinfo.json'
 ALIENVAULT_URL = 'https://otx.alienvault.com/api/v1/indicators/{TYPE}/{DOMAIN}/url_list?limit=500'
 URLSCAN_URL = 'https://urlscan.io/api/v1/search/?q=domain:{DOMAIN}&size=10000'
@@ -706,23 +706,55 @@ def fixArchiveOrgUrl(url):
             url = url[0:newline]
     return url
+# Add a link to the linksFound collection for archived responses (included timestamp preifx)
+def linksFoundResponseAdd(link):
+    global linksFound, argsInput, argsInputHostname
+    try:
+        if inputIsDomainANDPath:
+            checkInput = argsInput
+        else:
+            checkInput = argsInputHostname
+        # Remove the timestamp
+        linkWithoutTimestamp = link.split('/', 1)[-1]
+        # If the link specifies port 80 or 443, e.g. http://example.com:80, then remove the port
+        parsed = urlparse(linkWithoutTimestamp.strip())
+        if parsed.port in (80, 443):
+            new_netloc = parsed.hostname
+            parsed_url = parsed._replace(netloc=new_netloc).geturl()
+        else:
+            parsed_url = linkWithoutTimestamp
+        # Don't write it if the link does not contain the requested domain (this can sometimes happen)
+        if parsed_url.find(checkInput) >= 0:
+            linksFound.add(link)
+    except Exception as e:
+        linksFound.add(link)
 # Add a link to the linksFound collection
 def linksFoundAdd(link):
     global linksFound, argsInput, argsInputHostname
-    # If the link specifies port 80 or 443, e.g. http://example.com:80, then remove the port
     try:
         if inputIsDomainANDPath:
             checkInput = argsInput
         else:
             checkInput = argsInputHostname
+        # If the link specifies port 80 or 443, e.g. http://example.com:80, then remove the port
+        parsed = urlparse(link.strip())
+        if parsed.port in (80, 443):
+            new_netloc = parsed.hostname
+            parsed_url = parsed._replace(netloc=new_netloc).geturl()
+        else:
+            parsed_url = link
         # Don't write it if the link does not contain the requested domain (this can sometimes happen)
-        if link.find(checkInput) >= 0:
-            parsed = urlparse(link.strip())
-            if parsed.netloc.find(':80') >= 0 or parsed.netloc.fnd(':443') >= 0:
-                newNetloc = parsed.netloc.split(':')[0]
-                parsed = parsed._replace(netloc=newNetloc).geturl()
-            linksFound.add(parsed)
-    except:
+        if parsed_url.find(checkInput) >= 0:
+            linksFound.add(link)
+    except Exception as e:
         linksFound.add(link)
 def processArchiveUrl(url):
@@ -1851,11 +1883,15 @@ def getWaybackUrls():
             session.mount('https://', HTTP_ADAPTER)
             session.mount('http://', HTTP_ADAPTER)
             resp = session.get(url+'&showNumPages=True', headers={"User-Agent":userAgent})
-            totalPages = int(resp.text.strip())
-            # If the argument to limit the requests was passed and the total pages is larger than that, set to the limit
-            if args.limit_requests != 0 and totalPages > args.limit_requests:
-                totalPages = args.limit_requests
+            # Try to get the total number of pages. If there is a problem, we'll return totalPages = 0 which means we'll get everything back in one request
+            try:
+                totalPages = int(resp.text.strip())
+                # If the argument to limit the requests was passed and the total pages is larger than that, set to the limit
+                if args.limit_requests != 0 and totalPages > args.limit_requests:
+                    totalPages = args.limit_requests
+            except:
+                totalPages = -1
         except Exception as e:
             try:
                 # If the rate limit was reached end now
@@ -1880,31 +1916,39 @@ def getWaybackUrls():
                 else:
                     writerr(colored(getSPACER('[ ERR ] Unable to get links from Wayback Machine (archive.org): ' + str(e)), 'red'))
             return
         if args.check_only:
-            checkWayback = totalPages
-            write(colored('Get URLs from Wayback Machine: ','cyan')+colored(str(checkWayback)+' requests','white'))
+            if totalPages < 0:
+                write(colored('Due to a change in Wayback Machine API, all URLs will be retrieved in one request and it is not possible to determine how long it will take, so please ignore this.','cyan'))
+            else:
+                checkWayback = totalPages
+                write(colored('Get URLs from Wayback Machine: ','cyan')+colored(str(checkWayback)+' requests','white'))
         else:
             if verbose():
                 write(colored('The archive URL requested to get links: ','magenta')+colored(url+'\n','white'))
-            # if the page number was found then display it, but otherwise we will just try to increment until we have everything
-            write(colored('\rGetting links from ' + str(totalPages) + ' Wayback Machine (archive.org) API requests (this can take a while for some domains)...\r','cyan'))
+            if totalPages < 0:
+                write(colored('\rGetting links from Wayback Machine (archive.org) with one request (this can take a while for some domains)...\r','cyan'))
-            # Get a list of all the page URLs we need to visit
-            pages = []
-            if totalPages == 1:
-                pages.append(url)
+                processWayBackPage(url)
             else:
-                for page in range(0, totalPages):
-                    pages.append(url+str(page))
-            # Process the URLs from web archive
-            if stopProgram is None:
-                p = mp.Pool(args.processes)
-                p.map(processWayBackPage, pages)
-                p.close()
-                p.join()
+                # if the page number was found then display it, but otherwise we will just try to increment until we have everything
+                write(colored('\rGetting links from ' + str(totalPages) + ' Wayback Machine (archive.org) API requests (this can take a while for some domains)...\r','cyan'))
+                # Get a list of all the page URLs we need to visit
+                pages = []
+                if totalPages == 1:
+                    pages.append(url)
+                else:
+                    for page in range(0, totalPages):
+                        pages.append(url+str(page))
+                # Process the URLs from web archive
+                if stopProgram is None:
+                    p = mp.Pool(args.processes)
+                    p.map(processWayBackPage, pages)
+                    p.close()
+                    p.join()
             # Show the MIME types found (in case user wants to exclude more)
             if verbose() and len(linkMimes) > 0 :
@@ -2335,7 +2379,7 @@ def getVirusTotalUrls():
     except Exception as e:
         writerr(colored('ERROR getVirusTotalUrls 1: ' + str(e), 'red'))
 def processResponses():
     """
     Get archived responses from Wayback Machine (archive.org)
@@ -2431,11 +2475,11 @@ def processResponses():
             if args.capture_interval == 'none': # get all
                 collapse = ''
             elif args.capture_interval == 'h': # get at most 1 capture per URL per hour
-                collapse = 'timestamp:10,original'
+                collapse = '&collapse=timestamp:10'
             elif args.capture_interval == 'd': # get at most 1 capture per URL per day
-                collapse = 'timestamp:8,original'
+                collapse = '&collapse=timestamp:8'
             elif args.capture_interval == 'm': # get at most 1 capture per URL per month
-                collapse = 'timestamp:6,original'
+                collapse = '&collapse=timestamp:6'
             url = WAYBACK_URL.replace('{DOMAIN}',subs + quote(argsInput) + path).replace('{COLLAPSE}',collapse) + filterMIME + filterCode + filterLimit + filterFrom + filterTo + filterKeywords
@@ -2501,13 +2545,14 @@ def processResponses():
                 except:
                     pass
-            # Go through the response to save the links found
+            # Go through the response to save the links found
             for line in resp.iter_lines():
                 try:
                     results = line.decode("utf-8")
-                    timestamp = results.split(' ')[0]
-                    originalUrl = results.split(' ')[1]
-                    linksFoundAdd(timestamp+'/'+originalUrl)
+                    parts = results.split(' ', 2)
+                    timestamp = parts[0]
+                    originalUrl = parts[1]
+                    linksFoundResponseAdd(timestamp+'/'+originalUrl)
                 except Exception as e:
                     writerr(colored(getSPACER('ERROR processResponses 3: Cannot to get link from line: '+str(line)), 'red'))
@@ -2528,6 +2573,16 @@ def processResponses():
         # Get the total number of responses we will try to get and set the current file count to the success count
         totalResponses = len(linkRequests)
+        # If there are no reponses to download, diaplay an error and exit
+        if totalResponses == 0:
+            try:
+                if originalUrl:
+                    writerr(colored(getSPACER('Failed to get links from Wayback Machine (archive.org) - there were results (e.g. "'+originalUrl+'") but they didn\'t match the input you gave. Check input and try again.'), 'red'))
+            except:
+                writerr(colored(getSPACER('Failed to get links from Wayback Machine (archive.org) - check input and try again.'), 'red'))
+            return
         fileCount = successCount
         if args.check_only:

{waymore-4.2.dist-info → waymore-4.4.dist-info}/METADATA RENAMED Viewed

@@ -1,12 +1,11 @@
 Metadata-Version: 2.1
 Name: waymore
-Version: 4.2
+Version: 4.4
 Summary: Find way more from the Wayback Machine, Common Crawl, Alien Vault OTX, URLScan & VirusTotal!
 Home-page: https://github.com/xnl-h4ck3r/waymore
 Author: @xnl-h4ck3r
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: argparse
 Requires-Dist: requests
 Requires-Dist: pyyaml
 Requires-Dist: termcolor
@@ -16,7 +15,7 @@ Requires-Dist: tldextract
 <center><img src="https://github.com/xnl-h4ck3r/waymore/blob/main/waymore/images/title.png"></center>
-## About - v4.2
+## About - v4.4
 The idea behind **waymore** is to find even more links from the Wayback Machine than other existing tools.

waymore-4.4.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+waymore/__init__.py,sha256=bb3D2cWPj3M9gB4ePNX8nrpDuS8IImWiON1Cc_z3vGg,17
+waymore/waymore.py,sha256=cnFkODCRHd4OxxBZVMWUwus5bTZ-ypTGAK_Aa9HPd-g,169799
+waymore-4.4.dist-info/LICENSE,sha256=o_jq62xZ1YxI8tqzQKbNtqr3RW2i5sh0rk6ixCJEroU,1068
+waymore-4.4.dist-info/METADATA,sha256=gpUxWzvVUkCmZUB_Dd-gl_8w2P9UFh5tpfyob7wMe-o,47221
+waymore-4.4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+waymore-4.4.dist-info/entry_points.txt,sha256=YHy5EUf3r_7OTkt9jvylLjNeg7Z5yvIVm5RUAyfNcN4,49
+waymore-4.4.dist-info/top_level.txt,sha256=RFTphkWaRu1N7lUWIPUjabgCPQ3ETmNllF7qze4JJ_s,8
+waymore-4.4.dist-info/RECORD,,

waymore-4.2.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-waymore/__init__.py,sha256=KS43n250T2U7gE8jspc0cFVyTllND-0M0RkTA_yyc88,17
-waymore/waymore.py,sha256=8HdTaKE5-SzIIWWWF5kskaBVzl0FRG0DpBChR11JWjs,167332
-waymore-4.2.dist-info/LICENSE,sha256=o_jq62xZ1YxI8tqzQKbNtqr3RW2i5sh0rk6ixCJEroU,1068
-waymore-4.2.dist-info/METADATA,sha256=uSCOfxcJDVsAO9z_in0-n6l8BOfD9jPNHiAsDuhuhz0,47245
-waymore-4.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-waymore-4.2.dist-info/entry_points.txt,sha256=YHy5EUf3r_7OTkt9jvylLjNeg7Z5yvIVm5RUAyfNcN4,49
-waymore-4.2.dist-info/top_level.txt,sha256=RFTphkWaRu1N7lUWIPUjabgCPQ3ETmNllF7qze4JJ_s,8
-waymore-4.2.dist-info/RECORD,,

{waymore-4.2.dist-info → waymore-4.4.dist-info}/LICENSE RENAMED Viewed

File without changes

{waymore-4.2.dist-info → waymore-4.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{waymore-4.2.dist-info → waymore-4.4.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{waymore-4.2.dist-info → waymore-4.4.dist-info}/top_level.txt RENAMED Viewed

File without changes

waymore 4.2__py3-none-any.whl → 4.4__py3-none-any.whl

waymore 4.2py3-none-any.whl → 4.4py3-none-any.whl