PyPI - waymore - Versions diffs - 4.2__tar.gz → 4.3__tar.gz - Mend

waymore 4.2tar.gz → 4.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

{waymore-4.2/waymore.egg-info → waymore-4.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: waymore
-Version: 4.2
+Version: 4.3
 Summary: Find way more from the Wayback Machine, Common Crawl, Alien Vault OTX, URLScan & VirusTotal!
 Home-page: https://github.com/xnl-h4ck3r/waymore
 Author: @xnl-h4ck3r
@@ -16,7 +16,7 @@ Requires-Dist: tldextract
 <center><img src="https://github.com/xnl-h4ck3r/waymore/blob/main/waymore/images/title.png"></center>
-## About - v4.2
+## About - v4.3
 The idea behind **waymore** is to find even more links from the Wayback Machine than other existing tools.

{waymore-4.2 → waymore-4.3}/README.md RENAMED Viewed

@@ -1,6 +1,6 @@
 <center><img src="https://github.com/xnl-h4ck3r/waymore/blob/main/waymore/images/title.png"></center>
-## About - v4.2
+## About - v4.3
 The idea behind **waymore** is to find even more links from the Wayback Machine than other existing tools.

waymore-4.3/waymore/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__="4.3"

{waymore-4.2 → waymore-4.3}/waymore/waymore.py RENAMED Viewed

@@ -83,7 +83,7 @@ argsInputHostname = ''
 responseOutputDirectory = ''
 # Source Provider URLs
-WAYBACK_URL = 'https://web.archive.org/cdx/search/cdx?url={DOMAIN}&collapse={COLLAPSE}&fl=timestamp,original,mimetype,statuscode,digest'
+WAYBACK_URL = 'https://web.archive.org/cdx/search/cdx?url={DOMAIN}{COLLAPSE}&fl=timestamp,original,mimetype,statuscode,digest'
 CCRAWL_INDEX_URL = 'https://index.commoncrawl.org/collinfo.json'
 ALIENVAULT_URL = 'https://otx.alienvault.com/api/v1/indicators/{TYPE}/{DOMAIN}/url_list?limit=500'
 URLSCAN_URL = 'https://urlscan.io/api/v1/search/?q=domain:{DOMAIN}&size=10000'
@@ -1851,11 +1851,15 @@ def getWaybackUrls():
             session.mount('https://', HTTP_ADAPTER)
             session.mount('http://', HTTP_ADAPTER)
             resp = session.get(url+'&showNumPages=True', headers={"User-Agent":userAgent})
-            totalPages = int(resp.text.strip())
-            # If the argument to limit the requests was passed and the total pages is larger than that, set to the limit
-            if args.limit_requests != 0 and totalPages > args.limit_requests:
-                totalPages = args.limit_requests
+            # Try to get the total number of pages. If there is a problem, we'll return totalPages = 0 which means we'll get everything back in one request
+            try:
+                totalPages = int(resp.text.strip())
+                # If the argument to limit the requests was passed and the total pages is larger than that, set to the limit
+                if args.limit_requests != 0 and totalPages > args.limit_requests:
+                    totalPages = args.limit_requests
+            except:
+                totalPages = -1
         except Exception as e:
             try:
                 # If the rate limit was reached end now
@@ -1880,31 +1884,39 @@ def getWaybackUrls():
                 else:
                     writerr(colored(getSPACER('[ ERR ] Unable to get links from Wayback Machine (archive.org): ' + str(e)), 'red'))
             return
         if args.check_only:
-            checkWayback = totalPages
-            write(colored('Get URLs from Wayback Machine: ','cyan')+colored(str(checkWayback)+' requests','white'))
+            if totalPages < 0:
+                write(colored('Due to a change in Wayback Machine API, all URLs will be retrieved in one request and it is not possible to determine how long it will take, so please ignore this.','cyan'))
+            else:
+                checkWayback = totalPages
+                write(colored('Get URLs from Wayback Machine: ','cyan')+colored(str(checkWayback)+' requests','white'))
         else:
             if verbose():
                 write(colored('The archive URL requested to get links: ','magenta')+colored(url+'\n','white'))
-            # if the page number was found then display it, but otherwise we will just try to increment until we have everything
-            write(colored('\rGetting links from ' + str(totalPages) + ' Wayback Machine (archive.org) API requests (this can take a while for some domains)...\r','cyan'))
+            if totalPages < 0:
+                write(colored('\rGetting links from Wayback Machine (archive.org) with one request (this can take a while for some domains)...\r','cyan'))
-            # Get a list of all the page URLs we need to visit
-            pages = []
-            if totalPages == 1:
-                pages.append(url)
+                processWayBackPage(url)
             else:
-                for page in range(0, totalPages):
-                    pages.append(url+str(page))
-            # Process the URLs from web archive
-            if stopProgram is None:
-                p = mp.Pool(args.processes)
-                p.map(processWayBackPage, pages)
-                p.close()
-                p.join()
+                # if the page number was found then display it, but otherwise we will just try to increment until we have everything
+                write(colored('\rGetting links from ' + str(totalPages) + ' Wayback Machine (archive.org) API requests (this can take a while for some domains)...\r','cyan'))
+                # Get a list of all the page URLs we need to visit
+                pages = []
+                if totalPages == 1:
+                    pages.append(url)
+                else:
+                    for page in range(0, totalPages):
+                        pages.append(url+str(page))
+                # Process the URLs from web archive
+                if stopProgram is None:
+                    p = mp.Pool(args.processes)
+                    p.map(processWayBackPage, pages)
+                    p.close()
+                    p.join()
             # Show the MIME types found (in case user wants to exclude more)
             if verbose() and len(linkMimes) > 0 :
@@ -2431,11 +2443,11 @@ def processResponses():
             if args.capture_interval == 'none': # get all
                 collapse = ''
             elif args.capture_interval == 'h': # get at most 1 capture per URL per hour
-                collapse = 'timestamp:10,original'
+                collapse = '&collapse=timestamp:10'
             elif args.capture_interval == 'd': # get at most 1 capture per URL per day
-                collapse = 'timestamp:8,original'
+                collapse = '&collapse=timestamp:8'
             elif args.capture_interval == 'm': # get at most 1 capture per URL per month
-                collapse = 'timestamp:6,original'
+                collapse = '&collapse=timestamp:6'
             url = WAYBACK_URL.replace('{DOMAIN}',subs + quote(argsInput) + path).replace('{COLLAPSE}',collapse) + filterMIME + filterCode + filterLimit + filterFrom + filterTo + filterKeywords

{waymore-4.2 → waymore-4.3/waymore.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: waymore
-Version: 4.2
+Version: 4.3
 Summary: Find way more from the Wayback Machine, Common Crawl, Alien Vault OTX, URLScan & VirusTotal!
 Home-page: https://github.com/xnl-h4ck3r/waymore
 Author: @xnl-h4ck3r
@@ -16,7 +16,7 @@ Requires-Dist: tldextract
 <center><img src="https://github.com/xnl-h4ck3r/waymore/blob/main/waymore/images/title.png"></center>
-## About - v4.2
+## About - v4.3
 The idea behind **waymore** is to find even more links from the Wayback Machine than other existing tools.