PyPI - waymore - Versions diffs - 4.1__py3-none-any.whl → 4.3__py3-none-any.whl - Mend

waymore 4.1py3-none-any.whl → 4.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

waymore/__init__.py +1 -1
waymore/waymore.py +66 -46
{waymore-4.1.dist-info → waymore-4.3.dist-info}/METADATA +2 -2
waymore-4.3.dist-info/RECORD +8 -0
waymore-4.1.dist-info/RECORD +0 -8
{waymore-4.1.dist-info → waymore-4.3.dist-info}/LICENSE +0 -0
{waymore-4.1.dist-info → waymore-4.3.dist-info}/WHEEL +0 -0
{waymore-4.1.dist-info → waymore-4.3.dist-info}/entry_points.txt +0 -0
{waymore-4.1.dist-info → waymore-4.3.dist-info}/top_level.txt +0 -0

waymore/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__="4.1"
1	+ __version__="4.3"

waymore/waymore.py CHANGED Viewed

@@ -83,7 +83,7 @@ argsInputHostname = ''
 responseOutputDirectory = ''
 # Source Provider URLs
-WAYBACK_URL = 'https://web.archive.org/cdx/search/cdx?url={DOMAIN}&collapse={COLLAPSE}&fl=timestamp,original,mimetype,statuscode,digest'
+WAYBACK_URL = 'https://web.archive.org/cdx/search/cdx?url={DOMAIN}{COLLAPSE}&fl=timestamp,original,mimetype,statuscode,digest'
 CCRAWL_INDEX_URL = 'https://index.commoncrawl.org/collinfo.json'
 ALIENVAULT_URL = 'https://otx.alienvault.com/api/v1/indicators/{TYPE}/{DOMAIN}/url_list?limit=500'
 URLSCAN_URL = 'https://urlscan.io/api/v1/search/?q=domain:{DOMAIN}&size=10000'
@@ -1780,23 +1780,31 @@ def processWayBackPage(url):
                     return
             # Get the URLs and MIME types. Each line is a separate JSON string
-            for line in resp.iter_lines():
-                results = line.decode("utf-8")
-                # Only get MIME Types if --verbose option was selected
-                if verbose():
-                    try:
-                        linkMimes.add(str(results).split(' ')[2])
-                    except Exception as e:
-                        if verbose():
-                            writerr(colored(getSPACER('ERROR processWayBackPage 2: Cannot get MIME type from line: ' + str(line)),'red'))
-                            write(resp.text)
-                try:
+            try:
+                for line in resp.iter_lines():
+                    results = line.decode("utf-8")
                     foundUrl = fixArchiveOrgUrl(str(results).split(' ')[1])
-                    linksFoundAdd(foundUrl)
-                except Exception as e:
-                    if verbose():
-                        writerr(colored(getSPACER('ERROR processWayBackPage 3: Cannot get link from line: ' + str(line)),'red'))
-                        write(resp.text)
+                    # Check the URL exclusions
+                    match = re.search(r'('+re.escape(FILTER_URL).replace(',','|')+')', foundUrl, flags=re.IGNORECASE)
+                    if match is None:
+                        # Only get MIME Types if --verbose option was selected
+                        if verbose():
+                            try:
+                                linkMimes.add(str(results).split(' ')[2])
+                            except Exception as e:
+                                if verbose():
+                                    writerr(colored(getSPACER('ERROR processWayBackPage 2: Cannot get MIME type from line: ' + str(line)),'red'))
+                                    write(resp.text)
+                        try:
+                            linksFoundAdd(foundUrl)
+                        except Exception as e:
+                            if verbose():
+                                writerr(colored(getSPACER('ERROR processWayBackPage 3: Cannot get link from line: ' + str(line)),'red'))
+                                write(resp.text)
+            except Exception as e:
+                if verbose():
+                    writerr(colored(getSPACER('ERROR processWayBackPage 4: ' + str(line)),'red'))
         else:
             pass
     except Exception as e:
@@ -1843,11 +1851,15 @@ def getWaybackUrls():
             session.mount('https://', HTTP_ADAPTER)
             session.mount('http://', HTTP_ADAPTER)
             resp = session.get(url+'&showNumPages=True', headers={"User-Agent":userAgent})
-            totalPages = int(resp.text.strip())
-            # If the argument to limit the requests was passed and the total pages is larger than that, set to the limit
-            if args.limit_requests != 0 and totalPages > args.limit_requests:
-                totalPages = args.limit_requests
+            # Try to get the total number of pages. If there is a problem, we'll return totalPages = 0 which means we'll get everything back in one request
+            try:
+                totalPages = int(resp.text.strip())
+                # If the argument to limit the requests was passed and the total pages is larger than that, set to the limit
+                if args.limit_requests != 0 and totalPages > args.limit_requests:
+                    totalPages = args.limit_requests
+            except:
+                totalPages = -1
         except Exception as e:
             try:
                 # If the rate limit was reached end now
@@ -1872,31 +1884,39 @@ def getWaybackUrls():
                 else:
                     writerr(colored(getSPACER('[ ERR ] Unable to get links from Wayback Machine (archive.org): ' + str(e)), 'red'))
             return
         if args.check_only:
-            checkWayback = totalPages
-            write(colored('Get URLs from Wayback Machine: ','cyan')+colored(str(checkWayback)+' requests','white'))
+            if totalPages < 0:
+                write(colored('Due to a change in Wayback Machine API, all URLs will be retrieved in one request and it is not possible to determine how long it will take, so please ignore this.','cyan'))
+            else:
+                checkWayback = totalPages
+                write(colored('Get URLs from Wayback Machine: ','cyan')+colored(str(checkWayback)+' requests','white'))
         else:
             if verbose():
                 write(colored('The archive URL requested to get links: ','magenta')+colored(url+'\n','white'))
-            # if the page number was found then display it, but otherwise we will just try to increment until we have everything
-            write(colored('\rGetting links from ' + str(totalPages) + ' Wayback Machine (archive.org) API requests (this can take a while for some domains)...\r','cyan'))
+            if totalPages < 0:
+                write(colored('\rGetting links from Wayback Machine (archive.org) with one request (this can take a while for some domains)...\r','cyan'))
-            # Get a list of all the page URLs we need to visit
-            pages = []
-            if totalPages == 1:
-                pages.append(url)
+                processWayBackPage(url)
             else:
-                for page in range(0, totalPages):
-                    pages.append(url+str(page))
-            # Process the URLs from web archive
-            if stopProgram is None:
-                p = mp.Pool(args.processes)
-                p.map(processWayBackPage, pages)
-                p.close()
-                p.join()
+                # if the page number was found then display it, but otherwise we will just try to increment until we have everything
+                write(colored('\rGetting links from ' + str(totalPages) + ' Wayback Machine (archive.org) API requests (this can take a while for some domains)...\r','cyan'))
+                # Get a list of all the page URLs we need to visit
+                pages = []
+                if totalPages == 1:
+                    pages.append(url)
+                else:
+                    for page in range(0, totalPages):
+                        pages.append(url+str(page))
+                # Process the URLs from web archive
+                if stopProgram is None:
+                    p = mp.Pool(args.processes)
+                    p.map(processWayBackPage, pages)
+                    p.close()
+                    p.join()
             # Show the MIME types found (in case user wants to exclude more)
             if verbose() and len(linkMimes) > 0 :
@@ -2422,12 +2442,12 @@ def processResponses():
             # This is useful for filtering out captures that are 'too dense' or when looking for unique captures."
             if args.capture_interval == 'none': # get all
                 collapse = ''
-            elif args.capture_interval == 'h': # get at most 1 capture per hour
-                collapse = 'timestamp:10'
-            elif args.capture_interval == 'd': # get at most 1 capture per day
-                collapse = 'timestamp:8'
-            elif args.capture_interval == 'm': # get at most 1 capture per month
-                collapse = 'timestamp:6'
+            elif args.capture_interval == 'h': # get at most 1 capture per URL per hour
+                collapse = '&collapse=timestamp:10'
+            elif args.capture_interval == 'd': # get at most 1 capture per URL per day
+                collapse = '&collapse=timestamp:8'
+            elif args.capture_interval == 'm': # get at most 1 capture per URL per month
+                collapse = '&collapse=timestamp:6'
             url = WAYBACK_URL.replace('{DOMAIN}',subs + quote(argsInput) + path).replace('{COLLAPSE}',collapse) + filterMIME + filterCode + filterLimit + filterFrom + filterTo + filterKeywords

{waymore-4.1.dist-info → waymore-4.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: waymore
-Version: 4.1
+Version: 4.3
 Summary: Find way more from the Wayback Machine, Common Crawl, Alien Vault OTX, URLScan & VirusTotal!
 Home-page: https://github.com/xnl-h4ck3r/waymore
 Author: @xnl-h4ck3r
@@ -16,7 +16,7 @@ Requires-Dist: tldextract
 <center><img src="https://github.com/xnl-h4ck3r/waymore/blob/main/waymore/images/title.png"></center>
-## About - v4.1
+## About - v4.3
 The idea behind **waymore** is to find even more links from the Wayback Machine than other existing tools.

waymore-4.3.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+waymore/__init__.py,sha256=-TFFyw9iukscHpq2I58oEz2oLQ995GbajzvC6Iz9ddM,17
+waymore/waymore.py,sha256=SWTqBUa-btDe6cWjRcL3w-ef1uK45LpfztCgvgtQPSM,168145
+waymore-4.3.dist-info/LICENSE,sha256=o_jq62xZ1YxI8tqzQKbNtqr3RW2i5sh0rk6ixCJEroU,1068
+waymore-4.3.dist-info/METADATA,sha256=q7_uq3p1kLMMARqUWTnA33rhxUluolsyAykMv7Ot598,47245
+waymore-4.3.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+waymore-4.3.dist-info/entry_points.txt,sha256=YHy5EUf3r_7OTkt9jvylLjNeg7Z5yvIVm5RUAyfNcN4,49
+waymore-4.3.dist-info/top_level.txt,sha256=RFTphkWaRu1N7lUWIPUjabgCPQ3ETmNllF7qze4JJ_s,8
+waymore-4.3.dist-info/RECORD,,

waymore-4.1.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-waymore/__init__.py,sha256=W33igiZPTC8XKkjVNoDhbalnEpvc0URlY2v823612DE,17
-waymore/waymore.py,sha256=0BC0QZbyQap8f8xMp9_IXYG6yC_ZC9Hgi6L7_FER4Tg,166754
-waymore-4.1.dist-info/LICENSE,sha256=o_jq62xZ1YxI8tqzQKbNtqr3RW2i5sh0rk6ixCJEroU,1068
-waymore-4.1.dist-info/METADATA,sha256=ePRZd4A8GtQQHFlsRDCcPPW6GvrRe1XcgHDoYVav-qI,47245
-waymore-4.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-waymore-4.1.dist-info/entry_points.txt,sha256=YHy5EUf3r_7OTkt9jvylLjNeg7Z5yvIVm5RUAyfNcN4,49
-waymore-4.1.dist-info/top_level.txt,sha256=RFTphkWaRu1N7lUWIPUjabgCPQ3ETmNllF7qze4JJ_s,8
-waymore-4.1.dist-info/RECORD,,

{waymore-4.1.dist-info → waymore-4.3.dist-info}/LICENSE RENAMED Viewed

File without changes

{waymore-4.1.dist-info → waymore-4.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{waymore-4.1.dist-info → waymore-4.3.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{waymore-4.1.dist-info → waymore-4.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

waymore 4.1__py3-none-any.whl → 4.3__py3-none-any.whl

waymore 4.1py3-none-any.whl → 4.3py3-none-any.whl