waymore 4.1__py3-none-any.whl → 4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
waymore/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__="4.1"
1
+ __version__="4.3"
waymore/waymore.py CHANGED
@@ -83,7 +83,7 @@ argsInputHostname = ''
83
83
  responseOutputDirectory = ''
84
84
 
85
85
  # Source Provider URLs
86
- WAYBACK_URL = 'https://web.archive.org/cdx/search/cdx?url={DOMAIN}&collapse={COLLAPSE}&fl=timestamp,original,mimetype,statuscode,digest'
86
+ WAYBACK_URL = 'https://web.archive.org/cdx/search/cdx?url={DOMAIN}{COLLAPSE}&fl=timestamp,original,mimetype,statuscode,digest'
87
87
  CCRAWL_INDEX_URL = 'https://index.commoncrawl.org/collinfo.json'
88
88
  ALIENVAULT_URL = 'https://otx.alienvault.com/api/v1/indicators/{TYPE}/{DOMAIN}/url_list?limit=500'
89
89
  URLSCAN_URL = 'https://urlscan.io/api/v1/search/?q=domain:{DOMAIN}&size=10000'
@@ -1780,23 +1780,31 @@ def processWayBackPage(url):
1780
1780
  return
1781
1781
 
1782
1782
  # Get the URLs and MIME types. Each line is a separate JSON string
1783
- for line in resp.iter_lines():
1784
- results = line.decode("utf-8")
1785
- # Only get MIME Types if --verbose option was selected
1786
- if verbose():
1787
- try:
1788
- linkMimes.add(str(results).split(' ')[2])
1789
- except Exception as e:
1790
- if verbose():
1791
- writerr(colored(getSPACER('ERROR processWayBackPage 2: Cannot get MIME type from line: ' + str(line)),'red'))
1792
- write(resp.text)
1793
- try:
1783
+ try:
1784
+ for line in resp.iter_lines():
1785
+ results = line.decode("utf-8")
1794
1786
  foundUrl = fixArchiveOrgUrl(str(results).split(' ')[1])
1795
- linksFoundAdd(foundUrl)
1796
- except Exception as e:
1797
- if verbose():
1798
- writerr(colored(getSPACER('ERROR processWayBackPage 3: Cannot get link from line: ' + str(line)),'red'))
1799
- write(resp.text)
1787
+
1788
+ # Check the URL exclusions
1789
+ match = re.search(r'('+re.escape(FILTER_URL).replace(',','|')+')', foundUrl, flags=re.IGNORECASE)
1790
+ if match is None:
1791
+ # Only get MIME Types if --verbose option was selected
1792
+ if verbose():
1793
+ try:
1794
+ linkMimes.add(str(results).split(' ')[2])
1795
+ except Exception as e:
1796
+ if verbose():
1797
+ writerr(colored(getSPACER('ERROR processWayBackPage 2: Cannot get MIME type from line: ' + str(line)),'red'))
1798
+ write(resp.text)
1799
+ try:
1800
+ linksFoundAdd(foundUrl)
1801
+ except Exception as e:
1802
+ if verbose():
1803
+ writerr(colored(getSPACER('ERROR processWayBackPage 3: Cannot get link from line: ' + str(line)),'red'))
1804
+ write(resp.text)
1805
+ except Exception as e:
1806
+ if verbose():
1807
+ writerr(colored(getSPACER('ERROR processWayBackPage 4: ' + str(line)),'red'))
1800
1808
  else:
1801
1809
  pass
1802
1810
  except Exception as e:
@@ -1843,11 +1851,15 @@ def getWaybackUrls():
1843
1851
  session.mount('https://', HTTP_ADAPTER)
1844
1852
  session.mount('http://', HTTP_ADAPTER)
1845
1853
  resp = session.get(url+'&showNumPages=True', headers={"User-Agent":userAgent})
1846
- totalPages = int(resp.text.strip())
1847
-
1848
- # If the argument to limit the requests was passed and the total pages is larger than that, set to the limit
1849
- if args.limit_requests != 0 and totalPages > args.limit_requests:
1850
- totalPages = args.limit_requests
1854
+ # Try to get the total number of pages. If there is a problem, we'll return totalPages = 0 which means we'll get everything back in one request
1855
+ try:
1856
+ totalPages = int(resp.text.strip())
1857
+
1858
+ # If the argument to limit the requests was passed and the total pages is larger than that, set to the limit
1859
+ if args.limit_requests != 0 and totalPages > args.limit_requests:
1860
+ totalPages = args.limit_requests
1861
+ except:
1862
+ totalPages = -1
1851
1863
  except Exception as e:
1852
1864
  try:
1853
1865
  # If the rate limit was reached end now
@@ -1872,31 +1884,39 @@ def getWaybackUrls():
1872
1884
  else:
1873
1885
  writerr(colored(getSPACER('[ ERR ] Unable to get links from Wayback Machine (archive.org): ' + str(e)), 'red'))
1874
1886
  return
1875
-
1887
+
1876
1888
  if args.check_only:
1877
- checkWayback = totalPages
1878
- write(colored('Get URLs from Wayback Machine: ','cyan')+colored(str(checkWayback)+' requests','white'))
1889
+ if totalPages < 0:
1890
+ write(colored('Due to a change in Wayback Machine API, all URLs will be retrieved in one request and it is not possible to determine how long it will take, so please ignore this.','cyan'))
1891
+ else:
1892
+ checkWayback = totalPages
1893
+ write(colored('Get URLs from Wayback Machine: ','cyan')+colored(str(checkWayback)+' requests','white'))
1879
1894
  else:
1880
1895
  if verbose():
1881
1896
  write(colored('The archive URL requested to get links: ','magenta')+colored(url+'\n','white'))
1882
1897
 
1883
- # if the page number was found then display it, but otherwise we will just try to increment until we have everything
1884
- write(colored('\rGetting links from ' + str(totalPages) + ' Wayback Machine (archive.org) API requests (this can take a while for some domains)...\r','cyan'))
1898
+ if totalPages < 0:
1899
+ write(colored('\rGetting links from Wayback Machine (archive.org) with one request (this can take a while for some domains)...\r','cyan'))
1885
1900
 
1886
- # Get a list of all the page URLs we need to visit
1887
- pages = []
1888
- if totalPages == 1:
1889
- pages.append(url)
1901
+ processWayBackPage(url)
1890
1902
  else:
1891
- for page in range(0, totalPages):
1892
- pages.append(url+str(page))
1893
-
1894
- # Process the URLs from web archive
1895
- if stopProgram is None:
1896
- p = mp.Pool(args.processes)
1897
- p.map(processWayBackPage, pages)
1898
- p.close()
1899
- p.join()
1903
+ # if the page number was found then display it, but otherwise we will just try to increment until we have everything
1904
+ write(colored('\rGetting links from ' + str(totalPages) + ' Wayback Machine (archive.org) API requests (this can take a while for some domains)...\r','cyan'))
1905
+
1906
+ # Get a list of all the page URLs we need to visit
1907
+ pages = []
1908
+ if totalPages == 1:
1909
+ pages.append(url)
1910
+ else:
1911
+ for page in range(0, totalPages):
1912
+ pages.append(url+str(page))
1913
+
1914
+ # Process the URLs from web archive
1915
+ if stopProgram is None:
1916
+ p = mp.Pool(args.processes)
1917
+ p.map(processWayBackPage, pages)
1918
+ p.close()
1919
+ p.join()
1900
1920
 
1901
1921
  # Show the MIME types found (in case user wants to exclude more)
1902
1922
  if verbose() and len(linkMimes) > 0 :
@@ -2422,12 +2442,12 @@ def processResponses():
2422
2442
  # This is useful for filtering out captures that are 'too dense' or when looking for unique captures."
2423
2443
  if args.capture_interval == 'none': # get all
2424
2444
  collapse = ''
2425
- elif args.capture_interval == 'h': # get at most 1 capture per hour
2426
- collapse = 'timestamp:10'
2427
- elif args.capture_interval == 'd': # get at most 1 capture per day
2428
- collapse = 'timestamp:8'
2429
- elif args.capture_interval == 'm': # get at most 1 capture per month
2430
- collapse = 'timestamp:6'
2445
+ elif args.capture_interval == 'h': # get at most 1 capture per URL per hour
2446
+ collapse = '&collapse=timestamp:10'
2447
+ elif args.capture_interval == 'd': # get at most 1 capture per URL per day
2448
+ collapse = '&collapse=timestamp:8'
2449
+ elif args.capture_interval == 'm': # get at most 1 capture per URL per month
2450
+ collapse = '&collapse=timestamp:6'
2431
2451
 
2432
2452
  url = WAYBACK_URL.replace('{DOMAIN}',subs + quote(argsInput) + path).replace('{COLLAPSE}',collapse) + filterMIME + filterCode + filterLimit + filterFrom + filterTo + filterKeywords
2433
2453
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: waymore
3
- Version: 4.1
3
+ Version: 4.3
4
4
  Summary: Find way more from the Wayback Machine, Common Crawl, Alien Vault OTX, URLScan & VirusTotal!
5
5
  Home-page: https://github.com/xnl-h4ck3r/waymore
6
6
  Author: @xnl-h4ck3r
@@ -16,7 +16,7 @@ Requires-Dist: tldextract
16
16
 
17
17
  <center><img src="https://github.com/xnl-h4ck3r/waymore/blob/main/waymore/images/title.png"></center>
18
18
 
19
- ## About - v4.1
19
+ ## About - v4.3
20
20
 
21
21
  The idea behind **waymore** is to find even more links from the Wayback Machine than other existing tools.
22
22
 
@@ -0,0 +1,8 @@
1
+ waymore/__init__.py,sha256=-TFFyw9iukscHpq2I58oEz2oLQ995GbajzvC6Iz9ddM,17
2
+ waymore/waymore.py,sha256=SWTqBUa-btDe6cWjRcL3w-ef1uK45LpfztCgvgtQPSM,168145
3
+ waymore-4.3.dist-info/LICENSE,sha256=o_jq62xZ1YxI8tqzQKbNtqr3RW2i5sh0rk6ixCJEroU,1068
4
+ waymore-4.3.dist-info/METADATA,sha256=q7_uq3p1kLMMARqUWTnA33rhxUluolsyAykMv7Ot598,47245
5
+ waymore-4.3.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
6
+ waymore-4.3.dist-info/entry_points.txt,sha256=YHy5EUf3r_7OTkt9jvylLjNeg7Z5yvIVm5RUAyfNcN4,49
7
+ waymore-4.3.dist-info/top_level.txt,sha256=RFTphkWaRu1N7lUWIPUjabgCPQ3ETmNllF7qze4JJ_s,8
8
+ waymore-4.3.dist-info/RECORD,,
@@ -1,8 +0,0 @@
1
- waymore/__init__.py,sha256=W33igiZPTC8XKkjVNoDhbalnEpvc0URlY2v823612DE,17
2
- waymore/waymore.py,sha256=0BC0QZbyQap8f8xMp9_IXYG6yC_ZC9Hgi6L7_FER4Tg,166754
3
- waymore-4.1.dist-info/LICENSE,sha256=o_jq62xZ1YxI8tqzQKbNtqr3RW2i5sh0rk6ixCJEroU,1068
4
- waymore-4.1.dist-info/METADATA,sha256=ePRZd4A8GtQQHFlsRDCcPPW6GvrRe1XcgHDoYVav-qI,47245
5
- waymore-4.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
6
- waymore-4.1.dist-info/entry_points.txt,sha256=YHy5EUf3r_7OTkt9jvylLjNeg7Z5yvIVm5RUAyfNcN4,49
7
- waymore-4.1.dist-info/top_level.txt,sha256=RFTphkWaRu1N7lUWIPUjabgCPQ3ETmNllF7qze4JJ_s,8
8
- waymore-4.1.dist-info/RECORD,,
File without changes
File without changes