waymore 4.2__tar.gz → 4.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: waymore
3
- Version: 4.2
3
+ Version: 4.3
4
4
  Summary: Find way more from the Wayback Machine, Common Crawl, Alien Vault OTX, URLScan & VirusTotal!
5
5
  Home-page: https://github.com/xnl-h4ck3r/waymore
6
6
  Author: @xnl-h4ck3r
@@ -16,7 +16,7 @@ Requires-Dist: tldextract
16
16
 
17
17
  <center><img src="https://github.com/xnl-h4ck3r/waymore/blob/main/waymore/images/title.png"></center>
18
18
 
19
- ## About - v4.2
19
+ ## About - v4.3
20
20
 
21
21
  The idea behind **waymore** is to find even more links from the Wayback Machine than other existing tools.
22
22
 
@@ -1,6 +1,6 @@
1
1
  <center><img src="https://github.com/xnl-h4ck3r/waymore/blob/main/waymore/images/title.png"></center>
2
2
 
3
- ## About - v4.2
3
+ ## About - v4.3
4
4
 
5
5
  The idea behind **waymore** is to find even more links from the Wayback Machine than other existing tools.
6
6
 
@@ -0,0 +1 @@
1
+ __version__="4.3"
@@ -83,7 +83,7 @@ argsInputHostname = ''
83
83
  responseOutputDirectory = ''
84
84
 
85
85
  # Source Provider URLs
86
- WAYBACK_URL = 'https://web.archive.org/cdx/search/cdx?url={DOMAIN}&collapse={COLLAPSE}&fl=timestamp,original,mimetype,statuscode,digest'
86
+ WAYBACK_URL = 'https://web.archive.org/cdx/search/cdx?url={DOMAIN}{COLLAPSE}&fl=timestamp,original,mimetype,statuscode,digest'
87
87
  CCRAWL_INDEX_URL = 'https://index.commoncrawl.org/collinfo.json'
88
88
  ALIENVAULT_URL = 'https://otx.alienvault.com/api/v1/indicators/{TYPE}/{DOMAIN}/url_list?limit=500'
89
89
  URLSCAN_URL = 'https://urlscan.io/api/v1/search/?q=domain:{DOMAIN}&size=10000'
@@ -1851,11 +1851,15 @@ def getWaybackUrls():
1851
1851
  session.mount('https://', HTTP_ADAPTER)
1852
1852
  session.mount('http://', HTTP_ADAPTER)
1853
1853
  resp = session.get(url+'&showNumPages=True', headers={"User-Agent":userAgent})
1854
- totalPages = int(resp.text.strip())
1855
-
1856
- # If the argument to limit the requests was passed and the total pages is larger than that, set to the limit
1857
- if args.limit_requests != 0 and totalPages > args.limit_requests:
1858
- totalPages = args.limit_requests
1854
+ # Try to get the total number of pages. If there is a problem, we'll return totalPages = 0 which means we'll get everything back in one request
1855
+ try:
1856
+ totalPages = int(resp.text.strip())
1857
+
1858
+ # If the argument to limit the requests was passed and the total pages is larger than that, set to the limit
1859
+ if args.limit_requests != 0 and totalPages > args.limit_requests:
1860
+ totalPages = args.limit_requests
1861
+ except:
1862
+ totalPages = -1
1859
1863
  except Exception as e:
1860
1864
  try:
1861
1865
  # If the rate limit was reached end now
@@ -1880,31 +1884,39 @@ def getWaybackUrls():
1880
1884
  else:
1881
1885
  writerr(colored(getSPACER('[ ERR ] Unable to get links from Wayback Machine (archive.org): ' + str(e)), 'red'))
1882
1886
  return
1883
-
1887
+
1884
1888
  if args.check_only:
1885
- checkWayback = totalPages
1886
- write(colored('Get URLs from Wayback Machine: ','cyan')+colored(str(checkWayback)+' requests','white'))
1889
+ if totalPages < 0:
1890
+ write(colored('Due to a change in Wayback Machine API, all URLs will be retrieved in one request and it is not possible to determine how long it will take, so please ignore this.','cyan'))
1891
+ else:
1892
+ checkWayback = totalPages
1893
+ write(colored('Get URLs from Wayback Machine: ','cyan')+colored(str(checkWayback)+' requests','white'))
1887
1894
  else:
1888
1895
  if verbose():
1889
1896
  write(colored('The archive URL requested to get links: ','magenta')+colored(url+'\n','white'))
1890
1897
 
1891
- # if the page number was found then display it, but otherwise we will just try to increment until we have everything
1892
- write(colored('\rGetting links from ' + str(totalPages) + ' Wayback Machine (archive.org) API requests (this can take a while for some domains)...\r','cyan'))
1898
+ if totalPages < 0:
1899
+ write(colored('\rGetting links from Wayback Machine (archive.org) with one request (this can take a while for some domains)...\r','cyan'))
1893
1900
 
1894
- # Get a list of all the page URLs we need to visit
1895
- pages = []
1896
- if totalPages == 1:
1897
- pages.append(url)
1901
+ processWayBackPage(url)
1898
1902
  else:
1899
- for page in range(0, totalPages):
1900
- pages.append(url+str(page))
1901
-
1902
- # Process the URLs from web archive
1903
- if stopProgram is None:
1904
- p = mp.Pool(args.processes)
1905
- p.map(processWayBackPage, pages)
1906
- p.close()
1907
- p.join()
1903
+ # if the page number was found then display it, but otherwise we will just try to increment until we have everything
1904
+ write(colored('\rGetting links from ' + str(totalPages) + ' Wayback Machine (archive.org) API requests (this can take a while for some domains)...\r','cyan'))
1905
+
1906
+ # Get a list of all the page URLs we need to visit
1907
+ pages = []
1908
+ if totalPages == 1:
1909
+ pages.append(url)
1910
+ else:
1911
+ for page in range(0, totalPages):
1912
+ pages.append(url+str(page))
1913
+
1914
+ # Process the URLs from web archive
1915
+ if stopProgram is None:
1916
+ p = mp.Pool(args.processes)
1917
+ p.map(processWayBackPage, pages)
1918
+ p.close()
1919
+ p.join()
1908
1920
 
1909
1921
  # Show the MIME types found (in case user wants to exclude more)
1910
1922
  if verbose() and len(linkMimes) > 0 :
@@ -2431,11 +2443,11 @@ def processResponses():
2431
2443
  if args.capture_interval == 'none': # get all
2432
2444
  collapse = ''
2433
2445
  elif args.capture_interval == 'h': # get at most 1 capture per URL per hour
2434
- collapse = 'timestamp:10,original'
2446
+ collapse = '&collapse=timestamp:10'
2435
2447
  elif args.capture_interval == 'd': # get at most 1 capture per URL per day
2436
- collapse = 'timestamp:8,original'
2448
+ collapse = '&collapse=timestamp:8'
2437
2449
  elif args.capture_interval == 'm': # get at most 1 capture per URL per month
2438
- collapse = 'timestamp:6,original'
2450
+ collapse = '&collapse=timestamp:6'
2439
2451
 
2440
2452
  url = WAYBACK_URL.replace('{DOMAIN}',subs + quote(argsInput) + path).replace('{COLLAPSE}',collapse) + filterMIME + filterCode + filterLimit + filterFrom + filterTo + filterKeywords
2441
2453
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: waymore
3
- Version: 4.2
3
+ Version: 4.3
4
4
  Summary: Find way more from the Wayback Machine, Common Crawl, Alien Vault OTX, URLScan & VirusTotal!
5
5
  Home-page: https://github.com/xnl-h4ck3r/waymore
6
6
  Author: @xnl-h4ck3r
@@ -16,7 +16,7 @@ Requires-Dist: tldextract
16
16
 
17
17
  <center><img src="https://github.com/xnl-h4ck3r/waymore/blob/main/waymore/images/title.png"></center>
18
18
 
19
- ## About - v4.2
19
+ ## About - v4.3
20
20
 
21
21
  The idea behind **waymore** is to find even more links from the Wayback Machine than other existing tools.
22
22
 
@@ -1 +0,0 @@
1
- __version__="4.2"
File without changes
File without changes
File without changes