waymore 4.2__py3-none-any.whl → 4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
waymore/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__="4.2"
1
+ __version__="4.4"
waymore/waymore.py CHANGED
@@ -83,7 +83,7 @@ argsInputHostname = ''
83
83
  responseOutputDirectory = ''
84
84
 
85
85
  # Source Provider URLs
86
- WAYBACK_URL = 'https://web.archive.org/cdx/search/cdx?url={DOMAIN}&collapse={COLLAPSE}&fl=timestamp,original,mimetype,statuscode,digest'
86
+ WAYBACK_URL = 'https://web.archive.org/cdx/search/cdx?url={DOMAIN}{COLLAPSE}&fl=timestamp,original,mimetype,statuscode,digest'
87
87
  CCRAWL_INDEX_URL = 'https://index.commoncrawl.org/collinfo.json'
88
88
  ALIENVAULT_URL = 'https://otx.alienvault.com/api/v1/indicators/{TYPE}/{DOMAIN}/url_list?limit=500'
89
89
  URLSCAN_URL = 'https://urlscan.io/api/v1/search/?q=domain:{DOMAIN}&size=10000'
@@ -706,23 +706,55 @@ def fixArchiveOrgUrl(url):
706
706
  url = url[0:newline]
707
707
  return url
708
708
 
709
+ # Add a link to the linksFound collection for archived responses (included timestamp preifx)
710
+ def linksFoundResponseAdd(link):
711
+ global linksFound, argsInput, argsInputHostname
712
+
713
+ try:
714
+ if inputIsDomainANDPath:
715
+ checkInput = argsInput
716
+ else:
717
+ checkInput = argsInputHostname
718
+
719
+ # Remove the timestamp
720
+ linkWithoutTimestamp = link.split('/', 1)[-1]
721
+
722
+ # If the link specifies port 80 or 443, e.g. http://example.com:80, then remove the port
723
+ parsed = urlparse(linkWithoutTimestamp.strip())
724
+ if parsed.port in (80, 443):
725
+ new_netloc = parsed.hostname
726
+ parsed_url = parsed._replace(netloc=new_netloc).geturl()
727
+ else:
728
+ parsed_url = linkWithoutTimestamp
729
+
730
+ # Don't write it if the link does not contain the requested domain (this can sometimes happen)
731
+ if parsed_url.find(checkInput) >= 0:
732
+ linksFound.add(link)
733
+ except Exception as e:
734
+ linksFound.add(link)
735
+
709
736
  # Add a link to the linksFound collection
710
737
  def linksFoundAdd(link):
711
738
  global linksFound, argsInput, argsInputHostname
712
- # If the link specifies port 80 or 443, e.g. http://example.com:80, then remove the port
739
+
713
740
  try:
714
741
  if inputIsDomainANDPath:
715
742
  checkInput = argsInput
716
743
  else:
717
744
  checkInput = argsInputHostname
745
+
746
+ # If the link specifies port 80 or 443, e.g. http://example.com:80, then remove the port
747
+ parsed = urlparse(link.strip())
748
+ if parsed.port in (80, 443):
749
+ new_netloc = parsed.hostname
750
+ parsed_url = parsed._replace(netloc=new_netloc).geturl()
751
+ else:
752
+ parsed_url = link
753
+
718
754
  # Don't write it if the link does not contain the requested domain (this can sometimes happen)
719
- if link.find(checkInput) >= 0:
720
- parsed = urlparse(link.strip())
721
- if parsed.netloc.find(':80') >= 0 or parsed.netloc.fnd(':443') >= 0:
722
- newNetloc = parsed.netloc.split(':')[0]
723
- parsed = parsed._replace(netloc=newNetloc).geturl()
724
- linksFound.add(parsed)
725
- except:
755
+ if parsed_url.find(checkInput) >= 0:
756
+ linksFound.add(link)
757
+ except Exception as e:
726
758
  linksFound.add(link)
727
759
 
728
760
  def processArchiveUrl(url):
@@ -1851,11 +1883,15 @@ def getWaybackUrls():
1851
1883
  session.mount('https://', HTTP_ADAPTER)
1852
1884
  session.mount('http://', HTTP_ADAPTER)
1853
1885
  resp = session.get(url+'&showNumPages=True', headers={"User-Agent":userAgent})
1854
- totalPages = int(resp.text.strip())
1855
-
1856
- # If the argument to limit the requests was passed and the total pages is larger than that, set to the limit
1857
- if args.limit_requests != 0 and totalPages > args.limit_requests:
1858
- totalPages = args.limit_requests
1886
+ # Try to get the total number of pages. If there is a problem, we'll return totalPages = 0 which means we'll get everything back in one request
1887
+ try:
1888
+ totalPages = int(resp.text.strip())
1889
+
1890
+ # If the argument to limit the requests was passed and the total pages is larger than that, set to the limit
1891
+ if args.limit_requests != 0 and totalPages > args.limit_requests:
1892
+ totalPages = args.limit_requests
1893
+ except:
1894
+ totalPages = -1
1859
1895
  except Exception as e:
1860
1896
  try:
1861
1897
  # If the rate limit was reached end now
@@ -1880,31 +1916,39 @@ def getWaybackUrls():
1880
1916
  else:
1881
1917
  writerr(colored(getSPACER('[ ERR ] Unable to get links from Wayback Machine (archive.org): ' + str(e)), 'red'))
1882
1918
  return
1883
-
1919
+
1884
1920
  if args.check_only:
1885
- checkWayback = totalPages
1886
- write(colored('Get URLs from Wayback Machine: ','cyan')+colored(str(checkWayback)+' requests','white'))
1921
+ if totalPages < 0:
1922
+ write(colored('Due to a change in Wayback Machine API, all URLs will be retrieved in one request and it is not possible to determine how long it will take, so please ignore this.','cyan'))
1923
+ else:
1924
+ checkWayback = totalPages
1925
+ write(colored('Get URLs from Wayback Machine: ','cyan')+colored(str(checkWayback)+' requests','white'))
1887
1926
  else:
1888
1927
  if verbose():
1889
1928
  write(colored('The archive URL requested to get links: ','magenta')+colored(url+'\n','white'))
1890
1929
 
1891
- # if the page number was found then display it, but otherwise we will just try to increment until we have everything
1892
- write(colored('\rGetting links from ' + str(totalPages) + ' Wayback Machine (archive.org) API requests (this can take a while for some domains)...\r','cyan'))
1930
+ if totalPages < 0:
1931
+ write(colored('\rGetting links from Wayback Machine (archive.org) with one request (this can take a while for some domains)...\r','cyan'))
1893
1932
 
1894
- # Get a list of all the page URLs we need to visit
1895
- pages = []
1896
- if totalPages == 1:
1897
- pages.append(url)
1933
+ processWayBackPage(url)
1898
1934
  else:
1899
- for page in range(0, totalPages):
1900
- pages.append(url+str(page))
1901
-
1902
- # Process the URLs from web archive
1903
- if stopProgram is None:
1904
- p = mp.Pool(args.processes)
1905
- p.map(processWayBackPage, pages)
1906
- p.close()
1907
- p.join()
1935
+ # if the page number was found then display it, but otherwise we will just try to increment until we have everything
1936
+ write(colored('\rGetting links from ' + str(totalPages) + ' Wayback Machine (archive.org) API requests (this can take a while for some domains)...\r','cyan'))
1937
+
1938
+ # Get a list of all the page URLs we need to visit
1939
+ pages = []
1940
+ if totalPages == 1:
1941
+ pages.append(url)
1942
+ else:
1943
+ for page in range(0, totalPages):
1944
+ pages.append(url+str(page))
1945
+
1946
+ # Process the URLs from web archive
1947
+ if stopProgram is None:
1948
+ p = mp.Pool(args.processes)
1949
+ p.map(processWayBackPage, pages)
1950
+ p.close()
1951
+ p.join()
1908
1952
 
1909
1953
  # Show the MIME types found (in case user wants to exclude more)
1910
1954
  if verbose() and len(linkMimes) > 0 :
@@ -2335,7 +2379,7 @@ def getVirusTotalUrls():
2335
2379
 
2336
2380
  except Exception as e:
2337
2381
  writerr(colored('ERROR getVirusTotalUrls 1: ' + str(e), 'red'))
2338
-
2382
+
2339
2383
  def processResponses():
2340
2384
  """
2341
2385
  Get archived responses from Wayback Machine (archive.org)
@@ -2431,11 +2475,11 @@ def processResponses():
2431
2475
  if args.capture_interval == 'none': # get all
2432
2476
  collapse = ''
2433
2477
  elif args.capture_interval == 'h': # get at most 1 capture per URL per hour
2434
- collapse = 'timestamp:10,original'
2478
+ collapse = '&collapse=timestamp:10'
2435
2479
  elif args.capture_interval == 'd': # get at most 1 capture per URL per day
2436
- collapse = 'timestamp:8,original'
2480
+ collapse = '&collapse=timestamp:8'
2437
2481
  elif args.capture_interval == 'm': # get at most 1 capture per URL per month
2438
- collapse = 'timestamp:6,original'
2482
+ collapse = '&collapse=timestamp:6'
2439
2483
 
2440
2484
  url = WAYBACK_URL.replace('{DOMAIN}',subs + quote(argsInput) + path).replace('{COLLAPSE}',collapse) + filterMIME + filterCode + filterLimit + filterFrom + filterTo + filterKeywords
2441
2485
 
@@ -2501,13 +2545,14 @@ def processResponses():
2501
2545
  except:
2502
2546
  pass
2503
2547
 
2504
- # Go through the response to save the links found
2548
+ # Go through the response to save the links found
2505
2549
  for line in resp.iter_lines():
2506
2550
  try:
2507
2551
  results = line.decode("utf-8")
2508
- timestamp = results.split(' ')[0]
2509
- originalUrl = results.split(' ')[1]
2510
- linksFoundAdd(timestamp+'/'+originalUrl)
2552
+ parts = results.split(' ', 2)
2553
+ timestamp = parts[0]
2554
+ originalUrl = parts[1]
2555
+ linksFoundResponseAdd(timestamp+'/'+originalUrl)
2511
2556
  except Exception as e:
2512
2557
  writerr(colored(getSPACER('ERROR processResponses 3: Cannot to get link from line: '+str(line)), 'red'))
2513
2558
 
@@ -2528,6 +2573,16 @@ def processResponses():
2528
2573
 
2529
2574
  # Get the total number of responses we will try to get and set the current file count to the success count
2530
2575
  totalResponses = len(linkRequests)
2576
+
2577
+ # If there are no reponses to download, diaplay an error and exit
2578
+ if totalResponses == 0:
2579
+ try:
2580
+ if originalUrl:
2581
+ writerr(colored(getSPACER('Failed to get links from Wayback Machine (archive.org) - there were results (e.g. "'+originalUrl+'") but they didn\'t match the input you gave. Check input and try again.'), 'red'))
2582
+ except:
2583
+ writerr(colored(getSPACER('Failed to get links from Wayback Machine (archive.org) - check input and try again.'), 'red'))
2584
+ return
2585
+
2531
2586
  fileCount = successCount
2532
2587
 
2533
2588
  if args.check_only:
@@ -1,12 +1,11 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: waymore
3
- Version: 4.2
3
+ Version: 4.4
4
4
  Summary: Find way more from the Wayback Machine, Common Crawl, Alien Vault OTX, URLScan & VirusTotal!
5
5
  Home-page: https://github.com/xnl-h4ck3r/waymore
6
6
  Author: @xnl-h4ck3r
7
7
  Description-Content-Type: text/markdown
8
8
  License-File: LICENSE
9
- Requires-Dist: argparse
10
9
  Requires-Dist: requests
11
10
  Requires-Dist: pyyaml
12
11
  Requires-Dist: termcolor
@@ -16,7 +15,7 @@ Requires-Dist: tldextract
16
15
 
17
16
  <center><img src="https://github.com/xnl-h4ck3r/waymore/blob/main/waymore/images/title.png"></center>
18
17
 
19
- ## About - v4.2
18
+ ## About - v4.4
20
19
 
21
20
  The idea behind **waymore** is to find even more links from the Wayback Machine than other existing tools.
22
21
 
@@ -0,0 +1,8 @@
1
+ waymore/__init__.py,sha256=bb3D2cWPj3M9gB4ePNX8nrpDuS8IImWiON1Cc_z3vGg,17
2
+ waymore/waymore.py,sha256=cnFkODCRHd4OxxBZVMWUwus5bTZ-ypTGAK_Aa9HPd-g,169799
3
+ waymore-4.4.dist-info/LICENSE,sha256=o_jq62xZ1YxI8tqzQKbNtqr3RW2i5sh0rk6ixCJEroU,1068
4
+ waymore-4.4.dist-info/METADATA,sha256=gpUxWzvVUkCmZUB_Dd-gl_8w2P9UFh5tpfyob7wMe-o,47221
5
+ waymore-4.4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
6
+ waymore-4.4.dist-info/entry_points.txt,sha256=YHy5EUf3r_7OTkt9jvylLjNeg7Z5yvIVm5RUAyfNcN4,49
7
+ waymore-4.4.dist-info/top_level.txt,sha256=RFTphkWaRu1N7lUWIPUjabgCPQ3ETmNllF7qze4JJ_s,8
8
+ waymore-4.4.dist-info/RECORD,,
@@ -1,8 +0,0 @@
1
- waymore/__init__.py,sha256=KS43n250T2U7gE8jspc0cFVyTllND-0M0RkTA_yyc88,17
2
- waymore/waymore.py,sha256=8HdTaKE5-SzIIWWWF5kskaBVzl0FRG0DpBChR11JWjs,167332
3
- waymore-4.2.dist-info/LICENSE,sha256=o_jq62xZ1YxI8tqzQKbNtqr3RW2i5sh0rk6ixCJEroU,1068
4
- waymore-4.2.dist-info/METADATA,sha256=uSCOfxcJDVsAO9z_in0-n6l8BOfD9jPNHiAsDuhuhz0,47245
5
- waymore-4.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
6
- waymore-4.2.dist-info/entry_points.txt,sha256=YHy5EUf3r_7OTkt9jvylLjNeg7Z5yvIVm5RUAyfNcN4,49
7
- waymore-4.2.dist-info/top_level.txt,sha256=RFTphkWaRu1N7lUWIPUjabgCPQ3ETmNllF7qze4JJ_s,8
8
- waymore-4.2.dist-info/RECORD,,
File without changes
File without changes