waymore 6.5__tar.gz → 6.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {waymore-6.5/waymore.egg-info → waymore-6.6}/PKG-INFO +4 -5
- {waymore-6.5 → waymore-6.6}/README.md +3 -4
- {waymore-6.5 → waymore-6.6}/pyproject.toml +3 -0
- {waymore-6.5 → waymore-6.6}/setup.py +9 -22
- {waymore-6.5 → waymore-6.6}/tests/test_import.py +1 -0
- waymore-6.6/waymore/__init__.py +1 -0
- {waymore-6.5 → waymore-6.6}/waymore/waymore.py +346 -551
- {waymore-6.5 → waymore-6.6/waymore.egg-info}/PKG-INFO +4 -5
- waymore-6.5/waymore/__init__.py +0 -1
- {waymore-6.5 → waymore-6.6}/LICENSE +0 -0
- {waymore-6.5 → waymore-6.6}/requirements.txt +0 -0
- {waymore-6.5 → waymore-6.6}/setup.cfg +0 -0
- {waymore-6.5 → waymore-6.6}/waymore.egg-info/SOURCES.txt +0 -0
- {waymore-6.5 → waymore-6.6}/waymore.egg-info/dependency_links.txt +0 -0
- {waymore-6.5 → waymore-6.6}/waymore.egg-info/entry_points.txt +0 -0
- {waymore-6.5 → waymore-6.6}/waymore.egg-info/requires.txt +0 -0
- {waymore-6.5 → waymore-6.6}/waymore.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: waymore
|
|
3
|
-
Version: 6.
|
|
3
|
+
Version: 6.6
|
|
4
4
|
Summary: Find way more from the Wayback Machine, Common Crawl, Alien Vault OTX, URLScan, VirusTotal & Intelligence X!
|
|
5
5
|
Home-page: https://github.com/xnl-h4ck3r/waymore
|
|
6
6
|
Author: xnl-h4ck3r
|
|
@@ -18,7 +18,7 @@ Requires-Dist: tldextract
|
|
|
18
18
|
|
|
19
19
|
<center><img src="https://github.com/xnl-h4ck3r/waymore/blob/main/waymore/images/title.png"></center>
|
|
20
20
|
|
|
21
|
-
## About - v6.
|
|
21
|
+
## About - v6.6
|
|
22
22
|
|
|
23
23
|
The idea behind **waymore** is to find even more links from the Wayback Machine than other existing tools.
|
|
24
24
|
|
|
@@ -91,8 +91,8 @@ pipx install git+https://github.com/xnl-h4ck3r/waymore.git
|
|
|
91
91
|
| -mc | | Only Match HTTP status codes for retrieved URLs and responses. Comma separated list of codes. Passing this argument overrides the config `FILTER_CODE` and `-fc`. |
|
|
92
92
|
| -mt | | Only MIME Types for retrieved URLs and responses. Comma separated list of MIME types. Passing this argument overrides the config `FILTER_MIME` and `-ft`. **NOTE: This will NOT be applied to Alien Vault OTX, Virus Total and Intelligence X because they don't have the ability to filter on MIME Type. Sometimes URLScan does not have a MIME Type defined - these will always be included. Consider excluding sources if this matters to you.**. |
|
|
93
93
|
| -l | --limit | How many responses will be saved (if `-mode R` or `-mode B` is passed). A positive value will get the **first N** results, a negative value will get the **last N** results. A value of 0 will get **ALL** responses (default: 5000) |
|
|
94
|
-
| -from | --from-date | What date to get
|
|
95
|
-
| -to | --to-date | What date to get
|
|
94
|
+
| -from | --from-date | What date to get data from. If not specified it will get from the earliest possible results. A partial value can be passed, e.g. `2016`, `201805`, etc. **IMPORTANT: There are some exceptions with sources unable to get URLs within date limits: Virus Total - all known sub domains will still be returned; Intelligence X - all URLs will still be returned.** |
|
|
95
|
+
| -to | --to-date | What date to get data to. If not specified it will get to the latest possible results. A partial value can be passed, e.g. `2021`, `202112`, etc. **IMPORTANT: There are some exceptions with sources unable to get URLs within date limits: Virus Total - all known sub domains will still be returned; Intelligence X - all URLs will still be returned.** |
|
|
96
96
|
| -ci | --capture-interval | Filters the search on archive.org to only get at most 1 capture per hour (`h`), day (`d`) or month (`m`). This filter is used for responses only. The default is `d` but can also be set to `none` to not filter anything and get all responses. |
|
|
97
97
|
| -ra | --regex-after | RegEx for filtering purposes against links found from all sources of URLs AND responses downloaded. Only positive matches will be output. |
|
|
98
98
|
| -url-filename | | Set the file name of downloaded responses to the URL that generated the response, otherwise it will be set to the hash value of the response. Using the hash value means multiple URLs that generated the same response will only result in one file being saved for that response. |
|
|
@@ -103,7 +103,6 @@ pipx install git+https://github.com/xnl-h4ck3r/waymore.git
|
|
|
103
103
|
| -xvt | | Exclude checks for links from virustotal.com |
|
|
104
104
|
| -xix | | Exclude checks for links from Intelligence X.com |
|
|
105
105
|
| -lcc | | Limit the number of Common Crawl index collections searched, e.g. `-lcc 10` will just search the latest `10` collections (default: 1). As of November 2024 there are currently 106 collections. Setting to `0` will search **ALL** collections. If you don't want to search Common Crawl at all, use the `-xcc` option. |
|
|
106
|
-
| -lcy | | Limit the number of Common Crawl index collections searched by the year of the index data. The earliest index has data from 2008. Setting to 0 (default) will search collections or any year (but in conjuction with `-lcc`). For example, if you are only interested in data from 2015 and after, pass `-lcy 2015`. This will override the value of `-lcc` if passed. If you don't want to search Common Crawl at all, use the `-xcc` option. |
|
|
107
106
|
| -t | --timeout | This is for archived responses only! How many seconds to wait for the server to send data before giving up (default: 30) |
|
|
108
107
|
| -p | --processes | Basic multithreading is done when getting requests for a file of URLs. This argument determines the number of processes (threads) used (default: 1) |
|
|
109
108
|
| -r | --retries | The number of retries for requests that get connection error or rate limited (default: 1). |
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
<center><img src="https://github.com/xnl-h4ck3r/waymore/blob/main/waymore/images/title.png"></center>
|
|
2
2
|
|
|
3
|
-
## About - v6.
|
|
3
|
+
## About - v6.6
|
|
4
4
|
|
|
5
5
|
The idea behind **waymore** is to find even more links from the Wayback Machine than other existing tools.
|
|
6
6
|
|
|
@@ -73,8 +73,8 @@ pipx install git+https://github.com/xnl-h4ck3r/waymore.git
|
|
|
73
73
|
| -mc | | Only Match HTTP status codes for retrieved URLs and responses. Comma separated list of codes. Passing this argument overrides the config `FILTER_CODE` and `-fc`. |
|
|
74
74
|
| -mt | | Only MIME Types for retrieved URLs and responses. Comma separated list of MIME types. Passing this argument overrides the config `FILTER_MIME` and `-ft`. **NOTE: This will NOT be applied to Alien Vault OTX, Virus Total and Intelligence X because they don't have the ability to filter on MIME Type. Sometimes URLScan does not have a MIME Type defined - these will always be included. Consider excluding sources if this matters to you.**. |
|
|
75
75
|
| -l | --limit | How many responses will be saved (if `-mode R` or `-mode B` is passed). A positive value will get the **first N** results, a negative value will get the **last N** results. A value of 0 will get **ALL** responses (default: 5000) |
|
|
76
|
-
| -from | --from-date | What date to get
|
|
77
|
-
| -to | --to-date | What date to get
|
|
76
|
+
| -from | --from-date | What date to get data from. If not specified it will get from the earliest possible results. A partial value can be passed, e.g. `2016`, `201805`, etc. **IMPORTANT: There are some exceptions with sources unable to get URLs within date limits: Virus Total - all known sub domains will still be returned; Intelligence X - all URLs will still be returned.** |
|
|
77
|
+
| -to | --to-date | What date to get data to. If not specified it will get to the latest possible results. A partial value can be passed, e.g. `2021`, `202112`, etc. **IMPORTANT: There are some exceptions with sources unable to get URLs within date limits: Virus Total - all known sub domains will still be returned; Intelligence X - all URLs will still be returned.** |
|
|
78
78
|
| -ci | --capture-interval | Filters the search on archive.org to only get at most 1 capture per hour (`h`), day (`d`) or month (`m`). This filter is used for responses only. The default is `d` but can also be set to `none` to not filter anything and get all responses. |
|
|
79
79
|
| -ra | --regex-after | RegEx for filtering purposes against links found from all sources of URLs AND responses downloaded. Only positive matches will be output. |
|
|
80
80
|
| -url-filename | | Set the file name of downloaded responses to the URL that generated the response, otherwise it will be set to the hash value of the response. Using the hash value means multiple URLs that generated the same response will only result in one file being saved for that response. |
|
|
@@ -85,7 +85,6 @@ pipx install git+https://github.com/xnl-h4ck3r/waymore.git
|
|
|
85
85
|
| -xvt | | Exclude checks for links from virustotal.com |
|
|
86
86
|
| -xix | | Exclude checks for links from Intelligence X.com |
|
|
87
87
|
| -lcc | | Limit the number of Common Crawl index collections searched, e.g. `-lcc 10` will just search the latest `10` collections (default: 1). As of November 2024 there are currently 106 collections. Setting to `0` will search **ALL** collections. If you don't want to search Common Crawl at all, use the `-xcc` option. |
|
|
88
|
-
| -lcy | | Limit the number of Common Crawl index collections searched by the year of the index data. The earliest index has data from 2008. Setting to 0 (default) will search collections or any year (but in conjuction with `-lcc`). For example, if you are only interested in data from 2015 and after, pass `-lcy 2015`. This will override the value of `-lcc` if passed. If you don't want to search Common Crawl at all, use the `-xcc` option. |
|
|
89
88
|
| -t | --timeout | This is for archived responses only! How many seconds to wait for the server to send data before giving up (default: 30) |
|
|
90
89
|
| -p | --processes | Basic multithreading is done when getting requests for a file of URLs. This argument determines the number of processes (threads) used (default: 1) |
|
|
91
90
|
| -r | --retries | The number of retries for requests that get connection error or rate limited (default: 1). |
|
|
@@ -21,7 +21,10 @@ version = { attr = "waymore.__version__" }
|
|
|
21
21
|
[tool.ruff]
|
|
22
22
|
line-length = 100
|
|
23
23
|
target-version = "py39"
|
|
24
|
+
|
|
25
|
+
[tool.ruff.lint]
|
|
24
26
|
select = ["E", "F", "I", "UP"]
|
|
27
|
+
ignore = ["E501"] # Ignore line length violations for existing code
|
|
25
28
|
|
|
26
29
|
[tool.black]
|
|
27
30
|
line-length = 100
|
|
@@ -2,15 +2,15 @@
|
|
|
2
2
|
import os
|
|
3
3
|
import re
|
|
4
4
|
import shutil
|
|
5
|
-
|
|
5
|
+
|
|
6
|
+
from setuptools import find_packages, setup
|
|
6
7
|
|
|
7
8
|
# Read version from __init__.py without importing
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
def get_version():
|
|
11
|
-
init_path = os.path.join(os.path.dirname(
|
|
12
|
-
|
|
13
|
-
with open(init_path, "r", encoding="utf-8") as f:
|
|
12
|
+
init_path = os.path.join(os.path.dirname(__file__), "waymore", "__init__.py")
|
|
13
|
+
with open(init_path, encoding="utf-8") as f:
|
|
14
14
|
content = f.read()
|
|
15
15
|
match = re.search(r'__version__\s*=\s*["\']([^"\']+)["\']', content)
|
|
16
16
|
if match:
|
|
@@ -25,10 +25,7 @@ target_directory = (
|
|
|
25
25
|
os.path.join(os.path.expanduser("~"), ".config", "waymore")
|
|
26
26
|
if os.name == "posix"
|
|
27
27
|
else (
|
|
28
|
-
os.path.join(
|
|
29
|
-
os.path.expanduser(
|
|
30
|
-
"~"), "Library", "Application Support", "waymore"
|
|
31
|
-
)
|
|
28
|
+
os.path.join(os.path.expanduser("~"), "Library", "Application Support", "waymore")
|
|
32
29
|
if os.name == "darwin"
|
|
33
30
|
else None
|
|
34
31
|
)
|
|
@@ -42,16 +39,10 @@ if target_directory and os.path.isfile("config.yml"):
|
|
|
42
39
|
# If file already exists, create a new one
|
|
43
40
|
if os.path.isfile(target_directory + "/config.yml"):
|
|
44
41
|
configNew = True
|
|
45
|
-
os.rename(
|
|
46
|
-
target_directory + "/config.yml", target_directory + "/config.yml.OLD"
|
|
47
|
-
)
|
|
42
|
+
os.rename(target_directory + "/config.yml", target_directory + "/config.yml.OLD")
|
|
48
43
|
shutil.copy("config.yml", target_directory)
|
|
49
|
-
os.rename(
|
|
50
|
-
|
|
51
|
-
)
|
|
52
|
-
os.rename(
|
|
53
|
-
target_directory + "/config.yml.OLD", target_directory + "/config.yml"
|
|
54
|
-
)
|
|
44
|
+
os.rename(target_directory + "/config.yml", target_directory + "/config.yml.NEW")
|
|
45
|
+
os.rename(target_directory + "/config.yml.OLD", target_directory + "/config.yml")
|
|
55
46
|
else:
|
|
56
47
|
shutil.copy("config.yml", target_directory)
|
|
57
48
|
|
|
@@ -86,8 +77,4 @@ if configNew:
|
|
|
86
77
|
+ "/config.yml already exists.\nCreating config.yml.NEW but leaving existing config.\nIf you need the new file, then remove the current one and rename config.yml.NEW to config.yml\n\033[0m"
|
|
87
78
|
)
|
|
88
79
|
else:
|
|
89
|
-
print(
|
|
90
|
-
"\n\033[92mThe file "
|
|
91
|
-
+ target_directory
|
|
92
|
-
+ "/config.yml has been created.\n\033[0m"
|
|
93
|
-
)
|
|
80
|
+
print("\n\033[92mThe file " + target_directory + "/config.yml has been created.\n\033[0m")
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "6.6"
|