waybackprov 0.0.9__tar.gz → 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {waybackprov-0.0.9/waybackprov.egg-info → waybackprov-0.1.0}/PKG-INFO +43 -22
- waybackprov-0.0.9/PKG-INFO → waybackprov-0.1.0/README.md +38 -27
- waybackprov-0.1.0/pyproject.toml +30 -0
- waybackprov-0.1.0/src/waybackprov/__init__.py +269 -0
- waybackprov-0.0.9/README.md +0 -87
- waybackprov-0.0.9/setup.cfg +0 -4
- waybackprov-0.0.9/setup.py +0 -19
- waybackprov-0.0.9/waybackprov.egg-info/SOURCES.txt +0 -8
- waybackprov-0.0.9/waybackprov.egg-info/dependency_links.txt +0 -1
- waybackprov-0.0.9/waybackprov.egg-info/entry_points.txt +0 -3
- waybackprov-0.0.9/waybackprov.egg-info/top_level.txt +0 -1
- waybackprov-0.0.9/waybackprov.py +0 -249
|
@@ -1,35 +1,49 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
2
|
Name: waybackprov
|
|
3
|
-
Version: 0.0
|
|
3
|
+
Version: 0.1.0
|
|
4
4
|
Summary: Checks the provenance of a URL in the Wayback machine
|
|
5
|
-
Home-page: https://github.com/edsu/waybackprov
|
|
6
5
|
Author: Ed Summers
|
|
7
|
-
Author-email: ehs@pobox.com
|
|
8
|
-
License: UNKNOWN
|
|
9
|
-
Platform: UNKNOWN
|
|
6
|
+
Author-email: Ed Summers <ehs@pobox.com>
|
|
10
7
|
Requires-Python: >=3.0
|
|
8
|
+
Project-URL: repository, https://github.com/docnow/waybackprov
|
|
11
9
|
Description-Content-Type: text/markdown
|
|
12
10
|
|
|
11
|
+
# waybackprov
|
|
12
|
+
|
|
13
|
+
[](https://github.com/DocNow/waybackprov/actions/workflows/test.yml)
|
|
14
|
+
|
|
13
15
|
Give *waybackprov* a URL and it will summarize which Internet Archive
|
|
14
16
|
collections have archived the URL. This kind of information can sometimes
|
|
15
17
|
provide insight about why a particular web resource or set of web resources were
|
|
16
18
|
archived from the web.
|
|
17
19
|
|
|
18
|
-
##
|
|
20
|
+
## Run
|
|
21
|
+
|
|
22
|
+
If you have [uv] installed you can run `waybackprov` easily without installing anything:
|
|
23
|
+
|
|
24
|
+
```
|
|
25
|
+
uvx waybackprov
|
|
26
|
+
```
|
|
19
27
|
|
|
20
|
-
|
|
28
|
+
Otherwise you'll probably want to install it with `pip`:
|
|
29
|
+
|
|
30
|
+
```
|
|
31
|
+
pip install waybackprov
|
|
32
|
+
```
|
|
21
33
|
|
|
22
34
|
## Basic Usage
|
|
23
35
|
|
|
24
36
|
To check a particular URL here's how it works:
|
|
25
37
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
38
|
+
```shell
|
|
39
|
+
waybackprov https://twitter.com/EPAScottPruitt
|
|
40
|
+
364 https://archive.org/details/focused_crawls
|
|
41
|
+
306 https://archive.org/details/edgi_monitor
|
|
42
|
+
151 https://archive.org/details/www3.epa.gov
|
|
43
|
+
60 https://archive.org/details/epa.gov4
|
|
44
|
+
47 https://archive.org/details/epa.gov5
|
|
45
|
+
...
|
|
46
|
+
```
|
|
33
47
|
|
|
34
48
|
The first column contains the number of crawls for a particular URL, and the
|
|
35
49
|
second column contains the URL for the Internet Archive collection that added
|
|
@@ -40,14 +54,18 @@ it.
|
|
|
40
54
|
By default waybackprov will only look at the current year. If you would like it
|
|
41
55
|
to examine a range of years use the `--start` and `--end` options:
|
|
42
56
|
|
|
43
|
-
|
|
57
|
+
```shell
|
|
58
|
+
waybackprov --start 2016 --end 2018 https://twitter.com/EPAScottPruitt
|
|
59
|
+
```
|
|
44
60
|
|
|
45
61
|
## Multiple Pages
|
|
46
62
|
|
|
47
63
|
If you would like to look at all URLs at a particular URL prefix you can use the
|
|
48
64
|
`--prefix` option:
|
|
49
65
|
|
|
50
|
-
|
|
66
|
+
```shell
|
|
67
|
+
waybackprov --prefix https://twitter.com/EPAScottPruitt
|
|
68
|
+
```
|
|
51
69
|
|
|
52
70
|
This will use the Internet Archive's [CDX API](https://github.com/webrecorder/pywb/wiki/CDX-Server-API) to also include URLs that are extensions of the URL you supply, so it would include for example:
|
|
53
71
|
|
|
@@ -63,7 +81,9 @@ interested in is highly recommended since it prevents lots of lookups for CSS,
|
|
|
63
81
|
JavaScript and image files that are components of the resource that was
|
|
64
82
|
initially crawled.
|
|
65
83
|
|
|
66
|
-
|
|
84
|
+
```
|
|
85
|
+
waybackprov --prefix --match 'status/\d+$' https://twitter.com/EPAScottPruitt
|
|
86
|
+
```
|
|
67
87
|
|
|
68
88
|
## Collections
|
|
69
89
|
|
|
@@ -88,14 +108,15 @@ rather than a summary.
|
|
|
88
108
|
If you would like to see detailed information about what *waybackprov* is doing
|
|
89
109
|
use the `--log` option to supply the a file path to log to:
|
|
90
110
|
|
|
91
|
-
|
|
111
|
+
```shell
|
|
112
|
+
waybackprov --log waybackprov.log https://example.com/
|
|
113
|
+
```
|
|
92
114
|
|
|
93
115
|
## Test
|
|
94
116
|
|
|
95
117
|
If you would like to test it first install [pytest] and then:
|
|
96
118
|
|
|
97
|
-
pytest test.py
|
|
119
|
+
uv run pytest test.py
|
|
98
120
|
|
|
99
121
|
[pytest]: https://docs.pytest.org/en/latest/
|
|
100
|
-
|
|
101
|
-
|
|
122
|
+
[uv]: https://docs.astral.sh/uv/
|
|
@@ -1,35 +1,39 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
Summary: Checks the provenance of a URL in the Wayback machine
|
|
5
|
-
Home-page: https://github.com/edsu/waybackprov
|
|
6
|
-
Author: Ed Summers
|
|
7
|
-
Author-email: ehs@pobox.com
|
|
8
|
-
License: UNKNOWN
|
|
9
|
-
Platform: UNKNOWN
|
|
10
|
-
Requires-Python: >=3.0
|
|
11
|
-
Description-Content-Type: text/markdown
|
|
1
|
+
# waybackprov
|
|
2
|
+
|
|
3
|
+
[](https://github.com/DocNow/waybackprov/actions/workflows/test.yml)
|
|
12
4
|
|
|
13
5
|
Give *waybackprov* a URL and it will summarize which Internet Archive
|
|
14
6
|
collections have archived the URL. This kind of information can sometimes
|
|
15
7
|
provide insight about why a particular web resource or set of web resources were
|
|
16
8
|
archived from the web.
|
|
17
9
|
|
|
18
|
-
##
|
|
10
|
+
## Run
|
|
11
|
+
|
|
12
|
+
If you have [uv] installed you can run `waybackprov` easily without installing anything:
|
|
13
|
+
|
|
14
|
+
```
|
|
15
|
+
uvx waybackprov
|
|
16
|
+
```
|
|
19
17
|
|
|
20
|
-
|
|
18
|
+
Otherwise you'll probably want to install it with `pip`:
|
|
19
|
+
|
|
20
|
+
```
|
|
21
|
+
pip install waybackprov
|
|
22
|
+
```
|
|
21
23
|
|
|
22
24
|
## Basic Usage
|
|
23
25
|
|
|
24
26
|
To check a particular URL here's how it works:
|
|
25
27
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
28
|
+
```shell
|
|
29
|
+
waybackprov https://twitter.com/EPAScottPruitt
|
|
30
|
+
364 https://archive.org/details/focused_crawls
|
|
31
|
+
306 https://archive.org/details/edgi_monitor
|
|
32
|
+
151 https://archive.org/details/www3.epa.gov
|
|
33
|
+
60 https://archive.org/details/epa.gov4
|
|
34
|
+
47 https://archive.org/details/epa.gov5
|
|
35
|
+
...
|
|
36
|
+
```
|
|
33
37
|
|
|
34
38
|
The first column contains the number of crawls for a particular URL, and the
|
|
35
39
|
second column contains the URL for the Internet Archive collection that added
|
|
@@ -40,14 +44,18 @@ it.
|
|
|
40
44
|
By default waybackprov will only look at the current year. If you would like it
|
|
41
45
|
to examine a range of years use the `--start` and `--end` options:
|
|
42
46
|
|
|
43
|
-
|
|
47
|
+
```shell
|
|
48
|
+
waybackprov --start 2016 --end 2018 https://twitter.com/EPAScottPruitt
|
|
49
|
+
```
|
|
44
50
|
|
|
45
51
|
## Multiple Pages
|
|
46
52
|
|
|
47
53
|
If you would like to look at all URLs at a particular URL prefix you can use the
|
|
48
54
|
`--prefix` option:
|
|
49
55
|
|
|
50
|
-
|
|
56
|
+
```shell
|
|
57
|
+
waybackprov --prefix https://twitter.com/EPAScottPruitt
|
|
58
|
+
```
|
|
51
59
|
|
|
52
60
|
This will use the Internet Archive's [CDX API](https://github.com/webrecorder/pywb/wiki/CDX-Server-API) to also include URLs that are extensions of the URL you supply, so it would include for example:
|
|
53
61
|
|
|
@@ -63,7 +71,9 @@ interested in is highly recommended since it prevents lots of lookups for CSS,
|
|
|
63
71
|
JavaScript and image files that are components of the resource that was
|
|
64
72
|
initially crawled.
|
|
65
73
|
|
|
66
|
-
|
|
74
|
+
```
|
|
75
|
+
waybackprov --prefix --match 'status/\d+$' https://twitter.com/EPAScottPruitt
|
|
76
|
+
```
|
|
67
77
|
|
|
68
78
|
## Collections
|
|
69
79
|
|
|
@@ -88,14 +98,15 @@ rather than a summary.
|
|
|
88
98
|
If you would like to see detailed information about what *waybackprov* is doing
|
|
89
99
|
use the `--log` option to supply the a file path to log to:
|
|
90
100
|
|
|
91
|
-
|
|
101
|
+
```shell
|
|
102
|
+
waybackprov --log waybackprov.log https://example.com/
|
|
103
|
+
```
|
|
92
104
|
|
|
93
105
|
## Test
|
|
94
106
|
|
|
95
107
|
If you would like to test it first install [pytest] and then:
|
|
96
108
|
|
|
97
|
-
pytest test.py
|
|
109
|
+
uv run pytest test.py
|
|
98
110
|
|
|
99
111
|
[pytest]: https://docs.pytest.org/en/latest/
|
|
100
|
-
|
|
101
|
-
|
|
112
|
+
[uv]: https://docs.astral.sh/uv/
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "waybackprov"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Checks the provenance of a URL in the Wayback machine"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "Ed Summers", email = "ehs@pobox.com" }
|
|
8
|
+
]
|
|
9
|
+
requires-python = ">=3.0"
|
|
10
|
+
dependencies = []
|
|
11
|
+
|
|
12
|
+
[project.urls]
|
|
13
|
+
repository = "https://github.com/docnow/waybackprov"
|
|
14
|
+
|
|
15
|
+
[project.scripts]
|
|
16
|
+
waybackprov = "waybackprov:main"
|
|
17
|
+
|
|
18
|
+
[build-system]
|
|
19
|
+
requires = ["uv_build>=0.9.8,<0.10.0"]
|
|
20
|
+
build-backend = "uv_build"
|
|
21
|
+
|
|
22
|
+
[dependency-groups]
|
|
23
|
+
dev = [
|
|
24
|
+
"pytest>=4.6.11",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[tool.pytest.ini_options]
|
|
28
|
+
addopts = "-v -s"
|
|
29
|
+
log_file = "test.log"
|
|
30
|
+
log_file_level = "DEBUG"
|
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import csv
|
|
5
|
+
import sys
|
|
6
|
+
import json
|
|
7
|
+
import time
|
|
8
|
+
import codecs
|
|
9
|
+
import logging
|
|
10
|
+
import operator
|
|
11
|
+
import datetime
|
|
12
|
+
import optparse
|
|
13
|
+
import collections
|
|
14
|
+
|
|
15
|
+
from functools import reduce
|
|
16
|
+
from urllib.parse import quote
|
|
17
|
+
from urllib.request import urlopen
|
|
18
|
+
|
|
19
|
+
colls = {}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def main():
|
|
23
|
+
now = datetime.datetime.now()
|
|
24
|
+
|
|
25
|
+
parser = optparse.OptionParser("waybackprov [options] <url>")
|
|
26
|
+
parser.add_option("--start", default=now.year - 1, help="start year")
|
|
27
|
+
parser.add_option("--end", default=now.year, help="end year")
|
|
28
|
+
parser.add_option(
|
|
29
|
+
"--format", choices=["text", "csv", "json"], default="text", help="output data"
|
|
30
|
+
)
|
|
31
|
+
parser.add_option(
|
|
32
|
+
"--collapse", action="store_true", help="only display most specific collection"
|
|
33
|
+
)
|
|
34
|
+
parser.add_option("--prefix", action="store_true", help="use url as a prefix")
|
|
35
|
+
parser.add_option("--match", help="limit to urls that match pattern")
|
|
36
|
+
parser.add_option("--log", help="where to log activity to")
|
|
37
|
+
opts, args = parser.parse_args()
|
|
38
|
+
|
|
39
|
+
if opts.log:
|
|
40
|
+
logging.basicConfig(
|
|
41
|
+
filename=opts.log,
|
|
42
|
+
format="%(asctime)s - %(levelname)s - %(message)s",
|
|
43
|
+
level=logging.INFO,
|
|
44
|
+
)
|
|
45
|
+
else:
|
|
46
|
+
logging.basicConfig(
|
|
47
|
+
format="%(asctime)s - %(levelname)s - %(message)s", level=logging.WARNING
|
|
48
|
+
)
|
|
49
|
+
if len(args) != 1:
|
|
50
|
+
parser.error("You must supply a URL to lookup")
|
|
51
|
+
|
|
52
|
+
url = args[0]
|
|
53
|
+
|
|
54
|
+
crawl_data = get_crawls(
|
|
55
|
+
url,
|
|
56
|
+
start_year=opts.start,
|
|
57
|
+
end_year=opts.end,
|
|
58
|
+
collapse=opts.collapse,
|
|
59
|
+
prefix=opts.prefix,
|
|
60
|
+
match=opts.match,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
if opts.format == "text":
|
|
64
|
+
crawls = 0
|
|
65
|
+
coll_urls = {}
|
|
66
|
+
coll_counter = collections.Counter()
|
|
67
|
+
for crawl in crawl_data:
|
|
68
|
+
crawls += 1
|
|
69
|
+
coll_counter.update(crawl["collections"])
|
|
70
|
+
for coll in crawl["collections"]:
|
|
71
|
+
# keep track of urls in each collection
|
|
72
|
+
if coll not in coll_urls:
|
|
73
|
+
coll_urls[coll] = set()
|
|
74
|
+
coll_urls[coll].add(crawl["url"])
|
|
75
|
+
|
|
76
|
+
if len(coll_counter) == 0:
|
|
77
|
+
print(
|
|
78
|
+
"No results for %s-%s, consider using --start and --end to broaden."
|
|
79
|
+
% (opts.start, opts.end)
|
|
80
|
+
)
|
|
81
|
+
return
|
|
82
|
+
|
|
83
|
+
max_pos = str(len(str(coll_counter.most_common(1)[0][1])))
|
|
84
|
+
if opts.prefix:
|
|
85
|
+
str_format = (
|
|
86
|
+
"%" + max_pos + "i %" + max_pos + "i https://archive.org/details/%s"
|
|
87
|
+
)
|
|
88
|
+
else:
|
|
89
|
+
str_format = "%" + max_pos + "i https://archive.org/details/%s"
|
|
90
|
+
|
|
91
|
+
for coll_id, count in coll_counter.most_common():
|
|
92
|
+
if opts.prefix:
|
|
93
|
+
print(str_format % (count, len(coll_urls[coll_id]), coll_id))
|
|
94
|
+
else:
|
|
95
|
+
print(str_format % (count, coll_id))
|
|
96
|
+
|
|
97
|
+
print("")
|
|
98
|
+
print("total crawls %s-%s: %s" % (opts.start, opts.end, crawls))
|
|
99
|
+
if opts.prefix:
|
|
100
|
+
total_urls = len(reduce(operator.or_, coll_urls.values()))
|
|
101
|
+
print("total urls: %s" % total_urls)
|
|
102
|
+
|
|
103
|
+
elif opts.format == "json":
|
|
104
|
+
data = list(crawl_data)
|
|
105
|
+
print(json.dumps(data, indent=2))
|
|
106
|
+
|
|
107
|
+
elif opts.format == "csv":
|
|
108
|
+
w = csv.DictWriter(
|
|
109
|
+
sys.stdout,
|
|
110
|
+
fieldnames=["timestamp", "status", "collections", "url", "wayback_url"],
|
|
111
|
+
)
|
|
112
|
+
for crawl in crawl_data:
|
|
113
|
+
crawl["collections"] = ",".join(crawl["collections"])
|
|
114
|
+
w.writerow(crawl)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def get_crawls(
|
|
118
|
+
url, start_year=None, end_year=None, collapse=False, prefix=False, match=None
|
|
119
|
+
):
|
|
120
|
+
if prefix is True:
|
|
121
|
+
for year, sub_url in cdx(
|
|
122
|
+
url, match=match, start_year=start_year, end_year=end_year
|
|
123
|
+
):
|
|
124
|
+
yield from get_crawls(sub_url, start_year=year, end_year=year)
|
|
125
|
+
|
|
126
|
+
if start_year is None:
|
|
127
|
+
start_year = datetime.datetime.now().year - 1
|
|
128
|
+
else:
|
|
129
|
+
start_year = int(start_year)
|
|
130
|
+
if end_year is None:
|
|
131
|
+
end_year = datetime.datetime.now().year
|
|
132
|
+
else:
|
|
133
|
+
end_year = int(end_year)
|
|
134
|
+
|
|
135
|
+
api = "https://web.archive.org/__wb/calendarcaptures?url=%s&selected_year=%s"
|
|
136
|
+
for year in range(start_year, end_year + 1):
|
|
137
|
+
# This calendar data structure reflects the layout of a calendar
|
|
138
|
+
# month. So some spots in the first and last row are null. Not
|
|
139
|
+
# every day has any data if the URL wasn't crawled then.
|
|
140
|
+
logging.info("getting calendar year %s for %s", year, url)
|
|
141
|
+
cal = get_json(api % (url, year))
|
|
142
|
+
for month in cal:
|
|
143
|
+
for week in month:
|
|
144
|
+
for day in week:
|
|
145
|
+
if day is None or day == {}:
|
|
146
|
+
continue
|
|
147
|
+
# note: we can't seem to rely on 'cnt' as a count
|
|
148
|
+
for i in range(0, len(day["st"])):
|
|
149
|
+
c = {
|
|
150
|
+
"status": day["st"][i],
|
|
151
|
+
"timestamp": day["ts"][i],
|
|
152
|
+
"collections": day["why"][i],
|
|
153
|
+
"url": url,
|
|
154
|
+
}
|
|
155
|
+
c["wayback_url"] = "https://web.archive.org/web/%s/%s" % (
|
|
156
|
+
c["timestamp"],
|
|
157
|
+
url,
|
|
158
|
+
)
|
|
159
|
+
if c["collections"] is None:
|
|
160
|
+
continue
|
|
161
|
+
if collapse and len(c["collections"]) > 0:
|
|
162
|
+
c["collections"] = [deepest_collection(c["collections"])]
|
|
163
|
+
logging.info("found crawl %s", c)
|
|
164
|
+
yield c
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def deepest_collection(coll_ids):
|
|
168
|
+
return max(coll_ids, key=get_depth)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def get_collection(coll_id):
|
|
172
|
+
# no need to fetch twice
|
|
173
|
+
if coll_id in colls:
|
|
174
|
+
return colls[coll_id]
|
|
175
|
+
|
|
176
|
+
logging.info("fetching collection %s", coll_id)
|
|
177
|
+
|
|
178
|
+
# get the collection metadata
|
|
179
|
+
url = "https://archive.org/metadata/%s" % coll_id
|
|
180
|
+
data = get_json(url)["metadata"]
|
|
181
|
+
|
|
182
|
+
# make collection into reliable array
|
|
183
|
+
if "collection" in data:
|
|
184
|
+
if type(data["collection"]) is str:
|
|
185
|
+
data["collection"] = [data["collection"]]
|
|
186
|
+
else:
|
|
187
|
+
data["collection"] = []
|
|
188
|
+
|
|
189
|
+
# so we don't have to look it up again
|
|
190
|
+
colls[coll_id] = data
|
|
191
|
+
|
|
192
|
+
return data
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def get_depth(coll_id, seen_colls=None):
|
|
196
|
+
coll = get_collection(coll_id)
|
|
197
|
+
if "depth" in coll:
|
|
198
|
+
return coll["depth"]
|
|
199
|
+
|
|
200
|
+
logging.info("calculating depth of %s", coll_id)
|
|
201
|
+
|
|
202
|
+
if len(coll["collection"]) == 0:
|
|
203
|
+
return 0
|
|
204
|
+
|
|
205
|
+
# prevent recursive loops
|
|
206
|
+
if seen_colls is None:
|
|
207
|
+
seen_colls = set()
|
|
208
|
+
if coll_id in seen_colls:
|
|
209
|
+
return 0
|
|
210
|
+
seen_colls.add(coll_id)
|
|
211
|
+
|
|
212
|
+
depth = max(map(lambda id: get_depth(id, seen_colls) + 1, coll["collection"]))
|
|
213
|
+
|
|
214
|
+
coll["depth"] = depth
|
|
215
|
+
logging.info("depth %s = %s", coll_id, depth)
|
|
216
|
+
return depth
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def get_json(url):
|
|
220
|
+
count = 0
|
|
221
|
+
while True:
|
|
222
|
+
count += 1
|
|
223
|
+
if count >= 10:
|
|
224
|
+
logging.error("giving up on fetching JSON from %s", url)
|
|
225
|
+
try:
|
|
226
|
+
resp = urlopen(url)
|
|
227
|
+
reader = codecs.getreader("utf-8")
|
|
228
|
+
return json.load(reader(resp))
|
|
229
|
+
except Exception as e:
|
|
230
|
+
logging.error("caught exception: %s", e)
|
|
231
|
+
logging.info("sleeping for %s seconds", count * 10)
|
|
232
|
+
time.sleep(count * 10)
|
|
233
|
+
raise (Exception("unable to get JSON for %s", url))
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def cdx(url, match=None, start_year=None, end_year=None):
|
|
237
|
+
logging.info("searching cdx for %s with regex %s", url, match)
|
|
238
|
+
|
|
239
|
+
if match:
|
|
240
|
+
try:
|
|
241
|
+
pattern = re.compile(match)
|
|
242
|
+
except Exception as e:
|
|
243
|
+
sys.exit("invalid regular expression: {}".format(e))
|
|
244
|
+
else:
|
|
245
|
+
pattern = None
|
|
246
|
+
|
|
247
|
+
cdx_url = "http://web.archive.org/cdx/search/cdx?url={}&matchType=prefix&from={}&to={}".format(
|
|
248
|
+
quote(url), start_year, end_year
|
|
249
|
+
)
|
|
250
|
+
seen = set()
|
|
251
|
+
results = codecs.decode(urlopen(cdx_url).read(), encoding="utf8")
|
|
252
|
+
|
|
253
|
+
for line in results.split("\n"):
|
|
254
|
+
parts = line.split(" ")
|
|
255
|
+
if len(parts) == 7:
|
|
256
|
+
year = int(parts[1][0:4])
|
|
257
|
+
url = parts[2]
|
|
258
|
+
seen_key = "{}:{}".format(year, url)
|
|
259
|
+
if seen_key in seen:
|
|
260
|
+
continue
|
|
261
|
+
if pattern and not pattern.search(url):
|
|
262
|
+
continue
|
|
263
|
+
seen.add(seen_key)
|
|
264
|
+
logging.info("cdx found %s", url)
|
|
265
|
+
yield (year, url)
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
if __name__ == "__main__":
|
|
269
|
+
main()
|
waybackprov-0.0.9/README.md
DELETED
|
@@ -1,87 +0,0 @@
|
|
|
1
|
-
Give *waybackprov* a URL and it will summarize which Internet Archive
|
|
2
|
-
collections have archived the URL. This kind of information can sometimes
|
|
3
|
-
provide insight about why a particular web resource or set of web resources were
|
|
4
|
-
archived from the web.
|
|
5
|
-
|
|
6
|
-
## Install
|
|
7
|
-
|
|
8
|
-
pip install waybackprov
|
|
9
|
-
|
|
10
|
-
## Basic Usage
|
|
11
|
-
|
|
12
|
-
To check a particular URL here's how it works:
|
|
13
|
-
|
|
14
|
-
% waybackprov https://twitter.com/EPAScottPruitt
|
|
15
|
-
364 https://archive.org/details/focused_crawls
|
|
16
|
-
306 https://archive.org/details/edgi_monitor
|
|
17
|
-
151 https://archive.org/details/www3.epa.gov
|
|
18
|
-
60 https://archive.org/details/epa.gov4
|
|
19
|
-
47 https://archive.org/details/epa.gov5
|
|
20
|
-
...
|
|
21
|
-
|
|
22
|
-
The first column contains the number of crawls for a particular URL, and the
|
|
23
|
-
second column contains the URL for the Internet Archive collection that added
|
|
24
|
-
it.
|
|
25
|
-
|
|
26
|
-
## Time
|
|
27
|
-
|
|
28
|
-
By default waybackprov will only look at the current year. If you would like it
|
|
29
|
-
to examine a range of years use the `--start` and `--end` options:
|
|
30
|
-
|
|
31
|
-
% waybackprov --start 2016 --end 2018 https://twitter.com/EPAScottPruitt
|
|
32
|
-
|
|
33
|
-
## Multiple Pages
|
|
34
|
-
|
|
35
|
-
If you would like to look at all URLs at a particular URL prefix you can use the
|
|
36
|
-
`--prefix` option:
|
|
37
|
-
|
|
38
|
-
% waybackprov --prefix https://twitter.com/EPAScottPruitt
|
|
39
|
-
|
|
40
|
-
This will use the Internet Archive's [CDX API](https://github.com/webrecorder/pywb/wiki/CDX-Server-API) to also include URLs that are extensions of the URL you supply, so it would include for example:
|
|
41
|
-
|
|
42
|
-
https://twitter.com/EPAScottPruitt/status/1309839080398339
|
|
43
|
-
|
|
44
|
-
But it can also include things you may not want, such as:
|
|
45
|
-
|
|
46
|
-
https://twitter.com/EPAScottPruitt/status/1309839080398339/media/1
|
|
47
|
-
|
|
48
|
-
To further limit the URLs use the `--match` parameter to specify a regular
|
|
49
|
-
expression only check particular URLs. Further specifying the URLs you are
|
|
50
|
-
interested in is highly recommended since it prevents lots of lookups for CSS,
|
|
51
|
-
JavaScript and image files that are components of the resource that was
|
|
52
|
-
initially crawled.
|
|
53
|
-
|
|
54
|
-
% waybackprov --prefix --match 'status/\d+$' https://twitter.com/EPAScottPruitt
|
|
55
|
-
|
|
56
|
-
## Collections
|
|
57
|
-
|
|
58
|
-
One thing to remember when interpreting this data is that collections can
|
|
59
|
-
contain other collections. For example the *edgi_monitor* collection is a
|
|
60
|
-
sub-collection of *focused_crawls*.
|
|
61
|
-
|
|
62
|
-
If you use the `--collapse` option only the most specific collection will be
|
|
63
|
-
reported for a given crawl. So if *coll1* is part of *coll2* which is part of
|
|
64
|
-
*coll3*, only *coll1* will be reported instead of *coll1*, *coll2* and *coll3*.
|
|
65
|
-
This does involve collection metadata lookups at the Internet Archive API, so it
|
|
66
|
-
does slow performance significantly.
|
|
67
|
-
|
|
68
|
-
## JSON and CSV
|
|
69
|
-
|
|
70
|
-
If you would rather see the raw data as JSON or CSV use the `--format` option.
|
|
71
|
-
When you use either of these formats you will see the metadata for each crawl,
|
|
72
|
-
rather than a summary.
|
|
73
|
-
|
|
74
|
-
## Log
|
|
75
|
-
|
|
76
|
-
If you would like to see detailed information about what *waybackprov* is doing
|
|
77
|
-
use the `--log` option to supply the a file path to log to:
|
|
78
|
-
|
|
79
|
-
% waybackprov --log waybackprov.log https://example.com/
|
|
80
|
-
|
|
81
|
-
## Test
|
|
82
|
-
|
|
83
|
-
If you would like to test it first install [pytest] and then:
|
|
84
|
-
|
|
85
|
-
pytest test.py
|
|
86
|
-
|
|
87
|
-
[pytest]: https://docs.pytest.org/en/latest/
|
waybackprov-0.0.9/setup.cfg
DELETED
waybackprov-0.0.9/setup.py
DELETED
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
from setuptools import setup
|
|
2
|
-
|
|
3
|
-
with open("README.md") as f:
|
|
4
|
-
long_description = f.read()
|
|
5
|
-
|
|
6
|
-
if __name__ == "__main__":
|
|
7
|
-
setup(
|
|
8
|
-
name='waybackprov',
|
|
9
|
-
version='0.0.9',
|
|
10
|
-
url='https://github.com/edsu/waybackprov',
|
|
11
|
-
author='Ed Summers',
|
|
12
|
-
author_email='ehs@pobox.com',
|
|
13
|
-
py_modules=['waybackprov', ],
|
|
14
|
-
description='Checks the provenance of a URL in the Wayback machine',
|
|
15
|
-
long_description=long_description,
|
|
16
|
-
long_description_content_type="text/markdown",
|
|
17
|
-
python_requires='>=3.0',
|
|
18
|
-
entry_points={'console_scripts': ['waybackprov = waybackprov:main']}
|
|
19
|
-
)
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
waybackprov
|
waybackprov-0.0.9/waybackprov.py
DELETED
|
@@ -1,249 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
|
|
3
|
-
import re
|
|
4
|
-
import csv
|
|
5
|
-
import sys
|
|
6
|
-
import json
|
|
7
|
-
import time
|
|
8
|
-
import codecs
|
|
9
|
-
import logging
|
|
10
|
-
import operator
|
|
11
|
-
import datetime
|
|
12
|
-
import optparse
|
|
13
|
-
import collections
|
|
14
|
-
|
|
15
|
-
from functools import reduce
|
|
16
|
-
from urllib.parse import quote
|
|
17
|
-
from urllib.request import urlopen
|
|
18
|
-
|
|
19
|
-
colls = {}
|
|
20
|
-
|
|
21
|
-
def main():
|
|
22
|
-
now = datetime.datetime.now()
|
|
23
|
-
|
|
24
|
-
parser = optparse.OptionParser('waybackprov.py [options] <url>')
|
|
25
|
-
parser.add_option('--start', default=now.year -1, help='start year')
|
|
26
|
-
parser.add_option('--end', default=now.year, help='end year')
|
|
27
|
-
parser.add_option('--format', choices=['text', 'csv', 'json'],
|
|
28
|
-
default='text', help='output data')
|
|
29
|
-
parser.add_option('--collapse', action='store_true',
|
|
30
|
-
help='only display most specific collection')
|
|
31
|
-
parser.add_option('--prefix', action='store_true',
|
|
32
|
-
help='use url as a prefix')
|
|
33
|
-
parser.add_option('--match', help='limit to urls that match pattern')
|
|
34
|
-
parser.add_option('--log', help='where to log activity to')
|
|
35
|
-
opts, args = parser.parse_args()
|
|
36
|
-
|
|
37
|
-
if opts.log:
|
|
38
|
-
logging.basicConfig(
|
|
39
|
-
filename=opts.log,
|
|
40
|
-
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
41
|
-
level=logging.INFO
|
|
42
|
-
)
|
|
43
|
-
else:
|
|
44
|
-
logging.basicConfig(
|
|
45
|
-
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
46
|
-
level=logging.WARNING
|
|
47
|
-
)
|
|
48
|
-
if len(args) != 1:
|
|
49
|
-
parser.error('You must supply a URL to lookup')
|
|
50
|
-
|
|
51
|
-
url = args[0]
|
|
52
|
-
|
|
53
|
-
crawl_data = get_crawls(url,
|
|
54
|
-
start_year=opts.start,
|
|
55
|
-
end_year=opts.end,
|
|
56
|
-
collapse=opts.collapse,
|
|
57
|
-
prefix=opts.prefix,
|
|
58
|
-
match=opts.match
|
|
59
|
-
)
|
|
60
|
-
|
|
61
|
-
if opts.format == 'text':
|
|
62
|
-
crawls = 0
|
|
63
|
-
coll_urls = {}
|
|
64
|
-
coll_counter = collections.Counter()
|
|
65
|
-
for crawl in crawl_data:
|
|
66
|
-
crawls += 1
|
|
67
|
-
coll_counter.update(crawl['collections'])
|
|
68
|
-
for coll in crawl['collections']:
|
|
69
|
-
# keep track of urls in each collection
|
|
70
|
-
if coll not in coll_urls:
|
|
71
|
-
coll_urls[coll] = set()
|
|
72
|
-
coll_urls[coll].add(crawl['url'])
|
|
73
|
-
|
|
74
|
-
if len(coll_counter) == 0:
|
|
75
|
-
print('No results for %s-%s, consider using --start and --end to broaden.' % (opts.start, opts.end))
|
|
76
|
-
return
|
|
77
|
-
|
|
78
|
-
max_pos = str(len(str(coll_counter.most_common(1)[0][1])))
|
|
79
|
-
if opts.prefix:
|
|
80
|
-
str_format = '%' + max_pos + 'i %' + max_pos + 'i https://archive.org/details/%s'
|
|
81
|
-
else:
|
|
82
|
-
str_format = '%' + max_pos + 'i https://archive.org/details/%s'
|
|
83
|
-
|
|
84
|
-
for coll_id, count in coll_counter.most_common():
|
|
85
|
-
if opts.prefix:
|
|
86
|
-
print(str_format % (count, len(coll_urls[coll_id]), coll_id))
|
|
87
|
-
else:
|
|
88
|
-
print(str_format % (count, coll_id))
|
|
89
|
-
|
|
90
|
-
print('')
|
|
91
|
-
print('total crawls %s-%s: %s' % (opts.start, opts.end, crawls))
|
|
92
|
-
if (opts.prefix):
|
|
93
|
-
total_urls = len(reduce(operator.or_, coll_urls.values()))
|
|
94
|
-
print('total urls: %s' % total_urls)
|
|
95
|
-
|
|
96
|
-
elif opts.format == 'json':
|
|
97
|
-
data = list(crawl_data)
|
|
98
|
-
print(json.dumps(data, indent=2))
|
|
99
|
-
|
|
100
|
-
elif opts.format == 'csv':
|
|
101
|
-
w = csv.DictWriter(sys.stdout,
|
|
102
|
-
fieldnames=['timestamp', 'status', 'collections', 'url', 'wayback_url'])
|
|
103
|
-
for crawl in crawl_data:
|
|
104
|
-
crawl['collections'] = ','.join(crawl['collections'])
|
|
105
|
-
w.writerow(crawl)
|
|
106
|
-
|
|
107
|
-
def get_crawls(url, start_year=None, end_year=None, collapse=False,
|
|
108
|
-
prefix=False, match=None):
|
|
109
|
-
|
|
110
|
-
if prefix == True:
|
|
111
|
-
for year, sub_url in cdx(url, match=match, start_year=start_year,
|
|
112
|
-
end_year=end_year):
|
|
113
|
-
yield from get_crawls(sub_url, start_year=year, end_year=year)
|
|
114
|
-
|
|
115
|
-
if start_year is None:
|
|
116
|
-
start_year = datetime.datetime.now().year - 1
|
|
117
|
-
else:
|
|
118
|
-
start_year = int(start_year)
|
|
119
|
-
if end_year is None:
|
|
120
|
-
end_year = datetime.datetime.now().year
|
|
121
|
-
else:
|
|
122
|
-
end_year = int(end_year)
|
|
123
|
-
|
|
124
|
-
api = 'https://web.archive.org/__wb/calendarcaptures?url=%s&selected_year=%s'
|
|
125
|
-
for year in range(start_year, end_year + 1):
|
|
126
|
-
# This calendar data structure reflects the layout of a calendar
|
|
127
|
-
# month. So some spots in the first and last row are null. Not
|
|
128
|
-
# every day has any data if the URL wasn't crawled then.
|
|
129
|
-
logging.info("getting calendar year %s for %s", year, url)
|
|
130
|
-
cal = get_json(api % (url, year))
|
|
131
|
-
found = False
|
|
132
|
-
for month in cal:
|
|
133
|
-
for week in month:
|
|
134
|
-
for day in week:
|
|
135
|
-
if day is None or day == {}:
|
|
136
|
-
continue
|
|
137
|
-
# note: we can't seem to rely on 'cnt' as a count
|
|
138
|
-
for i in range(0, len(day['st'])):
|
|
139
|
-
c = {
|
|
140
|
-
'status': day['st'][i],
|
|
141
|
-
'timestamp': day['ts'][i],
|
|
142
|
-
'collections': day['why'][i],
|
|
143
|
-
'url': url
|
|
144
|
-
}
|
|
145
|
-
c['wayback_url'] = 'https://web.archive.org/web/%s/%s' % (c['timestamp'], url)
|
|
146
|
-
if c['collections'] is None:
|
|
147
|
-
continue
|
|
148
|
-
if collapse and len(c['collections']) > 0:
|
|
149
|
-
c['collections'] = [deepest_collection(c['collections'])]
|
|
150
|
-
logging.info('found crawl %s', c)
|
|
151
|
-
found = True
|
|
152
|
-
yield c
|
|
153
|
-
|
|
154
|
-
def deepest_collection(coll_ids):
|
|
155
|
-
return max(coll_ids, key=get_depth)
|
|
156
|
-
|
|
157
|
-
def get_collection(coll_id):
|
|
158
|
-
# no need to fetch twice
|
|
159
|
-
if coll_id in colls:
|
|
160
|
-
return colls[coll_id]
|
|
161
|
-
|
|
162
|
-
logging.info('fetching collection %s', coll_id)
|
|
163
|
-
|
|
164
|
-
# get the collection metadata
|
|
165
|
-
url = 'https://archive.org/metadata/%s' % coll_id
|
|
166
|
-
data = get_json(url)['metadata']
|
|
167
|
-
|
|
168
|
-
# make collection into reliable array
|
|
169
|
-
if 'collection' in data:
|
|
170
|
-
if type(data['collection']) == str:
|
|
171
|
-
data['collection'] = [data['collection']]
|
|
172
|
-
else:
|
|
173
|
-
data['collection'] = []
|
|
174
|
-
|
|
175
|
-
# so we don't have to look it up again
|
|
176
|
-
colls[coll_id] = data
|
|
177
|
-
|
|
178
|
-
return data
|
|
179
|
-
|
|
180
|
-
def get_depth(coll_id, seen_colls=None):
|
|
181
|
-
coll = get_collection(coll_id)
|
|
182
|
-
if 'depth' in coll:
|
|
183
|
-
return coll['depth']
|
|
184
|
-
|
|
185
|
-
logging.info('calculating depth of %s', coll_id)
|
|
186
|
-
|
|
187
|
-
if len(coll['collection']) == 0:
|
|
188
|
-
return 0
|
|
189
|
-
|
|
190
|
-
# prevent recursive loops
|
|
191
|
-
if seen_colls == None:
|
|
192
|
-
seen_colls = set()
|
|
193
|
-
if coll_id in seen_colls:
|
|
194
|
-
return 0
|
|
195
|
-
seen_colls.add(coll_id)
|
|
196
|
-
|
|
197
|
-
depth = max(map(lambda id: get_depth(id, seen_colls) + 1, coll['collection']))
|
|
198
|
-
|
|
199
|
-
coll['depth'] = depth
|
|
200
|
-
logging.info('depth %s = %s', coll_id, depth)
|
|
201
|
-
return depth
|
|
202
|
-
|
|
203
|
-
def get_json(url):
|
|
204
|
-
count = 0
|
|
205
|
-
while True:
|
|
206
|
-
count += 1
|
|
207
|
-
if count >= 10:
|
|
208
|
-
logging.error("giving up on fetching JSON from %s", url)
|
|
209
|
-
try:
|
|
210
|
-
resp = urlopen(url)
|
|
211
|
-
reader = codecs.getreader('utf-8')
|
|
212
|
-
return json.load(reader(resp))
|
|
213
|
-
except Exception as e:
|
|
214
|
-
logging.error('caught exception: %s', e)
|
|
215
|
-
logging.info('sleeping for %s seconds', count * 10)
|
|
216
|
-
time.sleep(count * 10)
|
|
217
|
-
raise(Exception("unable to get JSON for %s", url))
|
|
218
|
-
|
|
219
|
-
def cdx(url, match=None, start_year=None, end_year=None):
|
|
220
|
-
logging.info('searching cdx for %s with regex %s', url, match)
|
|
221
|
-
|
|
222
|
-
if match:
|
|
223
|
-
try:
|
|
224
|
-
pattern = re.compile(match)
|
|
225
|
-
except Exception as e:
|
|
226
|
-
sys.exit('invalid regular expression: {}'.format(e))
|
|
227
|
-
else:
|
|
228
|
-
pattern = None
|
|
229
|
-
|
|
230
|
-
cdx_url = 'http://web.archive.org/cdx/search/cdx?url={}&matchType=prefix&from={}&to={}'.format(quote(url), start_year, end_year)
|
|
231
|
-
seen = set()
|
|
232
|
-
results = codecs.decode(urlopen(cdx_url).read(), encoding='utf8')
|
|
233
|
-
|
|
234
|
-
for line in results.split('\n'):
|
|
235
|
-
parts = line.split(' ')
|
|
236
|
-
if len(parts) == 7:
|
|
237
|
-
year = int(parts[1][0:4])
|
|
238
|
-
url = parts[2]
|
|
239
|
-
seen_key = '{}:{}'.format(year, url)
|
|
240
|
-
if seen_key in seen:
|
|
241
|
-
continue
|
|
242
|
-
if pattern and not pattern.search(url):
|
|
243
|
-
continue
|
|
244
|
-
seen.add(seen_key)
|
|
245
|
-
logging.info('cdx found %s', url)
|
|
246
|
-
yield(year, url)
|
|
247
|
-
|
|
248
|
-
if __name__ == "__main__":
|
|
249
|
-
main()
|