waybackprov 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,269 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import re
4
+ import csv
5
+ import sys
6
+ import json
7
+ import time
8
+ import codecs
9
+ import logging
10
+ import operator
11
+ import datetime
12
+ import optparse
13
+ import collections
14
+
15
+ from functools import reduce
16
+ from urllib.parse import quote
17
+ from urllib.request import urlopen
18
+
19
+ colls = {}
20
+
21
+
22
+ def main():
23
+ now = datetime.datetime.now()
24
+
25
+ parser = optparse.OptionParser("waybackprov [options] <url>")
26
+ parser.add_option("--start", default=now.year - 1, help="start year")
27
+ parser.add_option("--end", default=now.year, help="end year")
28
+ parser.add_option(
29
+ "--format", choices=["text", "csv", "json"], default="text", help="output data"
30
+ )
31
+ parser.add_option(
32
+ "--collapse", action="store_true", help="only display most specific collection"
33
+ )
34
+ parser.add_option("--prefix", action="store_true", help="use url as a prefix")
35
+ parser.add_option("--match", help="limit to urls that match pattern")
36
+ parser.add_option("--log", help="where to log activity to")
37
+ opts, args = parser.parse_args()
38
+
39
+ if opts.log:
40
+ logging.basicConfig(
41
+ filename=opts.log,
42
+ format="%(asctime)s - %(levelname)s - %(message)s",
43
+ level=logging.INFO,
44
+ )
45
+ else:
46
+ logging.basicConfig(
47
+ format="%(asctime)s - %(levelname)s - %(message)s", level=logging.WARNING
48
+ )
49
+ if len(args) != 1:
50
+ parser.error("You must supply a URL to lookup")
51
+
52
+ url = args[0]
53
+
54
+ crawl_data = get_crawls(
55
+ url,
56
+ start_year=opts.start,
57
+ end_year=opts.end,
58
+ collapse=opts.collapse,
59
+ prefix=opts.prefix,
60
+ match=opts.match,
61
+ )
62
+
63
+ if opts.format == "text":
64
+ crawls = 0
65
+ coll_urls = {}
66
+ coll_counter = collections.Counter()
67
+ for crawl in crawl_data:
68
+ crawls += 1
69
+ coll_counter.update(crawl["collections"])
70
+ for coll in crawl["collections"]:
71
+ # keep track of urls in each collection
72
+ if coll not in coll_urls:
73
+ coll_urls[coll] = set()
74
+ coll_urls[coll].add(crawl["url"])
75
+
76
+ if len(coll_counter) == 0:
77
+ print(
78
+ "No results for %s-%s, consider using --start and --end to broaden."
79
+ % (opts.start, opts.end)
80
+ )
81
+ return
82
+
83
+ max_pos = str(len(str(coll_counter.most_common(1)[0][1])))
84
+ if opts.prefix:
85
+ str_format = (
86
+ "%" + max_pos + "i %" + max_pos + "i https://archive.org/details/%s"
87
+ )
88
+ else:
89
+ str_format = "%" + max_pos + "i https://archive.org/details/%s"
90
+
91
+ for coll_id, count in coll_counter.most_common():
92
+ if opts.prefix:
93
+ print(str_format % (count, len(coll_urls[coll_id]), coll_id))
94
+ else:
95
+ print(str_format % (count, coll_id))
96
+
97
+ print("")
98
+ print("total crawls %s-%s: %s" % (opts.start, opts.end, crawls))
99
+ if opts.prefix:
100
+ total_urls = len(reduce(operator.or_, coll_urls.values()))
101
+ print("total urls: %s" % total_urls)
102
+
103
+ elif opts.format == "json":
104
+ data = list(crawl_data)
105
+ print(json.dumps(data, indent=2))
106
+
107
+ elif opts.format == "csv":
108
+ w = csv.DictWriter(
109
+ sys.stdout,
110
+ fieldnames=["timestamp", "status", "collections", "url", "wayback_url"],
111
+ )
112
+ for crawl in crawl_data:
113
+ crawl["collections"] = ",".join(crawl["collections"])
114
+ w.writerow(crawl)
115
+
116
+
117
+ def get_crawls(
118
+ url, start_year=None, end_year=None, collapse=False, prefix=False, match=None
119
+ ):
120
+ if prefix is True:
121
+ for year, sub_url in cdx(
122
+ url, match=match, start_year=start_year, end_year=end_year
123
+ ):
124
+ yield from get_crawls(sub_url, start_year=year, end_year=year)
125
+
126
+ if start_year is None:
127
+ start_year = datetime.datetime.now().year - 1
128
+ else:
129
+ start_year = int(start_year)
130
+ if end_year is None:
131
+ end_year = datetime.datetime.now().year
132
+ else:
133
+ end_year = int(end_year)
134
+
135
+ api = "https://web.archive.org/__wb/calendarcaptures?url=%s&selected_year=%s"
136
+ for year in range(start_year, end_year + 1):
137
+ # This calendar data structure reflects the layout of a calendar
138
+ # month. So some spots in the first and last row are null. Not
139
+ # every day has any data if the URL wasn't crawled then.
140
+ logging.info("getting calendar year %s for %s", year, url)
141
+ cal = get_json(api % (url, year))
142
+ for month in cal:
143
+ for week in month:
144
+ for day in week:
145
+ if day is None or day == {}:
146
+ continue
147
+ # note: we can't seem to rely on 'cnt' as a count
148
+ for i in range(0, len(day["st"])):
149
+ c = {
150
+ "status": day["st"][i],
151
+ "timestamp": day["ts"][i],
152
+ "collections": day["why"][i],
153
+ "url": url,
154
+ }
155
+ c["wayback_url"] = "https://web.archive.org/web/%s/%s" % (
156
+ c["timestamp"],
157
+ url,
158
+ )
159
+ if c["collections"] is None:
160
+ continue
161
+ if collapse and len(c["collections"]) > 0:
162
+ c["collections"] = [deepest_collection(c["collections"])]
163
+ logging.info("found crawl %s", c)
164
+ yield c
165
+
166
+
167
+ def deepest_collection(coll_ids):
168
+ return max(coll_ids, key=get_depth)
169
+
170
+
171
+ def get_collection(coll_id):
172
+ # no need to fetch twice
173
+ if coll_id in colls:
174
+ return colls[coll_id]
175
+
176
+ logging.info("fetching collection %s", coll_id)
177
+
178
+ # get the collection metadata
179
+ url = "https://archive.org/metadata/%s" % coll_id
180
+ data = get_json(url)["metadata"]
181
+
182
+ # make collection into reliable array
183
+ if "collection" in data:
184
+ if type(data["collection"]) is str:
185
+ data["collection"] = [data["collection"]]
186
+ else:
187
+ data["collection"] = []
188
+
189
+ # so we don't have to look it up again
190
+ colls[coll_id] = data
191
+
192
+ return data
193
+
194
+
195
+ def get_depth(coll_id, seen_colls=None):
196
+ coll = get_collection(coll_id)
197
+ if "depth" in coll:
198
+ return coll["depth"]
199
+
200
+ logging.info("calculating depth of %s", coll_id)
201
+
202
+ if len(coll["collection"]) == 0:
203
+ return 0
204
+
205
+ # prevent recursive loops
206
+ if seen_colls is None:
207
+ seen_colls = set()
208
+ if coll_id in seen_colls:
209
+ return 0
210
+ seen_colls.add(coll_id)
211
+
212
+ depth = max(map(lambda id: get_depth(id, seen_colls) + 1, coll["collection"]))
213
+
214
+ coll["depth"] = depth
215
+ logging.info("depth %s = %s", coll_id, depth)
216
+ return depth
217
+
218
+
219
+ def get_json(url):
220
+ count = 0
221
+ while True:
222
+ count += 1
223
+ if count >= 10:
224
+ logging.error("giving up on fetching JSON from %s", url)
225
+ try:
226
+ resp = urlopen(url)
227
+ reader = codecs.getreader("utf-8")
228
+ return json.load(reader(resp))
229
+ except Exception as e:
230
+ logging.error("caught exception: %s", e)
231
+ logging.info("sleeping for %s seconds", count * 10)
232
+ time.sleep(count * 10)
233
+ raise (Exception("unable to get JSON for %s", url))
234
+
235
+
236
+ def cdx(url, match=None, start_year=None, end_year=None):
237
+ logging.info("searching cdx for %s with regex %s", url, match)
238
+
239
+ if match:
240
+ try:
241
+ pattern = re.compile(match)
242
+ except Exception as e:
243
+ sys.exit("invalid regular expression: {}".format(e))
244
+ else:
245
+ pattern = None
246
+
247
+ cdx_url = "http://web.archive.org/cdx/search/cdx?url={}&matchType=prefix&from={}&to={}".format(
248
+ quote(url), start_year, end_year
249
+ )
250
+ seen = set()
251
+ results = codecs.decode(urlopen(cdx_url).read(), encoding="utf8")
252
+
253
+ for line in results.split("\n"):
254
+ parts = line.split(" ")
255
+ if len(parts) == 7:
256
+ year = int(parts[1][0:4])
257
+ url = parts[2]
258
+ seen_key = "{}:{}".format(year, url)
259
+ if seen_key in seen:
260
+ continue
261
+ if pattern and not pattern.search(url):
262
+ continue
263
+ seen.add(seen_key)
264
+ logging.info("cdx found %s", url)
265
+ yield (year, url)
266
+
267
+
268
+ if __name__ == "__main__":
269
+ main()
@@ -0,0 +1,122 @@
1
+ Metadata-Version: 2.3
2
+ Name: waybackprov
3
+ Version: 0.1.0
4
+ Summary: Checks the provenance of a URL in the Wayback machine
5
+ Author: Ed Summers
6
+ Author-email: Ed Summers <ehs@pobox.com>
7
+ Requires-Python: >=3.0
8
+ Project-URL: repository, https://github.com/docnow/waybackprov
9
+ Description-Content-Type: text/markdown
10
+
11
+ # waybackprov
12
+
13
+ [![Test](https://github.com/DocNow/waybackprov/actions/workflows/test.yml/badge.svg)](https://github.com/DocNow/waybackprov/actions/workflows/test.yml)
14
+
15
+ Give *waybackprov* a URL and it will summarize which Internet Archive
16
+ collections have archived the URL. This kind of information can sometimes
17
+ provide insight about why a particular web resource or set of web resources were
18
+ archived from the web.
19
+
20
+ ## Run
21
+
22
+ If you have [uv] installed you can run `waybackprov` easily without installing anything:
23
+
24
+ ```
25
+ uvx waybackprov
26
+ ```
27
+
28
+ Otherwise you'll probably want to install it with `pip`:
29
+
30
+ ```
31
+ pip install waybackprov
32
+ ```
33
+
34
+ ## Basic Usage
35
+
36
+ To check a particular URL here's how it works:
37
+
38
+ ```shell
39
+ waybackprov https://twitter.com/EPAScottPruitt
40
+ 364 https://archive.org/details/focused_crawls
41
+ 306 https://archive.org/details/edgi_monitor
42
+ 151 https://archive.org/details/www3.epa.gov
43
+ 60 https://archive.org/details/epa.gov4
44
+ 47 https://archive.org/details/epa.gov5
45
+ ...
46
+ ```
47
+
48
+ The first column contains the number of crawls for a particular URL, and the
49
+ second column contains the URL for the Internet Archive collection that added
50
+ it.
51
+
52
+ ## Time
53
+
54
+ By default waybackprov will only look at the current year. If you would like it
55
+ to examine a range of years use the `--start` and `--end` options:
56
+
57
+ ```shell
58
+ waybackprov --start 2016 --end 2018 https://twitter.com/EPAScottPruitt
59
+ ```
60
+
61
+ ## Multiple Pages
62
+
63
+ If you would like to look at all URLs at a particular URL prefix you can use the
64
+ `--prefix` option:
65
+
66
+ ```shell
67
+ waybackprov --prefix https://twitter.com/EPAScottPruitt
68
+ ```
69
+
70
+ This will use the Internet Archive's [CDX API](https://github.com/webrecorder/pywb/wiki/CDX-Server-API) to also include URLs that are extensions of the URL you supply, so it would include for example:
71
+
72
+ https://twitter.com/EPAScottPruitt/status/1309839080398339
73
+
74
+ But it can also include things you may not want, such as:
75
+
76
+ https://twitter.com/EPAScottPruitt/status/1309839080398339/media/1
77
+
78
+ To further limit the URLs use the `--match` parameter to specify a regular
79
+ expression only check particular URLs. Further specifying the URLs you are
80
+ interested in is highly recommended since it prevents lots of lookups for CSS,
81
+ JavaScript and image files that are components of the resource that was
82
+ initially crawled.
83
+
84
+ ```
85
+ waybackprov --prefix --match 'status/\d+$' https://twitter.com/EPAScottPruitt
86
+ ```
87
+
88
+ ## Collections
89
+
90
+ One thing to remember when interpreting this data is that collections can
91
+ contain other collections. For example the *edgi_monitor* collection is a
92
+ sub-collection of *focused_crawls*.
93
+
94
+ If you use the `--collapse` option only the most specific collection will be
95
+ reported for a given crawl. So if *coll1* is part of *coll2* which is part of
96
+ *coll3*, only *coll1* will be reported instead of *coll1*, *coll2* and *coll3*.
97
+ This does involve collection metadata lookups at the Internet Archive API, so it
98
+ does slow performance significantly.
99
+
100
+ ## JSON and CSV
101
+
102
+ If you would rather see the raw data as JSON or CSV use the `--format` option.
103
+ When you use either of these formats you will see the metadata for each crawl,
104
+ rather than a summary.
105
+
106
+ ## Log
107
+
108
+ If you would like to see detailed information about what *waybackprov* is doing
109
+ use the `--log` option to supply the a file path to log to:
110
+
111
+ ```shell
112
+ waybackprov --log waybackprov.log https://example.com/
113
+ ```
114
+
115
+ ## Test
116
+
117
+ If you would like to test it first install [pytest] and then:
118
+
119
+ uv run pytest test.py
120
+
121
+ [pytest]: https://docs.pytest.org/en/latest/
122
+ [uv]: https://docs.astral.sh/uv/
@@ -0,0 +1,5 @@
1
+ waybackprov/__init__.py,sha256=_eRtNRoC5yHya-UFEwKNq2aQqrW_0eo2Dui-rDm4ius,8435
2
+ waybackprov-0.1.0.dist-info/WHEEL,sha256=DpNsHFUm_gffZe1FgzmqwuqiuPC6Y-uBCzibcJcdupM,78
3
+ waybackprov-0.1.0.dist-info/entry_points.txt,sha256=gm4N2tl-7pQpgZYbZPtL1t9CBp8p-c0V7_f1uLfnOQk,50
4
+ waybackprov-0.1.0.dist-info/METADATA,sha256=VFErj5hoWeJJ2dUL-V3wBXBVP-_np0xN7BNl2v3oNvA,3833
5
+ waybackprov-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: uv 0.9.8
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ waybackprov = waybackprov:main
3
+