waybackprov 0.1.0__tar.gz → 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: waybackprov
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.1
|
|
4
4
|
Summary: Checks the provenance of a URL in the Wayback machine
|
|
5
5
|
Author: Ed Summers
|
|
6
6
|
Author-email: Ed Summers <ehs@pobox.com>
|
|
@@ -37,18 +37,21 @@ To check a particular URL here's how it works:
|
|
|
37
37
|
|
|
38
38
|
```shell
|
|
39
39
|
waybackprov https://twitter.com/EPAScottPruitt
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
40
|
+
|
|
41
|
+
crawls collections
|
|
42
|
+
364 https://archive.org/details/focused_crawls
|
|
43
|
+
306 https://archive.org/details/edgi_monitor
|
|
44
|
+
151 https://archive.org/details/www3.epa.gov
|
|
45
|
+
60 https://archive.org/details/epa.gov4
|
|
46
|
+
47 https://archive.org/details/epa.gov5
|
|
46
47
|
```
|
|
47
48
|
|
|
48
49
|
The first column contains the number of crawls for a particular URL, and the
|
|
49
50
|
second column contains the URL for the Internet Archive collection that added
|
|
50
51
|
it.
|
|
51
52
|
|
|
53
|
+
When evaluating the counts it's important to remember that collections can be contained in other collections. So `epa.gov4` in the example above is part of the `edgi_monitor` collection.
|
|
54
|
+
|
|
52
55
|
## Time
|
|
53
56
|
|
|
54
57
|
By default waybackprov will only look at the current year. If you would like it
|
|
@@ -27,18 +27,21 @@ To check a particular URL here's how it works:
|
|
|
27
27
|
|
|
28
28
|
```shell
|
|
29
29
|
waybackprov https://twitter.com/EPAScottPruitt
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
30
|
+
|
|
31
|
+
crawls collections
|
|
32
|
+
364 https://archive.org/details/focused_crawls
|
|
33
|
+
306 https://archive.org/details/edgi_monitor
|
|
34
|
+
151 https://archive.org/details/www3.epa.gov
|
|
35
|
+
60 https://archive.org/details/epa.gov4
|
|
36
|
+
47 https://archive.org/details/epa.gov5
|
|
36
37
|
```
|
|
37
38
|
|
|
38
39
|
The first column contains the number of crawls for a particular URL, and the
|
|
39
40
|
second column contains the URL for the Internet Archive collection that added
|
|
40
41
|
it.
|
|
41
42
|
|
|
43
|
+
When evaluating the counts it's important to remember that collections can be contained in other collections. So `epa.gov4` in the example above is part of the `edgi_monitor` collection.
|
|
44
|
+
|
|
42
45
|
## Time
|
|
43
46
|
|
|
44
47
|
By default waybackprov will only look at the current year. If you would like it
|
|
@@ -7,12 +7,10 @@ import json
|
|
|
7
7
|
import time
|
|
8
8
|
import codecs
|
|
9
9
|
import logging
|
|
10
|
-
import operator
|
|
11
10
|
import datetime
|
|
12
11
|
import optparse
|
|
13
12
|
import collections
|
|
14
13
|
|
|
15
|
-
from functools import reduce
|
|
16
14
|
from urllib.parse import quote
|
|
17
15
|
from urllib.request import urlopen
|
|
18
16
|
|
|
@@ -61,12 +59,19 @@ def main():
|
|
|
61
59
|
)
|
|
62
60
|
|
|
63
61
|
if opts.format == "text":
|
|
64
|
-
|
|
62
|
+
# coll_urls is a dictionary where the key is a collection id and the
|
|
63
|
+
# value is a set of URLs that have been crawled
|
|
65
64
|
coll_urls = {}
|
|
65
|
+
|
|
66
|
+
# coll_counter is a Counter that counts the number of crawls that are
|
|
67
|
+
# in a collection
|
|
66
68
|
coll_counter = collections.Counter()
|
|
69
|
+
|
|
67
70
|
for crawl in crawl_data:
|
|
68
|
-
crawls += 1
|
|
69
71
|
coll_counter.update(crawl["collections"])
|
|
72
|
+
|
|
73
|
+
# a crawl can appear in multiple collections because of how
|
|
74
|
+
# collections can contain other collections
|
|
70
75
|
for coll in crawl["collections"]:
|
|
71
76
|
# keep track of urls in each collection
|
|
72
77
|
if coll not in coll_urls:
|
|
@@ -80,25 +85,19 @@ def main():
|
|
|
80
85
|
)
|
|
81
86
|
return
|
|
82
87
|
|
|
83
|
-
max_pos = str(len(str(coll_counter.most_common(1)[0][1])))
|
|
84
88
|
if opts.prefix:
|
|
85
|
-
str_format =
|
|
86
|
-
|
|
87
|
-
)
|
|
89
|
+
str_format = "%6s %6s %s"
|
|
90
|
+
print(str_format % ("crawls", "urls", "collection"))
|
|
88
91
|
else:
|
|
89
|
-
str_format = "%
|
|
92
|
+
str_format = "%6s %s"
|
|
93
|
+
print(str_format % ("crawls", "collection"))
|
|
90
94
|
|
|
91
95
|
for coll_id, count in coll_counter.most_common():
|
|
96
|
+
coll_url = f"https://archive.org/details/{coll_id}"
|
|
92
97
|
if opts.prefix:
|
|
93
|
-
print(str_format % (count, len(coll_urls[coll_id]),
|
|
98
|
+
print(str_format % (count, len(coll_urls[coll_id]), coll_url))
|
|
94
99
|
else:
|
|
95
|
-
print(str_format % (count,
|
|
96
|
-
|
|
97
|
-
print("")
|
|
98
|
-
print("total crawls %s-%s: %s" % (opts.start, opts.end, crawls))
|
|
99
|
-
if opts.prefix:
|
|
100
|
-
total_urls = len(reduce(operator.or_, coll_urls.values()))
|
|
101
|
-
print("total urls: %s" % total_urls)
|
|
100
|
+
print(str_format % (count, coll_url))
|
|
102
101
|
|
|
103
102
|
elif opts.format == "json":
|
|
104
103
|
data = list(crawl_data)
|
|
@@ -227,8 +226,8 @@ def get_json(url):
|
|
|
227
226
|
reader = codecs.getreader("utf-8")
|
|
228
227
|
return json.load(reader(resp))
|
|
229
228
|
except Exception as e:
|
|
230
|
-
logging.
|
|
231
|
-
logging.
|
|
229
|
+
logging.debug("caught exception: %s", e)
|
|
230
|
+
logging.debug("sleeping for %s seconds", count * 10)
|
|
232
231
|
time.sleep(count * 10)
|
|
233
232
|
raise (Exception("unable to get JSON for %s", url))
|
|
234
233
|
|