waybackprov 0.1.0__tar.gz → 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: waybackprov
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: Checks the provenance of a URL in the Wayback machine
5
5
  Author: Ed Summers
6
6
  Author-email: Ed Summers <ehs@pobox.com>
@@ -37,18 +37,21 @@ To check a particular URL here's how it works:
37
37
 
38
38
  ```shell
39
39
  waybackprov https://twitter.com/EPAScottPruitt
40
- 364 https://archive.org/details/focused_crawls
41
- 306 https://archive.org/details/edgi_monitor
42
- 151 https://archive.org/details/www3.epa.gov
43
- 60 https://archive.org/details/epa.gov4
44
- 47 https://archive.org/details/epa.gov5
45
- ...
40
+
41
+ crawls collections
42
+ 364 https://archive.org/details/focused_crawls
43
+ 306 https://archive.org/details/edgi_monitor
44
+ 151 https://archive.org/details/www3.epa.gov
45
+ 60 https://archive.org/details/epa.gov4
46
+ 47 https://archive.org/details/epa.gov5
46
47
  ```
47
48
 
48
49
  The first column contains the number of crawls for a particular URL, and the
49
50
  second column contains the URL for the Internet Archive collection that added
50
51
  it.
51
52
 
53
+ When evaluating the counts it's important to remember that collections can be contained in other collections. So `epa.gov4` in the example above is part of the `edgi_monitor` collection.
54
+
52
55
  ## Time
53
56
 
54
57
  By default waybackprov will only look at the current year. If you would like it
@@ -27,18 +27,21 @@ To check a particular URL here's how it works:
27
27
 
28
28
  ```shell
29
29
  waybackprov https://twitter.com/EPAScottPruitt
30
- 364 https://archive.org/details/focused_crawls
31
- 306 https://archive.org/details/edgi_monitor
32
- 151 https://archive.org/details/www3.epa.gov
33
- 60 https://archive.org/details/epa.gov4
34
- 47 https://archive.org/details/epa.gov5
35
- ...
30
+
31
+ crawls collections
32
+ 364 https://archive.org/details/focused_crawls
33
+ 306 https://archive.org/details/edgi_monitor
34
+ 151 https://archive.org/details/www3.epa.gov
35
+ 60 https://archive.org/details/epa.gov4
36
+ 47 https://archive.org/details/epa.gov5
36
37
  ```
37
38
 
38
39
  The first column contains the number of crawls for a particular URL, and the
39
40
  second column contains the URL for the Internet Archive collection that added
40
41
  it.
41
42
 
43
+ When evaluating the counts it's important to remember that collections can be contained in other collections. So `epa.gov4` in the example above is part of the `edgi_monitor` collection.
44
+
42
45
  ## Time
43
46
 
44
47
  By default waybackprov will only look at the current year. If you would like it
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "waybackprov"
3
- version = "0.1.0"
3
+ version = "0.1.1"
4
4
  description = "Checks the provenance of a URL in the Wayback machine"
5
5
  readme = "README.md"
6
6
  authors = [
@@ -7,12 +7,10 @@ import json
7
7
  import time
8
8
  import codecs
9
9
  import logging
10
- import operator
11
10
  import datetime
12
11
  import optparse
13
12
  import collections
14
13
 
15
- from functools import reduce
16
14
  from urllib.parse import quote
17
15
  from urllib.request import urlopen
18
16
 
@@ -61,12 +59,19 @@ def main():
61
59
  )
62
60
 
63
61
  if opts.format == "text":
64
- crawls = 0
62
+ # coll_urls is a dictionary where the key is a collection id and the
63
+ # value is a set of URLs that have been crawled
65
64
  coll_urls = {}
65
+
66
+ # coll_counter is a Counter that counts the number of crawls that are
67
+ # in a collection
66
68
  coll_counter = collections.Counter()
69
+
67
70
  for crawl in crawl_data:
68
- crawls += 1
69
71
  coll_counter.update(crawl["collections"])
72
+
73
+ # a crawl can appear in multiple collections because of how
74
+ # collections can contain other collections
70
75
  for coll in crawl["collections"]:
71
76
  # keep track of urls in each collection
72
77
  if coll not in coll_urls:
@@ -80,25 +85,19 @@ def main():
80
85
  )
81
86
  return
82
87
 
83
- max_pos = str(len(str(coll_counter.most_common(1)[0][1])))
84
88
  if opts.prefix:
85
- str_format = (
86
- "%" + max_pos + "i %" + max_pos + "i https://archive.org/details/%s"
87
- )
89
+ str_format = "%6s %6s %s"
90
+ print(str_format % ("crawls", "urls", "collection"))
88
91
  else:
89
- str_format = "%" + max_pos + "i https://archive.org/details/%s"
92
+ str_format = "%6s %s"
93
+ print(str_format % ("crawls", "collection"))
90
94
 
91
95
  for coll_id, count in coll_counter.most_common():
96
+ coll_url = f"https://archive.org/details/{coll_id}"
92
97
  if opts.prefix:
93
- print(str_format % (count, len(coll_urls[coll_id]), coll_id))
98
+ print(str_format % (count, len(coll_urls[coll_id]), coll_url))
94
99
  else:
95
- print(str_format % (count, coll_id))
96
-
97
- print("")
98
- print("total crawls %s-%s: %s" % (opts.start, opts.end, crawls))
99
- if opts.prefix:
100
- total_urls = len(reduce(operator.or_, coll_urls.values()))
101
- print("total urls: %s" % total_urls)
100
+ print(str_format % (count, coll_url))
102
101
 
103
102
  elif opts.format == "json":
104
103
  data = list(crawl_data)
@@ -227,8 +226,8 @@ def get_json(url):
227
226
  reader = codecs.getreader("utf-8")
228
227
  return json.load(reader(resp))
229
228
  except Exception as e:
230
- logging.error("caught exception: %s", e)
231
- logging.info("sleeping for %s seconds", count * 10)
229
+ logging.debug("caught exception: %s", e)
230
+ logging.debug("sleeping for %s seconds", count * 10)
232
231
  time.sleep(count * 10)
233
232
  raise (Exception("unable to get JSON for %s", url))
234
233