PyPI - waybackprov - Versions diffs - 0.1.0__tar.gz → 0.1.1__tar.gz - Mend

waybackprov 0.1.0tar.gz → 0.1.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

{waybackprov-0.1.0 → waybackprov-0.1.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: waybackprov
-Version: 0.1.0
+Version: 0.1.1
 Summary: Checks the provenance of a URL in the Wayback machine
 Author: Ed Summers
 Author-email: Ed Summers <ehs@pobox.com>
@@ -37,18 +37,21 @@ To check a particular URL here's how it works:
 ```shell
 waybackprov https://twitter.com/EPAScottPruitt
-364 https://archive.org/details/focused_crawls
-306 https://archive.org/details/edgi_monitor
-151 https://archive.org/details/www3.epa.gov
- 60 https://archive.org/details/epa.gov4
- 47 https://archive.org/details/epa.gov5
-  ...
+crawls collections
+   364 https://archive.org/details/focused_crawls
+   306 https://archive.org/details/edgi_monitor
+   151 https://archive.org/details/www3.epa.gov
+    60 https://archive.org/details/epa.gov4
+    47 https://archive.org/details/epa.gov5
 ```
 The first column contains the number of crawls for a particular URL, and the
 second column contains the URL for the Internet Archive collection that added
 it.
+When evaluating the counts it's important to remember that collections can be contained in other collections. So `epa.gov4` in the example above is part of the `edgi_monitor` collection.
 ## Time
 By default waybackprov will only look at the current year. If you would like it

{waybackprov-0.1.0 → waybackprov-0.1.1}/README.md RENAMED Viewed

@@ -27,18 +27,21 @@ To check a particular URL here's how it works:
 ```shell
 waybackprov https://twitter.com/EPAScottPruitt
-364 https://archive.org/details/focused_crawls
-306 https://archive.org/details/edgi_monitor
-151 https://archive.org/details/www3.epa.gov
- 60 https://archive.org/details/epa.gov4
- 47 https://archive.org/details/epa.gov5
-  ...
+crawls collections
+   364 https://archive.org/details/focused_crawls
+   306 https://archive.org/details/edgi_monitor
+   151 https://archive.org/details/www3.epa.gov
+    60 https://archive.org/details/epa.gov4
+    47 https://archive.org/details/epa.gov5
 ```
 The first column contains the number of crawls for a particular URL, and the
 second column contains the URL for the Internet Archive collection that added
 it.
+When evaluating the counts it's important to remember that collections can be contained in other collections. So `epa.gov4` in the example above is part of the `edgi_monitor` collection.
 ## Time
 By default waybackprov will only look at the current year. If you would like it

{waybackprov-0.1.0 → waybackprov-0.1.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "waybackprov"
-version = "0.1.0"
+version = "0.1.1"
 description = "Checks the provenance of a URL in the Wayback machine"
 readme = "README.md"
 authors = [

{waybackprov-0.1.0 → waybackprov-0.1.1}/src/waybackprov/__init__.py RENAMED Viewed

@@ -7,12 +7,10 @@ import json
 import time
 import codecs
 import logging
-import operator
 import datetime
 import optparse
 import collections
-from functools import reduce
 from urllib.parse import quote
 from urllib.request import urlopen
@@ -61,12 +59,19 @@ def main():
     )
     if opts.format == "text":
-        crawls = 0
+        # coll_urls is a dictionary where the key is a collection id and the
+        # value is a set of URLs that have been crawled
         coll_urls = {}
+        # coll_counter is a Counter that counts the number of crawls that are
+        # in a collection
         coll_counter = collections.Counter()
         for crawl in crawl_data:
-            crawls += 1
             coll_counter.update(crawl["collections"])
+            # a crawl can appear in multiple collections because of how
+            # collections can contain other collections
             for coll in crawl["collections"]:
                 # keep track of urls in each collection
                 if coll not in coll_urls:
@@ -80,25 +85,19 @@ def main():
             )
             return
-        max_pos = str(len(str(coll_counter.most_common(1)[0][1])))
         if opts.prefix:
-            str_format = (
-                "%" + max_pos + "i %" + max_pos + "i https://archive.org/details/%s"
-            )
+            str_format = "%6s %6s %s"
+            print(str_format % ("crawls", "urls", "collection"))
         else:
-            str_format = "%" + max_pos + "i https://archive.org/details/%s"
+            str_format = "%6s %s"
+            print(str_format % ("crawls", "collection"))
         for coll_id, count in coll_counter.most_common():
+            coll_url = f"https://archive.org/details/{coll_id}"
             if opts.prefix:
-                print(str_format % (count, len(coll_urls[coll_id]), coll_id))
+                print(str_format % (count, len(coll_urls[coll_id]), coll_url))
             else:
-                print(str_format % (count, coll_id))
-        print("")
-        print("total crawls %s-%s: %s" % (opts.start, opts.end, crawls))
-        if opts.prefix:
-            total_urls = len(reduce(operator.or_, coll_urls.values()))
-            print("total urls: %s" % total_urls)
+                print(str_format % (count, coll_url))
     elif opts.format == "json":
         data = list(crawl_data)
@@ -227,8 +226,8 @@ def get_json(url):
             reader = codecs.getreader("utf-8")
             return json.load(reader(resp))
         except Exception as e:
-            logging.error("caught exception: %s", e)
-        logging.info("sleeping for %s seconds", count * 10)
+            logging.debug("caught exception: %s", e)
+        logging.debug("sleeping for %s seconds", count * 10)
         time.sleep(count * 10)
     raise (Exception("unable to get JSON for %s", url))

waybackprov 0.1.0__tar.gz → 0.1.1__tar.gz

waybackprov 0.1.0tar.gz → 0.1.1tar.gz