PyPI - warn-scraper - Versions diffs - 1.2.55__py3-none-any.whl → 1.2.57__py3-none-any.whl - Mend

warn-scraper 1.2.55py3-none-any.whl → 1.2.57py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

warn/scrapers/ct.py CHANGED Viewed

@@ -3,6 +3,7 @@ from datetime import datetime
 from pathlib import Path
 from bs4 import BeautifulSoup
+import requests
 from .. import utils
 from ..cache import Cache
@@ -40,8 +41,20 @@ def scrape(
     # We start in 2015
     current_year = datetime.now().year
-    # Get the full range of years
-    year_range = range(2015, current_year + 1)
+    if cache.exists(f"ct/{current_year}.html"):
+        # Get the full range of years
+        year_range = range(2015, current_year + 1)
+    else:
+        url = f"https://www.ctdol.state.ct.us/progsupt/bussrvce/warnreports/warn{current_year}.htm"
+        r = requests.head(url)
+        if r.ok:
+            logger.debug(f"Found first entry for {current_year}")
+            year_range = range(2015, current_year + 1)
+        else:
+            logger.debug(
+                f"No data for {current_year} found at {url}. Dropping back a year."
+            )
+            year_range = range(2015, current_year + 0)
     output_rows = []
     for year in year_range:
@@ -100,7 +113,6 @@ def _scrape_table(table) -> list:
     row_list = []
     # loop over table to process each row, skipping the header
     for table_row in table[0].find_all("tr")[1:]:
         # Get all the cells
         table_cells = table_row.find_all("td")

warn/scrapers/dc.py CHANGED Viewed

@@ -5,6 +5,7 @@ from datetime import datetime
 from pathlib import Path
 from bs4 import BeautifulSoup
+import requests
 from .. import utils
 from ..cache import Cache
@@ -38,13 +39,22 @@ def scrape(
     # Get the root page
     today = datetime.today()
     current_year = today.year
-    url = f"https://does.dc.gov/page/industry-closings-and-layoffs-warn-notifications-{current_year}"
+    targetfile = f"dc/{current_year}.html"
+    if not cache.exists(targetfile):  # Check if we have an entry for the latest year
+        url = f"https://does.dc.gov/page/industry-closings-and-layoffs-warn-notifications-{current_year}"
+        r = requests.head(url)
+        if not r.ok:
+            logger.debug(f"Still no data found for {current_year}. Falling back.")
+            current_year = today.year - 1
+            targetfile = f"dc/{current_year}.html"
+            url = f"https://does.dc.gov/page/industry-closings-and-layoffs-warn-notifications-{current_year}"
     r = utils.get_url(url)
     r.encoding = "utf-8"
     root_html = r.text
     # Save it to the cache
-    cache.write(f"dc/{current_year}.html", root_html)
+    cache.write(targetfile, root_html)
     # Parse the list of links
     soup = BeautifulSoup(root_html, "html5lib")
@@ -70,7 +80,6 @@ def scrape(
         root_html,
     ]
     for href in link_lookup.values():
         # Request the HTML
         r = utils.get_url(href)
         r.encoding = "utf-8"

warn/scrapers/hi.py CHANGED Viewed

@@ -1,13 +1,14 @@
 import datetime
 import logging
 from pathlib import Path
+from urllib.parse import quote
 from bs4 import BeautifulSoup
 from .. import utils
 __authors__ = ["Ash1R", "stucka"]
-__tags__ = ["html"]
+__tags__ = ["html", "pdf"]
 __source__ = {
     "name": "Workforce Development Hawaii",
     "url": "https://labor.hawaii.gov/wdc/real-time-warn-updates/",
@@ -28,15 +29,17 @@ def scrape(
     cache_dir -- the Path where results can be cached (default WARN_CACHE_DIR)
     Returns: the Path where the file is written
     """
-    firstpage = utils.get_url("https://labor.hawaii.gov/wdc/real-time-warn-updates/")
+    cacheprefix = "https://webcache.googleusercontent.com/search?q=cache%3A"    # Use Google Cache, per #600
+    firstpage = utils.get_url(cacheprefix + quote("https://labor.hawaii.gov/wdc/real-time-warn-updates/"))
     soup = BeautifulSoup(firstpage.text, features="html5lib")
     pagesection = soup.select("div.primary-content")[0]
     subpageurls = []
     for atag in pagesection.find_all("a"):
         href = atag["href"]
         if href.endswith("/"):
-            href = href[:-1]
-        subpageurls.append(href)
+            href = href         # [:-1]
+        subpageurls.append(cacheprefix + quote(href))
     headers = ["Company", "Date", "PDF url", "location", "jobs"]
     data = [headers]
@@ -85,8 +88,8 @@ def scrape(
             row.append(dates[i])
             row.append(url)
-            row.append(None)  # location
-            row.append(None)  # jobs
+            row.append(None)     # location
+            row.append(None)     # jobs
             data.append(row)
     output_csv = data_dir / "hi.csv"

{warn_scraper-1.2.55.dist-info → warn_scraper-1.2.57.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: warn-scraper
-Version: 1.2.55
+Version: 1.2.57
 Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
 Home-page: https://github.com/biglocalnews/warn-scraper
 Author: Big Local News

{warn_scraper-1.2.55.dist-info → warn_scraper-1.2.57.dist-info}/RECORD RENAMED Viewed

@@ -31,12 +31,12 @@ warn/scrapers/al.py,sha256=D0rT9GQ0vwfkRuveVAt-Po-T6b2TI1EPGeLOBy2m3_M,2240
 warn/scrapers/az.py,sha256=elGbue01Gjf_DQ66Wy9qqGIOJsiY-KIKJOVeft8pCXg,1447
 warn/scrapers/ca.py,sha256=ZXz6sceWpbKwSt4YA-WgD1SISN8vGJlwmlWhK5sURsk,8180
 warn/scrapers/co.py,sha256=EPWaXzDvV1bIGVNaTtaltY6enfc13Nsh4afb6MWbwfo,6930
-warn/scrapers/ct.py,sha256=-VTCWENDltBUHzWpDBaK4QWWtMMhccyxzBOrYqE9xL0,4228
-warn/scrapers/dc.py,sha256=WRmCiDTX_arr63az10c1Sj9VjtD9jY4Am4b_jGpIk1o,4021
+warn/scrapers/ct.py,sha256=yOvQy9ljtW3WJBq4MQIegkbkneXzFFos2l0qwnqcyQE,4771
+warn/scrapers/dc.py,sha256=kAWmERzEIOtGHla9tn8hK0NfP4B-aP4jknNGUiPw4C4,4493
 warn/scrapers/de.py,sha256=yMpCFAAlIO2f4DVUQpPKKLzm52_Zpn9IuPPFBNX1pjQ,1386
 warn/scrapers/fl.py,sha256=mHymxjwNGtYXRxAwjUSIG1qeSp4Y_zzr4XDxPz9LAfY,9560
 warn/scrapers/ga.py,sha256=EuqBrMlBojH6eXOHisNqJAQcsnb8FPHDwWomNopw9Ys,7285
-warn/scrapers/hi.py,sha256=ua32erw1Syc1SaAt28O0H_4p6WdUVVYwhg0EppyPWzI,3526
+warn/scrapers/hi.py,sha256=IrwgUMNPqsHExiLZ8dFM25am7KTtVRrLDjIesNtJIsk,3736
 warn/scrapers/ia.py,sha256=zOncaA9M0d6paT4pB7UU_4D_yxUgeUiGRcnpKi9DsRA,1999
 warn/scrapers/id.py,sha256=rRkE9g9ZOL7JgTdIz46kyGOeetzSla3e1Xr6gJ1v_74,5443
 warn/scrapers/il.py,sha256=sygdvsNuB_Gvu3o_HidtpSP4FLz0szKb1zEHqGxVtlI,1563
@@ -66,9 +66,9 @@ warn/scrapers/va.py,sha256=DDuR4_2Jpaxg9nVmuM6PAR8v8xz3VgxTBG5sWJgz2q0,1582
 warn/scrapers/vt.py,sha256=d-bo4WK2hkrk4BhCCmLpEovcoZltlvdIUB6O0uaMx5A,1186
 warn/scrapers/wa.py,sha256=Il3RmJpKr7SbwUBxHxlhEFLoxy7zSiduyo8F2EddB2Y,4021
 warn/scrapers/wi.py,sha256=ClEzXkwZbop0W4fkQgsb5oHAPUrb4luUPGV-jOKwkcg,4855
-warn_scraper-1.2.55.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-warn_scraper-1.2.55.dist-info/METADATA,sha256=8sKtv4Dpmf84Lfo2Th7AL0MbDdnFZDa6z6hAvrHUJSM,2025
-warn_scraper-1.2.55.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
-warn_scraper-1.2.55.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
-warn_scraper-1.2.55.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
-warn_scraper-1.2.55.dist-info/RECORD,,
+warn_scraper-1.2.57.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+warn_scraper-1.2.57.dist-info/METADATA,sha256=Z__kggmqalQaYaWaynUtNFXqNiBW9X8u7SDh0Y3qjkI,2025
+warn_scraper-1.2.57.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
+warn_scraper-1.2.57.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
+warn_scraper-1.2.57.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
+warn_scraper-1.2.57.dist-info/RECORD,,

{warn_scraper-1.2.55.dist-info → warn_scraper-1.2.57.dist-info}/LICENSE RENAMED Viewed

File without changes

{warn_scraper-1.2.55.dist-info → warn_scraper-1.2.57.dist-info}/WHEEL RENAMED Viewed

File without changes

{warn_scraper-1.2.55.dist-info → warn_scraper-1.2.57.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{warn_scraper-1.2.55.dist-info → warn_scraper-1.2.57.dist-info}/top_level.txt RENAMED Viewed

File without changes

warn-scraper 1.2.55__py3-none-any.whl → 1.2.57__py3-none-any.whl

warn-scraper 1.2.55py3-none-any.whl → 1.2.57py3-none-any.whl