warn-scraper 1.2.55__py3-none-any.whl → 1.2.57__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
warn/scrapers/ct.py CHANGED
@@ -3,6 +3,7 @@ from datetime import datetime
3
3
  from pathlib import Path
4
4
 
5
5
  from bs4 import BeautifulSoup
6
+ import requests
6
7
 
7
8
  from .. import utils
8
9
  from ..cache import Cache
@@ -40,8 +41,20 @@ def scrape(
40
41
  # We start in 2015
41
42
  current_year = datetime.now().year
42
43
 
43
- # Get the full range of years
44
- year_range = range(2015, current_year + 1)
44
+ if cache.exists(f"ct/{current_year}.html"):
45
+ # Get the full range of years
46
+ year_range = range(2015, current_year + 1)
47
+ else:
48
+ url = f"https://www.ctdol.state.ct.us/progsupt/bussrvce/warnreports/warn{current_year}.htm"
49
+ r = requests.head(url)
50
+ if r.ok:
51
+ logger.debug(f"Found first entry for {current_year}")
52
+ year_range = range(2015, current_year + 1)
53
+ else:
54
+ logger.debug(
55
+ f"No data for {current_year} found at {url}. Dropping back a year."
56
+ )
57
+ year_range = range(2015, current_year + 0)
45
58
 
46
59
  output_rows = []
47
60
  for year in year_range:
@@ -100,7 +113,6 @@ def _scrape_table(table) -> list:
100
113
  row_list = []
101
114
  # loop over table to process each row, skipping the header
102
115
  for table_row in table[0].find_all("tr")[1:]:
103
-
104
116
  # Get all the cells
105
117
  table_cells = table_row.find_all("td")
106
118
 
warn/scrapers/dc.py CHANGED
@@ -5,6 +5,7 @@ from datetime import datetime
5
5
  from pathlib import Path
6
6
 
7
7
  from bs4 import BeautifulSoup
8
+ import requests
8
9
 
9
10
  from .. import utils
10
11
  from ..cache import Cache
@@ -38,13 +39,22 @@ def scrape(
38
39
  # Get the root page
39
40
  today = datetime.today()
40
41
  current_year = today.year
41
- url = f"https://does.dc.gov/page/industry-closings-and-layoffs-warn-notifications-{current_year}"
42
+ targetfile = f"dc/{current_year}.html"
43
+ if not cache.exists(targetfile): # Check if we have an entry for the latest year
44
+ url = f"https://does.dc.gov/page/industry-closings-and-layoffs-warn-notifications-{current_year}"
45
+ r = requests.head(url)
46
+ if not r.ok:
47
+ logger.debug(f"Still no data found for {current_year}. Falling back.")
48
+ current_year = today.year - 1
49
+ targetfile = f"dc/{current_year}.html"
50
+ url = f"https://does.dc.gov/page/industry-closings-and-layoffs-warn-notifications-{current_year}"
51
+
42
52
  r = utils.get_url(url)
43
53
  r.encoding = "utf-8"
44
54
  root_html = r.text
45
55
 
46
56
  # Save it to the cache
47
- cache.write(f"dc/{current_year}.html", root_html)
57
+ cache.write(targetfile, root_html)
48
58
 
49
59
  # Parse the list of links
50
60
  soup = BeautifulSoup(root_html, "html5lib")
@@ -70,7 +80,6 @@ def scrape(
70
80
  root_html,
71
81
  ]
72
82
  for href in link_lookup.values():
73
-
74
83
  # Request the HTML
75
84
  r = utils.get_url(href)
76
85
  r.encoding = "utf-8"
warn/scrapers/hi.py CHANGED
@@ -1,13 +1,14 @@
1
1
  import datetime
2
2
  import logging
3
3
  from pathlib import Path
4
+ from urllib.parse import quote
4
5
 
5
6
  from bs4 import BeautifulSoup
6
7
 
7
8
  from .. import utils
8
9
 
9
10
  __authors__ = ["Ash1R", "stucka"]
10
- __tags__ = ["html"]
11
+ __tags__ = ["html", "pdf"]
11
12
  __source__ = {
12
13
  "name": "Workforce Development Hawaii",
13
14
  "url": "https://labor.hawaii.gov/wdc/real-time-warn-updates/",
@@ -28,15 +29,17 @@ def scrape(
28
29
  cache_dir -- the Path where results can be cached (default WARN_CACHE_DIR)
29
30
  Returns: the Path where the file is written
30
31
  """
31
- firstpage = utils.get_url("https://labor.hawaii.gov/wdc/real-time-warn-updates/")
32
+ cacheprefix = "https://webcache.googleusercontent.com/search?q=cache%3A" # Use Google Cache, per #600
33
+
34
+ firstpage = utils.get_url(cacheprefix + quote("https://labor.hawaii.gov/wdc/real-time-warn-updates/"))
32
35
  soup = BeautifulSoup(firstpage.text, features="html5lib")
33
36
  pagesection = soup.select("div.primary-content")[0]
34
37
  subpageurls = []
35
38
  for atag in pagesection.find_all("a"):
36
39
  href = atag["href"]
37
40
  if href.endswith("/"):
38
- href = href[:-1]
39
- subpageurls.append(href)
41
+ href = href # [:-1]
42
+ subpageurls.append(cacheprefix + quote(href))
40
43
 
41
44
  headers = ["Company", "Date", "PDF url", "location", "jobs"]
42
45
  data = [headers]
@@ -85,8 +88,8 @@ def scrape(
85
88
  row.append(dates[i])
86
89
 
87
90
  row.append(url)
88
- row.append(None) # location
89
- row.append(None) # jobs
91
+ row.append(None) # location
92
+ row.append(None) # jobs
90
93
  data.append(row)
91
94
 
92
95
  output_csv = data_dir / "hi.csv"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: warn-scraper
3
- Version: 1.2.55
3
+ Version: 1.2.57
4
4
  Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
5
5
  Home-page: https://github.com/biglocalnews/warn-scraper
6
6
  Author: Big Local News
@@ -31,12 +31,12 @@ warn/scrapers/al.py,sha256=D0rT9GQ0vwfkRuveVAt-Po-T6b2TI1EPGeLOBy2m3_M,2240
31
31
  warn/scrapers/az.py,sha256=elGbue01Gjf_DQ66Wy9qqGIOJsiY-KIKJOVeft8pCXg,1447
32
32
  warn/scrapers/ca.py,sha256=ZXz6sceWpbKwSt4YA-WgD1SISN8vGJlwmlWhK5sURsk,8180
33
33
  warn/scrapers/co.py,sha256=EPWaXzDvV1bIGVNaTtaltY6enfc13Nsh4afb6MWbwfo,6930
34
- warn/scrapers/ct.py,sha256=-VTCWENDltBUHzWpDBaK4QWWtMMhccyxzBOrYqE9xL0,4228
35
- warn/scrapers/dc.py,sha256=WRmCiDTX_arr63az10c1Sj9VjtD9jY4Am4b_jGpIk1o,4021
34
+ warn/scrapers/ct.py,sha256=yOvQy9ljtW3WJBq4MQIegkbkneXzFFos2l0qwnqcyQE,4771
35
+ warn/scrapers/dc.py,sha256=kAWmERzEIOtGHla9tn8hK0NfP4B-aP4jknNGUiPw4C4,4493
36
36
  warn/scrapers/de.py,sha256=yMpCFAAlIO2f4DVUQpPKKLzm52_Zpn9IuPPFBNX1pjQ,1386
37
37
  warn/scrapers/fl.py,sha256=mHymxjwNGtYXRxAwjUSIG1qeSp4Y_zzr4XDxPz9LAfY,9560
38
38
  warn/scrapers/ga.py,sha256=EuqBrMlBojH6eXOHisNqJAQcsnb8FPHDwWomNopw9Ys,7285
39
- warn/scrapers/hi.py,sha256=ua32erw1Syc1SaAt28O0H_4p6WdUVVYwhg0EppyPWzI,3526
39
+ warn/scrapers/hi.py,sha256=IrwgUMNPqsHExiLZ8dFM25am7KTtVRrLDjIesNtJIsk,3736
40
40
  warn/scrapers/ia.py,sha256=zOncaA9M0d6paT4pB7UU_4D_yxUgeUiGRcnpKi9DsRA,1999
41
41
  warn/scrapers/id.py,sha256=rRkE9g9ZOL7JgTdIz46kyGOeetzSla3e1Xr6gJ1v_74,5443
42
42
  warn/scrapers/il.py,sha256=sygdvsNuB_Gvu3o_HidtpSP4FLz0szKb1zEHqGxVtlI,1563
@@ -66,9 +66,9 @@ warn/scrapers/va.py,sha256=DDuR4_2Jpaxg9nVmuM6PAR8v8xz3VgxTBG5sWJgz2q0,1582
66
66
  warn/scrapers/vt.py,sha256=d-bo4WK2hkrk4BhCCmLpEovcoZltlvdIUB6O0uaMx5A,1186
67
67
  warn/scrapers/wa.py,sha256=Il3RmJpKr7SbwUBxHxlhEFLoxy7zSiduyo8F2EddB2Y,4021
68
68
  warn/scrapers/wi.py,sha256=ClEzXkwZbop0W4fkQgsb5oHAPUrb4luUPGV-jOKwkcg,4855
69
- warn_scraper-1.2.55.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
70
- warn_scraper-1.2.55.dist-info/METADATA,sha256=8sKtv4Dpmf84Lfo2Th7AL0MbDdnFZDa6z6hAvrHUJSM,2025
71
- warn_scraper-1.2.55.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
72
- warn_scraper-1.2.55.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
73
- warn_scraper-1.2.55.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
74
- warn_scraper-1.2.55.dist-info/RECORD,,
69
+ warn_scraper-1.2.57.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
70
+ warn_scraper-1.2.57.dist-info/METADATA,sha256=Z__kggmqalQaYaWaynUtNFXqNiBW9X8u7SDh0Y3qjkI,2025
71
+ warn_scraper-1.2.57.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
72
+ warn_scraper-1.2.57.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
73
+ warn_scraper-1.2.57.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
74
+ warn_scraper-1.2.57.dist-info/RECORD,,