warn-scraper 1.2.55__py3-none-any.whl → 1.2.57__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- warn/scrapers/ct.py +15 -3
- warn/scrapers/dc.py +12 -3
- warn/scrapers/hi.py +9 -6
- {warn_scraper-1.2.55.dist-info → warn_scraper-1.2.57.dist-info}/METADATA +1 -1
- {warn_scraper-1.2.55.dist-info → warn_scraper-1.2.57.dist-info}/RECORD +9 -9
- {warn_scraper-1.2.55.dist-info → warn_scraper-1.2.57.dist-info}/LICENSE +0 -0
- {warn_scraper-1.2.55.dist-info → warn_scraper-1.2.57.dist-info}/WHEEL +0 -0
- {warn_scraper-1.2.55.dist-info → warn_scraper-1.2.57.dist-info}/entry_points.txt +0 -0
- {warn_scraper-1.2.55.dist-info → warn_scraper-1.2.57.dist-info}/top_level.txt +0 -0
warn/scrapers/ct.py
CHANGED
@@ -3,6 +3,7 @@ from datetime import datetime
|
|
3
3
|
from pathlib import Path
|
4
4
|
|
5
5
|
from bs4 import BeautifulSoup
|
6
|
+
import requests
|
6
7
|
|
7
8
|
from .. import utils
|
8
9
|
from ..cache import Cache
|
@@ -40,8 +41,20 @@ def scrape(
|
|
40
41
|
# We start in 2015
|
41
42
|
current_year = datetime.now().year
|
42
43
|
|
43
|
-
|
44
|
-
|
44
|
+
if cache.exists(f"ct/{current_year}.html"):
|
45
|
+
# Get the full range of years
|
46
|
+
year_range = range(2015, current_year + 1)
|
47
|
+
else:
|
48
|
+
url = f"https://www.ctdol.state.ct.us/progsupt/bussrvce/warnreports/warn{current_year}.htm"
|
49
|
+
r = requests.head(url)
|
50
|
+
if r.ok:
|
51
|
+
logger.debug(f"Found first entry for {current_year}")
|
52
|
+
year_range = range(2015, current_year + 1)
|
53
|
+
else:
|
54
|
+
logger.debug(
|
55
|
+
f"No data for {current_year} found at {url}. Dropping back a year."
|
56
|
+
)
|
57
|
+
year_range = range(2015, current_year + 0)
|
45
58
|
|
46
59
|
output_rows = []
|
47
60
|
for year in year_range:
|
@@ -100,7 +113,6 @@ def _scrape_table(table) -> list:
|
|
100
113
|
row_list = []
|
101
114
|
# loop over table to process each row, skipping the header
|
102
115
|
for table_row in table[0].find_all("tr")[1:]:
|
103
|
-
|
104
116
|
# Get all the cells
|
105
117
|
table_cells = table_row.find_all("td")
|
106
118
|
|
warn/scrapers/dc.py
CHANGED
@@ -5,6 +5,7 @@ from datetime import datetime
|
|
5
5
|
from pathlib import Path
|
6
6
|
|
7
7
|
from bs4 import BeautifulSoup
|
8
|
+
import requests
|
8
9
|
|
9
10
|
from .. import utils
|
10
11
|
from ..cache import Cache
|
@@ -38,13 +39,22 @@ def scrape(
|
|
38
39
|
# Get the root page
|
39
40
|
today = datetime.today()
|
40
41
|
current_year = today.year
|
41
|
-
|
42
|
+
targetfile = f"dc/{current_year}.html"
|
43
|
+
if not cache.exists(targetfile): # Check if we have an entry for the latest year
|
44
|
+
url = f"https://does.dc.gov/page/industry-closings-and-layoffs-warn-notifications-{current_year}"
|
45
|
+
r = requests.head(url)
|
46
|
+
if not r.ok:
|
47
|
+
logger.debug(f"Still no data found for {current_year}. Falling back.")
|
48
|
+
current_year = today.year - 1
|
49
|
+
targetfile = f"dc/{current_year}.html"
|
50
|
+
url = f"https://does.dc.gov/page/industry-closings-and-layoffs-warn-notifications-{current_year}"
|
51
|
+
|
42
52
|
r = utils.get_url(url)
|
43
53
|
r.encoding = "utf-8"
|
44
54
|
root_html = r.text
|
45
55
|
|
46
56
|
# Save it to the cache
|
47
|
-
cache.write(
|
57
|
+
cache.write(targetfile, root_html)
|
48
58
|
|
49
59
|
# Parse the list of links
|
50
60
|
soup = BeautifulSoup(root_html, "html5lib")
|
@@ -70,7 +80,6 @@ def scrape(
|
|
70
80
|
root_html,
|
71
81
|
]
|
72
82
|
for href in link_lookup.values():
|
73
|
-
|
74
83
|
# Request the HTML
|
75
84
|
r = utils.get_url(href)
|
76
85
|
r.encoding = "utf-8"
|
warn/scrapers/hi.py
CHANGED
@@ -1,13 +1,14 @@
|
|
1
1
|
import datetime
|
2
2
|
import logging
|
3
3
|
from pathlib import Path
|
4
|
+
from urllib.parse import quote
|
4
5
|
|
5
6
|
from bs4 import BeautifulSoup
|
6
7
|
|
7
8
|
from .. import utils
|
8
9
|
|
9
10
|
__authors__ = ["Ash1R", "stucka"]
|
10
|
-
__tags__ = ["html"]
|
11
|
+
__tags__ = ["html", "pdf"]
|
11
12
|
__source__ = {
|
12
13
|
"name": "Workforce Development Hawaii",
|
13
14
|
"url": "https://labor.hawaii.gov/wdc/real-time-warn-updates/",
|
@@ -28,15 +29,17 @@ def scrape(
|
|
28
29
|
cache_dir -- the Path where results can be cached (default WARN_CACHE_DIR)
|
29
30
|
Returns: the Path where the file is written
|
30
31
|
"""
|
31
|
-
|
32
|
+
cacheprefix = "https://webcache.googleusercontent.com/search?q=cache%3A" # Use Google Cache, per #600
|
33
|
+
|
34
|
+
firstpage = utils.get_url(cacheprefix + quote("https://labor.hawaii.gov/wdc/real-time-warn-updates/"))
|
32
35
|
soup = BeautifulSoup(firstpage.text, features="html5lib")
|
33
36
|
pagesection = soup.select("div.primary-content")[0]
|
34
37
|
subpageurls = []
|
35
38
|
for atag in pagesection.find_all("a"):
|
36
39
|
href = atag["href"]
|
37
40
|
if href.endswith("/"):
|
38
|
-
href = href[:-1]
|
39
|
-
subpageurls.append(href)
|
41
|
+
href = href # [:-1]
|
42
|
+
subpageurls.append(cacheprefix + quote(href))
|
40
43
|
|
41
44
|
headers = ["Company", "Date", "PDF url", "location", "jobs"]
|
42
45
|
data = [headers]
|
@@ -85,8 +88,8 @@ def scrape(
|
|
85
88
|
row.append(dates[i])
|
86
89
|
|
87
90
|
row.append(url)
|
88
|
-
row.append(None)
|
89
|
-
row.append(None)
|
91
|
+
row.append(None) # location
|
92
|
+
row.append(None) # jobs
|
90
93
|
data.append(row)
|
91
94
|
|
92
95
|
output_csv = data_dir / "hi.csv"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: warn-scraper
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.57
|
4
4
|
Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
|
5
5
|
Home-page: https://github.com/biglocalnews/warn-scraper
|
6
6
|
Author: Big Local News
|
@@ -31,12 +31,12 @@ warn/scrapers/al.py,sha256=D0rT9GQ0vwfkRuveVAt-Po-T6b2TI1EPGeLOBy2m3_M,2240
|
|
31
31
|
warn/scrapers/az.py,sha256=elGbue01Gjf_DQ66Wy9qqGIOJsiY-KIKJOVeft8pCXg,1447
|
32
32
|
warn/scrapers/ca.py,sha256=ZXz6sceWpbKwSt4YA-WgD1SISN8vGJlwmlWhK5sURsk,8180
|
33
33
|
warn/scrapers/co.py,sha256=EPWaXzDvV1bIGVNaTtaltY6enfc13Nsh4afb6MWbwfo,6930
|
34
|
-
warn/scrapers/ct.py,sha256
|
35
|
-
warn/scrapers/dc.py,sha256=
|
34
|
+
warn/scrapers/ct.py,sha256=yOvQy9ljtW3WJBq4MQIegkbkneXzFFos2l0qwnqcyQE,4771
|
35
|
+
warn/scrapers/dc.py,sha256=kAWmERzEIOtGHla9tn8hK0NfP4B-aP4jknNGUiPw4C4,4493
|
36
36
|
warn/scrapers/de.py,sha256=yMpCFAAlIO2f4DVUQpPKKLzm52_Zpn9IuPPFBNX1pjQ,1386
|
37
37
|
warn/scrapers/fl.py,sha256=mHymxjwNGtYXRxAwjUSIG1qeSp4Y_zzr4XDxPz9LAfY,9560
|
38
38
|
warn/scrapers/ga.py,sha256=EuqBrMlBojH6eXOHisNqJAQcsnb8FPHDwWomNopw9Ys,7285
|
39
|
-
warn/scrapers/hi.py,sha256=
|
39
|
+
warn/scrapers/hi.py,sha256=IrwgUMNPqsHExiLZ8dFM25am7KTtVRrLDjIesNtJIsk,3736
|
40
40
|
warn/scrapers/ia.py,sha256=zOncaA9M0d6paT4pB7UU_4D_yxUgeUiGRcnpKi9DsRA,1999
|
41
41
|
warn/scrapers/id.py,sha256=rRkE9g9ZOL7JgTdIz46kyGOeetzSla3e1Xr6gJ1v_74,5443
|
42
42
|
warn/scrapers/il.py,sha256=sygdvsNuB_Gvu3o_HidtpSP4FLz0szKb1zEHqGxVtlI,1563
|
@@ -66,9 +66,9 @@ warn/scrapers/va.py,sha256=DDuR4_2Jpaxg9nVmuM6PAR8v8xz3VgxTBG5sWJgz2q0,1582
|
|
66
66
|
warn/scrapers/vt.py,sha256=d-bo4WK2hkrk4BhCCmLpEovcoZltlvdIUB6O0uaMx5A,1186
|
67
67
|
warn/scrapers/wa.py,sha256=Il3RmJpKr7SbwUBxHxlhEFLoxy7zSiduyo8F2EddB2Y,4021
|
68
68
|
warn/scrapers/wi.py,sha256=ClEzXkwZbop0W4fkQgsb5oHAPUrb4luUPGV-jOKwkcg,4855
|
69
|
-
warn_scraper-1.2.
|
70
|
-
warn_scraper-1.2.
|
71
|
-
warn_scraper-1.2.
|
72
|
-
warn_scraper-1.2.
|
73
|
-
warn_scraper-1.2.
|
74
|
-
warn_scraper-1.2.
|
69
|
+
warn_scraper-1.2.57.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
70
|
+
warn_scraper-1.2.57.dist-info/METADATA,sha256=Z__kggmqalQaYaWaynUtNFXqNiBW9X8u7SDh0Y3qjkI,2025
|
71
|
+
warn_scraper-1.2.57.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
72
|
+
warn_scraper-1.2.57.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
|
73
|
+
warn_scraper-1.2.57.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
|
74
|
+
warn_scraper-1.2.57.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|