warn-scraper 1.2.108__py3-none-any.whl → 1.2.110__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- warn/scrapers/co.py +139 -12
- warn/scrapers/va.py +14 -4
- {warn_scraper-1.2.108.dist-info → warn_scraper-1.2.110.dist-info}/METADATA +1 -1
- {warn_scraper-1.2.108.dist-info → warn_scraper-1.2.110.dist-info}/RECORD +8 -8
- {warn_scraper-1.2.108.dist-info → warn_scraper-1.2.110.dist-info}/WHEEL +0 -0
- {warn_scraper-1.2.108.dist-info → warn_scraper-1.2.110.dist-info}/entry_points.txt +0 -0
- {warn_scraper-1.2.108.dist-info → warn_scraper-1.2.110.dist-info}/licenses/LICENSE +0 -0
- {warn_scraper-1.2.108.dist-info → warn_scraper-1.2.110.dist-info}/top_level.txt +0 -0
warn/scrapers/co.py
CHANGED
@@ -6,7 +6,7 @@ from bs4 import BeautifulSoup, Tag
|
|
6
6
|
from .. import utils
|
7
7
|
from ..cache import Cache
|
8
8
|
|
9
|
-
__authors__ = ["anikasikka"]
|
9
|
+
__authors__ = ["anikasikka", "stucka"]
|
10
10
|
__tags__ = ["html"]
|
11
11
|
__source__ = {
|
12
12
|
"name": "Colorado Department of Labor and Employment",
|
@@ -53,13 +53,29 @@ def scrape(
|
|
53
53
|
else:
|
54
54
|
raise ValueError("Could not find Google Sheet link")
|
55
55
|
|
56
|
+
# Scraper had been working off partially loaded impression of the HTML in the dOM.
|
57
|
+
# This keyboard is not helping.
|
58
|
+
# Anyway, instead of trying to get a partially complete version and parse the HTML there,
|
59
|
+
# let's try to get the actual HTML export of the page.
|
60
|
+
# 2016 has a different filename schema we need to account for.
|
61
|
+
|
62
|
+
if "/edit" in current_href:
|
63
|
+
better_link = current_href.split("/edit")[0] + "/gviz/tq?tqx=out:html" # type: ignore
|
64
|
+
elif "drive.google.com/open?id=" in current_href: # Work from the ID
|
65
|
+
better_link = "https://docs.google.com/spreadsheets/d/"
|
66
|
+
better_link += current_href.split("open?id=")[-1] # type: ignore
|
67
|
+
better_link += "/gviz/tq?tqx=out:html"
|
68
|
+
else:
|
69
|
+
raise ValueError(f"Could not adapt {current_href} to find HTML export.")
|
70
|
+
|
56
71
|
# Open the Google Sheet
|
57
|
-
current_page = utils.get_url(
|
72
|
+
current_page = utils.get_url(better_link)
|
58
73
|
current_html = current_page.text
|
59
74
|
|
60
75
|
# Parse the Google Sheet
|
61
76
|
soup_current = BeautifulSoup(current_html, "html5lib")
|
62
|
-
table = soup_current.find(class_="waffle")
|
77
|
+
# table = soup_current.find(class_="waffle")
|
78
|
+
table = soup_current.find("table")
|
63
79
|
cleaned_data = scrape_google_sheets(table)
|
64
80
|
|
65
81
|
# Goes through the accordion links to get past data
|
@@ -78,9 +94,26 @@ def scrape(
|
|
78
94
|
link_list = [a for a in accordion.find_all("a") if "feedback" not in a.text]
|
79
95
|
logger.debug(f"Requesting {len(link_list)} discovered links")
|
80
96
|
for link in link_list:
|
81
|
-
|
97
|
+
bad_url = link["href"]
|
98
|
+
# Scraper had been working off partially loaded impression of the HTML in the dOM.
|
99
|
+
# This keyboard is not helping.
|
100
|
+
# Anyway, instead of trying to get a partially complete version and parse the HTML there,
|
101
|
+
# let's try to get the actual HTML export of the page.
|
102
|
+
# 2016 has a different filename schema we need to account for.
|
103
|
+
|
104
|
+
if "/edit" in bad_url:
|
105
|
+
better_link = bad_url.split("/edit")[0] + "/gviz/tq?tqx=out:html"
|
106
|
+
elif "drive.google.com/open?id=" in bad_url:
|
107
|
+
better_link = "https://docs.google.com/spreadsheets/d/"
|
108
|
+
better_link += bad_url.split("open?id=")[-1] # Get just the Id
|
109
|
+
better_link += "/gviz/tq?tqx=out:html"
|
110
|
+
else:
|
111
|
+
raise ValueError(f"Could not adapt {bad_url} to find HTML export.")
|
112
|
+
|
113
|
+
page = utils.get_url(better_link)
|
114
|
+
|
82
115
|
soup = BeautifulSoup(page.text, "html5lib")
|
83
|
-
table = soup.find(
|
116
|
+
table = soup.find("table")
|
84
117
|
if "2017" in link.text:
|
85
118
|
header_list = [
|
86
119
|
"Company",
|
@@ -105,9 +138,9 @@ def scrape(
|
|
105
138
|
|
106
139
|
# Clean up the headers
|
107
140
|
header_crosswalk = {
|
108
|
-
"Name": "company",
|
109
141
|
"Company Name": "company",
|
110
142
|
"Company": "company",
|
143
|
+
"Name": "company",
|
111
144
|
"WARN Date": "notice_date",
|
112
145
|
"Total Layoffs": "jobs",
|
113
146
|
"NAICS": "naics",
|
@@ -151,17 +184,108 @@ def scrape(
|
|
151
184
|
"@dropdown": "dropdown",
|
152
185
|
"Received": "received_date",
|
153
186
|
"Notes": "notes",
|
187
|
+
# Only add new matches above here, not below here.
|
154
188
|
}
|
189
|
+
|
190
|
+
header_garbage = {
|
191
|
+
# And then it got ugly with some columns getting unhidden.
|
192
|
+
"Timestamp": "timestamp",
|
193
|
+
"Email Address": "email_address",
|
194
|
+
"Is this a NEW WARN or a REVISION?": "is_this_a_new_warn_or_a_revision",
|
195
|
+
"Total number of employees with reduced hours": "total_number_of_employees_with_reduced_hours",
|
196
|
+
"Include the total number of employees on or expected to be on a Workshare plan.": "include_the_total_number_of_employees_on_or_expected_to_be_on_a_workshare_plan",
|
197
|
+
"Expected date of second job losses at location 1": "expected_date_of_second_job_losses_at_location_1",
|
198
|
+
"Expected end date of second job losses at location 1": "expected_end_date_of_second_job_losses_at_location_1",
|
199
|
+
"Expected date of third job losses at location 1": "expected_date_of_third_job_losses_at_location_1",
|
200
|
+
"Expected end date of third job losses at location 1": "expected_end_date_of_third_job_losses_at_location_1",
|
201
|
+
"Do the employees have bumping rights?": "do_the_employees_have_bumping_rights",
|
202
|
+
"Are the employees represented by a union?": "are_the_employees_represented_by_a_union",
|
203
|
+
"If you selected Rural Consortium for the workforce area, please choose a subarea using the map.": "if_you_selected_rural_consortium_for_the_workforce_area_please_choose_a_subarea_using_the_map",
|
204
|
+
"Name of union(s)": "name_of_unions",
|
205
|
+
"Contact phone number for union representative(s)": "contact_phone_number_for_union_representatives",
|
206
|
+
"Email address for union representative(s)": "email_address_for_union_representatives",
|
207
|
+
"Address, City, ZIP for Union 1": "address_city_zip_for_union_1",
|
208
|
+
"Has a second location been impacted?": "has_a_second_location_been_impacted",
|
209
|
+
"Location 2 Address": "location_2_address",
|
210
|
+
"Total number of employees at location 2": "total_number_of_employees_at_location_2",
|
211
|
+
"Total number of permanent layoffs at location 2": "total_number_of_permanent_layoffs_at_location_2",
|
212
|
+
"Total number of temporary layoffs at location 2": "total_number_of_temporary_layoffs_at_location_2",
|
213
|
+
"Total number of furloughs at location 2": "total_number_of_furloughs_at_location_2",
|
214
|
+
"Total number of employees with reduced hours at location 2": "total_number_of_employees_with_reduced_hours_at_location_2",
|
215
|
+
"Total number of employees on workshare plan at location 2": "total_number_of_employees_on_workshare_plan_at_location_2",
|
216
|
+
"Occupations Impacted at location 2": "occupations_impacted_at_location_2",
|
217
|
+
"Expected date of first job losses at location 2": "expected_date_of_first_job_losses_at_location_2",
|
218
|
+
"Contact name(s) for union representative(s)": "contact_names_for_union_representatives",
|
219
|
+
"Expected end date of first job losses at location 2": "expected_end_date_of_first_job_losses_at_location_2",
|
220
|
+
"Expected date of second job losses at location 2": "expected_date_of_second_job_losses_at_location_2",
|
221
|
+
"Expected end date of second job losses at location 2": "expected_end_date_of_second_job_losses_at_location_2",
|
222
|
+
"Expected date of third job losses at location 2": "expected_date_of_third_job_losses_at_location_2",
|
223
|
+
"Expected end date of third job losses at location 2": "expected_end_date_of_third_job_losses_at_location_2",
|
224
|
+
"Reason for Layoffs at location 2": "reason_for_layoffs_at_location_2",
|
225
|
+
"Do employees at location 2 having bumping rights?": "do_employees_at_location_2_having_bumping_rights",
|
226
|
+
"Are employees at location 2 represented by a union?": "are_employees_at_location_2_represented_by_a_union",
|
227
|
+
"Select the workforce area for location 2": "select_the_workforce_area_for_location_2",
|
228
|
+
"If you selected Other/Sub-Area, please choose a location from the following dropdown menu:": "if_you_selected_othersub_area_please_choose_a_location_from_the_following_dropdown_menu",
|
229
|
+
"Name of Union 2": "name_of_union_2",
|
230
|
+
"Contact name for Union 2": "contact_name_for_union_2",
|
231
|
+
"Contact phone number for Union 2": "contact_phone_number_for_union_2",
|
232
|
+
"Email address for Union 2": "email_address_for_union_2",
|
233
|
+
"Address, City, ZIP for Union 2": "address_city_zip_for_union_2",
|
234
|
+
"Has a third location been impacted?": "has_a_third_location_been_impacted",
|
235
|
+
"Location 3 Address": "location_3_address",
|
236
|
+
"Total number of employees at location 3": "total_number_of_employees_at_location_3",
|
237
|
+
"Total number of permanent layoffs at location 3": "total_number_of_permanent_layoffs_at_location_3",
|
238
|
+
"Total number of temporary layoffs at location 3": "total_number_of_temporary_layoffs_at_location_3",
|
239
|
+
"Total number of furloughs at location 3": "total_number_of_furloughs_at_location_3",
|
240
|
+
"Total number of employees with reduced hours at location 3": "total_number_of_employees_with_reduced_hours_at_location_3",
|
241
|
+
"Total number of employees on workshare plan at location 3": "total_number_of_employees_on_workshare_plan_at_location_3",
|
242
|
+
"Occupations Impacted at location 3": "occupations_impacted_at_location_3",
|
243
|
+
"Expected date of first job losses at location 3": "expected_date_of_first_job_losses_at_location_3",
|
244
|
+
"Expected end date of first job losses at location 3": "expected_end_date_of_first_job_losses_at_location_3",
|
245
|
+
"Expected date of second job losses at location 3": "expected_date_of_second_job_losses_at_location_3",
|
246
|
+
"Expected end date of second job losses at location 3": "expected_end_date_of_second_job_losses_at_location_3",
|
247
|
+
"Expected date of third job losses at location 3": "expected_date_of_third_job_losses_at_location_3",
|
248
|
+
"Expected end date of third job losses at location 3": "expected_end_date_of_third_job_losses_at_location_3",
|
249
|
+
"Reason for Layoffs at location 3": "reason_for_layoffs_at_location_3",
|
250
|
+
"Do employees at location 3 having bumping rights?": "do_employees_at_location_3_having_bumping_rights",
|
251
|
+
"Are employees at location 3 represented by a union?": "are_employees_at_location_3_represented_by_a_union",
|
252
|
+
"Select the workforce area for location 3": "select_the_workforce_area_for_location_3",
|
253
|
+
"Name of Union 3": "name_of_union_3",
|
254
|
+
"Contact name for Union 3": "contact_name_for_union_3",
|
255
|
+
"Contact phone number for Union 3": "contact_phone_number_for_union_3",
|
256
|
+
"Email address for Union 3": "email_address_for_union_3",
|
257
|
+
"Address, City, ZIP for Union 3": "address_city_zip_for_union_3",
|
258
|
+
"Include here any comments or additional details": "include_here_any_comments_or_additional_details",
|
259
|
+
# This is for garbage, not legit crosswalk. You probably do not want to add here.
|
260
|
+
}
|
261
|
+
|
155
262
|
standardized_data = []
|
156
263
|
for row in cleaned_data:
|
157
264
|
row_dict = {}
|
265
|
+
mangled = []
|
266
|
+
for key in row:
|
267
|
+
if (
|
268
|
+
key not in header_crosswalk and key not in header_garbage
|
269
|
+
): # Get all missing keys at once
|
270
|
+
mangled.append(key)
|
271
|
+
if len(mangled) > 0:
|
272
|
+
logger.warning(f"Missing a bunch of keys: {'|'.join(mangled)}")
|
273
|
+
|
158
274
|
for key, value in row.items():
|
159
|
-
|
160
|
-
|
275
|
+
if (
|
276
|
+
key not in header_crosswalk and key not in header_garbage
|
277
|
+
): # If we've never seen this before
|
278
|
+
logger.warning(f"Could not find {key} in header_crosswalk")
|
279
|
+
logger.warning(row)
|
280
|
+
if key not in header_garbage: # if it's in the crosswalk, if it's legit
|
281
|
+
standardized_key = header_crosswalk[key]
|
282
|
+
row_dict[standardized_key] = value
|
161
283
|
if len(row_dict["company"]) < 3 and row_dict["letter"] == "Avis Budget Group":
|
162
284
|
row_dict["company"] = "Avis Budget Group"
|
163
285
|
if len(row_dict["company"]) < 3: # or len(row_dict['naics']) <5:
|
164
286
|
logger.debug(f"Dropping row of questionable quality: {row_dict}")
|
287
|
+
elif "begin_date" in row_dict and row_dict["begin_date"] == "Layoff Date(s)":
|
288
|
+
logger.debug(f"Dropping row of questionable quality: {row_dict}")
|
165
289
|
else:
|
166
290
|
standardized_data.append(row_dict)
|
167
291
|
|
@@ -188,23 +312,26 @@ def scrape_google_sheets(table, header_list=None):
|
|
188
312
|
|
189
313
|
Returns: The parsed data as a list of dictionaries
|
190
314
|
"""
|
315
|
+
# logger.debug(table)
|
191
316
|
# If a header list isn't provided, pull one out automatically
|
192
317
|
if not header_list:
|
193
318
|
# Pull out the header row
|
194
|
-
header_soup = table.find_all("tr")[1]
|
195
|
-
|
319
|
+
# header_soup = table.find_all("tr")[1]
|
320
|
+
header_soup = table.find_all("tr")[0]
|
196
321
|
# Parse the header row into a list,
|
197
322
|
# preserving its order in the sheet
|
198
323
|
header_list = []
|
199
|
-
for cell in header_soup.find_all("td"):
|
324
|
+
for cellindex, cell in enumerate(header_soup.find_all("td")):
|
200
325
|
cell_text = cell.text.strip()
|
201
326
|
# Skip empty headers
|
202
327
|
if cell_text:
|
203
328
|
header_list.append(cell_text)
|
329
|
+
if not cell_text and cellindex == 0:
|
330
|
+
header_list.append("Company Name")
|
204
331
|
|
205
332
|
# Loop through all the data rows, which start
|
206
333
|
# after the header and the little bar
|
207
|
-
tr_list = table.find_all("tr")[
|
334
|
+
tr_list = table.find_all("tr")[1:]
|
208
335
|
logger.debug(f"Parsing {len(tr_list)} rows")
|
209
336
|
row_list = []
|
210
337
|
for row in tr_list:
|
warn/scrapers/va.py
CHANGED
@@ -189,10 +189,20 @@ def scrape(
|
|
189
189
|
|
190
190
|
logger.debug(f"Attempting to fetch {start_page}")
|
191
191
|
driver.get(start_page)
|
192
|
-
sleep((4 * random()) +
|
193
|
-
|
194
|
-
|
195
|
-
).
|
192
|
+
sleep((4 * random()) + 8)
|
193
|
+
# with open("va-weird.html", "w") as outfile:
|
194
|
+
# outfile.write(driver.page_source)
|
195
|
+
# driver.find_element(By.ID, "warn-notice-well").find_element(
|
196
|
+
# By.PARTIAL_LINK_TEXT, "Download"
|
197
|
+
# ).click()
|
198
|
+
|
199
|
+
# driver.find_element(By.PARTIAL_LINK_TEXT, "Download Full List of WARN notices").click()
|
200
|
+
|
201
|
+
# element = driver.find_element(By.CSS_SELECTOR, "#warn-notice-well a img")
|
202
|
+
element = driver.find_element(By.CSS_SELECTOR, "#warn-notice-well a")
|
203
|
+
logger.debug(f"Element found: {element.get_attribute('outerHTML')}")
|
204
|
+
driver.execute_script("arguments[0].click();", element)
|
205
|
+
# element.click()
|
196
206
|
|
197
207
|
logger.debug(f"Attempting to fetch {csv_url}")
|
198
208
|
# driver.get(csv_url)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: warn-scraper
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.110
|
4
4
|
Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
|
5
5
|
Home-page: https://github.com/biglocalnews/warn-scraper
|
6
6
|
Author: Big Local News
|
@@ -29,7 +29,7 @@ warn/scrapers/ak.py,sha256=h7BYMTV0whwWAPhbzVDVKMMoVCFphKly70aiTHabPq4,1847
|
|
29
29
|
warn/scrapers/al.py,sha256=D0rT9GQ0vwfkRuveVAt-Po-T6b2TI1EPGeLOBy2m3_M,2240
|
30
30
|
warn/scrapers/az.py,sha256=elGbue01Gjf_DQ66Wy9qqGIOJsiY-KIKJOVeft8pCXg,1447
|
31
31
|
warn/scrapers/ca.py,sha256=VQOfjHXPCc-jYwh-EPGVVfnzvXB7pdmCt2uJ6QnMPRM,8600
|
32
|
-
warn/scrapers/co.py,sha256=
|
32
|
+
warn/scrapers/co.py,sha256=YZP7BHluM6tmwdQ45ZpYzLvq1S7jHGBH-4ZXlSKDP5M,17509
|
33
33
|
warn/scrapers/ct.py,sha256=PKeZtlB0-z2wCmYmGl_WYoVo2gzwKV36upZcJVaJxjM,4852
|
34
34
|
warn/scrapers/dc.py,sha256=p1_c7O2R3O-41DmvcLVUIRhQKUewvZZKkzWkBxytN5M,5165
|
35
35
|
warn/scrapers/de.py,sha256=GyM92A-lFwZAfRxgbO-sIWhRfmBEKirzchaPIv-u0o4,1364
|
@@ -61,13 +61,13 @@ warn/scrapers/sd.py,sha256=_4R19Ybzsyx1PvcWV3_laJmJ3etrwVGfhNEQm6njwoA,1904
|
|
61
61
|
warn/scrapers/tn.py,sha256=i1H7c09Ea3CDrTXqqRMLBMPT_34QtGA0-x7T8rm_j5Q,2945
|
62
62
|
warn/scrapers/tx.py,sha256=watfR1gyN9w7nluiAOnnIghEmoq3eShNUzYSZ8SkZy4,4438
|
63
63
|
warn/scrapers/ut.py,sha256=iUh38YIjbvv5MyyKacsiZNe8KjfdBeDaOf-qMQEF_kc,2245
|
64
|
-
warn/scrapers/va.py,sha256=
|
64
|
+
warn/scrapers/va.py,sha256=7Nle7qL0VNPiE653XyaP9HQqSfuJFDRr2kEkjOqLvFM,11269
|
65
65
|
warn/scrapers/vt.py,sha256=d-bo4WK2hkrk4BhCCmLpEovcoZltlvdIUB6O0uaMx5A,1186
|
66
66
|
warn/scrapers/wa.py,sha256=UXdVtHZo_a-XfoiyOooTRfTb9W3PErSZdKca6SRORgs,4282
|
67
67
|
warn/scrapers/wi.py,sha256=ClEzXkwZbop0W4fkQgsb5oHAPUrb4luUPGV-jOKwkcg,4855
|
68
|
-
warn_scraper-1.2.
|
69
|
-
warn_scraper-1.2.
|
70
|
-
warn_scraper-1.2.
|
71
|
-
warn_scraper-1.2.
|
72
|
-
warn_scraper-1.2.
|
73
|
-
warn_scraper-1.2.
|
68
|
+
warn_scraper-1.2.110.dist-info/licenses/LICENSE,sha256=ZV-QHyqPwyMuwuj0lI05JeSjV1NyzVEk8Yeu7FPtYS0,585
|
69
|
+
warn_scraper-1.2.110.dist-info/METADATA,sha256=oyI5DrPOS62n4q0ElEvm9R8fwzTkNOus3MLwzzQYLic,2385
|
70
|
+
warn_scraper-1.2.110.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
|
71
|
+
warn_scraper-1.2.110.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
|
72
|
+
warn_scraper-1.2.110.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
|
73
|
+
warn_scraper-1.2.110.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|