warn-scraper 1.2.107__py3-none-any.whl → 1.2.109__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
warn/scrapers/co.py CHANGED
@@ -6,7 +6,7 @@ from bs4 import BeautifulSoup, Tag
6
6
  from .. import utils
7
7
  from ..cache import Cache
8
8
 
9
- __authors__ = ["anikasikka"]
9
+ __authors__ = ["anikasikka", "stucka"]
10
10
  __tags__ = ["html"]
11
11
  __source__ = {
12
12
  "name": "Colorado Department of Labor and Employment",
@@ -53,13 +53,29 @@ def scrape(
53
53
  else:
54
54
  raise ValueError("Could not find Google Sheet link")
55
55
 
56
+ # Scraper had been working off partially loaded impression of the HTML in the dOM.
57
+ # This keyboard is not helping.
58
+ # Anyway, instead of trying to get a partially complete version and parse the HTML there,
59
+ # let's try to get the actual HTML export of the page.
60
+ # 2016 has a different filename schema we need to account for.
61
+
62
+ if "/edit" in current_href:
63
+ better_link = current_href.split("/edit")[0] + "/gviz/tq?tqx=out:html" # type: ignore
64
+ elif "drive.google.com/open?id=" in current_href: # Work from the ID
65
+ better_link = "https://docs.google.com/spreadsheets/d/"
66
+ better_link += current_href.split("open?id=")[-1] # type: ignore
67
+ better_link += "/gviz/tq?tqx=out:html"
68
+ else:
69
+ raise ValueError(f"Could not adapt {current_href} to find HTML export.")
70
+
56
71
  # Open the Google Sheet
57
- current_page = utils.get_url(current_href)
72
+ current_page = utils.get_url(better_link)
58
73
  current_html = current_page.text
59
74
 
60
75
  # Parse the Google Sheet
61
76
  soup_current = BeautifulSoup(current_html, "html5lib")
62
- table = soup_current.find(class_="waffle")
77
+ # table = soup_current.find(class_="waffle")
78
+ table = soup_current.find("table")
63
79
  cleaned_data = scrape_google_sheets(table)
64
80
 
65
81
  # Goes through the accordion links to get past data
@@ -78,9 +94,26 @@ def scrape(
78
94
  link_list = [a for a in accordion.find_all("a") if "feedback" not in a.text]
79
95
  logger.debug(f"Requesting {len(link_list)} discovered links")
80
96
  for link in link_list:
81
- page = utils.get_url(link["href"])
97
+ bad_url = link["href"]
98
+ # Scraper had been working off partially loaded impression of the HTML in the dOM.
99
+ # This keyboard is not helping.
100
+ # Anyway, instead of trying to get a partially complete version and parse the HTML there,
101
+ # let's try to get the actual HTML export of the page.
102
+ # 2016 has a different filename schema we need to account for.
103
+
104
+ if "/edit" in bad_url:
105
+ better_link = bad_url.split("/edit")[0] + "/gviz/tq?tqx=out:html"
106
+ elif "drive.google.com/open?id=" in bad_url:
107
+ better_link = "https://docs.google.com/spreadsheets/d/"
108
+ better_link += bad_url.split("open?id=")[-1] # Get just the Id
109
+ better_link += "/gviz/tq?tqx=out:html"
110
+ else:
111
+ raise ValueError(f"Could not adapt {bad_url} to find HTML export.")
112
+
113
+ page = utils.get_url(better_link)
114
+
82
115
  soup = BeautifulSoup(page.text, "html5lib")
83
- table = soup.find(class_="waffle")
116
+ table = soup.find("table")
84
117
  if "2017" in link.text:
85
118
  header_list = [
86
119
  "Company",
@@ -105,9 +138,9 @@ def scrape(
105
138
 
106
139
  # Clean up the headers
107
140
  header_crosswalk = {
108
- "Name": "company",
109
141
  "Company Name": "company",
110
142
  "Company": "company",
143
+ "Name": "company",
111
144
  "WARN Date": "notice_date",
112
145
  "Total Layoffs": "jobs",
113
146
  "NAICS": "naics",
@@ -151,17 +184,108 @@ def scrape(
151
184
  "@dropdown": "dropdown",
152
185
  "Received": "received_date",
153
186
  "Notes": "notes",
187
+ # Only add new matches above here, not below here.
154
188
  }
189
+
190
+ header_garbage = {
191
+ # And then it got ugly with some columns getting unhidden.
192
+ "Timestamp": "timestamp",
193
+ "Email Address": "email_address",
194
+ "Is this a NEW WARN or a REVISION?": "is_this_a_new_warn_or_a_revision",
195
+ "Total number of employees with reduced hours": "total_number_of_employees_with_reduced_hours",
196
+ "Include the total number of employees on or expected to be on a Workshare plan.": "include_the_total_number_of_employees_on_or_expected_to_be_on_a_workshare_plan",
197
+ "Expected date of second job losses at location 1": "expected_date_of_second_job_losses_at_location_1",
198
+ "Expected end date of second job losses at location 1": "expected_end_date_of_second_job_losses_at_location_1",
199
+ "Expected date of third job losses at location 1": "expected_date_of_third_job_losses_at_location_1",
200
+ "Expected end date of third job losses at location 1": "expected_end_date_of_third_job_losses_at_location_1",
201
+ "Do the employees have bumping rights?": "do_the_employees_have_bumping_rights",
202
+ "Are the employees represented by a union?": "are_the_employees_represented_by_a_union",
203
+ "If you selected Rural Consortium for the workforce area, please choose a subarea using the map.": "if_you_selected_rural_consortium_for_the_workforce_area_please_choose_a_subarea_using_the_map",
204
+ "Name of union(s)": "name_of_unions",
205
+ "Contact phone number for union representative(s)": "contact_phone_number_for_union_representatives",
206
+ "Email address for union representative(s)": "email_address_for_union_representatives",
207
+ "Address, City, ZIP for Union 1": "address_city_zip_for_union_1",
208
+ "Has a second location been impacted?": "has_a_second_location_been_impacted",
209
+ "Location 2 Address": "location_2_address",
210
+ "Total number of employees at location 2": "total_number_of_employees_at_location_2",
211
+ "Total number of permanent layoffs at location 2": "total_number_of_permanent_layoffs_at_location_2",
212
+ "Total number of temporary layoffs at location 2": "total_number_of_temporary_layoffs_at_location_2",
213
+ "Total number of furloughs at location 2": "total_number_of_furloughs_at_location_2",
214
+ "Total number of employees with reduced hours at location 2": "total_number_of_employees_with_reduced_hours_at_location_2",
215
+ "Total number of employees on workshare plan at location 2": "total_number_of_employees_on_workshare_plan_at_location_2",
216
+ "Occupations Impacted at location 2": "occupations_impacted_at_location_2",
217
+ "Expected date of first job losses at location 2": "expected_date_of_first_job_losses_at_location_2",
218
+ "Contact name(s) for union representative(s)": "contact_names_for_union_representatives",
219
+ "Expected end date of first job losses at location 2": "expected_end_date_of_first_job_losses_at_location_2",
220
+ "Expected date of second job losses at location 2": "expected_date_of_second_job_losses_at_location_2",
221
+ "Expected end date of second job losses at location 2": "expected_end_date_of_second_job_losses_at_location_2",
222
+ "Expected date of third job losses at location 2": "expected_date_of_third_job_losses_at_location_2",
223
+ "Expected end date of third job losses at location 2": "expected_end_date_of_third_job_losses_at_location_2",
224
+ "Reason for Layoffs at location 2": "reason_for_layoffs_at_location_2",
225
+ "Do employees at location 2 having bumping rights?": "do_employees_at_location_2_having_bumping_rights",
226
+ "Are employees at location 2 represented by a union?": "are_employees_at_location_2_represented_by_a_union",
227
+ "Select the workforce area for location 2": "select_the_workforce_area_for_location_2",
228
+ "If you selected Other/Sub-Area, please choose a location from the following dropdown menu:": "if_you_selected_othersub_area_please_choose_a_location_from_the_following_dropdown_menu",
229
+ "Name of Union 2": "name_of_union_2",
230
+ "Contact name for Union 2": "contact_name_for_union_2",
231
+ "Contact phone number for Union 2": "contact_phone_number_for_union_2",
232
+ "Email address for Union 2": "email_address_for_union_2",
233
+ "Address, City, ZIP for Union 2": "address_city_zip_for_union_2",
234
+ "Has a third location been impacted?": "has_a_third_location_been_impacted",
235
+ "Location 3 Address": "location_3_address",
236
+ "Total number of employees at location 3": "total_number_of_employees_at_location_3",
237
+ "Total number of permanent layoffs at location 3": "total_number_of_permanent_layoffs_at_location_3",
238
+ "Total number of temporary layoffs at location 3": "total_number_of_temporary_layoffs_at_location_3",
239
+ "Total number of furloughs at location 3": "total_number_of_furloughs_at_location_3",
240
+ "Total number of employees with reduced hours at location 3": "total_number_of_employees_with_reduced_hours_at_location_3",
241
+ "Total number of employees on workshare plan at location 3": "total_number_of_employees_on_workshare_plan_at_location_3",
242
+ "Occupations Impacted at location 3": "occupations_impacted_at_location_3",
243
+ "Expected date of first job losses at location 3": "expected_date_of_first_job_losses_at_location_3",
244
+ "Expected end date of first job losses at location 3": "expected_end_date_of_first_job_losses_at_location_3",
245
+ "Expected date of second job losses at location 3": "expected_date_of_second_job_losses_at_location_3",
246
+ "Expected end date of second job losses at location 3": "expected_end_date_of_second_job_losses_at_location_3",
247
+ "Expected date of third job losses at location 3": "expected_date_of_third_job_losses_at_location_3",
248
+ "Expected end date of third job losses at location 3": "expected_end_date_of_third_job_losses_at_location_3",
249
+ "Reason for Layoffs at location 3": "reason_for_layoffs_at_location_3",
250
+ "Do employees at location 3 having bumping rights?": "do_employees_at_location_3_having_bumping_rights",
251
+ "Are employees at location 3 represented by a union?": "are_employees_at_location_3_represented_by_a_union",
252
+ "Select the workforce area for location 3": "select_the_workforce_area_for_location_3",
253
+ "Name of Union 3": "name_of_union_3",
254
+ "Contact name for Union 3": "contact_name_for_union_3",
255
+ "Contact phone number for Union 3": "contact_phone_number_for_union_3",
256
+ "Email address for Union 3": "email_address_for_union_3",
257
+ "Address, City, ZIP for Union 3": "address_city_zip_for_union_3",
258
+ "Include here any comments or additional details": "include_here_any_comments_or_additional_details",
259
+ # This is for garbage, not legit crosswalk. You probably do not want to add here.
260
+ }
261
+
155
262
  standardized_data = []
156
263
  for row in cleaned_data:
157
264
  row_dict = {}
265
+ mangled = []
266
+ for key in row:
267
+ if (
268
+ key not in header_crosswalk and key not in header_garbage
269
+ ): # Get all missing keys at once
270
+ mangled.append(key)
271
+ if len(mangled) > 0:
272
+ logger.warning(f"Missing a bunch of keys: {'|'.join(mangled)}")
273
+
158
274
  for key, value in row.items():
159
- standardized_key = header_crosswalk[key]
160
- row_dict[standardized_key] = value
275
+ if (
276
+ key not in header_crosswalk and key not in header_garbage
277
+ ): # If we've never seen this before
278
+ logger.warning(f"Could not find {key} in header_crosswalk")
279
+ logger.warning(row)
280
+ if key not in header_garbage: # if it's in the crosswalk, if it's legit
281
+ standardized_key = header_crosswalk[key]
282
+ row_dict[standardized_key] = value
161
283
  if len(row_dict["company"]) < 3 and row_dict["letter"] == "Avis Budget Group":
162
284
  row_dict["company"] = "Avis Budget Group"
163
285
  if len(row_dict["company"]) < 3: # or len(row_dict['naics']) <5:
164
286
  logger.debug(f"Dropping row of questionable quality: {row_dict}")
287
+ elif "begin_date" in row_dict and row_dict["begin_date"] == "Layoff Date(s)":
288
+ logger.debug(f"Dropping row of questionable quality: {row_dict}")
165
289
  else:
166
290
  standardized_data.append(row_dict)
167
291
 
@@ -188,23 +312,26 @@ def scrape_google_sheets(table, header_list=None):
188
312
 
189
313
  Returns: The parsed data as a list of dictionaries
190
314
  """
315
+ # logger.debug(table)
191
316
  # If a header list isn't provided, pull one out automatically
192
317
  if not header_list:
193
318
  # Pull out the header row
194
- header_soup = table.find_all("tr")[1]
195
-
319
+ # header_soup = table.find_all("tr")[1]
320
+ header_soup = table.find_all("tr")[0]
196
321
  # Parse the header row into a list,
197
322
  # preserving its order in the sheet
198
323
  header_list = []
199
- for cell in header_soup.find_all("td"):
324
+ for cellindex, cell in enumerate(header_soup.find_all("td")):
200
325
  cell_text = cell.text.strip()
201
326
  # Skip empty headers
202
327
  if cell_text:
203
328
  header_list.append(cell_text)
329
+ if not cell_text and cellindex == 0:
330
+ header_list.append("Company Name")
204
331
 
205
332
  # Loop through all the data rows, which start
206
333
  # after the header and the little bar
207
- tr_list = table.find_all("tr")[3:]
334
+ tr_list = table.find_all("tr")[1:]
208
335
  logger.debug(f"Parsing {len(tr_list)} rows")
209
336
  row_list = []
210
337
  for row in tr_list:
warn/scrapers/dc.py CHANGED
@@ -48,7 +48,9 @@ def scrape(
48
48
  url = f"https://does.dc.gov/page/industry-closings-and-layoffs-warn-notifications-{current_year - 1}"
49
49
  success, content = utils.save_if_good_url(targetfile, url)
50
50
 
51
- root_html = cache.read(targetfile) # Explicitly re-read as text for regex to work
51
+ root_html = cache.read(
52
+ "/".join(str(targetfile).split("/")[-2:])
53
+ ) # Explicitly re-read as text for regex to work
52
54
 
53
55
  # A June 2025 entry includes a weird table inside a table cell.
54
56
  # This is an ugly patch.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: warn-scraper
3
- Version: 1.2.107
3
+ Version: 1.2.109
4
4
  Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
5
5
  Home-page: https://github.com/biglocalnews/warn-scraper
6
6
  Author: Big Local News
@@ -29,9 +29,9 @@ warn/scrapers/ak.py,sha256=h7BYMTV0whwWAPhbzVDVKMMoVCFphKly70aiTHabPq4,1847
29
29
  warn/scrapers/al.py,sha256=D0rT9GQ0vwfkRuveVAt-Po-T6b2TI1EPGeLOBy2m3_M,2240
30
30
  warn/scrapers/az.py,sha256=elGbue01Gjf_DQ66Wy9qqGIOJsiY-KIKJOVeft8pCXg,1447
31
31
  warn/scrapers/ca.py,sha256=VQOfjHXPCc-jYwh-EPGVVfnzvXB7pdmCt2uJ6QnMPRM,8600
32
- warn/scrapers/co.py,sha256=hUfqrzlhXQBkP4vxewVRrMZrgInoLer5S2MZlyYIQE4,7878
32
+ warn/scrapers/co.py,sha256=YZP7BHluM6tmwdQ45ZpYzLvq1S7jHGBH-4ZXlSKDP5M,17509
33
33
  warn/scrapers/ct.py,sha256=PKeZtlB0-z2wCmYmGl_WYoVo2gzwKV36upZcJVaJxjM,4852
34
- warn/scrapers/dc.py,sha256=C0JwgGX7A4JMxlahTrfzbFKpmyPxF7y6wsnUf-sE3OU,5120
34
+ warn/scrapers/dc.py,sha256=p1_c7O2R3O-41DmvcLVUIRhQKUewvZZKkzWkBxytN5M,5165
35
35
  warn/scrapers/de.py,sha256=GyM92A-lFwZAfRxgbO-sIWhRfmBEKirzchaPIv-u0o4,1364
36
36
  warn/scrapers/fl.py,sha256=YJ6Qt-jJZ7_iUKlHDaQuaV2gRmae8AJKS5dwwChadBE,9563
37
37
  warn/scrapers/ga.py,sha256=o_OF4zPQ3vJM8USQPD7l_ThyRWAzUZkwrwLHCvWmHMI,7429
@@ -65,9 +65,9 @@ warn/scrapers/va.py,sha256=hOPuiAjnTmtXCOdnBM_jAJuz9_u6oCxtbm2F-9m3ot0,10732
65
65
  warn/scrapers/vt.py,sha256=d-bo4WK2hkrk4BhCCmLpEovcoZltlvdIUB6O0uaMx5A,1186
66
66
  warn/scrapers/wa.py,sha256=UXdVtHZo_a-XfoiyOooTRfTb9W3PErSZdKca6SRORgs,4282
67
67
  warn/scrapers/wi.py,sha256=ClEzXkwZbop0W4fkQgsb5oHAPUrb4luUPGV-jOKwkcg,4855
68
- warn_scraper-1.2.107.dist-info/licenses/LICENSE,sha256=ZV-QHyqPwyMuwuj0lI05JeSjV1NyzVEk8Yeu7FPtYS0,585
69
- warn_scraper-1.2.107.dist-info/METADATA,sha256=NYe0Bakge_0cvILQQr7jVI7xnpeHzZ2dDRYhJAMOK4Q,2385
70
- warn_scraper-1.2.107.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
71
- warn_scraper-1.2.107.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
72
- warn_scraper-1.2.107.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
73
- warn_scraper-1.2.107.dist-info/RECORD,,
68
+ warn_scraper-1.2.109.dist-info/licenses/LICENSE,sha256=ZV-QHyqPwyMuwuj0lI05JeSjV1NyzVEk8Yeu7FPtYS0,585
69
+ warn_scraper-1.2.109.dist-info/METADATA,sha256=mFd0sQWXifEMeyUPZvh1xacXtHWbXMFuSFoDPNvfQVs,2385
70
+ warn_scraper-1.2.109.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
71
+ warn_scraper-1.2.109.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
72
+ warn_scraper-1.2.109.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
73
+ warn_scraper-1.2.109.dist-info/RECORD,,