warn-scraper 1.2.116__py3-none-any.whl → 1.2.118__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
warn/scrapers/co.py CHANGED
@@ -184,6 +184,7 @@ def scrape(
184
184
  "@dropdown": "dropdown",
185
185
  "Received": "received_date",
186
186
  "Notes": "notes",
187
+ "12/1/25`": "company",
187
188
  # Only add new matches above here, not below here.
188
189
  }
189
190
 
warn/scrapers/ky.py CHANGED
@@ -60,34 +60,62 @@ def scrape(
60
60
  # Open it up
61
61
  workbook = load_workbook(filename=latest_path)
62
62
 
63
- dirty_list: list = []
63
+ crosswalk = {
64
+ "Closure or Layoff?": "closure_or_layoff",
65
+ "Company Name": "company",
66
+ "Company: Company Name": "company",
67
+ "County": "county",
68
+ "Date Received": "date_received",
69
+ "Employees": "employees",
70
+ "NAICS": "NAICS",
71
+ "NAICS Code": "NAICS",
72
+ "Notice Link": "notice_url",
73
+ "Notice Type": "source",
74
+ "Notice URL": "notice_url",
75
+ "Notice: Notice Number": "notice_number",
76
+ "Number of Employees Affected": "employees",
77
+ "Projected Date": "date_effective",
78
+ "Region": "region",
79
+ "Trade": "trade",
80
+ "Type of Employees Affected": "union_affected",
81
+ "Workforce Board": "region",
82
+ "address": "address",
83
+ "comments": "comments",
84
+ "congressional": "congressional",
85
+ "contact": "contact",
86
+ "industry": "industry",
87
+ "neg": "neg", # Rarely seen, only in historical data, maybe all with "N"
88
+ "occupations": "industry",
89
+ "source": "source",
90
+ "union": "union", # Unclear if different than types of employees affected/union_affected
91
+ }
92
+
93
+ masterlist: list = []
64
94
  for sheet in workbook.worksheets:
65
95
  localrows = parse_xlsx(sheet)
66
- dirty_list.extend(localrows)
67
96
 
68
- headers = dirty_list[0]
69
- row_list = []
70
- for rowindex, row in enumerate(dirty_list):
71
- if (
72
- row != headers
73
- ): # Filter out headers, but also double-check when headers may change
74
- if row[0] == "Date Received":
75
- logger.debug(
76
- f"Dropping dirty row that doesn't quite match headers in row {rowindex}"
77
- )
78
- logger.debug(f"Want: {headers}")
79
- logger.debug(f"Got : {row}")
97
+ # Traverse each tab. Assume the first line is a header. Check if the second line is bogus.
98
+ # Build a list of dicts.
99
+ localheadersraw: list = localrows[0]
100
+ localheaders: list = []
101
+ for entry in localheadersraw:
102
+ if entry not in crosswalk:
103
+ logger.error(f"Potential header {entry} not found in crosswalk.")
80
104
  else:
81
- line = {}
82
- for i, fieldname in enumerate(headers):
105
+ localheaders.append(crosswalk[entry])
106
+ for row in localrows[1:]: # Skip the header row
107
+ if row[0] != "Date Received": # Check for fake second header
108
+ line: dict = {}
109
+ for i, fieldname in enumerate(localheaders):
83
110
  line[fieldname] = row[i]
84
- row_list.append(line)
85
- # dirty_list = None
86
- logger.debug(
87
- f"Successfully merged {len(row_list)-1:,} records from new spreadsheet."
88
- )
111
+ if isinstance(row[i], str):
112
+ line[fieldname] = row[i].strip()
113
+ masterlist.append(line)
89
114
 
90
- # Need to double-check this archived file code, and make sure headers match
115
+ logger.debug(f"Successfully merged {len(masterlist)} records from new spreadsheet.")
116
+
117
+ # Earlier versions of this code needed the archived data to match the new data.
118
+ # We can no longer expect that since October 2025 data revisions.
91
119
 
92
120
  archive_url = "https://storage.googleapis.com/bln-data-public/warn-layoffs/ky-historical-normalized.csv"
93
121
 
@@ -96,24 +124,25 @@ def scrape(
96
124
 
97
125
  reader = list(csv.reader(r.text.splitlines()))
98
126
 
99
- headerlength = len(headers)
100
-
101
- assert reader[0][:headerlength] == headers
102
- logger.debug(
103
- f"Historical data matches current headers. Merging {len(reader)-1:,} records."
104
- )
105
-
127
+ localheadersraw = reader[0]
128
+ localheaders: list = [] # type: ignore
129
+ for entry in localheadersraw:
130
+ if entry not in crosswalk:
131
+ logger.error(f"Cannot match possible header value of {entry} to crosswalk.")
132
+ else:
133
+ localheaders.append(crosswalk[entry])
106
134
  for row in reader[1:]: # Skip header row
107
- line = {}
108
- for i, item in enumerate(headers):
109
- line[item] = row[
110
- i
111
- ] # Make this a list of dictionaries to match earlier input
112
- row_list.append(line)
135
+ line: dict = {} # type: ignore
136
+ for i, fieldname in enumerate(localheaders):
137
+ line[fieldname] = row[i]
138
+ if isinstance(row[i], str):
139
+ line[fieldname] = row[i].strip()
140
+ masterlist.append(line)
141
+ logger.debug("Historical records folded in.")
113
142
 
114
143
  # Write out the results
115
144
  data_path = data_dir / "ky.csv"
116
- utils.write_dict_rows_to_csv(data_path, headers, row_list, extrasaction="ignore")
145
+ utils.write_disparate_dict_rows_to_csv(data_path, masterlist)
117
146
 
118
147
  # Pass it out
119
148
  return data_path
warn/utils.py CHANGED
@@ -232,6 +232,41 @@ def write_dict_rows_to_csv(output_path, headers, rows, mode="w", extrasaction="r
232
232
  writer.writerow(row)
233
233
 
234
234
 
235
+ def write_disparate_dict_rows_to_csv(output_path, rows, mode="w"):
236
+ """Write the provided list of dictionaries to the provided path as comma-separated values, while determining a header.
237
+
238
+ Args:
239
+ output_path (Path): the Path were the result will be saved
240
+ rows (list): the list of dictionaries to be saved; can have disparate dict keys
241
+ mode (str): the mode to be used when opening the file (default 'w')
242
+ """
243
+ create_directory(output_path, is_file=True)
244
+ headers: set = set() # Get all the potential header names
245
+ for row in rows:
246
+ for item in row:
247
+ headers.add(item)
248
+ headers = list(sorted(headers))
249
+ logger.debug(f"Found {len(headers):,} header entries in list of dicts.")
250
+ logger.debug(f"Writing {len(rows)} rows to {output_path}")
251
+ with open(output_path, mode, newline="") as outfile:
252
+ # Create the writer object
253
+ writer = csv.writer(outfile)
254
+ # If we are writing a new row ...
255
+ if mode == "w":
256
+ # ... drop in the headers
257
+ writer.writerow(headers)
258
+ # Loop through the dicts and write them in one by one.
259
+ for row in rows:
260
+ line = {}
261
+ for item in headers:
262
+ if item in row:
263
+ line[item] = row[item]
264
+ else:
265
+ line[item] = None
266
+ writer.writerow(list(line.values()))
267
+ return
268
+
269
+
235
270
  def get_all_scrapers():
236
271
  """Get all the states and territories that have scrapers.
237
272
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: warn-scraper
3
- Version: 1.2.116
3
+ Version: 1.2.118
4
4
  Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
5
5
  Home-page: https://github.com/biglocalnews/warn-scraper
6
6
  Author: Big Local News
@@ -17,7 +17,7 @@ warn/__init__.py,sha256=A07JFY1TyaPtVIndBa7IvTk13DETqIkLgRdk0A-MCoE,85
17
17
  warn/cache.py,sha256=hyta04_G-ALGwcKl4xNc7EgHS_xklyVD5d8SXNrJekY,5520
18
18
  warn/cli.py,sha256=ZqyJwICdHFkn2hEgbArj_upbElR9-TSDlYDqyEGeexE,2019
19
19
  warn/runner.py,sha256=oeGRybGwpnkQKlPzRMlKxhsDt1GN4PZoX-vUwrsPgos,1894
20
- warn/utils.py,sha256=Jd1pIVtfUXxDweKa_6vHTNX13E47Ms7FHSw110unDHk,10408
20
+ warn/utils.py,sha256=67ltJ1ZDCqLfZoFcI8kp5BaTbv28ZzOfzaDvlbnVBfM,11821
21
21
  warn/platforms/__init__.py,sha256=wIZRDf4tbTuC8oKM4ZrTAtwNgbtMQGzPXMwDYCFyrog,81
22
22
  warn/platforms/job_center/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
23
  warn/platforms/job_center/cache.py,sha256=yhA3sE46lNFg8vEewSoRYVByi0YSlkBiKm7qoSUiTdM,1868
@@ -29,7 +29,7 @@ warn/scrapers/ak.py,sha256=h7BYMTV0whwWAPhbzVDVKMMoVCFphKly70aiTHabPq4,1847
29
29
  warn/scrapers/al.py,sha256=D0rT9GQ0vwfkRuveVAt-Po-T6b2TI1EPGeLOBy2m3_M,2240
30
30
  warn/scrapers/az.py,sha256=elGbue01Gjf_DQ66Wy9qqGIOJsiY-KIKJOVeft8pCXg,1447
31
31
  warn/scrapers/ca.py,sha256=VQOfjHXPCc-jYwh-EPGVVfnzvXB7pdmCt2uJ6QnMPRM,8600
32
- warn/scrapers/co.py,sha256=fnd_dz4esjulFm1C27VtYyXMZoqtZkVl6gOWc3wNn6E,17914
32
+ warn/scrapers/co.py,sha256=8Y0KJwTqX1c8AB1V9fksAT0VI-39_rINU2c_Z7w66pY,17945
33
33
  warn/scrapers/ct.py,sha256=PKeZtlB0-z2wCmYmGl_WYoVo2gzwKV36upZcJVaJxjM,4852
34
34
  warn/scrapers/dc.py,sha256=p1_c7O2R3O-41DmvcLVUIRhQKUewvZZKkzWkBxytN5M,5165
35
35
  warn/scrapers/de.py,sha256=GyM92A-lFwZAfRxgbO-sIWhRfmBEKirzchaPIv-u0o4,1364
@@ -41,7 +41,7 @@ warn/scrapers/id.py,sha256=qJLcLgCgAfKzLpuwW32JqNwXn9NxZRZvQ50nZZKUhmE,6674
41
41
  warn/scrapers/il.py,sha256=sygdvsNuB_Gvu3o_HidtpSP4FLz0szKb1zEHqGxVtlI,1563
42
42
  warn/scrapers/in.py,sha256=dAT40ROhhKiwLcwa_YJ6EyhsYBLe0IX2rOWXmNa6JMs,2026
43
43
  warn/scrapers/ks.py,sha256=F_3biEMF7zgCX2XVuUACR74Vyzapta4SaM9SY3EuZCU,1266
44
- warn/scrapers/ky.py,sha256=XjIojMpaoKbypa7l23IybP02jBijBCJG5UGqfO-EYjg,4365
44
+ warn/scrapers/ky.py,sha256=IDIzULH5h-UqGCvKvvipYbi5Gg3_rmue_o9SgF7QWqs,5843
45
45
  warn/scrapers/la.py,sha256=ORkMOQErl33SEiagOli4agDLdTt0R1MxxBmqOg3hNv8,13175
46
46
  warn/scrapers/md.py,sha256=hwgxXQnhyBWm8qF1dvxIThAX1MkrZbXLwRI9inO5t8g,4060
47
47
  warn/scrapers/me.py,sha256=q36F4yJ7hvZsLayA3uBS1romo4X3Qf-sEi2Y7LAQCi8,1172
@@ -65,9 +65,9 @@ warn/scrapers/va.py,sha256=7Nle7qL0VNPiE653XyaP9HQqSfuJFDRr2kEkjOqLvFM,11269
65
65
  warn/scrapers/vt.py,sha256=d-bo4WK2hkrk4BhCCmLpEovcoZltlvdIUB6O0uaMx5A,1186
66
66
  warn/scrapers/wa.py,sha256=UXdVtHZo_a-XfoiyOooTRfTb9W3PErSZdKca6SRORgs,4282
67
67
  warn/scrapers/wi.py,sha256=ClEzXkwZbop0W4fkQgsb5oHAPUrb4luUPGV-jOKwkcg,4855
68
- warn_scraper-1.2.116.dist-info/licenses/LICENSE,sha256=ZV-QHyqPwyMuwuj0lI05JeSjV1NyzVEk8Yeu7FPtYS0,585
69
- warn_scraper-1.2.116.dist-info/METADATA,sha256=azCMcdV2gteQF6fCJKmmGV54nWi3877lo1OYYcVwBhw,2385
70
- warn_scraper-1.2.116.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
71
- warn_scraper-1.2.116.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
72
- warn_scraper-1.2.116.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
73
- warn_scraper-1.2.116.dist-info/RECORD,,
68
+ warn_scraper-1.2.118.dist-info/licenses/LICENSE,sha256=ZV-QHyqPwyMuwuj0lI05JeSjV1NyzVEk8Yeu7FPtYS0,585
69
+ warn_scraper-1.2.118.dist-info/METADATA,sha256=K3igkvyJ1uKPecz9UpRkx75hq1MN_GYy3GDRYHzX24k,2385
70
+ warn_scraper-1.2.118.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
71
+ warn_scraper-1.2.118.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
72
+ warn_scraper-1.2.118.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
73
+ warn_scraper-1.2.118.dist-info/RECORD,,