warn-scraper 1.2.117__py3-none-any.whl → 1.2.118__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- warn/scrapers/ky.py +65 -36
- warn/utils.py +35 -0
- {warn_scraper-1.2.117.dist-info → warn_scraper-1.2.118.dist-info}/METADATA +1 -1
- {warn_scraper-1.2.117.dist-info → warn_scraper-1.2.118.dist-info}/RECORD +8 -8
- {warn_scraper-1.2.117.dist-info → warn_scraper-1.2.118.dist-info}/WHEEL +0 -0
- {warn_scraper-1.2.117.dist-info → warn_scraper-1.2.118.dist-info}/entry_points.txt +0 -0
- {warn_scraper-1.2.117.dist-info → warn_scraper-1.2.118.dist-info}/licenses/LICENSE +0 -0
- {warn_scraper-1.2.117.dist-info → warn_scraper-1.2.118.dist-info}/top_level.txt +0 -0
warn/scrapers/ky.py
CHANGED
@@ -60,34 +60,62 @@ def scrape(
|
|
60
60
|
# Open it up
|
61
61
|
workbook = load_workbook(filename=latest_path)
|
62
62
|
|
63
|
-
|
63
|
+
crosswalk = {
|
64
|
+
"Closure or Layoff?": "closure_or_layoff",
|
65
|
+
"Company Name": "company",
|
66
|
+
"Company: Company Name": "company",
|
67
|
+
"County": "county",
|
68
|
+
"Date Received": "date_received",
|
69
|
+
"Employees": "employees",
|
70
|
+
"NAICS": "NAICS",
|
71
|
+
"NAICS Code": "NAICS",
|
72
|
+
"Notice Link": "notice_url",
|
73
|
+
"Notice Type": "source",
|
74
|
+
"Notice URL": "notice_url",
|
75
|
+
"Notice: Notice Number": "notice_number",
|
76
|
+
"Number of Employees Affected": "employees",
|
77
|
+
"Projected Date": "date_effective",
|
78
|
+
"Region": "region",
|
79
|
+
"Trade": "trade",
|
80
|
+
"Type of Employees Affected": "union_affected",
|
81
|
+
"Workforce Board": "region",
|
82
|
+
"address": "address",
|
83
|
+
"comments": "comments",
|
84
|
+
"congressional": "congressional",
|
85
|
+
"contact": "contact",
|
86
|
+
"industry": "industry",
|
87
|
+
"neg": "neg", # Rarely seen, only in historical data, maybe all with "N"
|
88
|
+
"occupations": "industry",
|
89
|
+
"source": "source",
|
90
|
+
"union": "union", # Unclear if different than types of employees affected/union_affected
|
91
|
+
}
|
92
|
+
|
93
|
+
masterlist: list = []
|
64
94
|
for sheet in workbook.worksheets:
|
65
95
|
localrows = parse_xlsx(sheet)
|
66
|
-
dirty_list.extend(localrows)
|
67
96
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
logger.debug(
|
76
|
-
f"Dropping dirty row that doesn't quite match headers in row {rowindex}"
|
77
|
-
)
|
78
|
-
logger.debug(f"Want: {headers}")
|
79
|
-
logger.debug(f"Got : {row}")
|
97
|
+
# Traverse each tab. Assume the first line is a header. Check if the second line is bogus.
|
98
|
+
# Build a list of dicts.
|
99
|
+
localheadersraw: list = localrows[0]
|
100
|
+
localheaders: list = []
|
101
|
+
for entry in localheadersraw:
|
102
|
+
if entry not in crosswalk:
|
103
|
+
logger.error(f"Potential header {entry} not found in crosswalk.")
|
80
104
|
else:
|
81
|
-
|
82
|
-
|
105
|
+
localheaders.append(crosswalk[entry])
|
106
|
+
for row in localrows[1:]: # Skip the header row
|
107
|
+
if row[0] != "Date Received": # Check for fake second header
|
108
|
+
line: dict = {}
|
109
|
+
for i, fieldname in enumerate(localheaders):
|
83
110
|
line[fieldname] = row[i]
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
f"Successfully merged {len(row_list)-1:,} records from new spreadsheet."
|
88
|
-
)
|
111
|
+
if isinstance(row[i], str):
|
112
|
+
line[fieldname] = row[i].strip()
|
113
|
+
masterlist.append(line)
|
89
114
|
|
90
|
-
|
115
|
+
logger.debug(f"Successfully merged {len(masterlist)} records from new spreadsheet.")
|
116
|
+
|
117
|
+
# Earlier versions of this code needed the archived data to match the new data.
|
118
|
+
# We can no longer expect that since October 2025 data revisions.
|
91
119
|
|
92
120
|
archive_url = "https://storage.googleapis.com/bln-data-public/warn-layoffs/ky-historical-normalized.csv"
|
93
121
|
|
@@ -96,24 +124,25 @@ def scrape(
|
|
96
124
|
|
97
125
|
reader = list(csv.reader(r.text.splitlines()))
|
98
126
|
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
127
|
+
localheadersraw = reader[0]
|
128
|
+
localheaders: list = [] # type: ignore
|
129
|
+
for entry in localheadersraw:
|
130
|
+
if entry not in crosswalk:
|
131
|
+
logger.error(f"Cannot match possible header value of {entry} to crosswalk.")
|
132
|
+
else:
|
133
|
+
localheaders.append(crosswalk[entry])
|
106
134
|
for row in reader[1:]: # Skip header row
|
107
|
-
line = {}
|
108
|
-
for i,
|
109
|
-
line[
|
110
|
-
|
111
|
-
|
112
|
-
|
135
|
+
line: dict = {} # type: ignore
|
136
|
+
for i, fieldname in enumerate(localheaders):
|
137
|
+
line[fieldname] = row[i]
|
138
|
+
if isinstance(row[i], str):
|
139
|
+
line[fieldname] = row[i].strip()
|
140
|
+
masterlist.append(line)
|
141
|
+
logger.debug("Historical records folded in.")
|
113
142
|
|
114
143
|
# Write out the results
|
115
144
|
data_path = data_dir / "ky.csv"
|
116
|
-
utils.
|
145
|
+
utils.write_disparate_dict_rows_to_csv(data_path, masterlist)
|
117
146
|
|
118
147
|
# Pass it out
|
119
148
|
return data_path
|
warn/utils.py
CHANGED
@@ -232,6 +232,41 @@ def write_dict_rows_to_csv(output_path, headers, rows, mode="w", extrasaction="r
|
|
232
232
|
writer.writerow(row)
|
233
233
|
|
234
234
|
|
235
|
+
def write_disparate_dict_rows_to_csv(output_path, rows, mode="w"):
|
236
|
+
"""Write the provided list of dictionaries to the provided path as comma-separated values, while determining a header.
|
237
|
+
|
238
|
+
Args:
|
239
|
+
output_path (Path): the Path were the result will be saved
|
240
|
+
rows (list): the list of dictionaries to be saved; can have disparate dict keys
|
241
|
+
mode (str): the mode to be used when opening the file (default 'w')
|
242
|
+
"""
|
243
|
+
create_directory(output_path, is_file=True)
|
244
|
+
headers: set = set() # Get all the potential header names
|
245
|
+
for row in rows:
|
246
|
+
for item in row:
|
247
|
+
headers.add(item)
|
248
|
+
headers = list(sorted(headers))
|
249
|
+
logger.debug(f"Found {len(headers):,} header entries in list of dicts.")
|
250
|
+
logger.debug(f"Writing {len(rows)} rows to {output_path}")
|
251
|
+
with open(output_path, mode, newline="") as outfile:
|
252
|
+
# Create the writer object
|
253
|
+
writer = csv.writer(outfile)
|
254
|
+
# If we are writing a new row ...
|
255
|
+
if mode == "w":
|
256
|
+
# ... drop in the headers
|
257
|
+
writer.writerow(headers)
|
258
|
+
# Loop through the dicts and write them in one by one.
|
259
|
+
for row in rows:
|
260
|
+
line = {}
|
261
|
+
for item in headers:
|
262
|
+
if item in row:
|
263
|
+
line[item] = row[item]
|
264
|
+
else:
|
265
|
+
line[item] = None
|
266
|
+
writer.writerow(list(line.values()))
|
267
|
+
return
|
268
|
+
|
269
|
+
|
235
270
|
def get_all_scrapers():
|
236
271
|
"""Get all the states and territories that have scrapers.
|
237
272
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: warn-scraper
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.118
|
4
4
|
Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
|
5
5
|
Home-page: https://github.com/biglocalnews/warn-scraper
|
6
6
|
Author: Big Local News
|
@@ -17,7 +17,7 @@ warn/__init__.py,sha256=A07JFY1TyaPtVIndBa7IvTk13DETqIkLgRdk0A-MCoE,85
|
|
17
17
|
warn/cache.py,sha256=hyta04_G-ALGwcKl4xNc7EgHS_xklyVD5d8SXNrJekY,5520
|
18
18
|
warn/cli.py,sha256=ZqyJwICdHFkn2hEgbArj_upbElR9-TSDlYDqyEGeexE,2019
|
19
19
|
warn/runner.py,sha256=oeGRybGwpnkQKlPzRMlKxhsDt1GN4PZoX-vUwrsPgos,1894
|
20
|
-
warn/utils.py,sha256=
|
20
|
+
warn/utils.py,sha256=67ltJ1ZDCqLfZoFcI8kp5BaTbv28ZzOfzaDvlbnVBfM,11821
|
21
21
|
warn/platforms/__init__.py,sha256=wIZRDf4tbTuC8oKM4ZrTAtwNgbtMQGzPXMwDYCFyrog,81
|
22
22
|
warn/platforms/job_center/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
23
23
|
warn/platforms/job_center/cache.py,sha256=yhA3sE46lNFg8vEewSoRYVByi0YSlkBiKm7qoSUiTdM,1868
|
@@ -41,7 +41,7 @@ warn/scrapers/id.py,sha256=qJLcLgCgAfKzLpuwW32JqNwXn9NxZRZvQ50nZZKUhmE,6674
|
|
41
41
|
warn/scrapers/il.py,sha256=sygdvsNuB_Gvu3o_HidtpSP4FLz0szKb1zEHqGxVtlI,1563
|
42
42
|
warn/scrapers/in.py,sha256=dAT40ROhhKiwLcwa_YJ6EyhsYBLe0IX2rOWXmNa6JMs,2026
|
43
43
|
warn/scrapers/ks.py,sha256=F_3biEMF7zgCX2XVuUACR74Vyzapta4SaM9SY3EuZCU,1266
|
44
|
-
warn/scrapers/ky.py,sha256=
|
44
|
+
warn/scrapers/ky.py,sha256=IDIzULH5h-UqGCvKvvipYbi5Gg3_rmue_o9SgF7QWqs,5843
|
45
45
|
warn/scrapers/la.py,sha256=ORkMOQErl33SEiagOli4agDLdTt0R1MxxBmqOg3hNv8,13175
|
46
46
|
warn/scrapers/md.py,sha256=hwgxXQnhyBWm8qF1dvxIThAX1MkrZbXLwRI9inO5t8g,4060
|
47
47
|
warn/scrapers/me.py,sha256=q36F4yJ7hvZsLayA3uBS1romo4X3Qf-sEi2Y7LAQCi8,1172
|
@@ -65,9 +65,9 @@ warn/scrapers/va.py,sha256=7Nle7qL0VNPiE653XyaP9HQqSfuJFDRr2kEkjOqLvFM,11269
|
|
65
65
|
warn/scrapers/vt.py,sha256=d-bo4WK2hkrk4BhCCmLpEovcoZltlvdIUB6O0uaMx5A,1186
|
66
66
|
warn/scrapers/wa.py,sha256=UXdVtHZo_a-XfoiyOooTRfTb9W3PErSZdKca6SRORgs,4282
|
67
67
|
warn/scrapers/wi.py,sha256=ClEzXkwZbop0W4fkQgsb5oHAPUrb4luUPGV-jOKwkcg,4855
|
68
|
-
warn_scraper-1.2.
|
69
|
-
warn_scraper-1.2.
|
70
|
-
warn_scraper-1.2.
|
71
|
-
warn_scraper-1.2.
|
72
|
-
warn_scraper-1.2.
|
73
|
-
warn_scraper-1.2.
|
68
|
+
warn_scraper-1.2.118.dist-info/licenses/LICENSE,sha256=ZV-QHyqPwyMuwuj0lI05JeSjV1NyzVEk8Yeu7FPtYS0,585
|
69
|
+
warn_scraper-1.2.118.dist-info/METADATA,sha256=K3igkvyJ1uKPecz9UpRkx75hq1MN_GYy3GDRYHzX24k,2385
|
70
|
+
warn_scraper-1.2.118.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
|
71
|
+
warn_scraper-1.2.118.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
|
72
|
+
warn_scraper-1.2.118.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
|
73
|
+
warn_scraper-1.2.118.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|