warn-scraper 1.2.151.dev0__py3-none-any.whl → 1.2.153.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- warn/pdfrodent/__init__.py +0 -0
- warn/pdfrodent/pdfrodent.py +361 -0
- warn/scrapers/al.py +5 -1
- warn/scrapers/ms.py +150 -0
- warn/utils.py +26 -8
- {warn_scraper-1.2.151.dev0.dist-info → warn_scraper-1.2.153.dev0.dist-info}/METADATA +1 -1
- {warn_scraper-1.2.151.dev0.dist-info → warn_scraper-1.2.153.dev0.dist-info}/RECORD +11 -8
- {warn_scraper-1.2.151.dev0.dist-info → warn_scraper-1.2.153.dev0.dist-info}/WHEEL +0 -0
- {warn_scraper-1.2.151.dev0.dist-info → warn_scraper-1.2.153.dev0.dist-info}/entry_points.txt +0 -0
- {warn_scraper-1.2.151.dev0.dist-info → warn_scraper-1.2.153.dev0.dist-info}/licenses/LICENSE +0 -0
- {warn_scraper-1.2.151.dev0.dist-info → warn_scraper-1.2.153.dev0.dist-info}/top_level.txt +0 -0
|
File without changes
|
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
import camelot # pip install camelot-py==1.0.9 for now
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def clean_cell(text: str) -> str:
|
|
11
|
+
"""
|
|
12
|
+
Clean up text from a PDF cell.
|
|
13
|
+
|
|
14
|
+
Keyword arguments:
|
|
15
|
+
text -- the text to clean
|
|
16
|
+
|
|
17
|
+
Returns: the cleaned text
|
|
18
|
+
"""
|
|
19
|
+
# Replace None with an empty string
|
|
20
|
+
if text is None:
|
|
21
|
+
return ""
|
|
22
|
+
|
|
23
|
+
# Standardize whitespace
|
|
24
|
+
clean_text = re.sub(r"\s+", " ", text).strip()
|
|
25
|
+
|
|
26
|
+
return clean_text
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def clean_row(row: list):
|
|
30
|
+
"""Clean up text from a list of strings.
|
|
31
|
+
|
|
32
|
+
args:
|
|
33
|
+
row (list): list of strings
|
|
34
|
+
returns:
|
|
35
|
+
line (list): list of strings, each with minimal whitespace
|
|
36
|
+
"""
|
|
37
|
+
line: list = []
|
|
38
|
+
for cell in row:
|
|
39
|
+
line.append(clean_cell(cell))
|
|
40
|
+
return line
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def is_empty(row: list) -> bool:
|
|
44
|
+
"""
|
|
45
|
+
Check if a row has no populated cells.
|
|
46
|
+
|
|
47
|
+
Keyword arguments:
|
|
48
|
+
row -- the row to check
|
|
49
|
+
|
|
50
|
+
Returns: True if the row is empty, False otherwise
|
|
51
|
+
"""
|
|
52
|
+
return len(list(filter(None, row))) == 0
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def is_mostly_empty(row: list) -> bool:
|
|
56
|
+
"""
|
|
57
|
+
Check if a row has few populated cells. Used to determine if carried over from a previous page.
|
|
58
|
+
|
|
59
|
+
Keyword arguments:
|
|
60
|
+
row -- the row to check
|
|
61
|
+
|
|
62
|
+
Returns: True if the row is mostly empty, False otherwise
|
|
63
|
+
"""
|
|
64
|
+
return len(list(filter(None, row))) <= 2
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def has_content(value):
|
|
68
|
+
"""Check if a particular value has any content, e.g. is it a null or an empty string."""
|
|
69
|
+
if value is list:
|
|
70
|
+
content = True
|
|
71
|
+
elif value is dict:
|
|
72
|
+
content = True
|
|
73
|
+
elif value is None:
|
|
74
|
+
content = False
|
|
75
|
+
else:
|
|
76
|
+
value = str(value).strip()
|
|
77
|
+
if len(value) > 0:
|
|
78
|
+
content = True
|
|
79
|
+
else:
|
|
80
|
+
content = False
|
|
81
|
+
return content
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def count_data_items(row: list, prefixes=None) -> int:
|
|
85
|
+
"""
|
|
86
|
+
Count number of non-blank non-null data items in a row that aren't an internal variable.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
row (list of dicts): The row to check
|
|
90
|
+
prefixes (list) optional: If not provided, will skip data items beginning with ["int_", "_int"]. To empty pass an empty list.
|
|
91
|
+
Returns:
|
|
92
|
+
Integer of how many non-blank non-internal data items there are
|
|
93
|
+
"""
|
|
94
|
+
good_items = 0
|
|
95
|
+
if not prefixes:
|
|
96
|
+
prefixes = ["int_", "_int_"]
|
|
97
|
+
for field in row:
|
|
98
|
+
goodfieldname = True
|
|
99
|
+
for prefix in prefixes:
|
|
100
|
+
if field.startswith(prefix):
|
|
101
|
+
goodfieldname = False
|
|
102
|
+
if goodfieldname:
|
|
103
|
+
if has_content(row[field]):
|
|
104
|
+
good_items += 1
|
|
105
|
+
return good_items
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def drop_thin_rows(rows: list, cutnumber: int, prefixes=None):
|
|
109
|
+
"""
|
|
110
|
+
Drop rows with an improperly low count of valid entries, after filtering out prefixed rows of safe data.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
row: List of dicts
|
|
114
|
+
cutnumber: Cut rows with X or fewer full items. x + 1, then, would be the minimum count of good.
|
|
115
|
+
prefixes: list, optional. If not provided will neglect to count data items beginning with ["int_", "_int_"]. To empty pass an empty list.
|
|
116
|
+
Returns:
|
|
117
|
+
line: List of dics
|
|
118
|
+
"""
|
|
119
|
+
lines = []
|
|
120
|
+
if not prefixes:
|
|
121
|
+
prefixes = ["int_", "_int_"]
|
|
122
|
+
for row in rows:
|
|
123
|
+
if count_data_items(row, prefixes=prefixes) > cutnumber:
|
|
124
|
+
lines.append(row)
|
|
125
|
+
return lines
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
internal_documentation_such_as_it_is = """
|
|
129
|
+
OK, this is going to be messy. The higher-level overview:
|
|
130
|
+
We get lists of strings from the PDF, an ostensible PDF row.
|
|
131
|
+
|
|
132
|
+
Some of these lists are going to be headers. The headers, of course, need to be detected initially.
|
|
133
|
+
|
|
134
|
+
And sometimes the headers show up as their own table, with nothing else.
|
|
135
|
+
If this is the case, they need to be applied as the headers to subsequent tables.
|
|
136
|
+
|
|
137
|
+
But headers can also repeat across pages, so we need to detect them.
|
|
138
|
+
|
|
139
|
+
To add to the fun, each of these rows from the PDF may be just part of another logical row,
|
|
140
|
+
from when cells are divided horizontally to hold multiple data points.
|
|
141
|
+
|
|
142
|
+
We need to detect those fragmentary lines, mostly by checking to see if most cells are empty.
|
|
143
|
+
|
|
144
|
+
If they're a fragment of a header, we need to track it somehow and build a structure to hold the fragment.
|
|
145
|
+
And remember header fragments may occur on multiple pages with multipage headers.
|
|
146
|
+
That means we need to build an initial structure to hold the headers, then skip some rows if we see the header again.
|
|
147
|
+
|
|
148
|
+
For non-header fragments, we need to append the data to the previous line in an appropriate data structure.
|
|
149
|
+
|
|
150
|
+
But wait! There's more!
|
|
151
|
+
|
|
152
|
+
PDF data tends to be really dirty, lots of junky white space.
|
|
153
|
+
|
|
154
|
+
Some people will use multiline data to show multiple data points in a single cell, such as Company name<newline>, City, State ZIP.
|
|
155
|
+
If we strip off white space, we're losing a way to segregate and process that data later. So we can't clean it up until later.
|
|
156
|
+
Unless it's for fragmentary rows, because we need to know that they're fragmentary and white space will wreck the count.
|
|
157
|
+
|
|
158
|
+
And of course lots of rows are entirely white space, just blank data rows left in a PDF. Those we just drop.
|
|
159
|
+
|
|
160
|
+
To sum up:
|
|
161
|
+
Just about every PDF row can be
|
|
162
|
+
An orphaned header, alone in the table
|
|
163
|
+
A full header row
|
|
164
|
+
A fragmentary header
|
|
165
|
+
A full data row
|
|
166
|
+
A fragmentary data row
|
|
167
|
+
A blank row
|
|
168
|
+
|
|
169
|
+
We need many little trackers to go through here and figure out what we're looking at.
|
|
170
|
+
|
|
171
|
+
We need code to clean up whitespace in cells and rows.
|
|
172
|
+
|
|
173
|
+
We need a function to delete rows with fewer than a certain number of data points (e.g., contents of a summary table).
|
|
174
|
+
|
|
175
|
+
We need a function that allows us to standardize header names.
|
|
176
|
+
|
|
177
|
+
We probably want code that tells us what PDF this is pulled from, on which row.
|
|
178
|
+
"""
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def parse_pdf(pdffile: str, field_fixes: dict | None = None):
|
|
182
|
+
"""Parse a PDF file to extract data from tables.
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
Filename (string)
|
|
186
|
+
field_fixes (string or dict): If supplied, a dictionary of header lookup values with values of the target name
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
filelist: A list of dictionaries of data rows keyed to headers
|
|
190
|
+
filerowholder: Debugging data showing how row types were determined
|
|
191
|
+
"""
|
|
192
|
+
if not field_fixes:
|
|
193
|
+
logger.debug(
|
|
194
|
+
"No 'field_fixes' variable submitted to pdfrodent.parse_pdf function."
|
|
195
|
+
)
|
|
196
|
+
field_fixes = {}
|
|
197
|
+
else:
|
|
198
|
+
logger.debug(f"{len(field_fixes):,} field_fixes to be used to clean headers.")
|
|
199
|
+
filelist = []
|
|
200
|
+
filerowholder = []
|
|
201
|
+
logger.debug(f"Opening {pdffile} for PDF parsing")
|
|
202
|
+
tables = camelot.read_pdf(pdffile, pages="all")
|
|
203
|
+
orphanedheader = False
|
|
204
|
+
orphanholder = None
|
|
205
|
+
for tableindex, table in enumerate(tables):
|
|
206
|
+
locallist: list = []
|
|
207
|
+
logger.debug(f"Processing table {tableindex} of {pdffile}")
|
|
208
|
+
filerowholder.append(f"Processing table {tableindex} of {pdffile}")
|
|
209
|
+
rawheader = None
|
|
210
|
+
headerfirst = []
|
|
211
|
+
headersupplement: dict = {}
|
|
212
|
+
isheader = True
|
|
213
|
+
seendata = False
|
|
214
|
+
logger.debug(
|
|
215
|
+
f"Processing table {tableindex} with {len(table.rows)} of {pdffile}"
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
# If the table has only one row, it's a stray header and should be used with the next table.
|
|
219
|
+
if len(table.rows) == 1:
|
|
220
|
+
logger.debug("\tOrphaned header detected!")
|
|
221
|
+
filerowholder.append("\tOrphaned header detected!")
|
|
222
|
+
orphanedheader = True
|
|
223
|
+
patchedheaders = []
|
|
224
|
+
rawheader = table.data[0]
|
|
225
|
+
for item in clean_row(rawheader):
|
|
226
|
+
if item in field_fixes:
|
|
227
|
+
patchedheaders.append(field_fixes[item])
|
|
228
|
+
else:
|
|
229
|
+
logger.debug(
|
|
230
|
+
f"New header type found: {item}, not in {' '.join(sorted(list(field_fixes.keys())))}"
|
|
231
|
+
)
|
|
232
|
+
patchedheaders.append(item)
|
|
233
|
+
orphanholder = {
|
|
234
|
+
"rawheader": rawheader,
|
|
235
|
+
"patchedheaders": patchedheaders,
|
|
236
|
+
}
|
|
237
|
+
logger.debug(f"{orphanholder}")
|
|
238
|
+
filerowholder.append(f"{orphanholder}")
|
|
239
|
+
# If there are multiple rows, there are a bunch of possibilities we need to poke ...
|
|
240
|
+
else:
|
|
241
|
+
# If we have a header from a one-row table, prepare to use the orphaned header
|
|
242
|
+
if orphanedheader:
|
|
243
|
+
isheader = True
|
|
244
|
+
rawheader = orphanholder["rawheader"] # type: ignore
|
|
245
|
+
headerfirst = orphanholder["patchedheaders"] # type: ignore
|
|
246
|
+
|
|
247
|
+
for rowindex, row in enumerate(table.data):
|
|
248
|
+
filerowholder.append(row)
|
|
249
|
+
line: dict = {} # rows in, lines out
|
|
250
|
+
# If it's the first row in a table and we don't have an orphaned header,
|
|
251
|
+
# it's an index row
|
|
252
|
+
if rowindex == 0 and not orphanedheader:
|
|
253
|
+
rawheader = row
|
|
254
|
+
patchedheaders = []
|
|
255
|
+
for item in clean_row(rawheader):
|
|
256
|
+
if item in field_fixes:
|
|
257
|
+
patchedheaders.append(field_fixes[item])
|
|
258
|
+
else:
|
|
259
|
+
logger.debug(
|
|
260
|
+
f"New header type found: {item}, not in {' '.join(sorted(list(field_fixes.keys())))}"
|
|
261
|
+
)
|
|
262
|
+
patchedheaders.append(item)
|
|
263
|
+
headerfirst = patchedheaders
|
|
264
|
+
isheader = True
|
|
265
|
+
filerowholder.append("\tIndex row!")
|
|
266
|
+
|
|
267
|
+
elif row == rawheader: # Later instance of a page header
|
|
268
|
+
isheader = True
|
|
269
|
+
filerowholder.append("\tRepeated header")
|
|
270
|
+
|
|
271
|
+
# Drop blank rows entirely
|
|
272
|
+
elif is_empty(clean_row(row)):
|
|
273
|
+
filerowholder.append("\tEmpty row")
|
|
274
|
+
pass
|
|
275
|
+
|
|
276
|
+
# Handle fragmentary records
|
|
277
|
+
elif is_mostly_empty(clean_row(row)):
|
|
278
|
+
filerowholder.append("\tMostly empty row!")
|
|
279
|
+
if not seendata: # Is this part of the initial header?
|
|
280
|
+
filerowholder.append("\tMostly empty row, haven't seen data")
|
|
281
|
+
for cellindex, cell in enumerate(row):
|
|
282
|
+
cleancell = clean_cell(cell)
|
|
283
|
+
if len(cleancell) > 0: # If we have good data
|
|
284
|
+
fieldname = f"supplement{cellindex}"
|
|
285
|
+
headersupplement[fieldname] = None # type: ignore
|
|
286
|
+
isheader = False
|
|
287
|
+
orphanedheader = False
|
|
288
|
+
|
|
289
|
+
else: # seenheader
|
|
290
|
+
if isheader: # Supplement to a header on a latter page
|
|
291
|
+
filerowholder.append(
|
|
292
|
+
"\tMostly empty row, seems to be appending to a header"
|
|
293
|
+
)
|
|
294
|
+
for cellindex, cell in enumerate(row):
|
|
295
|
+
cleancell = clean_cell(cell)
|
|
296
|
+
if len(cleancell) > 0: # If we have good data
|
|
297
|
+
if cleancell not in headersupplement:
|
|
298
|
+
headersupplement[cellindex] = headersupplement
|
|
299
|
+
logger.debug(
|
|
300
|
+
f"Added {cleancell} to headersupplement, which now holds: {headersupplement}"
|
|
301
|
+
)
|
|
302
|
+
isheader = False
|
|
303
|
+
|
|
304
|
+
else: # Not a header, have seenheader; must be a regular row supplement
|
|
305
|
+
orphanedheader = False
|
|
306
|
+
isheader = False
|
|
307
|
+
filerowholder.append(
|
|
308
|
+
"\tMostly empty row, seems to be detailed info for a regular row"
|
|
309
|
+
)
|
|
310
|
+
for cellindex, cell in enumerate(row):
|
|
311
|
+
cleancell = clean_cell(cell)
|
|
312
|
+
if len(cleancell) > 0: # If we have good data
|
|
313
|
+
if cellindex in headersupplement:
|
|
314
|
+
fieldname = headersupplement[cellindex] # type: ignore
|
|
315
|
+
else:
|
|
316
|
+
fieldname = f"supplement_{cellindex}"
|
|
317
|
+
logger.warning(
|
|
318
|
+
f"Found {fieldname} as {cleancell} but not located in supplemental headers: {headersupplement}"
|
|
319
|
+
)
|
|
320
|
+
if fieldname in field_fixes:
|
|
321
|
+
logger.debug(
|
|
322
|
+
f"Shifting cell with {fieldname} to {field_fixes[fieldname]}"
|
|
323
|
+
)
|
|
324
|
+
fieldname = field_fixes[fieldname]
|
|
325
|
+
locallist[-1][
|
|
326
|
+
fieldname
|
|
327
|
+
] = cleancell # Add it to the previous line
|
|
328
|
+
isheader = False
|
|
329
|
+
|
|
330
|
+
else:
|
|
331
|
+
# It's not an orphaned header
|
|
332
|
+
# It's not the initial header
|
|
333
|
+
# It's not a supplemental header
|
|
334
|
+
# It's not an empty row
|
|
335
|
+
# It's not a supplemental data row
|
|
336
|
+
# We ... actually have a regular data row here.
|
|
337
|
+
orphanedheader = False
|
|
338
|
+
filerowholder.append("\tSeems to be a regular row.")
|
|
339
|
+
isheader = False
|
|
340
|
+
seendata = True
|
|
341
|
+
for cellindex, cell in enumerate(row):
|
|
342
|
+
line[headerfirst[cellindex]] = clean_cell(cell)
|
|
343
|
+
filerowholder.append(f"\t\t{line}")
|
|
344
|
+
locallist.append(line)
|
|
345
|
+
|
|
346
|
+
report = table.parsing_report
|
|
347
|
+
|
|
348
|
+
for lineindex, line in enumerate(locallist):
|
|
349
|
+
line["_int_accuracy"] = report["accuracy"]
|
|
350
|
+
line["_int_pdf_filename"] = pdffile.split("/")[-1].split("\\")[-1]
|
|
351
|
+
line["_int_page"] = report["page"]
|
|
352
|
+
line["_int_table_number"] = report["order"]
|
|
353
|
+
line["_int_raw_fields"] = json.dumps(list(line.values()))
|
|
354
|
+
line["_int_data_items"] = count_data_items(line) # type: ignore
|
|
355
|
+
if "Event Number" in line:
|
|
356
|
+
line["Event Number"] = line["Event Number"].replace("\n", "")
|
|
357
|
+
|
|
358
|
+
locallist[lineindex] = line # Save it back
|
|
359
|
+
|
|
360
|
+
filelist.extend(locallist)
|
|
361
|
+
return (filelist, filerowholder)
|
warn/scrapers/al.py
CHANGED
|
@@ -32,11 +32,15 @@ def scrape(
|
|
|
32
32
|
Returns: the Path where the file is written
|
|
33
33
|
"""
|
|
34
34
|
output_csv = data_dir / "al.csv"
|
|
35
|
-
page = utils.get_url("https://www.madeinalabama.com/warn-list/")
|
|
35
|
+
# page = utils.get_url("https://www.madeinalabama.com/warn-list/")
|
|
36
|
+
# URL change in June 2026, maybe led to a HTTP 415 error
|
|
37
|
+
page = utils.get_url("https://workforce.alabama.gov/warn-list/")
|
|
38
|
+
|
|
36
39
|
# can't see 2020 listings when I open web page, but they are on the summary in the google search
|
|
37
40
|
soup = BeautifulSoup(page.text, "html.parser")
|
|
38
41
|
table = soup.find_all("table") # output is list-type
|
|
39
42
|
table_rows = table[0].find_all("tr")
|
|
43
|
+
logger.debug(f"{len(table_rows):,} total table rows (including header) found")
|
|
40
44
|
# Handle the header
|
|
41
45
|
raw_header = table_rows.pop(0)
|
|
42
46
|
header_row = _extract_fields_from_row(raw_header, "th")
|
warn/scrapers/ms.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from pyquery import PyQuery as pq
|
|
6
|
+
|
|
7
|
+
from warn.pdfrodent import pdfrodent as pdfrodent
|
|
8
|
+
|
|
9
|
+
from .. import utils
|
|
10
|
+
from ..cache import Cache
|
|
11
|
+
|
|
12
|
+
__authors__ = ["Ash1R", "stucka"]
|
|
13
|
+
__tags__ = ["pdf"]
|
|
14
|
+
__source__ = {
|
|
15
|
+
"name": "Mississippi Department of Employment Security",
|
|
16
|
+
"url": "https://mdes.ms.gov/information-center/warn-information/",
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
want_debugging_file = True
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def scrape(
|
|
24
|
+
data_dir: Path = utils.WARN_DATA_DIR,
|
|
25
|
+
cache_dir: Path = utils.WARN_CACHE_DIR,
|
|
26
|
+
) -> Path:
|
|
27
|
+
"""
|
|
28
|
+
Scrape data from Mississippi.
|
|
29
|
+
|
|
30
|
+
Keyword arguments:
|
|
31
|
+
data_dir -- the Path were the result will be saved (default WARN_DATA_DIR)
|
|
32
|
+
cache_dir -- the Path where results can be cached (default WARN_CACHE_DIR)
|
|
33
|
+
|
|
34
|
+
Returns: the Path where the file is written
|
|
35
|
+
"""
|
|
36
|
+
cache = Cache(cache_dir)
|
|
37
|
+
remoteurl = __source__["url"]
|
|
38
|
+
urlprefix = remoteurl.split(".gov")[0] + ".gov"
|
|
39
|
+
|
|
40
|
+
html = utils.get_url(remoteurl).text
|
|
41
|
+
cache.write("ms/index.html", html)
|
|
42
|
+
|
|
43
|
+
content = pq(html)("div#page_content")
|
|
44
|
+
anchors = pq(content)("a")
|
|
45
|
+
|
|
46
|
+
# Parse HTML to identify relevant PDFs
|
|
47
|
+
urlswanted = []
|
|
48
|
+
for anchor in anchors:
|
|
49
|
+
href = pq(anchor).attr("href")
|
|
50
|
+
remoteurl = href
|
|
51
|
+
if "http" not in remoteurl:
|
|
52
|
+
remoteurl = urlprefix + remoteurl
|
|
53
|
+
if remoteurl.endswith(".pdf"):
|
|
54
|
+
if not remoteurl.endswith("map.pdf"):
|
|
55
|
+
urlswanted.append(remoteurl)
|
|
56
|
+
|
|
57
|
+
# Get the files. The five first-listed files, we want fresh.
|
|
58
|
+
# That should cover every quarter in the latest year, and one quarter of the previous year, at least.
|
|
59
|
+
for i, urlwanted in enumerate(urlswanted):
|
|
60
|
+
basefilename = urlwanted.split("/")[-1]
|
|
61
|
+
localfilename = cache_dir / f"ms/{basefilename}"
|
|
62
|
+
if i <= 4: # Get the five newest files to ensure proper overlap
|
|
63
|
+
logger.debug(f"Fetching fresh copy of {localfilename}")
|
|
64
|
+
utils.save_if_good_url(localfilename, urlwanted)
|
|
65
|
+
else:
|
|
66
|
+
logger.debug(f"Getting copy of {localfilename} if needed")
|
|
67
|
+
utils.fetch_if_not_cached(localfilename, urlwanted)
|
|
68
|
+
|
|
69
|
+
pdffiles = sorted(cache.files(subdir="ms/", glob_pattern="*.pdf"))
|
|
70
|
+
|
|
71
|
+
headerfixes = {
|
|
72
|
+
"": "blank_entry",
|
|
73
|
+
"# Affected": "affected",
|
|
74
|
+
"# Of Notices Received": "notices_received",
|
|
75
|
+
"City": "city",
|
|
76
|
+
"Company Name": "company",
|
|
77
|
+
"Company Name (City) (County)": "company",
|
|
78
|
+
"Company Name (City) (County) (Zip)": "company",
|
|
79
|
+
"Company Name City (County)": "company",
|
|
80
|
+
"Company Name City, (County)": "company",
|
|
81
|
+
"Company Name, City (County)": "company",
|
|
82
|
+
"Company Name, City, County": "company",
|
|
83
|
+
"County": "county",
|
|
84
|
+
"Date of Action": "date_effective",
|
|
85
|
+
"Date of Notice": "date_notice",
|
|
86
|
+
"Date of WARN Notice": "date_notice",
|
|
87
|
+
"Event Number": "event_number",
|
|
88
|
+
"NAICS CODE & Description": "naics",
|
|
89
|
+
"NAICS CODE – Description": "naics",
|
|
90
|
+
"Notices Received": "notices_received",
|
|
91
|
+
"Number Of Notices Received": "notices_received",
|
|
92
|
+
"Number Of Notices Received October 2024 – December 2024": "notices_received",
|
|
93
|
+
"Number Affected": "affected",
|
|
94
|
+
"Reason / Comments": "reason",
|
|
95
|
+
"Reason – Comments": "reason",
|
|
96
|
+
"Type of Action": "action_type",
|
|
97
|
+
"Type of Action # Affected": "action_type",
|
|
98
|
+
"T ypes of Notice": "notice_types",
|
|
99
|
+
"T ypes of Notices Received": "notice_types",
|
|
100
|
+
"Type of Notice": "notice_types",
|
|
101
|
+
"Types of Notice": "notice_types",
|
|
102
|
+
"Types of Notices": "notice_types",
|
|
103
|
+
"Types of Notices Received": "notice_types",
|
|
104
|
+
"Workforc e Area": "workforce_area",
|
|
105
|
+
"Workforce Area": "workforce_area",
|
|
106
|
+
"_int_accuracy": "_int_accuracy",
|
|
107
|
+
"_int_data_items": "_int_data_items",
|
|
108
|
+
"_int_page": "_int_page",
|
|
109
|
+
"_int_pdf_filename": "_int_pdf_filename",
|
|
110
|
+
"_int_raw_fields": "_int_raw_fields",
|
|
111
|
+
"_int_table_number": "_int_table_number",
|
|
112
|
+
"supplement_0": "supplement_0",
|
|
113
|
+
"supplement_1": "supplement_1",
|
|
114
|
+
"supplement_2": "supplement_2",
|
|
115
|
+
"supplement_5": "affected", # Only carries from 2025sq2
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
masterlist = []
|
|
119
|
+
rowholder = []
|
|
120
|
+
for pdffile in pdffiles:
|
|
121
|
+
locallist, localrows = pdfrodent.parse_pdf(pdffile, headerfixes)
|
|
122
|
+
masterlist.extend(locallist)
|
|
123
|
+
rowholder.extend(localrows)
|
|
124
|
+
|
|
125
|
+
# Identify all header elements, even in the ones we're about to remove.
|
|
126
|
+
allheaders = set()
|
|
127
|
+
for row in masterlist:
|
|
128
|
+
for item in row:
|
|
129
|
+
allheaders.add(item)
|
|
130
|
+
text = ""
|
|
131
|
+
for item in sorted(allheaders):
|
|
132
|
+
text += f"\t\t'{item}': ,\n"
|
|
133
|
+
with open(Path(cache_dir) / "ms/allheaders.txt", "w") as outfile:
|
|
134
|
+
outfile.write(text)
|
|
135
|
+
|
|
136
|
+
targetfilename = data_dir / "ms.csv"
|
|
137
|
+
logger.debug(f"Found {len(masterlist):,} extracted rows from the PDFs.")
|
|
138
|
+
cleaned = pdfrodent.drop_thin_rows(masterlist, 6)
|
|
139
|
+
logger.debug(
|
|
140
|
+
f"After filtering out thin rows, we have {len(cleaned):,} rows of data meeting standards."
|
|
141
|
+
)
|
|
142
|
+
# utils.write_disparate_dict_rows_to_csv(targetfilename, masterlist)
|
|
143
|
+
utils.write_disparate_dict_rows_to_csv(targetfilename, cleaned)
|
|
144
|
+
|
|
145
|
+
if want_debugging_file:
|
|
146
|
+
with open(Path(cache_dir) / "ms/debugging.txt", "w") as outfile:
|
|
147
|
+
for row in rowholder:
|
|
148
|
+
outfile.write(json.dumps(row) + "\r\n")
|
|
149
|
+
|
|
150
|
+
return targetfilename
|
warn/utils.py
CHANGED
|
@@ -213,7 +213,7 @@ def write_rows_to_csv(output_path: Path, rows: list, mode="w"):
|
|
|
213
213
|
mode (str): the mode to be used when opening the file (default 'w')
|
|
214
214
|
"""
|
|
215
215
|
create_directory(output_path, is_file=True)
|
|
216
|
-
logger.debug(f"Writing {len(rows)} rows to {output_path}")
|
|
216
|
+
logger.debug(f"Writing {len(rows):,} rows to {output_path}")
|
|
217
217
|
with open(output_path, mode, newline="", encoding="utf-8") as f:
|
|
218
218
|
writer = csv.writer(f)
|
|
219
219
|
writer.writerows(rows)
|
|
@@ -230,7 +230,7 @@ def write_dict_rows_to_csv(output_path, headers, rows, mode="w", extrasaction="r
|
|
|
230
230
|
extrasaction (str): what to do if the if a field isn't in the headers (default 'raise')
|
|
231
231
|
"""
|
|
232
232
|
create_directory(output_path, is_file=True)
|
|
233
|
-
logger.debug(f"Writing {len(rows)} rows to {output_path}")
|
|
233
|
+
logger.debug(f"Writing {len(rows):,} rows to {output_path}")
|
|
234
234
|
with open(output_path, mode, newline="") as f:
|
|
235
235
|
# Create the writer object
|
|
236
236
|
writer = csv.DictWriter(f, fieldnames=headers, extrasaction=extrasaction)
|
|
@@ -243,22 +243,40 @@ def write_dict_rows_to_csv(output_path, headers, rows, mode="w", extrasaction="r
|
|
|
243
243
|
writer.writerow(row)
|
|
244
244
|
|
|
245
245
|
|
|
246
|
-
def write_disparate_dict_rows_to_csv(
|
|
246
|
+
def write_disparate_dict_rows_to_csv(
|
|
247
|
+
output_path, rows, mode="w", prefixes: None | list = None
|
|
248
|
+
):
|
|
247
249
|
"""Write the provided list of dictionaries to the provided path as comma-separated values, while determining a header.
|
|
248
250
|
|
|
249
251
|
Args:
|
|
250
252
|
output_path (Path): the Path were the result will be saved
|
|
251
253
|
rows (list): the list of dictionaries to be saved; can have disparate dict keys
|
|
252
254
|
mode (str): the mode to be used when opening the file (default 'w')
|
|
255
|
+
prefixes(list|None): text strings that determine whether fields should arrive after other fields.
|
|
256
|
+
Send an empty list, [], to run without any prefixes.
|
|
257
|
+
Send None or don't send to use default prefixes of _int_ and int_
|
|
253
258
|
"""
|
|
259
|
+
if not prefixes:
|
|
260
|
+
prefixes = ["int_", "_int_"]
|
|
261
|
+
logger.debug(f"Writing {(len(rows)+1):,} rows to {output_path}")
|
|
254
262
|
create_directory(output_path, is_file=True)
|
|
255
|
-
headers:
|
|
263
|
+
headers: list = [] # We want to preserve order, and set won't do it.
|
|
264
|
+
headerextras: list = [] # stuff that should be at the right of the field list
|
|
256
265
|
for row in rows:
|
|
257
266
|
for item in row:
|
|
258
|
-
headers
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
267
|
+
if item not in headers and item not in headerextras:
|
|
268
|
+
prefixhere = False
|
|
269
|
+
for prefix in prefixes:
|
|
270
|
+
if item.startswith(prefix):
|
|
271
|
+
prefixhere = True
|
|
272
|
+
if prefixhere:
|
|
273
|
+
headerextras.append(item)
|
|
274
|
+
else:
|
|
275
|
+
headers.append(item)
|
|
276
|
+
logger.debug(
|
|
277
|
+
f"Found {(len(headers) + len(headerextras)):,} header entries in the supplied list of dicts."
|
|
278
|
+
)
|
|
279
|
+
headers.extend(headerextras)
|
|
262
280
|
with open(output_path, mode, newline="") as outfile:
|
|
263
281
|
# Create the writer object
|
|
264
282
|
writer = csv.writer(outfile)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: warn-scraper
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.153.dev0
|
|
4
4
|
Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
|
|
5
5
|
Author-email: Big Local News <biglocalnews@stanford.edu>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -2,7 +2,9 @@ warn/__init__.py,sha256=A07JFY1TyaPtVIndBa7IvTk13DETqIkLgRdk0A-MCoE,85
|
|
|
2
2
|
warn/cache.py,sha256=QBSHycchvRTkOQfHptOtZeTYiPgLP383jS8MTiGln_c,5969
|
|
3
3
|
warn/cli.py,sha256=ZqyJwICdHFkn2hEgbArj_upbElR9-TSDlYDqyEGeexE,2019
|
|
4
4
|
warn/runner.py,sha256=oeGRybGwpnkQKlPzRMlKxhsDt1GN4PZoX-vUwrsPgos,1894
|
|
5
|
-
warn/utils.py,sha256
|
|
5
|
+
warn/utils.py,sha256=-JF8DnSg-80CbCIswM-rtB0CWf9zSVU56iJNpRw3V-o,13086
|
|
6
|
+
warn/pdfrodent/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
+
warn/pdfrodent/pdfrodent.py,sha256=IajvUyzVuUlph7F3LqaPU0HxDCkHb8YfnP1js4vOoTs,14632
|
|
6
8
|
warn/platforms/__init__.py,sha256=wIZRDf4tbTuC8oKM4ZrTAtwNgbtMQGzPXMwDYCFyrog,81
|
|
7
9
|
warn/platforms/job_center/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
10
|
warn/platforms/job_center/cache.py,sha256=yhA3sE46lNFg8vEewSoRYVByi0YSlkBiKm7qoSUiTdM,1868
|
|
@@ -11,7 +13,7 @@ warn/platforms/job_center/urls.py,sha256=IWhpuzN_xcNdHh23GbZPGvuHCsMcmb03qx3pRn1
|
|
|
11
13
|
warn/platforms/job_center/utils.py,sha256=HdUKgKirmpPP7e4Cu_ZyB3zPVS_p-_ylo-lXFhxK2QM,5696
|
|
12
14
|
warn/scrapers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
15
|
warn/scrapers/ak.py,sha256=h7BYMTV0whwWAPhbzVDVKMMoVCFphKly70aiTHabPq4,1847
|
|
14
|
-
warn/scrapers/al.py,sha256=
|
|
16
|
+
warn/scrapers/al.py,sha256=XSDEGC7F6_3GZ2m_uSiIG-1v8jMNH9pw_wNUCZyaMK0,2460
|
|
15
17
|
warn/scrapers/az.py,sha256=elGbue01Gjf_DQ66Wy9qqGIOJsiY-KIKJOVeft8pCXg,1447
|
|
16
18
|
warn/scrapers/ca.py,sha256=VQOfjHXPCc-jYwh-EPGVVfnzvXB7pdmCt2uJ6QnMPRM,8600
|
|
17
19
|
warn/scrapers/co.py,sha256=83OdikIrWGxt22mlI-_zLSNqJg1NO5C2Xjm3FF6DPYY,18252
|
|
@@ -32,6 +34,7 @@ warn/scrapers/md.py,sha256=hwgxXQnhyBWm8qF1dvxIThAX1MkrZbXLwRI9inO5t8g,4060
|
|
|
32
34
|
warn/scrapers/me.py,sha256=q36F4yJ7hvZsLayA3uBS1romo4X3Qf-sEi2Y7LAQCi8,1172
|
|
33
35
|
warn/scrapers/mi.py,sha256=Ppyawp4nbzSBODuzDKeqnO9_9do5MFwK4Y_f3uc6blE,5846
|
|
34
36
|
warn/scrapers/mo.py,sha256=wnnwQAiVPwuheMqptMXZpyQdiKNghhKwTO-Bnh9oXoU,3492
|
|
37
|
+
warn/scrapers/ms.py,sha256=BZZoMw3TNtwzBeBsqKLsPznBcDjPaO49I5-yBU0e9AI,5502
|
|
35
38
|
warn/scrapers/mt.py,sha256=t2MP4OCcuCEnrnvNgOu289P0eekZq4XaCK65qzgZX88,2457
|
|
36
39
|
warn/scrapers/ne.py,sha256=JawuGJ3tCKvMd-N-p03gnltB4rol4QUJshMk2oyMPO4,4143
|
|
37
40
|
warn/scrapers/nj.py,sha256=nwbMbeQuUJbYRVoyUyKZBmNqvqsXu3Habt-10r8DvZE,2230
|
|
@@ -51,9 +54,9 @@ warn/scrapers/va.py,sha256=7Nle7qL0VNPiE653XyaP9HQqSfuJFDRr2kEkjOqLvFM,11269
|
|
|
51
54
|
warn/scrapers/vt.py,sha256=d-bo4WK2hkrk4BhCCmLpEovcoZltlvdIUB6O0uaMx5A,1186
|
|
52
55
|
warn/scrapers/wa.py,sha256=UXdVtHZo_a-XfoiyOooTRfTb9W3PErSZdKca6SRORgs,4282
|
|
53
56
|
warn/scrapers/wi.py,sha256=ClEzXkwZbop0W4fkQgsb5oHAPUrb4luUPGV-jOKwkcg,4855
|
|
54
|
-
warn_scraper-1.2.
|
|
55
|
-
warn_scraper-1.2.
|
|
56
|
-
warn_scraper-1.2.
|
|
57
|
-
warn_scraper-1.2.
|
|
58
|
-
warn_scraper-1.2.
|
|
59
|
-
warn_scraper-1.2.
|
|
57
|
+
warn_scraper-1.2.153.dev0.dist-info/licenses/LICENSE,sha256=ZV-QHyqPwyMuwuj0lI05JeSjV1NyzVEk8Yeu7FPtYS0,585
|
|
58
|
+
warn_scraper-1.2.153.dev0.dist-info/METADATA,sha256=wEonbrS1LWIOVFTuLq75_rlLbDGb2g3-7w30N1x_JAc,1780
|
|
59
|
+
warn_scraper-1.2.153.dev0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
60
|
+
warn_scraper-1.2.153.dev0.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
|
|
61
|
+
warn_scraper-1.2.153.dev0.dist-info/top_level.txt,sha256=dZfms6N3kqVXufiPOo7YqOrAcUtYfNH_oyGvYUk9FB4,5
|
|
62
|
+
warn_scraper-1.2.153.dev0.dist-info/RECORD,,
|
|
File without changes
|
{warn_scraper-1.2.151.dev0.dist-info → warn_scraper-1.2.153.dev0.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{warn_scraper-1.2.151.dev0.dist-info → warn_scraper-1.2.153.dev0.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|