warn-scraper 1.2.72__py3-none-any.whl → 1.2.73__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- warn/scrapers/tn.py +36 -98
- {warn_scraper-1.2.72.dist-info → warn_scraper-1.2.73.dist-info}/METADATA +1 -1
- {warn_scraper-1.2.72.dist-info → warn_scraper-1.2.73.dist-info}/RECORD +7 -7
- {warn_scraper-1.2.72.dist-info → warn_scraper-1.2.73.dist-info}/WHEEL +1 -1
- {warn_scraper-1.2.72.dist-info → warn_scraper-1.2.73.dist-info}/LICENSE +0 -0
- {warn_scraper-1.2.72.dist-info → warn_scraper-1.2.73.dist-info}/entry_points.txt +0 -0
- {warn_scraper-1.2.72.dist-info → warn_scraper-1.2.73.dist-info}/top_level.txt +0 -0
warn/scrapers/tn.py
CHANGED
@@ -1,14 +1,14 @@
|
|
1
|
+
import csv
|
1
2
|
import typing
|
2
3
|
from pathlib import Path
|
3
4
|
|
4
|
-
import pdfplumber
|
5
5
|
from bs4 import BeautifulSoup
|
6
6
|
|
7
7
|
from .. import utils
|
8
8
|
from ..cache import Cache
|
9
9
|
|
10
|
-
__authors__ = ["anikasikka"]
|
11
|
-
__tags__ = ["html"
|
10
|
+
__authors__ = ["anikasikka", "stucka"]
|
11
|
+
__tags__ = ["html"]
|
12
12
|
__source__ = {
|
13
13
|
"name": "Tennessee Department of Labor and Workforce Development",
|
14
14
|
"url": "https://www.tn.gov/workforce/general-resources/major-publications0/major-publications-redirect/reports.html",
|
@@ -37,13 +37,11 @@ def scrape(
|
|
37
37
|
)
|
38
38
|
html = page.text
|
39
39
|
cache.write("tn/source.html", html)
|
40
|
+
soup = BeautifulSoup(html, "html5lib")
|
41
|
+
tables = soup.find_all(attrs={"class": "tn-datatable"})
|
42
|
+
rows = BeautifulSoup(str(tables), "html5lib").find_all("tr")
|
40
43
|
|
41
|
-
|
42
|
-
pdf_url = "https://www.tn.gov/content/dam/tn/workforce/documents/majorpublications/reports/WarnReportByMonth.pdf"
|
43
|
-
pdf_file = cache.download("tn/pdffile.pdf", pdf_url)
|
44
|
-
|
45
|
-
# Set the headers we'll use for both sources
|
46
|
-
tn_headers = [
|
44
|
+
dataheaders: typing.List = [
|
47
45
|
"Notice Date",
|
48
46
|
"Effective Date",
|
49
47
|
"Received Date",
|
@@ -53,102 +51,42 @@ def scrape(
|
|
53
51
|
"No. Of Employees",
|
54
52
|
"Layoff/Closure",
|
55
53
|
"Notice ID",
|
54
|
+
# "Notice URL",
|
56
55
|
]
|
57
|
-
cleaned_data: typing.List[typing.Any] = [tn_headers]
|
58
56
|
|
59
|
-
|
60
|
-
|
57
|
+
staginglist: typing.List = []
|
58
|
+
for row in reversed(rows):
|
59
|
+
cells = row.find_all("td")
|
60
|
+
if len(cells) == 6: # Filter for potentially valid rows
|
61
|
+
line: typing.Dict = {}
|
62
|
+
for item in dataheaders: # Build an ordered dictionary with null values
|
63
|
+
line[item] = None
|
64
|
+
line["Notice Date"] = cells[0].text.strip()
|
65
|
+
line["Effective Date"] = cells[4].text.strip()
|
66
|
+
line["Company"] = cells[1].text.strip()
|
67
|
+
line["County"] = cells[2].text.strip()
|
68
|
+
line["No. Of Employees"] = cells[3].text.strip()
|
69
|
+
line["Notice ID"] = cells[5].text.strip()
|
70
|
+
# line['Notice URL'] = cells[1].find("a")['href']
|
71
|
+
staginglist.append(line)
|
72
|
+
|
73
|
+
# Bring in historical data
|
74
|
+
historical_file = cache_dir / "tn/tn_historical.csv"
|
75
|
+
historical_url = (
|
76
|
+
"https://storage.googleapis.com/bln-data-public/warn-layoffs/tn_historical.csv"
|
77
|
+
)
|
78
|
+
utils.fetch_if_not_cached(historical_file, historical_url)
|
79
|
+
historical_str = cache.read("tn/tn_historical.csv")
|
61
80
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
for data in data_list[1:]:
|
67
|
-
# splitting the data on its delimiter
|
68
|
-
items = str(data).split("|")
|
69
|
-
|
70
|
-
# making sure that the last item in the list is the data value of interest
|
71
|
-
# splitting based on last character of each text-html data sequence
|
72
|
-
raw_data = []
|
73
|
-
for item in items:
|
74
|
-
value_html = item.split(":")[-1]
|
75
|
-
value_soup = BeautifulSoup(value_html, "html5lib")
|
76
|
-
string_list = list(value_soup.stripped_strings)
|
77
|
-
if len(string_list) > 0:
|
78
|
-
value = string_list[-1]
|
79
|
-
else:
|
80
|
-
continue
|
81
|
-
raw_data.append(value)
|
82
|
-
|
83
|
-
# If there aren't six entries it's junk
|
84
|
-
if len(raw_data) != 6:
|
85
|
-
continue
|
86
|
-
|
87
|
-
# Pluck out the values we want
|
88
|
-
nice_data = [
|
89
|
-
raw_data[0], # Notice Date
|
90
|
-
raw_data[4], # Effective Date
|
91
|
-
"", # Received Date
|
92
|
-
raw_data[1], # Company
|
93
|
-
"", # City
|
94
|
-
raw_data[2], # County
|
95
|
-
raw_data[3], # Number of employees
|
96
|
-
"", # Layoff/Closure
|
97
|
-
raw_data[5], # Notice ID
|
98
|
-
]
|
99
|
-
|
100
|
-
# Add them to the master list
|
101
|
-
cleaned_data.append(nice_data)
|
102
|
-
|
103
|
-
# The PDF header blacklist of rows to toss
|
104
|
-
pdf_header_blacklist = [
|
105
|
-
"Notice Date",
|
106
|
-
"Total",
|
107
|
-
]
|
81
|
+
historicallist = list(csv.DictReader(historical_str.splitlines()))
|
82
|
+
|
83
|
+
# Combine fresh and historical
|
84
|
+
staginglist.extend(historicallist)
|
108
85
|
|
109
|
-
# Open the PDF
|
110
|
-
with pdfplumber.open(pdf_file) as pdf:
|
111
|
-
# Loop through all the pages
|
112
|
-
for i, my_page in enumerate(pdf.pages):
|
113
|
-
# Sll even pages have data, odd pages don't have the data
|
114
|
-
if i % 2 != 0:
|
115
|
-
continue
|
116
|
-
|
117
|
-
# Pull out the table and loop through the rows
|
118
|
-
table = my_page.extract_table()
|
119
|
-
if not table:
|
120
|
-
continue
|
121
|
-
|
122
|
-
# Cut empty rows
|
123
|
-
row_list = [r for r in table if any(r)]
|
124
|
-
if not row_list:
|
125
|
-
continue
|
126
|
-
|
127
|
-
# If this is a summary table, skip it
|
128
|
-
first_cell = row_list[0][0]
|
129
|
-
assert first_cell
|
130
|
-
if first_cell.lower().strip() == "summary by month":
|
131
|
-
continue
|
132
|
-
|
133
|
-
# Loop through all the rows ...
|
134
|
-
for row in row_list:
|
135
|
-
# Skip remove redundant headers
|
136
|
-
if row[0] in pdf_header_blacklist:
|
137
|
-
continue
|
138
|
-
|
139
|
-
# Toss in an empty Notice ID since it isn't in the PDF
|
140
|
-
row.append("")
|
141
|
-
|
142
|
-
# Add the data to our output
|
143
|
-
cleaned_data.append(row)
|
144
|
-
|
145
|
-
# Set the path to the final CSV
|
146
86
|
output_csv = data_dir / "tn.csv"
|
147
87
|
|
148
|
-
|
149
|
-
utils.write_rows_to_csv(output_csv, cleaned_data)
|
88
|
+
utils.write_dict_rows_to_csv(output_csv, dataheaders, staginglist)
|
150
89
|
|
151
|
-
# Return the path to the final CSV
|
152
90
|
return output_csv
|
153
91
|
|
154
92
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: warn-scraper
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.73
|
4
4
|
Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
|
5
5
|
Home-page: https://github.com/biglocalnews/warn-scraper
|
6
6
|
Author: Big Local News
|
@@ -58,16 +58,16 @@ warn/scrapers/or.py,sha256=0PjyrW3CHdxtHhqEo3Ob-9B6YckACoBD3K0c4FPQUcg,5208
|
|
58
58
|
warn/scrapers/ri.py,sha256=vBbXFP5ClvqlOc_srR8sHsA8lpi7eLuMYm7ydUY5Fxo,4163
|
59
59
|
warn/scrapers/sc.py,sha256=p3kscSNSW9C8C5QaSUbCAo6XibgB7G2iH6zaMH7Mnsc,4819
|
60
60
|
warn/scrapers/sd.py,sha256=_4R19Ybzsyx1PvcWV3_laJmJ3etrwVGfhNEQm6njwoA,1904
|
61
|
-
warn/scrapers/tn.py,sha256=
|
61
|
+
warn/scrapers/tn.py,sha256=i1H7c09Ea3CDrTXqqRMLBMPT_34QtGA0-x7T8rm_j5Q,2945
|
62
62
|
warn/scrapers/tx.py,sha256=7lRIA13CyU1taYdxDA-t6uRn5q13Cr3oR1SNaEe3Dlg,4329
|
63
63
|
warn/scrapers/ut.py,sha256=iUh38YIjbvv5MyyKacsiZNe8KjfdBeDaOf-qMQEF_kc,2245
|
64
64
|
warn/scrapers/va.py,sha256=13lhkQrSkPGHEiWUuf1qiS890PWYE5gV-TgISpoiQnc,1711
|
65
65
|
warn/scrapers/vt.py,sha256=d-bo4WK2hkrk4BhCCmLpEovcoZltlvdIUB6O0uaMx5A,1186
|
66
66
|
warn/scrapers/wa.py,sha256=UXdVtHZo_a-XfoiyOooTRfTb9W3PErSZdKca6SRORgs,4282
|
67
67
|
warn/scrapers/wi.py,sha256=ClEzXkwZbop0W4fkQgsb5oHAPUrb4luUPGV-jOKwkcg,4855
|
68
|
-
warn_scraper-1.2.
|
69
|
-
warn_scraper-1.2.
|
70
|
-
warn_scraper-1.2.
|
71
|
-
warn_scraper-1.2.
|
72
|
-
warn_scraper-1.2.
|
73
|
-
warn_scraper-1.2.
|
68
|
+
warn_scraper-1.2.73.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
69
|
+
warn_scraper-1.2.73.dist-info/METADATA,sha256=5FQltUNKR1LZmPu4Yqz8aqogBoQNNhSyISPC0SQ1sdg,2025
|
70
|
+
warn_scraper-1.2.73.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
71
|
+
warn_scraper-1.2.73.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
|
72
|
+
warn_scraper-1.2.73.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
|
73
|
+
warn_scraper-1.2.73.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|