warn-scraper 1.2.72__py3-none-any.whl → 1.2.74__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
warn/scrapers/ca.py CHANGED
@@ -224,20 +224,25 @@ def _extract_pdf_data(pdf_path):
224
224
  if "summary" in first_cell:
225
225
  continue
226
226
  for row in rows:
227
- data_row = {}
228
- for i, value in enumerate(row):
229
- this_raw_header = raw_header[i]
230
- this_clean_header = header_crosswalk[this_raw_header]
231
- data_row[this_clean_header] = value
232
- # Data clean-ups
233
- data_row.update(
234
- {
235
- "effective_date": data_row["effective_date"].replace(" ", ""),
236
- "received_date": data_row["received_date"].replace(" ", ""),
237
- "source_file": str(pdf_path).split("/")[-1],
238
- }
239
- )
240
- data.append(data_row)
227
+ # Summary rows have an extra field, and the above code does not
228
+ # block the summary table from being parsed if it jumps onto another page.
229
+ if len(row) != len(raw_header) + 1:
230
+ data_row = {}
231
+ for i, value in enumerate(row):
232
+ this_raw_header = raw_header[i]
233
+ this_clean_header = header_crosswalk[this_raw_header]
234
+ data_row[this_clean_header] = value
235
+ # Data clean-ups
236
+ data_row.update(
237
+ {
238
+ "effective_date": data_row["effective_date"].replace(
239
+ " ", ""
240
+ ),
241
+ "received_date": data_row["received_date"].replace(" ", ""),
242
+ "source_file": str(pdf_path).split("/")[-1],
243
+ }
244
+ )
245
+ data.append(data_row)
241
246
  return data
242
247
 
243
248
 
warn/scrapers/tn.py CHANGED
@@ -1,14 +1,14 @@
1
+ import csv
1
2
  import typing
2
3
  from pathlib import Path
3
4
 
4
- import pdfplumber
5
5
  from bs4 import BeautifulSoup
6
6
 
7
7
  from .. import utils
8
8
  from ..cache import Cache
9
9
 
10
- __authors__ = ["anikasikka"]
11
- __tags__ = ["html", "pdf"]
10
+ __authors__ = ["anikasikka", "stucka"]
11
+ __tags__ = ["html"]
12
12
  __source__ = {
13
13
  "name": "Tennessee Department of Labor and Workforce Development",
14
14
  "url": "https://www.tn.gov/workforce/general-resources/major-publications0/major-publications-redirect/reports.html",
@@ -37,13 +37,11 @@ def scrape(
37
37
  )
38
38
  html = page.text
39
39
  cache.write("tn/source.html", html)
40
+ soup = BeautifulSoup(html, "html5lib")
41
+ tables = soup.find_all(attrs={"class": "tn-datatable"})
42
+ rows = BeautifulSoup(str(tables), "html5lib").find_all("tr")
40
43
 
41
- # Grab the PDF with the archived historial data
42
- pdf_url = "https://www.tn.gov/content/dam/tn/workforce/documents/majorpublications/reports/WarnReportByMonth.pdf"
43
- pdf_file = cache.download("tn/pdffile.pdf", pdf_url)
44
-
45
- # Set the headers we'll use for both sources
46
- tn_headers = [
44
+ dataheaders: typing.List = [
47
45
  "Notice Date",
48
46
  "Effective Date",
49
47
  "Received Date",
@@ -53,102 +51,42 @@ def scrape(
53
51
  "No. Of Employees",
54
52
  "Layoff/Closure",
55
53
  "Notice ID",
54
+ # "Notice URL",
56
55
  ]
57
- cleaned_data: typing.List[typing.Any] = [tn_headers]
58
56
 
59
- # Parse the latest HTML file and convert to a list of rows, with a header in the first row.
60
- soup = BeautifulSoup(html, "html5lib")
57
+ staginglist: typing.List = []
58
+ for row in reversed(rows):
59
+ cells = row.find_all("td")
60
+ if len(cells) == 6: # Filter for potentially valid rows
61
+ line: typing.Dict = {}
62
+ for item in dataheaders: # Build an ordered dictionary with null values
63
+ line[item] = None
64
+ line["Notice Date"] = cells[0].text.strip()
65
+ line["Effective Date"] = cells[4].text.strip()
66
+ line["Company"] = cells[1].text.strip()
67
+ line["County"] = cells[2].text.strip()
68
+ line["No. Of Employees"] = cells[3].text.strip()
69
+ line["Notice ID"] = cells[5].text.strip()
70
+ # line['Notice URL'] = cells[1].find("a")['href']
71
+ staginglist.append(line)
72
+
73
+ # Bring in historical data
74
+ historical_file = cache_dir / "tn/tn_historical.csv"
75
+ historical_url = (
76
+ "https://storage.googleapis.com/bln-data-public/warn-layoffs/tn_historical.csv"
77
+ )
78
+ utils.fetch_if_not_cached(historical_file, historical_url)
79
+ historical_str = cache.read("tn/tn_historical.csv")
61
80
 
62
- # Grab all the list items on the page
63
- data_list = soup.find_all("p")
64
-
65
- # Loop through them all, skipping the first item, which is a header
66
- for data in data_list[1:]:
67
- # splitting the data on its delimiter
68
- items = str(data).split("|")
69
-
70
- # making sure that the last item in the list is the data value of interest
71
- # splitting based on last character of each text-html data sequence
72
- raw_data = []
73
- for item in items:
74
- value_html = item.split(":")[-1]
75
- value_soup = BeautifulSoup(value_html, "html5lib")
76
- string_list = list(value_soup.stripped_strings)
77
- if len(string_list) > 0:
78
- value = string_list[-1]
79
- else:
80
- continue
81
- raw_data.append(value)
82
-
83
- # If there aren't six entries it's junk
84
- if len(raw_data) != 6:
85
- continue
86
-
87
- # Pluck out the values we want
88
- nice_data = [
89
- raw_data[0], # Notice Date
90
- raw_data[4], # Effective Date
91
- "", # Received Date
92
- raw_data[1], # Company
93
- "", # City
94
- raw_data[2], # County
95
- raw_data[3], # Number of employees
96
- "", # Layoff/Closure
97
- raw_data[5], # Notice ID
98
- ]
99
-
100
- # Add them to the master list
101
- cleaned_data.append(nice_data)
102
-
103
- # The PDF header blacklist of rows to toss
104
- pdf_header_blacklist = [
105
- "Notice Date",
106
- "Total",
107
- ]
81
+ historicallist = list(csv.DictReader(historical_str.splitlines()))
82
+
83
+ # Combine fresh and historical
84
+ staginglist.extend(historicallist)
108
85
 
109
- # Open the PDF
110
- with pdfplumber.open(pdf_file) as pdf:
111
- # Loop through all the pages
112
- for i, my_page in enumerate(pdf.pages):
113
- # Sll even pages have data, odd pages don't have the data
114
- if i % 2 != 0:
115
- continue
116
-
117
- # Pull out the table and loop through the rows
118
- table = my_page.extract_table()
119
- if not table:
120
- continue
121
-
122
- # Cut empty rows
123
- row_list = [r for r in table if any(r)]
124
- if not row_list:
125
- continue
126
-
127
- # If this is a summary table, skip it
128
- first_cell = row_list[0][0]
129
- assert first_cell
130
- if first_cell.lower().strip() == "summary by month":
131
- continue
132
-
133
- # Loop through all the rows ...
134
- for row in row_list:
135
- # Skip remove redundant headers
136
- if row[0] in pdf_header_blacklist:
137
- continue
138
-
139
- # Toss in an empty Notice ID since it isn't in the PDF
140
- row.append("")
141
-
142
- # Add the data to our output
143
- cleaned_data.append(row)
144
-
145
- # Set the path to the final CSV
146
86
  output_csv = data_dir / "tn.csv"
147
87
 
148
- # Write out the rows to the export directory
149
- utils.write_rows_to_csv(output_csv, cleaned_data)
88
+ utils.write_dict_rows_to_csv(output_csv, dataheaders, staginglist)
150
89
 
151
- # Return the path to the final CSV
152
90
  return output_csv
153
91
 
154
92
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: warn-scraper
3
- Version: 1.2.72
3
+ Version: 1.2.74
4
4
  Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
5
5
  Home-page: https://github.com/biglocalnews/warn-scraper
6
6
  Author: Big Local News
@@ -28,7 +28,7 @@ warn/scrapers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
28
  warn/scrapers/ak.py,sha256=h7BYMTV0whwWAPhbzVDVKMMoVCFphKly70aiTHabPq4,1847
29
29
  warn/scrapers/al.py,sha256=D0rT9GQ0vwfkRuveVAt-Po-T6b2TI1EPGeLOBy2m3_M,2240
30
30
  warn/scrapers/az.py,sha256=elGbue01Gjf_DQ66Wy9qqGIOJsiY-KIKJOVeft8pCXg,1447
31
- warn/scrapers/ca.py,sha256=_LvkIci1nTUKBt5KC-wEcazWG7zoeUeadxj4D0XD97k,8170
31
+ warn/scrapers/ca.py,sha256=rBTB-6LmNIlbGCqrCtI3O-w2e_0kcVSFxvjvh4EHBlk,8511
32
32
  warn/scrapers/co.py,sha256=g076Zqe8XA8tbW03HP6-03mJV8fft1niHfa5Sy6me9A,7388
33
33
  warn/scrapers/ct.py,sha256=HLMmBSFhT5Y3vZQUwRyCTxiG5BMQXTfG3SEj5rkQEL4,4771
34
34
  warn/scrapers/dc.py,sha256=_sHLnVqK_W90QqJb_W88yDlgPjoMl63LYZP3CJfdN9g,4484
@@ -58,16 +58,16 @@ warn/scrapers/or.py,sha256=0PjyrW3CHdxtHhqEo3Ob-9B6YckACoBD3K0c4FPQUcg,5208
58
58
  warn/scrapers/ri.py,sha256=vBbXFP5ClvqlOc_srR8sHsA8lpi7eLuMYm7ydUY5Fxo,4163
59
59
  warn/scrapers/sc.py,sha256=p3kscSNSW9C8C5QaSUbCAo6XibgB7G2iH6zaMH7Mnsc,4819
60
60
  warn/scrapers/sd.py,sha256=_4R19Ybzsyx1PvcWV3_laJmJ3etrwVGfhNEQm6njwoA,1904
61
- warn/scrapers/tn.py,sha256=NBGAjNZ1_-a08Qdu0Y8PnxIjcqAozqdZAvXSAbPae-A,4770
61
+ warn/scrapers/tn.py,sha256=i1H7c09Ea3CDrTXqqRMLBMPT_34QtGA0-x7T8rm_j5Q,2945
62
62
  warn/scrapers/tx.py,sha256=7lRIA13CyU1taYdxDA-t6uRn5q13Cr3oR1SNaEe3Dlg,4329
63
63
  warn/scrapers/ut.py,sha256=iUh38YIjbvv5MyyKacsiZNe8KjfdBeDaOf-qMQEF_kc,2245
64
64
  warn/scrapers/va.py,sha256=13lhkQrSkPGHEiWUuf1qiS890PWYE5gV-TgISpoiQnc,1711
65
65
  warn/scrapers/vt.py,sha256=d-bo4WK2hkrk4BhCCmLpEovcoZltlvdIUB6O0uaMx5A,1186
66
66
  warn/scrapers/wa.py,sha256=UXdVtHZo_a-XfoiyOooTRfTb9W3PErSZdKca6SRORgs,4282
67
67
  warn/scrapers/wi.py,sha256=ClEzXkwZbop0W4fkQgsb5oHAPUrb4luUPGV-jOKwkcg,4855
68
- warn_scraper-1.2.72.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
69
- warn_scraper-1.2.72.dist-info/METADATA,sha256=3SjdPSlNFIixBPUEqU3LKNIm6FnBkIc9ZYkYOJsRK9Y,2025
70
- warn_scraper-1.2.72.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
71
- warn_scraper-1.2.72.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
72
- warn_scraper-1.2.72.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
73
- warn_scraper-1.2.72.dist-info/RECORD,,
68
+ warn_scraper-1.2.74.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
69
+ warn_scraper-1.2.74.dist-info/METADATA,sha256=qUTAC44XVu9ARQFObvm3h9aJe0dAduopZX4-bP18i3k,2025
70
+ warn_scraper-1.2.74.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
71
+ warn_scraper-1.2.74.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
72
+ warn_scraper-1.2.74.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
73
+ warn_scraper-1.2.74.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.42.0)
2
+ Generator: bdist_wheel (0.43.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5