warn-scraper 1.2.104__py3-none-any.whl → 1.2.106__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
warn/scrapers/dc.py CHANGED
@@ -85,6 +85,11 @@ def scrape(
85
85
  r.encoding = "utf-8"
86
86
  html = r.text
87
87
 
88
+ # A June 2025 entry includes a weird table inside a table cell.
89
+ # This is an ugly patch.
90
+ weirdtable = r"(\s+<table>\s+<tbody>\s+<tr>\s+<td>)(.*?)(</td>\s+</tr>\s+</tbody>\s+</table>\s+)"
91
+ html = re.sub(weirdtable, r"\2", html)
92
+
88
93
  # Save it to the cache
89
94
  cache_key = uuid.uuid5(uuid.NAMESPACE_URL, href)
90
95
  cache.write(f"dc/{cache_key}.html", html)
warn/scrapers/id.py CHANGED
@@ -69,6 +69,11 @@ def scrape(
69
69
  with pdfplumber.open(pdf_file) as pdf:
70
70
  for index, page in enumerate(pdf.pages):
71
71
  rows = page.extract_table()
72
+ if rows[0][0] in ["Date of\nLetter", "Date of Letter"] and index > 1:
73
+ rows = rows[
74
+ 1:
75
+ ] # Drop inside header rows that _clean_table will mangle if merged cells span pages
76
+ # logger.debug(f"\n\nRows for page {page}: {rows}")
72
77
  output_rows += _clean_table(rows, index)
73
78
 
74
79
  # Write out the data to a CSV
@@ -118,8 +123,9 @@ def _clean_table(rows, page_index) -> list:
118
123
  # logger.debug(f"Dropping faulty row with {len(output_row)} elements: {output_row}")
119
124
 
120
125
  # Only include the header on the first page
121
- if page_index != 0:
122
- return output_rows[1:]
126
+ # No, this needed to be filtered earlier
127
+ # if page_index != 0:
128
+ # return output_rows[1:]
123
129
 
124
130
  return output_rows
125
131
 
@@ -132,18 +138,26 @@ def filter_garbage_rows(incoming: list):
132
138
 
133
139
  Returns: List of lists that have a minimum number of elements.
134
140
  """
135
- badrows: int = 0
141
+ shortrows: int = 0
142
+ mixedrows: int = 0
136
143
  outgoing: list = []
137
- for row in incoming:
138
- if len(row) >= 4:
144
+ for rowindex, row in enumerate(incoming):
145
+ error = False
146
+ if len(row) < 5:
147
+ error = True
148
+ logger.debug(f"Dropping short row: {row}")
149
+ shortrows += 1
150
+ if row[0] == "Date of Letter" and rowindex != 0: # Keep the header row
151
+ error = True
152
+ logger.debug(f"Dropping partial row: {row}")
153
+ mixedrows += 1
154
+ if not error:
139
155
  outgoing.append(row)
140
- else:
141
- badrows += 1
142
- if badrows == 0:
156
+ if shortrows == 0 and mixedrows == 0:
143
157
  logger.debug("No bad rows found.")
144
158
  else:
145
159
  logger.debug(
146
- f"!!!!! {badrows:,} bad rows dropped from the data set with insufficient number of fields."
160
+ f"!!!! Dropped {shortrows} rows with insufficient number of fields, and dropped {mixedrows} rows scrambled with header info"
147
161
  )
148
162
  return outgoing
149
163
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: warn-scraper
3
- Version: 1.2.104
3
+ Version: 1.2.106
4
4
  Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
5
5
  Home-page: https://github.com/biglocalnews/warn-scraper
6
6
  Author: Big Local News
@@ -31,13 +31,13 @@ warn/scrapers/az.py,sha256=elGbue01Gjf_DQ66Wy9qqGIOJsiY-KIKJOVeft8pCXg,1447
31
31
  warn/scrapers/ca.py,sha256=VQOfjHXPCc-jYwh-EPGVVfnzvXB7pdmCt2uJ6QnMPRM,8600
32
32
  warn/scrapers/co.py,sha256=hUfqrzlhXQBkP4vxewVRrMZrgInoLer5S2MZlyYIQE4,7878
33
33
  warn/scrapers/ct.py,sha256=PKeZtlB0-z2wCmYmGl_WYoVo2gzwKV36upZcJVaJxjM,4852
34
- warn/scrapers/dc.py,sha256=_sHLnVqK_W90QqJb_W88yDlgPjoMl63LYZP3CJfdN9g,4484
34
+ warn/scrapers/dc.py,sha256=NNTdjgUzC3jR-8F64U7dA77MzKo0dTTo64XjjpBu4cc,4743
35
35
  warn/scrapers/de.py,sha256=GyM92A-lFwZAfRxgbO-sIWhRfmBEKirzchaPIv-u0o4,1364
36
36
  warn/scrapers/fl.py,sha256=YJ6Qt-jJZ7_iUKlHDaQuaV2gRmae8AJKS5dwwChadBE,9563
37
37
  warn/scrapers/ga.py,sha256=o_OF4zPQ3vJM8USQPD7l_ThyRWAzUZkwrwLHCvWmHMI,7429
38
38
  warn/scrapers/hi.py,sha256=pSplAP15_ZBfQtcywyErmvNcrk7u55TjZj_F0Nqw9L8,5660
39
39
  warn/scrapers/ia.py,sha256=zOncaA9M0d6paT4pB7UU_4D_yxUgeUiGRcnpKi9DsRA,1999
40
- warn/scrapers/id.py,sha256=vl9r7ItpLUnUaugMihf74DKQ321FAY7A7LGpzYAusZ8,5926
40
+ warn/scrapers/id.py,sha256=qJLcLgCgAfKzLpuwW32JqNwXn9NxZRZvQ50nZZKUhmE,6674
41
41
  warn/scrapers/il.py,sha256=sygdvsNuB_Gvu3o_HidtpSP4FLz0szKb1zEHqGxVtlI,1563
42
42
  warn/scrapers/in.py,sha256=dAT40ROhhKiwLcwa_YJ6EyhsYBLe0IX2rOWXmNa6JMs,2026
43
43
  warn/scrapers/ks.py,sha256=F_3biEMF7zgCX2XVuUACR74Vyzapta4SaM9SY3EuZCU,1266
@@ -65,9 +65,9 @@ warn/scrapers/va.py,sha256=hOPuiAjnTmtXCOdnBM_jAJuz9_u6oCxtbm2F-9m3ot0,10732
65
65
  warn/scrapers/vt.py,sha256=d-bo4WK2hkrk4BhCCmLpEovcoZltlvdIUB6O0uaMx5A,1186
66
66
  warn/scrapers/wa.py,sha256=UXdVtHZo_a-XfoiyOooTRfTb9W3PErSZdKca6SRORgs,4282
67
67
  warn/scrapers/wi.py,sha256=ClEzXkwZbop0W4fkQgsb5oHAPUrb4luUPGV-jOKwkcg,4855
68
- warn_scraper-1.2.104.dist-info/licenses/LICENSE,sha256=ZV-QHyqPwyMuwuj0lI05JeSjV1NyzVEk8Yeu7FPtYS0,585
69
- warn_scraper-1.2.104.dist-info/METADATA,sha256=j9eq6IQ5GXReevRBLWI0Cv-_D7EtKIRCCRXI6I-U_4g,2385
70
- warn_scraper-1.2.104.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
71
- warn_scraper-1.2.104.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
72
- warn_scraper-1.2.104.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
73
- warn_scraper-1.2.104.dist-info/RECORD,,
68
+ warn_scraper-1.2.106.dist-info/licenses/LICENSE,sha256=ZV-QHyqPwyMuwuj0lI05JeSjV1NyzVEk8Yeu7FPtYS0,585
69
+ warn_scraper-1.2.106.dist-info/METADATA,sha256=eGgAOPnHhNXmKPGssgfGwKdtkZfn5QPz2qa2dP0kBQU,2385
70
+ warn_scraper-1.2.106.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
71
+ warn_scraper-1.2.106.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
72
+ warn_scraper-1.2.106.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
73
+ warn_scraper-1.2.106.dist-info/RECORD,,