warn-scraper 1.2.104__py3-none-any.whl → 1.2.105__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
warn/scrapers/id.py CHANGED
@@ -69,6 +69,11 @@ def scrape(
69
69
  with pdfplumber.open(pdf_file) as pdf:
70
70
  for index, page in enumerate(pdf.pages):
71
71
  rows = page.extract_table()
72
+ if rows[0][0] in ["Date of\nLetter", "Date of Letter"] and index > 1:
73
+ rows = rows[
74
+ 1:
75
+ ] # Drop inside header rows that _clean_table will mangle if merged cells span pages
76
+ # logger.debug(f"\n\nRows for page {page}: {rows}")
72
77
  output_rows += _clean_table(rows, index)
73
78
 
74
79
  # Write out the data to a CSV
@@ -118,8 +123,9 @@ def _clean_table(rows, page_index) -> list:
118
123
  # logger.debug(f"Dropping faulty row with {len(output_row)} elements: {output_row}")
119
124
 
120
125
  # Only include the header on the first page
121
- if page_index != 0:
122
- return output_rows[1:]
126
+ # No, this needed to be filtered earlier
127
+ # if page_index != 0:
128
+ # return output_rows[1:]
123
129
 
124
130
  return output_rows
125
131
 
@@ -132,18 +138,26 @@ def filter_garbage_rows(incoming: list):
132
138
 
133
139
  Returns: List of lists that have a minimum number of elements.
134
140
  """
135
- badrows: int = 0
141
+ shortrows: int = 0
142
+ mixedrows: int = 0
136
143
  outgoing: list = []
137
- for row in incoming:
138
- if len(row) >= 4:
144
+ for rowindex, row in enumerate(incoming):
145
+ error = False
146
+ if len(row) < 5:
147
+ error = True
148
+ logger.debug(f"Dropping short row: {row}")
149
+ shortrows += 1
150
+ if row[0] == "Date of Letter" and rowindex != 0: # Keep the header row
151
+ error = True
152
+ logger.debug(f"Dropping partial row: {row}")
153
+ mixedrows += 1
154
+ if not error:
139
155
  outgoing.append(row)
140
- else:
141
- badrows += 1
142
- if badrows == 0:
156
+ if shortrows == 0 and mixedrows == 0:
143
157
  logger.debug("No bad rows found.")
144
158
  else:
145
159
  logger.debug(
146
- f"!!!!! {badrows:,} bad rows dropped from the data set with insufficient number of fields."
160
+ f"!!!! Dropped {shortrows} rows with insufficient number of fields, and dropped {mixedrows} rows scrambled with header info"
147
161
  )
148
162
  return outgoing
149
163
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: warn-scraper
3
- Version: 1.2.104
3
+ Version: 1.2.105
4
4
  Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
5
5
  Home-page: https://github.com/biglocalnews/warn-scraper
6
6
  Author: Big Local News
@@ -37,7 +37,7 @@ warn/scrapers/fl.py,sha256=YJ6Qt-jJZ7_iUKlHDaQuaV2gRmae8AJKS5dwwChadBE,9563
37
37
  warn/scrapers/ga.py,sha256=o_OF4zPQ3vJM8USQPD7l_ThyRWAzUZkwrwLHCvWmHMI,7429
38
38
  warn/scrapers/hi.py,sha256=pSplAP15_ZBfQtcywyErmvNcrk7u55TjZj_F0Nqw9L8,5660
39
39
  warn/scrapers/ia.py,sha256=zOncaA9M0d6paT4pB7UU_4D_yxUgeUiGRcnpKi9DsRA,1999
40
- warn/scrapers/id.py,sha256=vl9r7ItpLUnUaugMihf74DKQ321FAY7A7LGpzYAusZ8,5926
40
+ warn/scrapers/id.py,sha256=qJLcLgCgAfKzLpuwW32JqNwXn9NxZRZvQ50nZZKUhmE,6674
41
41
  warn/scrapers/il.py,sha256=sygdvsNuB_Gvu3o_HidtpSP4FLz0szKb1zEHqGxVtlI,1563
42
42
  warn/scrapers/in.py,sha256=dAT40ROhhKiwLcwa_YJ6EyhsYBLe0IX2rOWXmNa6JMs,2026
43
43
  warn/scrapers/ks.py,sha256=F_3biEMF7zgCX2XVuUACR74Vyzapta4SaM9SY3EuZCU,1266
@@ -65,9 +65,9 @@ warn/scrapers/va.py,sha256=hOPuiAjnTmtXCOdnBM_jAJuz9_u6oCxtbm2F-9m3ot0,10732
65
65
  warn/scrapers/vt.py,sha256=d-bo4WK2hkrk4BhCCmLpEovcoZltlvdIUB6O0uaMx5A,1186
66
66
  warn/scrapers/wa.py,sha256=UXdVtHZo_a-XfoiyOooTRfTb9W3PErSZdKca6SRORgs,4282
67
67
  warn/scrapers/wi.py,sha256=ClEzXkwZbop0W4fkQgsb5oHAPUrb4luUPGV-jOKwkcg,4855
68
- warn_scraper-1.2.104.dist-info/licenses/LICENSE,sha256=ZV-QHyqPwyMuwuj0lI05JeSjV1NyzVEk8Yeu7FPtYS0,585
69
- warn_scraper-1.2.104.dist-info/METADATA,sha256=j9eq6IQ5GXReevRBLWI0Cv-_D7EtKIRCCRXI6I-U_4g,2385
70
- warn_scraper-1.2.104.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
71
- warn_scraper-1.2.104.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
72
- warn_scraper-1.2.104.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
73
- warn_scraper-1.2.104.dist-info/RECORD,,
68
+ warn_scraper-1.2.105.dist-info/licenses/LICENSE,sha256=ZV-QHyqPwyMuwuj0lI05JeSjV1NyzVEk8Yeu7FPtYS0,585
69
+ warn_scraper-1.2.105.dist-info/METADATA,sha256=vXzxe3HPaiZ9IjDDSA__ROWblz2T4jWKgjP_50v6XYQ,2385
70
+ warn_scraper-1.2.105.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
71
+ warn_scraper-1.2.105.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
72
+ warn_scraper-1.2.105.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
73
+ warn_scraper-1.2.105.dist-info/RECORD,,