warn-scraper 1.2.106__py3-none-any.whl → 1.2.107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
warn/scrapers/dc.py CHANGED
@@ -48,13 +48,15 @@ def scrape(
48
48
  url = f"https://does.dc.gov/page/industry-closings-and-layoffs-warn-notifications-{current_year - 1}"
49
49
  success, content = utils.save_if_good_url(targetfile, url)
50
50
 
51
- root_html = content
52
- # r = utils.get_url(url)
53
- # r.encoding = "utf-8"
54
- # root_html = r.text
51
+ root_html = cache.read(targetfile) # Explicitly re-read as text for regex to work
55
52
 
56
- # Save it to the cache
57
- # cache.write(targetfile, root_html)
53
+ # A June 2025 entry includes a weird table inside a table cell.
54
+ # This is an ugly patch.
55
+ weirdtable = r"(\s+<table>\s+<tbody>\s+<tr>\s+<td>)(.*)(</td>\s+</tr>\s+</tbody>\s+</table>\s+)"
56
+ after = re.subn(weirdtable, r"\2", root_html) #
57
+ if after[1] > 0:
58
+ logger.debug(f"{after[1]} changes made to {url}")
59
+ root_html = after[0]
58
60
 
59
61
  # Parse the list of links
60
62
  soup = BeautifulSoup(root_html, "html5lib")
@@ -84,16 +86,18 @@ def scrape(
84
86
  r = utils.get_url(href)
85
87
  r.encoding = "utf-8"
86
88
  html = r.text
87
-
88
- # A June 2025 entry includes a weird table inside a table cell.
89
- # This is an ugly patch.
90
- weirdtable = r"(\s+<table>\s+<tbody>\s+<tr>\s+<td>)(.*?)(</td>\s+</tr>\s+</tbody>\s+</table>\s+)"
91
- html = re.sub(weirdtable, r"\2", html)
92
-
93
89
  # Save it to the cache
94
90
  cache_key = uuid.uuid5(uuid.NAMESPACE_URL, href)
95
91
  cache.write(f"dc/{cache_key}.html", html)
96
92
 
93
+ # A June 2025 entry includes a weird table inside a table cell.
94
+ # This is an ugly patch.
95
+ weirdtable = r"(\s+<table>\s+<tbody>\s+<tr>\s+<td>)(.*)(</td>\s+</tr>\s+</tbody>\s+</table>\s+)"
96
+ after = re.subn(weirdtable, r"\2", html)
97
+ if after[1] > 0:
98
+ logger.debug(f"{after[1]} changes made to {href}")
99
+ html = after[0]
100
+
97
101
  # Add it to the list
98
102
  html_list.append(html)
99
103
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: warn-scraper
3
- Version: 1.2.106
3
+ Version: 1.2.107
4
4
  Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
5
5
  Home-page: https://github.com/biglocalnews/warn-scraper
6
6
  Author: Big Local News
@@ -31,7 +31,7 @@ warn/scrapers/az.py,sha256=elGbue01Gjf_DQ66Wy9qqGIOJsiY-KIKJOVeft8pCXg,1447
31
31
  warn/scrapers/ca.py,sha256=VQOfjHXPCc-jYwh-EPGVVfnzvXB7pdmCt2uJ6QnMPRM,8600
32
32
  warn/scrapers/co.py,sha256=hUfqrzlhXQBkP4vxewVRrMZrgInoLer5S2MZlyYIQE4,7878
33
33
  warn/scrapers/ct.py,sha256=PKeZtlB0-z2wCmYmGl_WYoVo2gzwKV36upZcJVaJxjM,4852
34
- warn/scrapers/dc.py,sha256=NNTdjgUzC3jR-8F64U7dA77MzKo0dTTo64XjjpBu4cc,4743
34
+ warn/scrapers/dc.py,sha256=C0JwgGX7A4JMxlahTrfzbFKpmyPxF7y6wsnUf-sE3OU,5120
35
35
  warn/scrapers/de.py,sha256=GyM92A-lFwZAfRxgbO-sIWhRfmBEKirzchaPIv-u0o4,1364
36
36
  warn/scrapers/fl.py,sha256=YJ6Qt-jJZ7_iUKlHDaQuaV2gRmae8AJKS5dwwChadBE,9563
37
37
  warn/scrapers/ga.py,sha256=o_OF4zPQ3vJM8USQPD7l_ThyRWAzUZkwrwLHCvWmHMI,7429
@@ -65,9 +65,9 @@ warn/scrapers/va.py,sha256=hOPuiAjnTmtXCOdnBM_jAJuz9_u6oCxtbm2F-9m3ot0,10732
65
65
  warn/scrapers/vt.py,sha256=d-bo4WK2hkrk4BhCCmLpEovcoZltlvdIUB6O0uaMx5A,1186
66
66
  warn/scrapers/wa.py,sha256=UXdVtHZo_a-XfoiyOooTRfTb9W3PErSZdKca6SRORgs,4282
67
67
  warn/scrapers/wi.py,sha256=ClEzXkwZbop0W4fkQgsb5oHAPUrb4luUPGV-jOKwkcg,4855
68
- warn_scraper-1.2.106.dist-info/licenses/LICENSE,sha256=ZV-QHyqPwyMuwuj0lI05JeSjV1NyzVEk8Yeu7FPtYS0,585
69
- warn_scraper-1.2.106.dist-info/METADATA,sha256=eGgAOPnHhNXmKPGssgfGwKdtkZfn5QPz2qa2dP0kBQU,2385
70
- warn_scraper-1.2.106.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
71
- warn_scraper-1.2.106.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
72
- warn_scraper-1.2.106.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
73
- warn_scraper-1.2.106.dist-info/RECORD,,
68
+ warn_scraper-1.2.107.dist-info/licenses/LICENSE,sha256=ZV-QHyqPwyMuwuj0lI05JeSjV1NyzVEk8Yeu7FPtYS0,585
69
+ warn_scraper-1.2.107.dist-info/METADATA,sha256=NYe0Bakge_0cvILQQr7jVI7xnpeHzZ2dDRYhJAMOK4Q,2385
70
+ warn_scraper-1.2.107.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
71
+ warn_scraper-1.2.107.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
72
+ warn_scraper-1.2.107.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
73
+ warn_scraper-1.2.107.dist-info/RECORD,,