warn-scraper 1.2.82__py3-none-any.whl → 1.2.84__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
warn/scrapers/md.py CHANGED
@@ -36,9 +36,13 @@ def scrape(
36
36
  # Set the cache
37
37
  cache = Cache(cache_dir)
38
38
 
39
+ # In November 2024 Maryland began throwing out many failed connection messages. These two things helped.
40
+ request_headers = {"User-Agent": "BigLocalNews.org"}
41
+ request_verify = False
42
+
39
43
  # Get the page
40
44
  url = "https://www.dllr.state.md.us/employment/warn.shtml"
41
- r = utils.get_url(url)
45
+ r = utils.get_url(url, headers=request_headers, verify=request_verify)
42
46
  r.encoding = "utf-8"
43
47
  html = r.text
44
48
 
@@ -56,17 +60,41 @@ def scrape(
56
60
  html_list = []
57
61
  html_list.append(html) # Save the source HTML for parsing also
58
62
 
63
+ old_pages = [
64
+ "warn2023.shtml",
65
+ "warn2022.shtml",
66
+ "warn2021.shtml",
67
+ "warn2020.shtml",
68
+ "warn2019.shtml",
69
+ "warn2018.shtml",
70
+ "warn2017.shtml",
71
+ "warn2016.shtml",
72
+ "warn2015.shtml",
73
+ "warn2014.shtml",
74
+ "warn2013.shtml",
75
+ "warn2012.shtml",
76
+ "warn2011.shtml",
77
+ "warn2010.shtml",
78
+ ]
79
+
59
80
  for href in href_list:
60
81
  # Request the HTML
61
82
  url = f"https://www.dllr.state.md.us/employment/{href}"
62
- r = utils.get_url(url)
63
- r.encoding = "utf-8"
64
- html = r.text
65
-
66
- # Save it to the cache
67
- cache.write(f"md/{href}.html", html)
68
-
69
- sleep(naptime) # Try to stop blocked connections by being less aggressive
83
+ filename = cache_dir / f"md/{href}.html"
84
+
85
+ if href not in old_pages:
86
+ sleep(naptime) # Try to stop blocked connections by being less aggressive
87
+ r = utils.get_url(url, headers=request_headers, verify=request_verify)
88
+ r.encoding = "utf-8"
89
+ html = r.text
90
+
91
+ # Save it to the cache
92
+ cache.write(filename, html)
93
+ else:
94
+ r = utils.fetch_if_not_cached(
95
+ filename, url, headers=request_headers, verify=request_verify
96
+ )
97
+ html = cache.read(filename)
70
98
 
71
99
  # Add it to the list
72
100
  html_list.append(html)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: warn-scraper
3
- Version: 1.2.82
3
+ Version: 1.2.84
4
4
  Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
5
5
  Home-page: https://github.com/biglocalnews/warn-scraper
6
6
  Author: Big Local News
@@ -23,7 +23,7 @@ Classifier: Programming Language :: Python :: 3.10
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE
25
25
  Requires-Dist: click
26
- Requires-Dist: bs4
26
+ Requires-Dist: beautifulsoup4
27
27
  Requires-Dist: html5lib
28
28
  Requires-Dist: pdfplumber
29
29
  Requires-Dist: requests
@@ -43,7 +43,7 @@ warn/scrapers/in.py,sha256=dAT40ROhhKiwLcwa_YJ6EyhsYBLe0IX2rOWXmNa6JMs,2026
43
43
  warn/scrapers/ks.py,sha256=F_3biEMF7zgCX2XVuUACR74Vyzapta4SaM9SY3EuZCU,1266
44
44
  warn/scrapers/ky.py,sha256=XjIojMpaoKbypa7l23IybP02jBijBCJG5UGqfO-EYjg,4365
45
45
  warn/scrapers/la.py,sha256=60z-4LZY5xp6aX8r6HGGW3FaOVEGnxlG2Mfgpt4G2WE,12877
46
- warn/scrapers/md.py,sha256=e-tiiKwr9dNEemtk7SWMv317Nv-qEDf5xPNcMU8AZDQ,3045
46
+ warn/scrapers/md.py,sha256=qRnmYsSFTQWeFUN6RJislnrKJ6ky2tqJUHJrmin9C-s,4011
47
47
  warn/scrapers/me.py,sha256=q36F4yJ7hvZsLayA3uBS1romo4X3Qf-sEi2Y7LAQCi8,1172
48
48
  warn/scrapers/mi.py,sha256=9clZ9mATEJwdVLzDo_h66rK0aV5Zc7GGQ7AauutS6Wo,3591
49
49
  warn/scrapers/mo.py,sha256=wnnwQAiVPwuheMqptMXZpyQdiKNghhKwTO-Bnh9oXoU,3492
@@ -65,9 +65,9 @@ warn/scrapers/va.py,sha256=13lhkQrSkPGHEiWUuf1qiS890PWYE5gV-TgISpoiQnc,1711
65
65
  warn/scrapers/vt.py,sha256=d-bo4WK2hkrk4BhCCmLpEovcoZltlvdIUB6O0uaMx5A,1186
66
66
  warn/scrapers/wa.py,sha256=UXdVtHZo_a-XfoiyOooTRfTb9W3PErSZdKca6SRORgs,4282
67
67
  warn/scrapers/wi.py,sha256=ClEzXkwZbop0W4fkQgsb5oHAPUrb4luUPGV-jOKwkcg,4855
68
- warn_scraper-1.2.82.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
69
- warn_scraper-1.2.82.dist-info/METADATA,sha256=kxvv_1p1AQwjc40wHLkQddflASZc9I4_GxtcySjdNIo,2025
70
- warn_scraper-1.2.82.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
71
- warn_scraper-1.2.82.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
72
- warn_scraper-1.2.82.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
73
- warn_scraper-1.2.82.dist-info/RECORD,,
68
+ warn_scraper-1.2.84.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
69
+ warn_scraper-1.2.84.dist-info/METADATA,sha256=Ij4-XcqKM3KulM8QABn24sKO1k23yxuOft-G738JBdY,2036
70
+ warn_scraper-1.2.84.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
71
+ warn_scraper-1.2.84.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
72
+ warn_scraper-1.2.84.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
73
+ warn_scraper-1.2.84.dist-info/RECORD,,