warn-scraper 1.2.88__py3-none-any.whl → 1.2.89__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- warn/cache.py +1 -1
- warn/scrapers/va.py +124 -27
- warn/utils.py +2 -2
- {warn_scraper-1.2.88.dist-info → warn_scraper-1.2.89.dist-info}/METADATA +5 -3
- {warn_scraper-1.2.88.dist-info → warn_scraper-1.2.89.dist-info}/RECORD +9 -9
- {warn_scraper-1.2.88.dist-info → warn_scraper-1.2.89.dist-info}/LICENSE +0 -0
- {warn_scraper-1.2.88.dist-info → warn_scraper-1.2.89.dist-info}/WHEEL +0 -0
- {warn_scraper-1.2.88.dist-info → warn_scraper-1.2.89.dist-info}/entry_points.txt +0 -0
- {warn_scraper-1.2.88.dist-info → warn_scraper-1.2.89.dist-info}/top_level.txt +0 -0
warn/cache.py
CHANGED
warn/scrapers/va.py
CHANGED
@@ -1,13 +1,20 @@
|
|
1
|
+
import datetime
|
1
2
|
import logging
|
3
|
+
import os
|
4
|
+
from glob import glob
|
2
5
|
from pathlib import Path
|
6
|
+
from shutil import copyfile
|
7
|
+
from time import sleep
|
8
|
+
|
9
|
+
from selenium import webdriver
|
10
|
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
11
|
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
12
|
+
from webdriver_manager.chrome import ChromeDriverManager
|
3
13
|
|
4
14
|
from .. import utils
|
5
15
|
from ..cache import Cache
|
6
16
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
__authors__ = ["zstumgoren", "Dilcia19", "shallotly"]
|
17
|
+
__authors__ = ["zstumgoren", "Dilcia19", "shallotly", "stucka"]
|
11
18
|
__tags__ = ["html", "csv"]
|
12
19
|
__source__ = {
|
13
20
|
"name": "Virginia Employment Commission",
|
@@ -30,37 +37,127 @@ def scrape(
|
|
30
37
|
|
31
38
|
Returns: the Path where the file is written
|
32
39
|
"""
|
33
|
-
|
34
|
-
|
40
|
+
cache = Cache(cache_dir)
|
41
|
+
csv_url = "https://vec.virginia.gov/warn-notices-csv.csv"
|
42
|
+
|
43
|
+
"""
|
44
|
+
This scraper originally tried to parse HTML to find a CSV download link.
|
45
|
+
The HTML scraping portion broke in early December 2024. The code had
|
46
|
+
also been downloading an incomplete slice of the data.
|
47
|
+
|
48
|
+
In late December 2024, everything broke because Virginia decided to begin
|
49
|
+
testing for Javascript-aware browsers. This code is the way it is because
|
50
|
+
every alternative considered was somehow worse. Not helping? Losing about
|
51
|
+
four hours of work including the extensive documentation on the
|
52
|
+
alternatives sought.
|
53
|
+
|
54
|
+
Virginia's protections require a JS-aware browser to evaluate some
|
55
|
+
obscurred, frequently changing code to set some short-lived cookies.
|
56
|
+
Without those cookies, no code. And even headless browsers get blocked
|
57
|
+
by a video test. Really unfun. So a ... headed? ... JS-aware browser
|
58
|
+
is required.
|
59
|
+
|
60
|
+
Some things evaluated included, off memory:
|
61
|
+
|
62
|
+
-- Using Playwright instead. This looked like a reasonable approach but
|
63
|
+
was awful resource-wise. Playwright itself had significant overhead,
|
64
|
+
partially from requiring its own version of browsers to be installed.
|
65
|
+
There's apparently some way with YAML to try to get Github Actions,
|
66
|
+
where this project is in production, to install only for particular
|
67
|
+
branches. Without that code, this'd be pending a couple minutes
|
68
|
+
several times a day on each of about 40 different branches of code.
|
69
|
+
-- Using Selenium. This is where it ultimately landed. It's not great,
|
70
|
+
but after trying about a dozen alernatives it's the best we got.
|
71
|
+
-- Installation code for Chrome's driver started acting flaky between
|
72
|
+
platforms.
|
73
|
+
-- PhantomJS couldn't even get past the first brush with the protection.
|
74
|
+
-- The optimal file is the CSV created by the state with well-defined
|
75
|
+
fields. Unfortunately, hitting the link once approved by the
|
76
|
+
Javascript results in an immediate download. There's no regular way
|
77
|
+
to get the file path through Javascript. Backdoor efforts like trying
|
78
|
+
to go through the Download menu also failed, because Chrome puts
|
79
|
+
them into a Shadow DOM. Several hunks of code to try to access the
|
80
|
+
Shadow DOM and get at the local filename are no longer functional
|
81
|
+
in Chrome. Building an extension to track some of this ... is not
|
82
|
+
an option, and loading it the first time would require human
|
83
|
+
intervention rather than automation. There might be a way to mess
|
84
|
+
with the Shadow DOM through CSS manipulation, but that looked to
|
85
|
+
weird to bother trying especially given other more reasonable measures
|
86
|
+
that no longer worked.
|
87
|
+
-- Also, efforts to get at the CSV through view-source failed.
|
88
|
+
-- And it's possible to scrape the HTML and try to parse it back out for
|
89
|
+
what warn-scraper needs, but that seemed even more fraught than trying
|
90
|
+
to get the CSV.
|
91
|
+
-- So if the filename isn't obtainable through Chrome, where do we get it?
|
92
|
+
There's a multiplatform way to get at a user's home directory. For
|
93
|
+
many people Downloads is off there, at ... ~/Downloads, capital D,
|
94
|
+
plural. Except people can configure that differently. And most
|
95
|
+
languages won't call it Downloads. And Chrome of course lets people
|
96
|
+
set a default download location that can be anywhere else, or select
|
97
|
+
a per-file location ("Ask me where to save this" or some such).
|
98
|
+
After going down even more rabbit holes, ... ~/Downloads is all that
|
99
|
+
gets implemented here.
|
100
|
+
-- I tried to see if Firefox might be a little less grumpy. One Python
|
101
|
+
driver-finder got one day of commits. A fork has Issues turned off
|
102
|
+
somehow. The third one I looked at was the one that was grumpy for
|
103
|
+
Chrome, and its maintainer is apparently trying to protect his
|
104
|
+
homeland with FPV drones. So ... back to Chrome.
|
105
|
+
|
106
|
+
So, yes, this is a weird implementation. It's a terrible model. It's
|
107
|
+
even got a hard-coded wait. At least as of late December 2024, however,
|
108
|
+
it does work. ... in late December 2024.
|
109
|
+
"""
|
35
110
|
|
36
|
-
#
|
37
|
-
|
111
|
+
# driver = webdriver.Chrome(options=chromeoptionsholder, service=Service(ChromeDriverManager().install()))
|
112
|
+
logger.debug("Attempting to launch Chrome")
|
113
|
+
chromeoptionsholder = ChromeOptions()
|
114
|
+
chrome_install = ChromeDriverManager().install()
|
38
115
|
|
39
|
-
#
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
116
|
+
# Weird error with finding the driver name in Windows. Sometimes.
|
117
|
+
if chrome_install.endswith("THIRD_PARTY_NOTICES.chromedriver"):
|
118
|
+
chrome_install = chrome_install.replace(
|
119
|
+
"THIRD_PARTY_NOTICES.chromedriver", "chromedriver.exe"
|
120
|
+
)
|
121
|
+
logger.debug(f"Chrome install variable is {chrome_install}")
|
122
|
+
# folder = os.path.dirname(chrome_install)
|
123
|
+
# chromedriver_path = folder # os.path.join(folder, "chromedriver.exe")
|
124
|
+
# service = ChromeService(chromedriver_path)
|
125
|
+
service = ChromeService(chrome_install)
|
126
|
+
driver = webdriver.Chrome(options=chromeoptionsholder, service=service)
|
127
|
+
logger.debug(f"Attempting to fetch {csv_url}")
|
128
|
+
driver.get(csv_url)
|
44
129
|
|
45
|
-
#
|
46
|
-
|
47
|
-
|
130
|
+
sleep(30) # Give it plenty of time to evaluate Javascript
|
131
|
+
|
132
|
+
download_dir = os.path.expanduser("~") + "/Downloads"
|
133
|
+
|
134
|
+
if not os.path.isdir(download_dir):
|
135
|
+
logger.error(f"The download directory is not {download_dir}.")
|
136
|
+
|
137
|
+
# get the list of files
|
138
|
+
list_of_files = glob(download_dir + "/warn-notices-csv*.csv")
|
139
|
+
if len(list_of_files) == 0:
|
140
|
+
logger.error(f"No matching files found in {download_dir}.")
|
141
|
+
|
142
|
+
# get the latest file name
|
143
|
+
latest_file = max(list_of_files, key=os.path.getctime)
|
144
|
+
latest_file_time = datetime.datetime.fromtimestamp(os.path.getctime(latest_file))
|
145
|
+
|
146
|
+
# print the latest file name
|
147
|
+
logger.debug(f"CSV saved to {latest_file}, saved at {latest_file_time}")
|
148
|
+
|
149
|
+
target_filename = cache_dir / "va" / "source.csv"
|
150
|
+
|
151
|
+
utils.create_directory(path=cache_dir / "va", is_file=False)
|
48
152
|
|
49
|
-
|
50
|
-
# soup = BeautifulSoup(html, "html.parser")
|
51
|
-
# csv_link = soup.find("a", text="Download")
|
52
|
-
# if isinstance(csv_link, Tag):
|
53
|
-
# csv_href = csv_link["href"]
|
54
|
-
# else:
|
55
|
-
# raise ValueError("Could not find CSV link")
|
153
|
+
logger.debug(f"Saving file to {target_filename}")
|
56
154
|
|
57
|
-
|
58
|
-
# csv_url = f"https://www.vec.virginia.gov{csv_href}"
|
155
|
+
copyfile(latest_file, target_filename)
|
59
156
|
|
60
|
-
|
157
|
+
driver.quit()
|
61
158
|
|
62
159
|
# Download it to the cache
|
63
|
-
cache.download("va/source.csv", csv_url, verify=True)
|
160
|
+
# cache.download("va/source.csv", csv_url, verify=True)
|
64
161
|
|
65
162
|
# Open it up as a list of rows
|
66
163
|
csv_rows = cache.read_csv("va/source.csv")
|
warn/utils.py
CHANGED
@@ -86,7 +86,7 @@ def save_if_good_url(filename, url, **kwargs):
|
|
86
86
|
success_flag = False
|
87
87
|
content = False
|
88
88
|
else:
|
89
|
-
with open(filename, "wb") as outfile:
|
89
|
+
with open(filename, "wb", encoding="utf-8") as outfile:
|
90
90
|
outfile.write(response.content)
|
91
91
|
success_flag = True
|
92
92
|
content = response.content
|
@@ -104,7 +104,7 @@ def write_rows_to_csv(output_path: Path, rows: list, mode="w"):
|
|
104
104
|
"""
|
105
105
|
create_directory(output_path, is_file=True)
|
106
106
|
logger.debug(f"Writing {len(rows)} rows to {output_path}")
|
107
|
-
with open(output_path, mode, newline="") as f:
|
107
|
+
with open(output_path, mode, newline="", encoding="utf-8") as f:
|
108
108
|
writer = csv.writer(f)
|
109
109
|
writer.writerows(rows)
|
110
110
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: warn-scraper
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.89
|
4
4
|
Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
|
5
5
|
Home-page: https://github.com/biglocalnews/warn-scraper
|
6
6
|
Author: Big Local News
|
@@ -28,9 +28,11 @@ Requires-Dist: html5lib
|
|
28
28
|
Requires-Dist: pdfplumber
|
29
29
|
Requires-Dist: requests
|
30
30
|
Requires-Dist: openpyxl
|
31
|
-
Requires-Dist: xlrd
|
32
|
-
Requires-Dist: tenacity
|
33
31
|
Requires-Dist: retry
|
32
|
+
Requires-Dist: selenium
|
33
|
+
Requires-Dist: tenacity
|
34
|
+
Requires-Dist: xlrd
|
35
|
+
Requires-Dist: webdriver-manager
|
34
36
|
|
35
37
|
## Links
|
36
38
|
|
@@ -14,10 +14,10 @@ tests/cassettes/test_scrape_integration.yaml,sha256=5JfS-nscabP0rDhUeBIXMIFVSS5q
|
|
14
14
|
tests/fixtures/2021_page_1.html,sha256=ZIPBhPE2BTcJX-mm5_4M4pgQcnrQWDBuGrEJDonj2QE,34
|
15
15
|
tests/fixtures/2021_page_2.html,sha256=qm6lX8LwFRNT2WIIW2U29ku3wEGzvECzQJCWBtcwSbg,34
|
16
16
|
warn/__init__.py,sha256=A07JFY1TyaPtVIndBa7IvTk13DETqIkLgRdk0A-MCoE,85
|
17
|
-
warn/cache.py,sha256=
|
17
|
+
warn/cache.py,sha256=hyta04_G-ALGwcKl4xNc7EgHS_xklyVD5d8SXNrJekY,5520
|
18
18
|
warn/cli.py,sha256=ZqyJwICdHFkn2hEgbArj_upbElR9-TSDlYDqyEGeexE,2019
|
19
19
|
warn/runner.py,sha256=oeGRybGwpnkQKlPzRMlKxhsDt1GN4PZoX-vUwrsPgos,1894
|
20
|
-
warn/utils.py,sha256=
|
20
|
+
warn/utils.py,sha256=V1JQD-bPwNiZ8kpl_YsonfjtaF1a8M8jlBNbdwGXcq4,7062
|
21
21
|
warn/platforms/__init__.py,sha256=wIZRDf4tbTuC8oKM4ZrTAtwNgbtMQGzPXMwDYCFyrog,81
|
22
22
|
warn/platforms/job_center/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
23
23
|
warn/platforms/job_center/cache.py,sha256=yhA3sE46lNFg8vEewSoRYVByi0YSlkBiKm7qoSUiTdM,1868
|
@@ -61,13 +61,13 @@ warn/scrapers/sd.py,sha256=_4R19Ybzsyx1PvcWV3_laJmJ3etrwVGfhNEQm6njwoA,1904
|
|
61
61
|
warn/scrapers/tn.py,sha256=i1H7c09Ea3CDrTXqqRMLBMPT_34QtGA0-x7T8rm_j5Q,2945
|
62
62
|
warn/scrapers/tx.py,sha256=watfR1gyN9w7nluiAOnnIghEmoq3eShNUzYSZ8SkZy4,4438
|
63
63
|
warn/scrapers/ut.py,sha256=iUh38YIjbvv5MyyKacsiZNe8KjfdBeDaOf-qMQEF_kc,2245
|
64
|
-
warn/scrapers/va.py,sha256
|
64
|
+
warn/scrapers/va.py,sha256=jp5G9Z73s5j9A3-1IybFV0rmZSBWH73vNrQpC7XLSSU,7573
|
65
65
|
warn/scrapers/vt.py,sha256=d-bo4WK2hkrk4BhCCmLpEovcoZltlvdIUB6O0uaMx5A,1186
|
66
66
|
warn/scrapers/wa.py,sha256=UXdVtHZo_a-XfoiyOooTRfTb9W3PErSZdKca6SRORgs,4282
|
67
67
|
warn/scrapers/wi.py,sha256=ClEzXkwZbop0W4fkQgsb5oHAPUrb4luUPGV-jOKwkcg,4855
|
68
|
-
warn_scraper-1.2.
|
69
|
-
warn_scraper-1.2.
|
70
|
-
warn_scraper-1.2.
|
71
|
-
warn_scraper-1.2.
|
72
|
-
warn_scraper-1.2.
|
73
|
-
warn_scraper-1.2.
|
68
|
+
warn_scraper-1.2.89.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
69
|
+
warn_scraper-1.2.89.dist-info/METADATA,sha256=5iDNnLM7c0Z6kpCF4-QB-6dbZWCkmPUnEw7FG6npfPo,2093
|
70
|
+
warn_scraper-1.2.89.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
|
71
|
+
warn_scraper-1.2.89.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
|
72
|
+
warn_scraper-1.2.89.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
|
73
|
+
warn_scraper-1.2.89.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|