warn-scraper 1.2.98__py3-none-any.whl → 1.2.100__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- warn/scrapers/va.py +58 -6
- {warn_scraper-1.2.98.dist-info → warn_scraper-1.2.100.dist-info}/METADATA +2 -1
- {warn_scraper-1.2.98.dist-info → warn_scraper-1.2.100.dist-info}/RECORD +7 -7
- {warn_scraper-1.2.98.dist-info → warn_scraper-1.2.100.dist-info}/LICENSE +0 -0
- {warn_scraper-1.2.98.dist-info → warn_scraper-1.2.100.dist-info}/WHEEL +0 -0
- {warn_scraper-1.2.98.dist-info → warn_scraper-1.2.100.dist-info}/entry_points.txt +0 -0
- {warn_scraper-1.2.98.dist-info → warn_scraper-1.2.100.dist-info}/top_level.txt +0 -0
warn/scrapers/va.py
CHANGED
@@ -2,14 +2,19 @@ import datetime
|
|
2
2
|
import logging
|
3
3
|
import os
|
4
4
|
import platform
|
5
|
+
|
6
|
+
# import subprocess
|
5
7
|
from glob import glob
|
6
8
|
from pathlib import Path
|
9
|
+
from random import random
|
7
10
|
from shutil import copyfile
|
8
11
|
from time import sleep
|
9
12
|
|
10
13
|
from selenium import webdriver
|
11
14
|
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
12
15
|
from selenium.webdriver.chrome.service import Service as ChromeService
|
16
|
+
from selenium.webdriver.common.by import By
|
17
|
+
from stealthenium import stealth
|
13
18
|
from webdriver_manager.chrome import ChromeDriverManager
|
14
19
|
|
15
20
|
from .. import utils
|
@@ -49,7 +54,9 @@ def scrape(
|
|
49
54
|
Returns: the Path where the file is written
|
50
55
|
"""
|
51
56
|
cache = Cache(cache_dir)
|
52
|
-
csv_url = "https://vec.virginia.gov/warn-notices-csv.csv"
|
57
|
+
# csv_url = "https://vec.virginia.gov/warn-notices-csv.csv"
|
58
|
+
start_page = "https://www.virginiaworks.gov/warn-notices/"
|
59
|
+
csv_url = "https://vec.virginia.gov/warn_notices.csv"
|
53
60
|
|
54
61
|
"""
|
55
62
|
This scraper originally tried to parse HTML to find a CSV download link.
|
@@ -117,6 +124,10 @@ def scrape(
|
|
117
124
|
So, yes, this is a weird implementation. It's a terrible model. It's
|
118
125
|
even got a hard-coded wait. At least as of late December 2024, however,
|
119
126
|
it does work. ... in late December 2024.
|
127
|
+
|
128
|
+
And then it broke in early January 2025! But it's not an IP block.
|
129
|
+
They may have started blocking direct calls to the CSV. Code patched
|
130
|
+
in late January 2025 to use the Download button.
|
120
131
|
"""
|
121
132
|
|
122
133
|
# driver = webdriver.Chrome(options=chromeoptionsholder, service=Service(ChromeDriverManager().install()))
|
@@ -126,6 +137,12 @@ def scrape(
|
|
126
137
|
chromeoptionsholder.add_argument("--disable-dev-shm-usage")
|
127
138
|
chromeoptionsholder.add_argument("--remote-debugging-pipe")
|
128
139
|
chromeoptionsholder.add_argument("--verbose")
|
140
|
+
chromeoptionsholder.add_argument("start-maximized")
|
141
|
+
chromeoptionsholder.add_experimental_option(
|
142
|
+
"excludeSwitches", ["enable-automation"]
|
143
|
+
)
|
144
|
+
chromeoptionsholder.add_experimental_option("useAutomationExtension", False)
|
145
|
+
chromeoptionsholder.add_argument("--disable-blink-features=AutomationControlled")
|
129
146
|
|
130
147
|
if "CHROMEWEBDRIVER" in os.environ:
|
131
148
|
chrome_install = os.environ["CHROMEWEBDRIVER"] + "/chromedriver"
|
@@ -139,13 +156,50 @@ def scrape(
|
|
139
156
|
)
|
140
157
|
logger.debug(f"Chrome install variable is {chrome_install}")
|
141
158
|
|
159
|
+
# Hack on chromedriver itself, to try to be sneakier
|
160
|
+
# So many bad ideas coming together here
|
161
|
+
# perlstr = f"perl -pi -e 's/cdc_/ugh_/g' {chrome_install}"
|
162
|
+
# logger.debug(perlstr)
|
163
|
+
# process = subprocess.run(perlstr.split(), capture_output=True, text=True)
|
164
|
+
# logger.debug(f"process stdout: {process.stdout}")
|
165
|
+
# logger.debug(f"process stderr: {process.stderr}")
|
166
|
+
|
142
167
|
# Launch X Windows emulator, then launch Chrome to run with it
|
143
168
|
with Xvfb() as xvfb: # noqa: F841
|
144
169
|
service = ChromeService(chrome_install, service_args=["--verbose"])
|
170
|
+
# driver = webdriver.Chrome(options=chromeoptionsholder, service=service)
|
171
|
+
# driver = webdriver.Remote(options=chromeoptionsholder, service=service)
|
172
|
+
# capabilities = DesiredCapabilities.CHROME.copy()
|
173
|
+
# driver = webdriver.Remote(options=chromeoptionsholder, desired_capapabilities=capabilities, command_executor="http://localhost:4444/wd/hub")
|
174
|
+
# driver = webdriver.Chrome(options=chromeoptionsholder, service=service)
|
175
|
+
service = ChromeService(chrome_install, service_args=["--verbose"], port=5600)
|
145
176
|
driver = webdriver.Chrome(options=chromeoptionsholder, service=service)
|
177
|
+
driver.command_executor._url = "http://localhost:5600"
|
178
|
+
# driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
|
179
|
+
|
180
|
+
stealth(
|
181
|
+
driver,
|
182
|
+
languages=["en-US", "en"],
|
183
|
+
vendor="Google Inc.",
|
184
|
+
platform="Win32",
|
185
|
+
webgl_vendor="Intel Inc.",
|
186
|
+
renderer="Intel Iris OpenGL Engine",
|
187
|
+
fix_hairline=True,
|
188
|
+
)
|
189
|
+
|
190
|
+
logger.debug(f"Attempting to fetch {start_page}")
|
191
|
+
driver.get(start_page)
|
192
|
+
sleep((4 * random()) + 3)
|
193
|
+
driver.find_element(By.ID, "warn-notice-well").find_element(
|
194
|
+
By.PARTIAL_LINK_TEXT, "Download"
|
195
|
+
).click()
|
196
|
+
|
146
197
|
logger.debug(f"Attempting to fetch {csv_url}")
|
147
|
-
driver.get(csv_url)
|
148
|
-
sleep(
|
198
|
+
# driver.get(csv_url)
|
199
|
+
sleep(45) # Give it plenty of time to evaluate Javascript
|
200
|
+
# driver.get(csv_url)
|
201
|
+
# sleep(10)
|
202
|
+
driver.quit()
|
149
203
|
|
150
204
|
download_dir = os.path.expanduser("~") + "/Downloads"
|
151
205
|
|
@@ -153,7 +207,7 @@ def scrape(
|
|
153
207
|
logger.error(f"The download directory is not {download_dir}.")
|
154
208
|
|
155
209
|
# get the list of files
|
156
|
-
list_of_files = glob(download_dir + "/
|
210
|
+
list_of_files = glob(download_dir + "/warn_notices*.csv")
|
157
211
|
if len(list_of_files) == 0:
|
158
212
|
logger.error(f"No matching files found in {download_dir}.")
|
159
213
|
|
@@ -172,8 +226,6 @@ def scrape(
|
|
172
226
|
|
173
227
|
copyfile(latest_file, target_filename)
|
174
228
|
|
175
|
-
driver.quit()
|
176
|
-
|
177
229
|
# Download it to the cache
|
178
230
|
# cache.download("va/source.csv", csv_url, verify=True)
|
179
231
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: warn-scraper
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.100
|
4
4
|
Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
|
5
5
|
Home-page: https://github.com/biglocalnews/warn-scraper
|
6
6
|
Author: Big Local News
|
@@ -31,6 +31,7 @@ Requires-Dist: openpyxl
|
|
31
31
|
Requires-Dist: pyopenssl
|
32
32
|
Requires-Dist: retry
|
33
33
|
Requires-Dist: selenium
|
34
|
+
Requires-Dist: stealthenium
|
34
35
|
Requires-Dist: tenacity
|
35
36
|
Requires-Dist: xlrd
|
36
37
|
Requires-Dist: xvfbwrapper
|
@@ -61,13 +61,13 @@ warn/scrapers/sd.py,sha256=_4R19Ybzsyx1PvcWV3_laJmJ3etrwVGfhNEQm6njwoA,1904
|
|
61
61
|
warn/scrapers/tn.py,sha256=i1H7c09Ea3CDrTXqqRMLBMPT_34QtGA0-x7T8rm_j5Q,2945
|
62
62
|
warn/scrapers/tx.py,sha256=watfR1gyN9w7nluiAOnnIghEmoq3eShNUzYSZ8SkZy4,4438
|
63
63
|
warn/scrapers/ut.py,sha256=iUh38YIjbvv5MyyKacsiZNe8KjfdBeDaOf-qMQEF_kc,2245
|
64
|
-
warn/scrapers/va.py,sha256=
|
64
|
+
warn/scrapers/va.py,sha256=hOPuiAjnTmtXCOdnBM_jAJuz9_u6oCxtbm2F-9m3ot0,10732
|
65
65
|
warn/scrapers/vt.py,sha256=d-bo4WK2hkrk4BhCCmLpEovcoZltlvdIUB6O0uaMx5A,1186
|
66
66
|
warn/scrapers/wa.py,sha256=UXdVtHZo_a-XfoiyOooTRfTb9W3PErSZdKca6SRORgs,4282
|
67
67
|
warn/scrapers/wi.py,sha256=ClEzXkwZbop0W4fkQgsb5oHAPUrb4luUPGV-jOKwkcg,4855
|
68
|
-
warn_scraper-1.2.
|
69
|
-
warn_scraper-1.2.
|
70
|
-
warn_scraper-1.2.
|
71
|
-
warn_scraper-1.2.
|
72
|
-
warn_scraper-1.2.
|
73
|
-
warn_scraper-1.2.
|
68
|
+
warn_scraper-1.2.100.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
69
|
+
warn_scraper-1.2.100.dist-info/METADATA,sha256=BpOnyGYpiaTU1rBJgJUSx4S0Qxv--QMfSpfxeJBYRFA,2174
|
70
|
+
warn_scraper-1.2.100.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
|
71
|
+
warn_scraper-1.2.100.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
|
72
|
+
warn_scraper-1.2.100.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
|
73
|
+
warn_scraper-1.2.100.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|