warn-scraper 1.2.99__py3-none-any.whl → 1.2.101__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- warn/scrapers/va.py +53 -3
- {warn_scraper-1.2.99.dist-info → warn_scraper-1.2.101.dist-info}/METADATA +14 -3
- {warn_scraper-1.2.99.dist-info → warn_scraper-1.2.101.dist-info}/RECORD +7 -7
- {warn_scraper-1.2.99.dist-info → warn_scraper-1.2.101.dist-info}/WHEEL +1 -1
- {warn_scraper-1.2.99.dist-info → warn_scraper-1.2.101.dist-info}/entry_points.txt +0 -0
- {warn_scraper-1.2.99.dist-info → warn_scraper-1.2.101.dist-info/licenses}/LICENSE +0 -0
- {warn_scraper-1.2.99.dist-info → warn_scraper-1.2.101.dist-info}/top_level.txt +0 -0
warn/scrapers/va.py
CHANGED
@@ -2,14 +2,19 @@ import datetime
|
|
2
2
|
import logging
|
3
3
|
import os
|
4
4
|
import platform
|
5
|
+
|
6
|
+
# import subprocess
|
5
7
|
from glob import glob
|
6
8
|
from pathlib import Path
|
9
|
+
from random import random
|
7
10
|
from shutil import copyfile
|
8
11
|
from time import sleep
|
9
12
|
|
10
13
|
from selenium import webdriver
|
11
14
|
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
12
15
|
from selenium.webdriver.chrome.service import Service as ChromeService
|
16
|
+
from selenium.webdriver.common.by import By
|
17
|
+
from stealthenium import stealth
|
13
18
|
from webdriver_manager.chrome import ChromeDriverManager
|
14
19
|
|
15
20
|
from .. import utils
|
@@ -50,6 +55,7 @@ def scrape(
|
|
50
55
|
"""
|
51
56
|
cache = Cache(cache_dir)
|
52
57
|
# csv_url = "https://vec.virginia.gov/warn-notices-csv.csv"
|
58
|
+
start_page = "https://www.virginiaworks.gov/warn-notices/"
|
53
59
|
csv_url = "https://vec.virginia.gov/warn_notices.csv"
|
54
60
|
|
55
61
|
"""
|
@@ -118,6 +124,10 @@ def scrape(
|
|
118
124
|
So, yes, this is a weird implementation. It's a terrible model. It's
|
119
125
|
even got a hard-coded wait. At least as of late December 2024, however,
|
120
126
|
it does work. ... in late December 2024.
|
127
|
+
|
128
|
+
And then it broke in early January 2025! But it's not an IP block.
|
129
|
+
They may have started blocking direct calls to the CSV. Code patched
|
130
|
+
in late January 2025 to use the Download button.
|
121
131
|
"""
|
122
132
|
|
123
133
|
# driver = webdriver.Chrome(options=chromeoptionsholder, service=Service(ChromeDriverManager().install()))
|
@@ -127,6 +137,12 @@ def scrape(
|
|
127
137
|
chromeoptionsholder.add_argument("--disable-dev-shm-usage")
|
128
138
|
chromeoptionsholder.add_argument("--remote-debugging-pipe")
|
129
139
|
chromeoptionsholder.add_argument("--verbose")
|
140
|
+
chromeoptionsholder.add_argument("start-maximized")
|
141
|
+
chromeoptionsholder.add_experimental_option(
|
142
|
+
"excludeSwitches", ["enable-automation"]
|
143
|
+
)
|
144
|
+
chromeoptionsholder.add_experimental_option("useAutomationExtension", False)
|
145
|
+
chromeoptionsholder.add_argument("--disable-blink-features=AutomationControlled")
|
130
146
|
|
131
147
|
if "CHROMEWEBDRIVER" in os.environ:
|
132
148
|
chrome_install = os.environ["CHROMEWEBDRIVER"] + "/chromedriver"
|
@@ -140,15 +156,49 @@ def scrape(
|
|
140
156
|
)
|
141
157
|
logger.debug(f"Chrome install variable is {chrome_install}")
|
142
158
|
|
159
|
+
# Hack on chromedriver itself, to try to be sneakier
|
160
|
+
# So many bad ideas coming together here
|
161
|
+
# perlstr = f"perl -pi -e 's/cdc_/ugh_/g' {chrome_install}"
|
162
|
+
# logger.debug(perlstr)
|
163
|
+
# process = subprocess.run(perlstr.split(), capture_output=True, text=True)
|
164
|
+
# logger.debug(f"process stdout: {process.stdout}")
|
165
|
+
# logger.debug(f"process stderr: {process.stderr}")
|
166
|
+
|
143
167
|
# Launch X Windows emulator, then launch Chrome to run with it
|
144
168
|
with Xvfb() as xvfb: # noqa: F841
|
145
169
|
service = ChromeService(chrome_install, service_args=["--verbose"])
|
170
|
+
# driver = webdriver.Chrome(options=chromeoptionsholder, service=service)
|
171
|
+
# driver = webdriver.Remote(options=chromeoptionsholder, service=service)
|
172
|
+
# capabilities = DesiredCapabilities.CHROME.copy()
|
173
|
+
# driver = webdriver.Remote(options=chromeoptionsholder, desired_capapabilities=capabilities, command_executor="http://localhost:4444/wd/hub")
|
174
|
+
# driver = webdriver.Chrome(options=chromeoptionsholder, service=service)
|
175
|
+
service = ChromeService(chrome_install, service_args=["--verbose"], port=5600)
|
146
176
|
driver = webdriver.Chrome(options=chromeoptionsholder, service=service)
|
177
|
+
driver.command_executor._url = "http://localhost:5600"
|
178
|
+
# driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
|
179
|
+
|
180
|
+
stealth(
|
181
|
+
driver,
|
182
|
+
languages=["en-US", "en"],
|
183
|
+
vendor="Google Inc.",
|
184
|
+
platform="Win32",
|
185
|
+
webgl_vendor="Intel Inc.",
|
186
|
+
renderer="Intel Iris OpenGL Engine",
|
187
|
+
fix_hairline=True,
|
188
|
+
)
|
189
|
+
|
190
|
+
logger.debug(f"Attempting to fetch {start_page}")
|
191
|
+
driver.get(start_page)
|
192
|
+
sleep((4 * random()) + 3)
|
193
|
+
driver.find_element(By.ID, "warn-notice-well").find_element(
|
194
|
+
By.PARTIAL_LINK_TEXT, "Download"
|
195
|
+
).click()
|
196
|
+
|
147
197
|
logger.debug(f"Attempting to fetch {csv_url}")
|
148
|
-
driver.get(csv_url)
|
198
|
+
# driver.get(csv_url)
|
149
199
|
sleep(45) # Give it plenty of time to evaluate Javascript
|
150
|
-
driver.get(csv_url)
|
151
|
-
sleep(10)
|
200
|
+
# driver.get(csv_url)
|
201
|
+
# sleep(10)
|
152
202
|
driver.quit()
|
153
203
|
|
154
204
|
download_dir = os.path.expanduser("~") + "/Downloads"
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: warn-scraper
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.101
|
4
4
|
Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
|
5
5
|
Home-page: https://github.com/biglocalnews/warn-scraper
|
6
6
|
Author: Big Local News
|
@@ -29,12 +29,23 @@ Requires-Dist: pdfplumber
|
|
29
29
|
Requires-Dist: requests
|
30
30
|
Requires-Dist: openpyxl
|
31
31
|
Requires-Dist: pyopenssl
|
32
|
-
Requires-Dist:
|
32
|
+
Requires-Dist: retry2
|
33
33
|
Requires-Dist: selenium
|
34
|
+
Requires-Dist: stealthenium
|
34
35
|
Requires-Dist: tenacity
|
35
36
|
Requires-Dist: xlrd
|
36
37
|
Requires-Dist: xvfbwrapper
|
37
38
|
Requires-Dist: webdriver-manager
|
39
|
+
Dynamic: author
|
40
|
+
Dynamic: classifier
|
41
|
+
Dynamic: description
|
42
|
+
Dynamic: description-content-type
|
43
|
+
Dynamic: home-page
|
44
|
+
Dynamic: license
|
45
|
+
Dynamic: license-file
|
46
|
+
Dynamic: project-url
|
47
|
+
Dynamic: requires-dist
|
48
|
+
Dynamic: summary
|
38
49
|
|
39
50
|
## Links
|
40
51
|
|
@@ -61,13 +61,13 @@ warn/scrapers/sd.py,sha256=_4R19Ybzsyx1PvcWV3_laJmJ3etrwVGfhNEQm6njwoA,1904
|
|
61
61
|
warn/scrapers/tn.py,sha256=i1H7c09Ea3CDrTXqqRMLBMPT_34QtGA0-x7T8rm_j5Q,2945
|
62
62
|
warn/scrapers/tx.py,sha256=watfR1gyN9w7nluiAOnnIghEmoq3eShNUzYSZ8SkZy4,4438
|
63
63
|
warn/scrapers/ut.py,sha256=iUh38YIjbvv5MyyKacsiZNe8KjfdBeDaOf-qMQEF_kc,2245
|
64
|
-
warn/scrapers/va.py,sha256=
|
64
|
+
warn/scrapers/va.py,sha256=hOPuiAjnTmtXCOdnBM_jAJuz9_u6oCxtbm2F-9m3ot0,10732
|
65
65
|
warn/scrapers/vt.py,sha256=d-bo4WK2hkrk4BhCCmLpEovcoZltlvdIUB6O0uaMx5A,1186
|
66
66
|
warn/scrapers/wa.py,sha256=UXdVtHZo_a-XfoiyOooTRfTb9W3PErSZdKca6SRORgs,4282
|
67
67
|
warn/scrapers/wi.py,sha256=ClEzXkwZbop0W4fkQgsb5oHAPUrb4luUPGV-jOKwkcg,4855
|
68
|
-
warn_scraper-1.2.
|
69
|
-
warn_scraper-1.2.
|
70
|
-
warn_scraper-1.2.
|
71
|
-
warn_scraper-1.2.
|
72
|
-
warn_scraper-1.2.
|
73
|
-
warn_scraper-1.2.
|
68
|
+
warn_scraper-1.2.101.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
69
|
+
warn_scraper-1.2.101.dist-info/METADATA,sha256=sgTsL26CgS1htJ6S-r0m62ljnet7Ilg4-zWjBSiJmdA,2385
|
70
|
+
warn_scraper-1.2.101.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
|
71
|
+
warn_scraper-1.2.101.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
|
72
|
+
warn_scraper-1.2.101.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
|
73
|
+
warn_scraper-1.2.101.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|