warn-scraper 1.2.98__py3-none-any.whl → 1.2.100__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
warn/scrapers/va.py CHANGED
@@ -2,14 +2,19 @@ import datetime
2
2
  import logging
3
3
  import os
4
4
  import platform
5
+
6
+ # import subprocess
5
7
  from glob import glob
6
8
  from pathlib import Path
9
+ from random import random
7
10
  from shutil import copyfile
8
11
  from time import sleep
9
12
 
10
13
  from selenium import webdriver
11
14
  from selenium.webdriver.chrome.options import Options as ChromeOptions
12
15
  from selenium.webdriver.chrome.service import Service as ChromeService
16
+ from selenium.webdriver.common.by import By
17
+ from stealthenium import stealth
13
18
  from webdriver_manager.chrome import ChromeDriverManager
14
19
 
15
20
  from .. import utils
@@ -49,7 +54,9 @@ def scrape(
49
54
  Returns: the Path where the file is written
50
55
  """
51
56
  cache = Cache(cache_dir)
52
- csv_url = "https://vec.virginia.gov/warn-notices-csv.csv"
57
+ # csv_url = "https://vec.virginia.gov/warn-notices-csv.csv"
58
+ start_page = "https://www.virginiaworks.gov/warn-notices/"
59
+ csv_url = "https://vec.virginia.gov/warn_notices.csv"
53
60
 
54
61
  """
55
62
  This scraper originally tried to parse HTML to find a CSV download link.
@@ -117,6 +124,10 @@ def scrape(
117
124
  So, yes, this is a weird implementation. It's a terrible model. It's
118
125
  even got a hard-coded wait. At least as of late December 2024, however,
119
126
  it does work. ... in late December 2024.
127
+
128
+ And then it broke in early January 2025! But it's not an IP block.
129
+ They may have started blocking direct calls to the CSV. Code patched
130
+ in late January 2025 to use the Download button.
120
131
  """
121
132
 
122
133
  # driver = webdriver.Chrome(options=chromeoptionsholder, service=Service(ChromeDriverManager().install()))
@@ -126,6 +137,12 @@ def scrape(
126
137
  chromeoptionsholder.add_argument("--disable-dev-shm-usage")
127
138
  chromeoptionsholder.add_argument("--remote-debugging-pipe")
128
139
  chromeoptionsholder.add_argument("--verbose")
140
+ chromeoptionsholder.add_argument("start-maximized")
141
+ chromeoptionsholder.add_experimental_option(
142
+ "excludeSwitches", ["enable-automation"]
143
+ )
144
+ chromeoptionsholder.add_experimental_option("useAutomationExtension", False)
145
+ chromeoptionsholder.add_argument("--disable-blink-features=AutomationControlled")
129
146
 
130
147
  if "CHROMEWEBDRIVER" in os.environ:
131
148
  chrome_install = os.environ["CHROMEWEBDRIVER"] + "/chromedriver"
@@ -139,13 +156,50 @@ def scrape(
139
156
  )
140
157
  logger.debug(f"Chrome install variable is {chrome_install}")
141
158
 
159
+ # Hack on chromedriver itself, to try to be sneakier
160
+ # So many bad ideas coming together here
161
+ # perlstr = f"perl -pi -e 's/cdc_/ugh_/g' {chrome_install}"
162
+ # logger.debug(perlstr)
163
+ # process = subprocess.run(perlstr.split(), capture_output=True, text=True)
164
+ # logger.debug(f"process stdout: {process.stdout}")
165
+ # logger.debug(f"process stderr: {process.stderr}")
166
+
142
167
  # Launch X Windows emulator, then launch Chrome to run with it
143
168
  with Xvfb() as xvfb: # noqa: F841
144
169
  service = ChromeService(chrome_install, service_args=["--verbose"])
170
+ # driver = webdriver.Chrome(options=chromeoptionsholder, service=service)
171
+ # driver = webdriver.Remote(options=chromeoptionsholder, service=service)
172
+ # capabilities = DesiredCapabilities.CHROME.copy()
173
+ # driver = webdriver.Remote(options=chromeoptionsholder, desired_capapabilities=capabilities, command_executor="http://localhost:4444/wd/hub")
174
+ # driver = webdriver.Chrome(options=chromeoptionsholder, service=service)
175
+ service = ChromeService(chrome_install, service_args=["--verbose"], port=5600)
145
176
  driver = webdriver.Chrome(options=chromeoptionsholder, service=service)
177
+ driver.command_executor._url = "http://localhost:5600"
178
+ # driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
179
+
180
+ stealth(
181
+ driver,
182
+ languages=["en-US", "en"],
183
+ vendor="Google Inc.",
184
+ platform="Win32",
185
+ webgl_vendor="Intel Inc.",
186
+ renderer="Intel Iris OpenGL Engine",
187
+ fix_hairline=True,
188
+ )
189
+
190
+ logger.debug(f"Attempting to fetch {start_page}")
191
+ driver.get(start_page)
192
+ sleep((4 * random()) + 3)
193
+ driver.find_element(By.ID, "warn-notice-well").find_element(
194
+ By.PARTIAL_LINK_TEXT, "Download"
195
+ ).click()
196
+
146
197
  logger.debug(f"Attempting to fetch {csv_url}")
147
- driver.get(csv_url)
148
- sleep(30) # Give it plenty of time to evaluate Javascript
198
+ # driver.get(csv_url)
199
+ sleep(45) # Give it plenty of time to evaluate Javascript
200
+ # driver.get(csv_url)
201
+ # sleep(10)
202
+ driver.quit()
149
203
 
150
204
  download_dir = os.path.expanduser("~") + "/Downloads"
151
205
 
@@ -153,7 +207,7 @@ def scrape(
153
207
  logger.error(f"The download directory is not {download_dir}.")
154
208
 
155
209
  # get the list of files
156
- list_of_files = glob(download_dir + "/warn-notices-csv*.csv")
210
+ list_of_files = glob(download_dir + "/warn_notices*.csv")
157
211
  if len(list_of_files) == 0:
158
212
  logger.error(f"No matching files found in {download_dir}.")
159
213
 
@@ -172,8 +226,6 @@ def scrape(
172
226
 
173
227
  copyfile(latest_file, target_filename)
174
228
 
175
- driver.quit()
176
-
177
229
  # Download it to the cache
178
230
  # cache.download("va/source.csv", csv_url, verify=True)
179
231
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: warn-scraper
3
- Version: 1.2.98
3
+ Version: 1.2.100
4
4
  Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
5
5
  Home-page: https://github.com/biglocalnews/warn-scraper
6
6
  Author: Big Local News
@@ -31,6 +31,7 @@ Requires-Dist: openpyxl
31
31
  Requires-Dist: pyopenssl
32
32
  Requires-Dist: retry
33
33
  Requires-Dist: selenium
34
+ Requires-Dist: stealthenium
34
35
  Requires-Dist: tenacity
35
36
  Requires-Dist: xlrd
36
37
  Requires-Dist: xvfbwrapper
@@ -61,13 +61,13 @@ warn/scrapers/sd.py,sha256=_4R19Ybzsyx1PvcWV3_laJmJ3etrwVGfhNEQm6njwoA,1904
61
61
  warn/scrapers/tn.py,sha256=i1H7c09Ea3CDrTXqqRMLBMPT_34QtGA0-x7T8rm_j5Q,2945
62
62
  warn/scrapers/tx.py,sha256=watfR1gyN9w7nluiAOnnIghEmoq3eShNUzYSZ8SkZy4,4438
63
63
  warn/scrapers/ut.py,sha256=iUh38YIjbvv5MyyKacsiZNe8KjfdBeDaOf-qMQEF_kc,2245
64
- warn/scrapers/va.py,sha256=SBL0lHEvL9DxjvMS6Rim-o9Ow1j4QHR7CWB7eMbbeec,8284
64
+ warn/scrapers/va.py,sha256=hOPuiAjnTmtXCOdnBM_jAJuz9_u6oCxtbm2F-9m3ot0,10732
65
65
  warn/scrapers/vt.py,sha256=d-bo4WK2hkrk4BhCCmLpEovcoZltlvdIUB6O0uaMx5A,1186
66
66
  warn/scrapers/wa.py,sha256=UXdVtHZo_a-XfoiyOooTRfTb9W3PErSZdKca6SRORgs,4282
67
67
  warn/scrapers/wi.py,sha256=ClEzXkwZbop0W4fkQgsb5oHAPUrb4luUPGV-jOKwkcg,4855
68
- warn_scraper-1.2.98.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
69
- warn_scraper-1.2.98.dist-info/METADATA,sha256=tMwNHjkc4F4yEa1eYr_kwSYbGsKDRhyD_tOpjv6ciMI,2145
70
- warn_scraper-1.2.98.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
71
- warn_scraper-1.2.98.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
72
- warn_scraper-1.2.98.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
73
- warn_scraper-1.2.98.dist-info/RECORD,,
68
+ warn_scraper-1.2.100.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
69
+ warn_scraper-1.2.100.dist-info/METADATA,sha256=BpOnyGYpiaTU1rBJgJUSx4S0Qxv--QMfSpfxeJBYRFA,2174
70
+ warn_scraper-1.2.100.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
71
+ warn_scraper-1.2.100.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
72
+ warn_scraper-1.2.100.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
73
+ warn_scraper-1.2.100.dist-info/RECORD,,