warn-scraper 1.2.99__py3-none-any.whl → 1.2.101__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
warn/scrapers/va.py CHANGED
@@ -2,14 +2,19 @@ import datetime
2
2
  import logging
3
3
  import os
4
4
  import platform
5
+
6
+ # import subprocess
5
7
  from glob import glob
6
8
  from pathlib import Path
9
+ from random import random
7
10
  from shutil import copyfile
8
11
  from time import sleep
9
12
 
10
13
  from selenium import webdriver
11
14
  from selenium.webdriver.chrome.options import Options as ChromeOptions
12
15
  from selenium.webdriver.chrome.service import Service as ChromeService
16
+ from selenium.webdriver.common.by import By
17
+ from stealthenium import stealth
13
18
  from webdriver_manager.chrome import ChromeDriverManager
14
19
 
15
20
  from .. import utils
@@ -50,6 +55,7 @@ def scrape(
50
55
  """
51
56
  cache = Cache(cache_dir)
52
57
  # csv_url = "https://vec.virginia.gov/warn-notices-csv.csv"
58
+ start_page = "https://www.virginiaworks.gov/warn-notices/"
53
59
  csv_url = "https://vec.virginia.gov/warn_notices.csv"
54
60
 
55
61
  """
@@ -118,6 +124,10 @@ def scrape(
118
124
  So, yes, this is a weird implementation. It's a terrible model. It's
119
125
  even got a hard-coded wait. At least as of late December 2024, however,
120
126
  it does work. ... in late December 2024.
127
+
128
+ And then it broke in early January 2025! But it's not an IP block.
129
+ They may have started blocking direct calls to the CSV. Code patched
130
+ in late January 2025 to use the Download button.
121
131
  """
122
132
 
123
133
  # driver = webdriver.Chrome(options=chromeoptionsholder, service=Service(ChromeDriverManager().install()))
@@ -127,6 +137,12 @@ def scrape(
127
137
  chromeoptionsholder.add_argument("--disable-dev-shm-usage")
128
138
  chromeoptionsholder.add_argument("--remote-debugging-pipe")
129
139
  chromeoptionsholder.add_argument("--verbose")
140
+ chromeoptionsholder.add_argument("start-maximized")
141
+ chromeoptionsholder.add_experimental_option(
142
+ "excludeSwitches", ["enable-automation"]
143
+ )
144
+ chromeoptionsholder.add_experimental_option("useAutomationExtension", False)
145
+ chromeoptionsholder.add_argument("--disable-blink-features=AutomationControlled")
130
146
 
131
147
  if "CHROMEWEBDRIVER" in os.environ:
132
148
  chrome_install = os.environ["CHROMEWEBDRIVER"] + "/chromedriver"
@@ -140,15 +156,49 @@ def scrape(
140
156
  )
141
157
  logger.debug(f"Chrome install variable is {chrome_install}")
142
158
 
159
+ # Hack on chromedriver itself, to try to be sneakier
160
+ # So many bad ideas coming together here
161
+ # perlstr = f"perl -pi -e 's/cdc_/ugh_/g' {chrome_install}"
162
+ # logger.debug(perlstr)
163
+ # process = subprocess.run(perlstr.split(), capture_output=True, text=True)
164
+ # logger.debug(f"process stdout: {process.stdout}")
165
+ # logger.debug(f"process stderr: {process.stderr}")
166
+
143
167
  # Launch X Windows emulator, then launch Chrome to run with it
144
168
  with Xvfb() as xvfb: # noqa: F841
145
169
  service = ChromeService(chrome_install, service_args=["--verbose"])
170
+ # driver = webdriver.Chrome(options=chromeoptionsholder, service=service)
171
+ # driver = webdriver.Remote(options=chromeoptionsholder, service=service)
172
+ # capabilities = DesiredCapabilities.CHROME.copy()
173
+ # driver = webdriver.Remote(options=chromeoptionsholder, desired_capapabilities=capabilities, command_executor="http://localhost:4444/wd/hub")
174
+ # driver = webdriver.Chrome(options=chromeoptionsholder, service=service)
175
+ service = ChromeService(chrome_install, service_args=["--verbose"], port=5600)
146
176
  driver = webdriver.Chrome(options=chromeoptionsholder, service=service)
177
+ driver.command_executor._url = "http://localhost:5600"
178
+ # driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
179
+
180
+ stealth(
181
+ driver,
182
+ languages=["en-US", "en"],
183
+ vendor="Google Inc.",
184
+ platform="Win32",
185
+ webgl_vendor="Intel Inc.",
186
+ renderer="Intel Iris OpenGL Engine",
187
+ fix_hairline=True,
188
+ )
189
+
190
+ logger.debug(f"Attempting to fetch {start_page}")
191
+ driver.get(start_page)
192
+ sleep((4 * random()) + 3)
193
+ driver.find_element(By.ID, "warn-notice-well").find_element(
194
+ By.PARTIAL_LINK_TEXT, "Download"
195
+ ).click()
196
+
147
197
  logger.debug(f"Attempting to fetch {csv_url}")
148
- driver.get(csv_url)
198
+ # driver.get(csv_url)
149
199
  sleep(45) # Give it plenty of time to evaluate Javascript
150
- driver.get(csv_url)
151
- sleep(10)
200
+ # driver.get(csv_url)
201
+ # sleep(10)
152
202
  driver.quit()
153
203
 
154
204
  download_dir = os.path.expanduser("~") + "/Downloads"
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: warn-scraper
3
- Version: 1.2.99
3
+ Version: 1.2.101
4
4
  Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
5
5
  Home-page: https://github.com/biglocalnews/warn-scraper
6
6
  Author: Big Local News
@@ -29,12 +29,23 @@ Requires-Dist: pdfplumber
29
29
  Requires-Dist: requests
30
30
  Requires-Dist: openpyxl
31
31
  Requires-Dist: pyopenssl
32
- Requires-Dist: retry
32
+ Requires-Dist: retry2
33
33
  Requires-Dist: selenium
34
+ Requires-Dist: stealthenium
34
35
  Requires-Dist: tenacity
35
36
  Requires-Dist: xlrd
36
37
  Requires-Dist: xvfbwrapper
37
38
  Requires-Dist: webdriver-manager
39
+ Dynamic: author
40
+ Dynamic: classifier
41
+ Dynamic: description
42
+ Dynamic: description-content-type
43
+ Dynamic: home-page
44
+ Dynamic: license
45
+ Dynamic: license-file
46
+ Dynamic: project-url
47
+ Dynamic: requires-dist
48
+ Dynamic: summary
38
49
 
39
50
  ## Links
40
51
 
@@ -61,13 +61,13 @@ warn/scrapers/sd.py,sha256=_4R19Ybzsyx1PvcWV3_laJmJ3etrwVGfhNEQm6njwoA,1904
61
61
  warn/scrapers/tn.py,sha256=i1H7c09Ea3CDrTXqqRMLBMPT_34QtGA0-x7T8rm_j5Q,2945
62
62
  warn/scrapers/tx.py,sha256=watfR1gyN9w7nluiAOnnIghEmoq3eShNUzYSZ8SkZy4,4438
63
63
  warn/scrapers/ut.py,sha256=iUh38YIjbvv5MyyKacsiZNe8KjfdBeDaOf-qMQEF_kc,2245
64
- warn/scrapers/va.py,sha256=AXcj3VpNfprhwVqVyc7hYzLamWtsf1_yOvpXSitpZeM,8389
64
+ warn/scrapers/va.py,sha256=hOPuiAjnTmtXCOdnBM_jAJuz9_u6oCxtbm2F-9m3ot0,10732
65
65
  warn/scrapers/vt.py,sha256=d-bo4WK2hkrk4BhCCmLpEovcoZltlvdIUB6O0uaMx5A,1186
66
66
  warn/scrapers/wa.py,sha256=UXdVtHZo_a-XfoiyOooTRfTb9W3PErSZdKca6SRORgs,4282
67
67
  warn/scrapers/wi.py,sha256=ClEzXkwZbop0W4fkQgsb5oHAPUrb4luUPGV-jOKwkcg,4855
68
- warn_scraper-1.2.99.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
69
- warn_scraper-1.2.99.dist-info/METADATA,sha256=UQ6xOoUHiV0l3bZONrApFK92uoWhZnoVbSvTb4QXVlg,2145
70
- warn_scraper-1.2.99.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
71
- warn_scraper-1.2.99.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
72
- warn_scraper-1.2.99.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
73
- warn_scraper-1.2.99.dist-info/RECORD,,
68
+ warn_scraper-1.2.101.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
69
+ warn_scraper-1.2.101.dist-info/METADATA,sha256=sgTsL26CgS1htJ6S-r0m62ljnet7Ilg4-zWjBSiJmdA,2385
70
+ warn_scraper-1.2.101.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
71
+ warn_scraper-1.2.101.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
72
+ warn_scraper-1.2.101.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
73
+ warn_scraper-1.2.101.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (71.1.0)
2
+ Generator: setuptools (79.0.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5