warn-scraper 1.2.111__py3-none-any.whl → 1.2.112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
warn/scrapers/la.py CHANGED
@@ -1,16 +1,19 @@
1
1
  import logging
2
2
  import os
3
3
  import re
4
+ import sys
5
+ from base64 import b64decode
4
6
  from datetime import datetime
5
7
  from pathlib import Path
6
8
 
7
9
  import pdfplumber
10
+ import requests
8
11
  from bs4 import BeautifulSoup
9
12
 
10
13
  from .. import utils
11
14
  from ..cache import Cache
12
15
 
13
- __authors__ = ["chriszs"]
16
+ __authors__ = ["chriszs", "stucka"]
14
17
  __tags__ = ["html", "pdf"]
15
18
  __source__ = {
16
19
  "name": "Louisiana Workforce Commission",
@@ -33,6 +36,14 @@ def scrape(
33
36
 
34
37
  Returns: the Path where the file is written
35
38
  """
39
+ try:
40
+ zyte_api_key = os.environ["ZYTE_API_KEY"]
41
+ except KeyError:
42
+ logger.error(
43
+ "No ZYTE_API_KEY variable found in environment. Please get an API key from Zyte and export it."
44
+ )
45
+ sys.exit(1)
46
+
36
47
  # Fire up the cache
37
48
  cache = Cache(cache_dir)
38
49
 
@@ -43,10 +54,22 @@ def scrape(
43
54
 
44
55
  # Download the root page
45
56
  url = f"{base_url}Downloads/{file_base}.asp"
46
- html = utils.get_url(url).text
57
+ api_response = requests.post(
58
+ "https://api.zyte.com/v1/extract",
59
+ auth=(zyte_api_key, ""),
60
+ json={
61
+ "url": url,
62
+ "httpResponseBody": True,
63
+ "followRedirect": True,
64
+ },
65
+ )
66
+ html_bytes: bytes = b64decode(api_response.json()["httpResponseBody"])
67
+ # html = utils.get_url(url).text
68
+ html = html_bytes.decode("utf-8", errors="backslashreplace")
47
69
 
48
70
  # Save it to the cache
49
- cache_key = f"{state_code}/{file_base}.html"
71
+ cache_key = cache_dir / f"{state_code}/{file_base}.html"
72
+ utils.create_directory(Path(cache_key), is_file=True)
50
73
  cache.write(cache_key, html)
51
74
 
52
75
  # Parse out the links to WARN notice PDFs
@@ -59,9 +82,28 @@ def scrape(
59
82
  if "WARN Notices" in link.text:
60
83
  # Download the PDF
61
84
  pdf_url = f"{base_url}{link['href']}"
62
- pdf_path = _read_or_download(cache, state_code, pdf_url)
85
+ logger.debug(pdf_url)
86
+ api_response = requests.post(
87
+ "https://api.zyte.com/v1/extract",
88
+ auth=(zyte_api_key, ""),
89
+ json={
90
+ "url": pdf_url,
91
+ "httpResponseBody": True,
92
+ "followRedirect": True,
93
+ },
94
+ )
95
+ http_response_body: bytes = b64decode(
96
+ api_response.json()["httpResponseBody"]
97
+ )
98
+ pdf_path = cache_dir / f"{state_code}/{os.path.basename(pdf_url)}"
99
+
100
+ with open(pdf_path, "wb") as fp:
101
+ fp.write(http_response_body)
102
+
103
+ # pdf_path = _read_or_download(cache, state_code, pdf_url)
63
104
 
64
105
  # Process the PDF
106
+ logger.debug(f"Attempting to parse {pdf_path}")
65
107
  rows = _process_pdf(pdf_path)
66
108
  all_rows.extend(rows)
67
109
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: warn-scraper
3
- Version: 1.2.111
3
+ Version: 1.2.112
4
4
  Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
5
5
  Home-page: https://github.com/biglocalnews/warn-scraper
6
6
  Author: Big Local News
@@ -42,7 +42,7 @@ warn/scrapers/il.py,sha256=sygdvsNuB_Gvu3o_HidtpSP4FLz0szKb1zEHqGxVtlI,1563
42
42
  warn/scrapers/in.py,sha256=dAT40ROhhKiwLcwa_YJ6EyhsYBLe0IX2rOWXmNa6JMs,2026
43
43
  warn/scrapers/ks.py,sha256=F_3biEMF7zgCX2XVuUACR74Vyzapta4SaM9SY3EuZCU,1266
44
44
  warn/scrapers/ky.py,sha256=XjIojMpaoKbypa7l23IybP02jBijBCJG5UGqfO-EYjg,4365
45
- warn/scrapers/la.py,sha256=60z-4LZY5xp6aX8r6HGGW3FaOVEGnxlG2Mfgpt4G2WE,12877
45
+ warn/scrapers/la.py,sha256=z2dCJ0obnKy47ZL21MrtKTo5AdexRDj0BBxyV2uIY_8,14340
46
46
  warn/scrapers/md.py,sha256=hwgxXQnhyBWm8qF1dvxIThAX1MkrZbXLwRI9inO5t8g,4060
47
47
  warn/scrapers/me.py,sha256=q36F4yJ7hvZsLayA3uBS1romo4X3Qf-sEi2Y7LAQCi8,1172
48
48
  warn/scrapers/mi.py,sha256=9clZ9mATEJwdVLzDo_h66rK0aV5Zc7GGQ7AauutS6Wo,3591
@@ -65,9 +65,9 @@ warn/scrapers/va.py,sha256=7Nle7qL0VNPiE653XyaP9HQqSfuJFDRr2kEkjOqLvFM,11269
65
65
  warn/scrapers/vt.py,sha256=d-bo4WK2hkrk4BhCCmLpEovcoZltlvdIUB6O0uaMx5A,1186
66
66
  warn/scrapers/wa.py,sha256=UXdVtHZo_a-XfoiyOooTRfTb9W3PErSZdKca6SRORgs,4282
67
67
  warn/scrapers/wi.py,sha256=ClEzXkwZbop0W4fkQgsb5oHAPUrb4luUPGV-jOKwkcg,4855
68
- warn_scraper-1.2.111.dist-info/licenses/LICENSE,sha256=ZV-QHyqPwyMuwuj0lI05JeSjV1NyzVEk8Yeu7FPtYS0,585
69
- warn_scraper-1.2.111.dist-info/METADATA,sha256=DW2-aOg2XJoXTqAN9fgs-tkMf7JjoijXxr93sUpEVfA,2385
70
- warn_scraper-1.2.111.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
71
- warn_scraper-1.2.111.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
72
- warn_scraper-1.2.111.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
73
- warn_scraper-1.2.111.dist-info/RECORD,,
68
+ warn_scraper-1.2.112.dist-info/licenses/LICENSE,sha256=ZV-QHyqPwyMuwuj0lI05JeSjV1NyzVEk8Yeu7FPtYS0,585
69
+ warn_scraper-1.2.112.dist-info/METADATA,sha256=dHhKPqTbOMGVMZ_eDRMnQ1TEjaoBPvzdqHGPwyWDXsU,2385
70
+ warn_scraper-1.2.112.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
71
+ warn_scraper-1.2.112.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
72
+ warn_scraper-1.2.112.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
73
+ warn_scraper-1.2.112.dist-info/RECORD,,