warn-scraper 1.2.111__py3-none-any.whl → 1.2.112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- warn/scrapers/la.py +46 -4
- {warn_scraper-1.2.111.dist-info → warn_scraper-1.2.112.dist-info}/METADATA +1 -1
- {warn_scraper-1.2.111.dist-info → warn_scraper-1.2.112.dist-info}/RECORD +7 -7
- {warn_scraper-1.2.111.dist-info → warn_scraper-1.2.112.dist-info}/WHEEL +0 -0
- {warn_scraper-1.2.111.dist-info → warn_scraper-1.2.112.dist-info}/entry_points.txt +0 -0
- {warn_scraper-1.2.111.dist-info → warn_scraper-1.2.112.dist-info}/licenses/LICENSE +0 -0
- {warn_scraper-1.2.111.dist-info → warn_scraper-1.2.112.dist-info}/top_level.txt +0 -0
warn/scrapers/la.py
CHANGED
@@ -1,16 +1,19 @@
|
|
1
1
|
import logging
|
2
2
|
import os
|
3
3
|
import re
|
4
|
+
import sys
|
5
|
+
from base64 import b64decode
|
4
6
|
from datetime import datetime
|
5
7
|
from pathlib import Path
|
6
8
|
|
7
9
|
import pdfplumber
|
10
|
+
import requests
|
8
11
|
from bs4 import BeautifulSoup
|
9
12
|
|
10
13
|
from .. import utils
|
11
14
|
from ..cache import Cache
|
12
15
|
|
13
|
-
__authors__ = ["chriszs"]
|
16
|
+
__authors__ = ["chriszs", "stucka"]
|
14
17
|
__tags__ = ["html", "pdf"]
|
15
18
|
__source__ = {
|
16
19
|
"name": "Louisiana Workforce Commission",
|
@@ -33,6 +36,14 @@ def scrape(
|
|
33
36
|
|
34
37
|
Returns: the Path where the file is written
|
35
38
|
"""
|
39
|
+
try:
|
40
|
+
zyte_api_key = os.environ["ZYTE_API_KEY"]
|
41
|
+
except KeyError:
|
42
|
+
logger.error(
|
43
|
+
"No ZYTE_API_KEY variable found in environment. Please get an API key from Zyte and export it."
|
44
|
+
)
|
45
|
+
sys.exit(1)
|
46
|
+
|
36
47
|
# Fire up the cache
|
37
48
|
cache = Cache(cache_dir)
|
38
49
|
|
@@ -43,10 +54,22 @@ def scrape(
|
|
43
54
|
|
44
55
|
# Download the root page
|
45
56
|
url = f"{base_url}Downloads/{file_base}.asp"
|
46
|
-
|
57
|
+
api_response = requests.post(
|
58
|
+
"https://api.zyte.com/v1/extract",
|
59
|
+
auth=(zyte_api_key, ""),
|
60
|
+
json={
|
61
|
+
"url": url,
|
62
|
+
"httpResponseBody": True,
|
63
|
+
"followRedirect": True,
|
64
|
+
},
|
65
|
+
)
|
66
|
+
html_bytes: bytes = b64decode(api_response.json()["httpResponseBody"])
|
67
|
+
# html = utils.get_url(url).text
|
68
|
+
html = html_bytes.decode("utf-8", errors="backslashreplace")
|
47
69
|
|
48
70
|
# Save it to the cache
|
49
|
-
cache_key = f"{state_code}/{file_base}.html"
|
71
|
+
cache_key = cache_dir / f"{state_code}/{file_base}.html"
|
72
|
+
utils.create_directory(Path(cache_key), is_file=True)
|
50
73
|
cache.write(cache_key, html)
|
51
74
|
|
52
75
|
# Parse out the links to WARN notice PDFs
|
@@ -59,9 +82,28 @@ def scrape(
|
|
59
82
|
if "WARN Notices" in link.text:
|
60
83
|
# Download the PDF
|
61
84
|
pdf_url = f"{base_url}{link['href']}"
|
62
|
-
|
85
|
+
logger.debug(pdf_url)
|
86
|
+
api_response = requests.post(
|
87
|
+
"https://api.zyte.com/v1/extract",
|
88
|
+
auth=(zyte_api_key, ""),
|
89
|
+
json={
|
90
|
+
"url": pdf_url,
|
91
|
+
"httpResponseBody": True,
|
92
|
+
"followRedirect": True,
|
93
|
+
},
|
94
|
+
)
|
95
|
+
http_response_body: bytes = b64decode(
|
96
|
+
api_response.json()["httpResponseBody"]
|
97
|
+
)
|
98
|
+
pdf_path = cache_dir / f"{state_code}/{os.path.basename(pdf_url)}"
|
99
|
+
|
100
|
+
with open(pdf_path, "wb") as fp:
|
101
|
+
fp.write(http_response_body)
|
102
|
+
|
103
|
+
# pdf_path = _read_or_download(cache, state_code, pdf_url)
|
63
104
|
|
64
105
|
# Process the PDF
|
106
|
+
logger.debug(f"Attempting to parse {pdf_path}")
|
65
107
|
rows = _process_pdf(pdf_path)
|
66
108
|
all_rows.extend(rows)
|
67
109
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: warn-scraper
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.112
|
4
4
|
Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
|
5
5
|
Home-page: https://github.com/biglocalnews/warn-scraper
|
6
6
|
Author: Big Local News
|
@@ -42,7 +42,7 @@ warn/scrapers/il.py,sha256=sygdvsNuB_Gvu3o_HidtpSP4FLz0szKb1zEHqGxVtlI,1563
|
|
42
42
|
warn/scrapers/in.py,sha256=dAT40ROhhKiwLcwa_YJ6EyhsYBLe0IX2rOWXmNa6JMs,2026
|
43
43
|
warn/scrapers/ks.py,sha256=F_3biEMF7zgCX2XVuUACR74Vyzapta4SaM9SY3EuZCU,1266
|
44
44
|
warn/scrapers/ky.py,sha256=XjIojMpaoKbypa7l23IybP02jBijBCJG5UGqfO-EYjg,4365
|
45
|
-
warn/scrapers/la.py,sha256=
|
45
|
+
warn/scrapers/la.py,sha256=z2dCJ0obnKy47ZL21MrtKTo5AdexRDj0BBxyV2uIY_8,14340
|
46
46
|
warn/scrapers/md.py,sha256=hwgxXQnhyBWm8qF1dvxIThAX1MkrZbXLwRI9inO5t8g,4060
|
47
47
|
warn/scrapers/me.py,sha256=q36F4yJ7hvZsLayA3uBS1romo4X3Qf-sEi2Y7LAQCi8,1172
|
48
48
|
warn/scrapers/mi.py,sha256=9clZ9mATEJwdVLzDo_h66rK0aV5Zc7GGQ7AauutS6Wo,3591
|
@@ -65,9 +65,9 @@ warn/scrapers/va.py,sha256=7Nle7qL0VNPiE653XyaP9HQqSfuJFDRr2kEkjOqLvFM,11269
|
|
65
65
|
warn/scrapers/vt.py,sha256=d-bo4WK2hkrk4BhCCmLpEovcoZltlvdIUB6O0uaMx5A,1186
|
66
66
|
warn/scrapers/wa.py,sha256=UXdVtHZo_a-XfoiyOooTRfTb9W3PErSZdKca6SRORgs,4282
|
67
67
|
warn/scrapers/wi.py,sha256=ClEzXkwZbop0W4fkQgsb5oHAPUrb4luUPGV-jOKwkcg,4855
|
68
|
-
warn_scraper-1.2.
|
69
|
-
warn_scraper-1.2.
|
70
|
-
warn_scraper-1.2.
|
71
|
-
warn_scraper-1.2.
|
72
|
-
warn_scraper-1.2.
|
73
|
-
warn_scraper-1.2.
|
68
|
+
warn_scraper-1.2.112.dist-info/licenses/LICENSE,sha256=ZV-QHyqPwyMuwuj0lI05JeSjV1NyzVEk8Yeu7FPtYS0,585
|
69
|
+
warn_scraper-1.2.112.dist-info/METADATA,sha256=dHhKPqTbOMGVMZ_eDRMnQ1TEjaoBPvzdqHGPwyWDXsU,2385
|
70
|
+
warn_scraper-1.2.112.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
|
71
|
+
warn_scraper-1.2.112.dist-info/entry_points.txt,sha256=poh_oSweObGlBSs1_2qZmnTodlOYD0KfO7-h7W2UQIw,47
|
72
|
+
warn_scraper-1.2.112.dist-info/top_level.txt,sha256=gOhHgNEkrUvajlzoKkVOo-TlQht9MoXnKOErjzqLGHo,11
|
73
|
+
warn_scraper-1.2.112.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|