warn-scraper 1.2.75__tar.gz → 1.2.77__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {warn-scraper-1.2.75/warn_scraper.egg-info → warn-scraper-1.2.77}/PKG-INFO +1 -1
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/id.py +19 -4
- {warn-scraper-1.2.75 → warn-scraper-1.2.77/warn_scraper.egg-info}/PKG-INFO +1 -1
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/.devcontainer/devcontainer.json +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/.github/dependabot.yml.disabled +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/.github/workflows/continuous-deployment.yml +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/.gitignore +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/.pre-commit-config.yaml +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/LICENSE +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/MANIFEST.in +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/Makefile +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/Pipfile +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/Pipfile.lock +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/README.md +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/Makefile +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/_static/R42693.pdf +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/_static/gao-03-1003.pdf +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/_static/releasing-actions-finished.png +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/_static/releasing-actions-start.png +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/_static/releasing-changelog-button.png +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/_static/releasing-changelog-entered.png +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/_static/releasing-draft-button.png +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/_static/releasing-name-release.png +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/_static/releasing-name-tag.png +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/_static/releasing-publish-button.png +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/_static/releasing-pypi.png +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/_static/releasing-release-published.png +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/_static/releasing-releases-button.png +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/_static/releasing-tag-button.png +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/_templates/sources.md.tmpl +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/conf.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/contributing.rst +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/index.rst +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/make.bat +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/reference.rst +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/releasing.md +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/requirements.txt +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/scrapers/al.md +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/scrapers/az.md +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/scrapers/ca.md +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/scrapers/co.md +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/scrapers/dc.md +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/scrapers/de.md +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/scrapers/ia.md +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/scrapers/in.md +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/scrapers/job_center.md +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/scrapers/ks.md +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/scrapers/md.md +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/scrapers/me.md +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/scrapers/mo.md +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/scrapers/ny.md +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/scrapers/ok.md +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/scrapers/or.md +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/scrapers/sc.md +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/scrapers/tx.md +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/scrapers/ut.md +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/scrapers/va.md +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/scrapers/vt.md +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/scrapers/wi.md +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/sources.md +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/docs/usage.md +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/setup.cfg +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/setup.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/tests/__init__.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/tests/cassettes/test_cached_detail_pages.yaml +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/tests/cassettes/test_cached_search_results.yaml +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/tests/cassettes/test_missing_detail_page_values.yaml +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/tests/cassettes/test_no_results.yaml +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/tests/cassettes/test_paged_results.yaml +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/tests/cassettes/test_scrape_integration.yaml +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/tests/conftest.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/tests/fixtures/2021_page_1.html +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/tests/fixtures/2021_page_2.html +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/tests/test_cache.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/tests/test_delete.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/tests/test_job_center.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/tests/test_job_center_cache.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/tests/test_openpyxl.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/__init__.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/cache.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/cli.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/platforms/__init__.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/platforms/job_center/__init__.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/platforms/job_center/cache.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/platforms/job_center/site.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/platforms/job_center/urls.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/platforms/job_center/utils.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/runner.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/__init__.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/ak.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/al.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/az.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/ca.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/co.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/ct.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/dc.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/de.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/fl.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/ga.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/hi.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/ia.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/il.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/in.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/ks.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/ky.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/la.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/md.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/me.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/mi.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/mo.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/mt.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/ne.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/nj.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/nm.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/ny.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/oh.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/ok.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/or.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/ri.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/sc.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/sd.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/tn.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/tx.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/ut.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/va.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/vt.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/wa.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/scrapers/wi.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn/utils.py +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn_scraper.egg-info/SOURCES.txt +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn_scraper.egg-info/dependency_links.txt +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn_scraper.egg-info/entry_points.txt +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn_scraper.egg-info/not-zip-safe +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn_scraper.egg-info/requires.txt +0 -0
- {warn-scraper-1.2.75 → warn-scraper-1.2.77}/warn_scraper.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: warn-scraper
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.77
|
4
4
|
Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
|
5
5
|
Home-page: https://github.com/biglocalnews/warn-scraper
|
6
6
|
Author: Big Local News
|
@@ -4,6 +4,8 @@ import re
|
|
4
4
|
from pathlib import Path
|
5
5
|
|
6
6
|
import pdfplumber
|
7
|
+
import requests
|
8
|
+
from bs4 import BeautifulSoup
|
7
9
|
|
8
10
|
from .. import utils
|
9
11
|
from ..cache import Cache
|
@@ -12,7 +14,7 @@ __authors__ = ["chriszs", "stucka"]
|
|
12
14
|
__tags__ = ["pdf"]
|
13
15
|
__source__ = {
|
14
16
|
"name": "Idaho Department of Labor",
|
15
|
-
"url": "https://www.labor.idaho.gov/
|
17
|
+
"url": "https://www.labor.idaho.gov/businesss/layoff-assistance/",
|
16
18
|
}
|
17
19
|
|
18
20
|
logger = logging.getLogger(__name__)
|
@@ -32,7 +34,8 @@ def scrape(
|
|
32
34
|
Returns: the Path where the file is written
|
33
35
|
"""
|
34
36
|
# Create the URL of the source PDF
|
35
|
-
base_url = "https://www.labor.idaho.gov
|
37
|
+
base_url = "https://www.labor.idaho.gov"
|
38
|
+
start_url = "https://www.labor.idaho.gov/businesss/layoff-assistance/"
|
36
39
|
file_name = "WARNNotice.pdf"
|
37
40
|
# There's a numeric parameter called v on this PDF URL that updates
|
38
41
|
# from time to time. Suspect this is a cache-buster. We're using a
|
@@ -40,12 +43,23 @@ def scrape(
|
|
40
43
|
min_cache_buster = 0
|
41
44
|
max_cache_buster = 10000000000
|
42
45
|
cache_buster = random.randrange(min_cache_buster, max_cache_buster)
|
43
|
-
|
46
|
+
page_url = f"{start_url}?v={cache_buster}"
|
44
47
|
|
45
48
|
cache = Cache(cache_dir)
|
46
49
|
state_code = "id"
|
50
|
+
logger.debug(f"Trying to fetch page at {page_url}")
|
51
|
+
r = requests.get(page_url)
|
52
|
+
|
53
|
+
# Start finding the link before "Who to contact"
|
54
|
+
html = r.text
|
55
|
+
localizedhtml = html.split("<h2>Who to contact")[0]
|
56
|
+
soup = BeautifulSoup(localizedhtml, features="html5lib")
|
57
|
+
last_url = soup.find_all("a")[-1]["href"]
|
58
|
+
pdf_url = f"{base_url}{last_url}"
|
59
|
+
|
60
|
+
logger.debug(f"Trying to fetch PDF at {pdf_url}")
|
47
61
|
cache_key = f"{state_code}/{file_name}"
|
48
|
-
pdf_file = cache.download(cache_key,
|
62
|
+
pdf_file = cache.download(cache_key, pdf_url, verify=True)
|
49
63
|
|
50
64
|
# Loop through the PDF pages and scrape out the data
|
51
65
|
output_rows: list = []
|
@@ -124,6 +138,7 @@ def filter_garbage_rows(incoming: list):
|
|
124
138
|
badrows += 1
|
125
139
|
if badrows == 0:
|
126
140
|
logger.debug("No bad rows found.")
|
141
|
+
else:
|
127
142
|
logger.debug(
|
128
143
|
f"!!!!! {badrows:,} bad rows dropped from the data set with insufficient number of fields."
|
129
144
|
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: warn-scraper
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.77
|
4
4
|
Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
|
5
5
|
Home-page: https://github.com/biglocalnews/warn-scraper
|
6
6
|
Author: Big Local News
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{warn-scraper-1.2.75 → warn-scraper-1.2.77}/tests/cassettes/test_missing_detail_page_values.yaml
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|