warn-scraper 1.2.107__tar.gz → 1.2.109__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {warn_scraper-1.2.107/warn_scraper.egg-info → warn_scraper-1.2.109}/PKG-INFO +1 -1
- warn_scraper-1.2.109/warn/scrapers/co.py +365 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/dc.py +3 -1
- {warn_scraper-1.2.107 → warn_scraper-1.2.109/warn_scraper.egg-info}/PKG-INFO +1 -1
- warn_scraper-1.2.107/warn/scrapers/co.py +0 -238
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/.devcontainer/devcontainer.json +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/.github/dependabot.yml.disabled-for-sanity +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/.github/workflows/continuous-deployment.yml +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/.github/workflows/continuous-deployment.yml.broken-tests +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/.gitignore +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/.pre-commit-config.yaml +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/LICENSE +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/MANIFEST.in +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/Makefile +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/Pipfile +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/Pipfile.lock +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/README.md +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/Makefile +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/_static/R42693.pdf +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/_static/gao-03-1003.pdf +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/_static/releasing-actions-finished.png +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/_static/releasing-actions-start.png +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/_static/releasing-changelog-button.png +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/_static/releasing-changelog-entered.png +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/_static/releasing-draft-button.png +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/_static/releasing-name-release.png +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/_static/releasing-name-tag.png +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/_static/releasing-publish-button.png +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/_static/releasing-pypi.png +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/_static/releasing-release-published.png +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/_static/releasing-releases-button.png +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/_static/releasing-tag-button.png +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/_templates/sources.md.tmpl +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/conf.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/contributing.rst +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/index.rst +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/make.bat +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/reference.rst +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/releasing.md +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/requirements.txt +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/scrapers/al.md +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/scrapers/az.md +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/scrapers/ca.md +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/scrapers/co.md +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/scrapers/dc.md +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/scrapers/de.md +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/scrapers/ia.md +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/scrapers/in.md +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/scrapers/job_center.md +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/scrapers/ks.md +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/scrapers/md.md +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/scrapers/me.md +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/scrapers/mo.md +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/scrapers/ny.md +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/scrapers/ok.md +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/scrapers/or.md +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/scrapers/sc.md +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/scrapers/tx.md +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/scrapers/ut.md +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/scrapers/va.md +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/scrapers/vt.md +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/scrapers/wi.md +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/sources.md +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/docs/usage.md +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/setup.cfg +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/setup.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/tests/__init__.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/tests/cassettes/test_cached_detail_pages.yaml +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/tests/cassettes/test_cached_search_results.yaml +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/tests/cassettes/test_missing_detail_page_values.yaml +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/tests/cassettes/test_no_results.yaml +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/tests/cassettes/test_paged_results.yaml +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/tests/cassettes/test_scrape_integration.yaml +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/tests/conftest.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/tests/fixtures/2021_page_1.html +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/tests/fixtures/2021_page_2.html +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/tests/test_cache.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/tests/test_delete.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/tests/test_job_center.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/tests/test_job_center_cache.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/tests/test_openpyxl.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/__init__.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/cache.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/cli.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/platforms/__init__.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/platforms/job_center/__init__.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/platforms/job_center/cache.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/platforms/job_center/site.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/platforms/job_center/urls.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/platforms/job_center/utils.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/runner.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/__init__.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/ak.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/al.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/az.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/ca.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/ct.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/de.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/fl.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/ga.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/hi.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/ia.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/id.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/il.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/in.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/ks.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/ky.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/la.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/md.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/me.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/mi.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/mo.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/mt.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/ne.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/nj.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/nm.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/ny.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/oh.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/ok.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/or.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/ri.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/sc.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/sd.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/tn.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/tx.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/ut.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/va.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/vt.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/wa.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/scrapers/wi.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn/utils.py +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn_scraper.egg-info/SOURCES.txt +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn_scraper.egg-info/dependency_links.txt +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn_scraper.egg-info/entry_points.txt +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn_scraper.egg-info/not-zip-safe +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn_scraper.egg-info/requires.txt +0 -0
- {warn_scraper-1.2.107 → warn_scraper-1.2.109}/warn_scraper.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: warn-scraper
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.109
|
4
4
|
Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
|
5
5
|
Home-page: https://github.com/biglocalnews/warn-scraper
|
6
6
|
Author: Big Local News
|
@@ -0,0 +1,365 @@
|
|
1
|
+
import logging
|
2
|
+
from pathlib import Path
|
3
|
+
|
4
|
+
from bs4 import BeautifulSoup, Tag
|
5
|
+
|
6
|
+
from .. import utils
|
7
|
+
from ..cache import Cache
|
8
|
+
|
9
|
+
__authors__ = ["anikasikka", "stucka"]
|
10
|
+
__tags__ = ["html"]
|
11
|
+
__source__ = {
|
12
|
+
"name": "Colorado Department of Labor and Employment",
|
13
|
+
"url": "https://cdle.colorado.gov/employers/layoff-separations/layoff-warn-list",
|
14
|
+
}
|
15
|
+
|
16
|
+
logger = logging.getLogger(__name__)
|
17
|
+
|
18
|
+
|
19
|
+
def scrape(
|
20
|
+
data_dir: Path = utils.WARN_DATA_DIR,
|
21
|
+
cache_dir: Path = utils.WARN_CACHE_DIR,
|
22
|
+
) -> Path:
|
23
|
+
"""
|
24
|
+
Scrape data from Colorado.
|
25
|
+
|
26
|
+
Keyword arguments:
|
27
|
+
data_dir -- the Path were the result will be saved (default WARN_DATA_DIR)
|
28
|
+
cache_dir -- the Path where results can be cached (default WARN_CACHE_DIR)
|
29
|
+
|
30
|
+
Returns: the Path where the file is written
|
31
|
+
"""
|
32
|
+
# Grab the page
|
33
|
+
page = utils.get_url(
|
34
|
+
"https://cdle.colorado.gov/employers/layoff-separations/layoff-warn-list"
|
35
|
+
)
|
36
|
+
html = page.text
|
37
|
+
|
38
|
+
# Write the raw file to the cache
|
39
|
+
cache = Cache(cache_dir)
|
40
|
+
cache.write("co/main/source.html", html)
|
41
|
+
|
42
|
+
# Parse the page
|
43
|
+
soup = BeautifulSoup(html, "html5lib")
|
44
|
+
|
45
|
+
# Get the link to the Google Sheet that's on the page
|
46
|
+
content_region = soup.find(class_="region-content")
|
47
|
+
if isinstance(content_region, Tag):
|
48
|
+
current_link = content_region.find("a", class_="btn-dark-blue")
|
49
|
+
else:
|
50
|
+
raise ValueError("Could not find content region")
|
51
|
+
if isinstance(current_link, Tag):
|
52
|
+
current_href = current_link["href"]
|
53
|
+
else:
|
54
|
+
raise ValueError("Could not find Google Sheet link")
|
55
|
+
|
56
|
+
# Scraper had been working off partially loaded impression of the HTML in the dOM.
|
57
|
+
# This keyboard is not helping.
|
58
|
+
# Anyway, instead of trying to get a partially complete version and parse the HTML there,
|
59
|
+
# let's try to get the actual HTML export of the page.
|
60
|
+
# 2016 has a different filename schema we need to account for.
|
61
|
+
|
62
|
+
if "/edit" in current_href:
|
63
|
+
better_link = current_href.split("/edit")[0] + "/gviz/tq?tqx=out:html" # type: ignore
|
64
|
+
elif "drive.google.com/open?id=" in current_href: # Work from the ID
|
65
|
+
better_link = "https://docs.google.com/spreadsheets/d/"
|
66
|
+
better_link += current_href.split("open?id=")[-1] # type: ignore
|
67
|
+
better_link += "/gviz/tq?tqx=out:html"
|
68
|
+
else:
|
69
|
+
raise ValueError(f"Could not adapt {current_href} to find HTML export.")
|
70
|
+
|
71
|
+
# Open the Google Sheet
|
72
|
+
current_page = utils.get_url(better_link)
|
73
|
+
current_html = current_page.text
|
74
|
+
|
75
|
+
# Parse the Google Sheet
|
76
|
+
soup_current = BeautifulSoup(current_html, "html5lib")
|
77
|
+
# table = soup_current.find(class_="waffle")
|
78
|
+
table = soup_current.find("table")
|
79
|
+
cleaned_data = scrape_google_sheets(table)
|
80
|
+
|
81
|
+
# Goes through the accordion links to get past data
|
82
|
+
content_region = soup.find(class_="region-content")
|
83
|
+
if isinstance(content_region, Tag):
|
84
|
+
accordion_list = content_region.find_all("dl")
|
85
|
+
else:
|
86
|
+
raise ValueError("Could not find content region")
|
87
|
+
|
88
|
+
# Make sure there's only one
|
89
|
+
assert len(accordion_list) == 1
|
90
|
+
|
91
|
+
# Grab the first one from the list
|
92
|
+
accordion = accordion_list[0]
|
93
|
+
|
94
|
+
link_list = [a for a in accordion.find_all("a") if "feedback" not in a.text]
|
95
|
+
logger.debug(f"Requesting {len(link_list)} discovered links")
|
96
|
+
for link in link_list:
|
97
|
+
bad_url = link["href"]
|
98
|
+
# Scraper had been working off partially loaded impression of the HTML in the dOM.
|
99
|
+
# This keyboard is not helping.
|
100
|
+
# Anyway, instead of trying to get a partially complete version and parse the HTML there,
|
101
|
+
# let's try to get the actual HTML export of the page.
|
102
|
+
# 2016 has a different filename schema we need to account for.
|
103
|
+
|
104
|
+
if "/edit" in bad_url:
|
105
|
+
better_link = bad_url.split("/edit")[0] + "/gviz/tq?tqx=out:html"
|
106
|
+
elif "drive.google.com/open?id=" in bad_url:
|
107
|
+
better_link = "https://docs.google.com/spreadsheets/d/"
|
108
|
+
better_link += bad_url.split("open?id=")[-1] # Get just the Id
|
109
|
+
better_link += "/gviz/tq?tqx=out:html"
|
110
|
+
else:
|
111
|
+
raise ValueError(f"Could not adapt {bad_url} to find HTML export.")
|
112
|
+
|
113
|
+
page = utils.get_url(better_link)
|
114
|
+
|
115
|
+
soup = BeautifulSoup(page.text, "html5lib")
|
116
|
+
table = soup.find("table")
|
117
|
+
if "2017" in link.text:
|
118
|
+
header_list = [
|
119
|
+
"Company",
|
120
|
+
"Layoff Total",
|
121
|
+
"Workforce Region",
|
122
|
+
"WARN Date",
|
123
|
+
"Reason for Layoff",
|
124
|
+
]
|
125
|
+
elif "2019" in link.text:
|
126
|
+
header_list = [
|
127
|
+
"Company Name",
|
128
|
+
"Layoff Total",
|
129
|
+
"Workforce Local Area",
|
130
|
+
"WARN Date",
|
131
|
+
"Reason for Layoff",
|
132
|
+
"Occupations",
|
133
|
+
"Layoff Date(s)",
|
134
|
+
]
|
135
|
+
else:
|
136
|
+
header_list = []
|
137
|
+
cleaned_data += scrape_google_sheets(table, header_list)
|
138
|
+
|
139
|
+
# Clean up the headers
|
140
|
+
header_crosswalk = {
|
141
|
+
"Company Name": "company",
|
142
|
+
"Company": "company",
|
143
|
+
"Name": "company",
|
144
|
+
"WARN Date": "notice_date",
|
145
|
+
"Total Layoffs": "jobs",
|
146
|
+
"NAICS": "naics",
|
147
|
+
"Workforce Area": "workforce_area",
|
148
|
+
"# Perm": "permanent_job_losses",
|
149
|
+
"#Temp": "temporary_job_losses",
|
150
|
+
"Reduced Hours": "reduced_hours",
|
151
|
+
"#Furloughs": "furloughs",
|
152
|
+
"Begin Date": "begin_date",
|
153
|
+
"End Date": "end_date",
|
154
|
+
"Reason for Layoffs": "reason",
|
155
|
+
"Reason for Layoff": "reason",
|
156
|
+
"WARN Letter": "letter",
|
157
|
+
"Occupations Impacted": "occupations",
|
158
|
+
"Occupations": "occupations",
|
159
|
+
"Select the workforce area": "workforce_area",
|
160
|
+
"Total CO": "jobs",
|
161
|
+
"CO Layoffs": "jobs",
|
162
|
+
"Total number of permanent layoffs": "permanent_job_losses",
|
163
|
+
"# permanent": "permanent_job_losses",
|
164
|
+
"# Permanent": "permanent_job_losses",
|
165
|
+
"Total number of temporary layoffs": "temporary_job_losses",
|
166
|
+
"Total number of furloughs": "furloughs",
|
167
|
+
"Begin date of layoffs": "begin_date",
|
168
|
+
"End date of layoffs": "end_date",
|
169
|
+
"Layoff Total": "jobs",
|
170
|
+
"Local Area": "workforce_area",
|
171
|
+
"Layoff Date(s)": "begin_date",
|
172
|
+
"Temp Layoffs": "temporary_job_losses",
|
173
|
+
"Perm Layoffs": "permanent_job_losses",
|
174
|
+
"Furloughs": "furloughs",
|
175
|
+
"Workforce Local Area": "workforce_area",
|
176
|
+
"Workforce Region": "workforce_region",
|
177
|
+
"Contact Name": "contact",
|
178
|
+
"Contact Phone": "phone",
|
179
|
+
"Contact Email": "email",
|
180
|
+
"FEIN": "fein",
|
181
|
+
"Location Address": "location",
|
182
|
+
"Total number of employees at the location": "at_the_location",
|
183
|
+
"Sector 33 (6414) Guided Missle & Space Vehicle": "naics",
|
184
|
+
"@dropdown": "dropdown",
|
185
|
+
"Received": "received_date",
|
186
|
+
"Notes": "notes",
|
187
|
+
# Only add new matches above here, not below here.
|
188
|
+
}
|
189
|
+
|
190
|
+
header_garbage = {
|
191
|
+
# And then it got ugly with some columns getting unhidden.
|
192
|
+
"Timestamp": "timestamp",
|
193
|
+
"Email Address": "email_address",
|
194
|
+
"Is this a NEW WARN or a REVISION?": "is_this_a_new_warn_or_a_revision",
|
195
|
+
"Total number of employees with reduced hours": "total_number_of_employees_with_reduced_hours",
|
196
|
+
"Include the total number of employees on or expected to be on a Workshare plan.": "include_the_total_number_of_employees_on_or_expected_to_be_on_a_workshare_plan",
|
197
|
+
"Expected date of second job losses at location 1": "expected_date_of_second_job_losses_at_location_1",
|
198
|
+
"Expected end date of second job losses at location 1": "expected_end_date_of_second_job_losses_at_location_1",
|
199
|
+
"Expected date of third job losses at location 1": "expected_date_of_third_job_losses_at_location_1",
|
200
|
+
"Expected end date of third job losses at location 1": "expected_end_date_of_third_job_losses_at_location_1",
|
201
|
+
"Do the employees have bumping rights?": "do_the_employees_have_bumping_rights",
|
202
|
+
"Are the employees represented by a union?": "are_the_employees_represented_by_a_union",
|
203
|
+
"If you selected Rural Consortium for the workforce area, please choose a subarea using the map.": "if_you_selected_rural_consortium_for_the_workforce_area_please_choose_a_subarea_using_the_map",
|
204
|
+
"Name of union(s)": "name_of_unions",
|
205
|
+
"Contact phone number for union representative(s)": "contact_phone_number_for_union_representatives",
|
206
|
+
"Email address for union representative(s)": "email_address_for_union_representatives",
|
207
|
+
"Address, City, ZIP for Union 1": "address_city_zip_for_union_1",
|
208
|
+
"Has a second location been impacted?": "has_a_second_location_been_impacted",
|
209
|
+
"Location 2 Address": "location_2_address",
|
210
|
+
"Total number of employees at location 2": "total_number_of_employees_at_location_2",
|
211
|
+
"Total number of permanent layoffs at location 2": "total_number_of_permanent_layoffs_at_location_2",
|
212
|
+
"Total number of temporary layoffs at location 2": "total_number_of_temporary_layoffs_at_location_2",
|
213
|
+
"Total number of furloughs at location 2": "total_number_of_furloughs_at_location_2",
|
214
|
+
"Total number of employees with reduced hours at location 2": "total_number_of_employees_with_reduced_hours_at_location_2",
|
215
|
+
"Total number of employees on workshare plan at location 2": "total_number_of_employees_on_workshare_plan_at_location_2",
|
216
|
+
"Occupations Impacted at location 2": "occupations_impacted_at_location_2",
|
217
|
+
"Expected date of first job losses at location 2": "expected_date_of_first_job_losses_at_location_2",
|
218
|
+
"Contact name(s) for union representative(s)": "contact_names_for_union_representatives",
|
219
|
+
"Expected end date of first job losses at location 2": "expected_end_date_of_first_job_losses_at_location_2",
|
220
|
+
"Expected date of second job losses at location 2": "expected_date_of_second_job_losses_at_location_2",
|
221
|
+
"Expected end date of second job losses at location 2": "expected_end_date_of_second_job_losses_at_location_2",
|
222
|
+
"Expected date of third job losses at location 2": "expected_date_of_third_job_losses_at_location_2",
|
223
|
+
"Expected end date of third job losses at location 2": "expected_end_date_of_third_job_losses_at_location_2",
|
224
|
+
"Reason for Layoffs at location 2": "reason_for_layoffs_at_location_2",
|
225
|
+
"Do employees at location 2 having bumping rights?": "do_employees_at_location_2_having_bumping_rights",
|
226
|
+
"Are employees at location 2 represented by a union?": "are_employees_at_location_2_represented_by_a_union",
|
227
|
+
"Select the workforce area for location 2": "select_the_workforce_area_for_location_2",
|
228
|
+
"If you selected Other/Sub-Area, please choose a location from the following dropdown menu:": "if_you_selected_othersub_area_please_choose_a_location_from_the_following_dropdown_menu",
|
229
|
+
"Name of Union 2": "name_of_union_2",
|
230
|
+
"Contact name for Union 2": "contact_name_for_union_2",
|
231
|
+
"Contact phone number for Union 2": "contact_phone_number_for_union_2",
|
232
|
+
"Email address for Union 2": "email_address_for_union_2",
|
233
|
+
"Address, City, ZIP for Union 2": "address_city_zip_for_union_2",
|
234
|
+
"Has a third location been impacted?": "has_a_third_location_been_impacted",
|
235
|
+
"Location 3 Address": "location_3_address",
|
236
|
+
"Total number of employees at location 3": "total_number_of_employees_at_location_3",
|
237
|
+
"Total number of permanent layoffs at location 3": "total_number_of_permanent_layoffs_at_location_3",
|
238
|
+
"Total number of temporary layoffs at location 3": "total_number_of_temporary_layoffs_at_location_3",
|
239
|
+
"Total number of furloughs at location 3": "total_number_of_furloughs_at_location_3",
|
240
|
+
"Total number of employees with reduced hours at location 3": "total_number_of_employees_with_reduced_hours_at_location_3",
|
241
|
+
"Total number of employees on workshare plan at location 3": "total_number_of_employees_on_workshare_plan_at_location_3",
|
242
|
+
"Occupations Impacted at location 3": "occupations_impacted_at_location_3",
|
243
|
+
"Expected date of first job losses at location 3": "expected_date_of_first_job_losses_at_location_3",
|
244
|
+
"Expected end date of first job losses at location 3": "expected_end_date_of_first_job_losses_at_location_3",
|
245
|
+
"Expected date of second job losses at location 3": "expected_date_of_second_job_losses_at_location_3",
|
246
|
+
"Expected end date of second job losses at location 3": "expected_end_date_of_second_job_losses_at_location_3",
|
247
|
+
"Expected date of third job losses at location 3": "expected_date_of_third_job_losses_at_location_3",
|
248
|
+
"Expected end date of third job losses at location 3": "expected_end_date_of_third_job_losses_at_location_3",
|
249
|
+
"Reason for Layoffs at location 3": "reason_for_layoffs_at_location_3",
|
250
|
+
"Do employees at location 3 having bumping rights?": "do_employees_at_location_3_having_bumping_rights",
|
251
|
+
"Are employees at location 3 represented by a union?": "are_employees_at_location_3_represented_by_a_union",
|
252
|
+
"Select the workforce area for location 3": "select_the_workforce_area_for_location_3",
|
253
|
+
"Name of Union 3": "name_of_union_3",
|
254
|
+
"Contact name for Union 3": "contact_name_for_union_3",
|
255
|
+
"Contact phone number for Union 3": "contact_phone_number_for_union_3",
|
256
|
+
"Email address for Union 3": "email_address_for_union_3",
|
257
|
+
"Address, City, ZIP for Union 3": "address_city_zip_for_union_3",
|
258
|
+
"Include here any comments or additional details": "include_here_any_comments_or_additional_details",
|
259
|
+
# This is for garbage, not legit crosswalk. You probably do not want to add here.
|
260
|
+
}
|
261
|
+
|
262
|
+
standardized_data = []
|
263
|
+
for row in cleaned_data:
|
264
|
+
row_dict = {}
|
265
|
+
mangled = []
|
266
|
+
for key in row:
|
267
|
+
if (
|
268
|
+
key not in header_crosswalk and key not in header_garbage
|
269
|
+
): # Get all missing keys at once
|
270
|
+
mangled.append(key)
|
271
|
+
if len(mangled) > 0:
|
272
|
+
logger.warning(f"Missing a bunch of keys: {'|'.join(mangled)}")
|
273
|
+
|
274
|
+
for key, value in row.items():
|
275
|
+
if (
|
276
|
+
key not in header_crosswalk and key not in header_garbage
|
277
|
+
): # If we've never seen this before
|
278
|
+
logger.warning(f"Could not find {key} in header_crosswalk")
|
279
|
+
logger.warning(row)
|
280
|
+
if key not in header_garbage: # if it's in the crosswalk, if it's legit
|
281
|
+
standardized_key = header_crosswalk[key]
|
282
|
+
row_dict[standardized_key] = value
|
283
|
+
if len(row_dict["company"]) < 3 and row_dict["letter"] == "Avis Budget Group":
|
284
|
+
row_dict["company"] = "Avis Budget Group"
|
285
|
+
if len(row_dict["company"]) < 3: # or len(row_dict['naics']) <5:
|
286
|
+
logger.debug(f"Dropping row of questionable quality: {row_dict}")
|
287
|
+
elif "begin_date" in row_dict and row_dict["begin_date"] == "Layoff Date(s)":
|
288
|
+
logger.debug(f"Dropping row of questionable quality: {row_dict}")
|
289
|
+
else:
|
290
|
+
standardized_data.append(row_dict)
|
291
|
+
|
292
|
+
# Set the path to the final CSV
|
293
|
+
output_csv = data_dir / "co.csv"
|
294
|
+
|
295
|
+
# Write out the rows to the export directory
|
296
|
+
# headers = list(cleaned_data[0].keys())
|
297
|
+
utils.write_dict_rows_to_csv(
|
298
|
+
output_csv, set(header_crosswalk.values()), standardized_data
|
299
|
+
)
|
300
|
+
|
301
|
+
# Return the path to the final CSV
|
302
|
+
return output_csv
|
303
|
+
|
304
|
+
|
305
|
+
def scrape_google_sheets(table, header_list=None):
|
306
|
+
"""
|
307
|
+
Scrapes data out of a Google Sheet.
|
308
|
+
|
309
|
+
Keyword arguments:
|
310
|
+
table -- A Google Sheet table pulled into BeautifulSoup
|
311
|
+
header_list -- A list of header to use. Provide this when the source spreadsheet doesn't have a proper header row.
|
312
|
+
|
313
|
+
Returns: The parsed data as a list of dictionaries
|
314
|
+
"""
|
315
|
+
# logger.debug(table)
|
316
|
+
# If a header list isn't provided, pull one out automatically
|
317
|
+
if not header_list:
|
318
|
+
# Pull out the header row
|
319
|
+
# header_soup = table.find_all("tr")[1]
|
320
|
+
header_soup = table.find_all("tr")[0]
|
321
|
+
# Parse the header row into a list,
|
322
|
+
# preserving its order in the sheet
|
323
|
+
header_list = []
|
324
|
+
for cellindex, cell in enumerate(header_soup.find_all("td")):
|
325
|
+
cell_text = cell.text.strip()
|
326
|
+
# Skip empty headers
|
327
|
+
if cell_text:
|
328
|
+
header_list.append(cell_text)
|
329
|
+
if not cell_text and cellindex == 0:
|
330
|
+
header_list.append("Company Name")
|
331
|
+
|
332
|
+
# Loop through all the data rows, which start
|
333
|
+
# after the header and the little bar
|
334
|
+
tr_list = table.find_all("tr")[1:]
|
335
|
+
logger.debug(f"Parsing {len(tr_list)} rows")
|
336
|
+
row_list = []
|
337
|
+
for row in tr_list:
|
338
|
+
# Only pull out the cells that have headers
|
339
|
+
cell_list = row.find_all("td")[: len(header_list)]
|
340
|
+
|
341
|
+
# Loop through the cells and key them into a dictionary using the header
|
342
|
+
row_dict = {}
|
343
|
+
for i, cell in enumerate(cell_list):
|
344
|
+
row_dict[header_list[i]] = cell.text.strip()
|
345
|
+
|
346
|
+
# Get values list for examination
|
347
|
+
value_list = list(row_dict.values())
|
348
|
+
|
349
|
+
# Skip empty rows
|
350
|
+
if not any(value_list):
|
351
|
+
continue
|
352
|
+
|
353
|
+
# Skip header rows
|
354
|
+
if "WARN Date" in value_list:
|
355
|
+
continue
|
356
|
+
|
357
|
+
# Keep whatever is left
|
358
|
+
row_list.append(row_dict)
|
359
|
+
|
360
|
+
# Return what we got
|
361
|
+
return row_list
|
362
|
+
|
363
|
+
|
364
|
+
if __name__ == "__main__":
|
365
|
+
scrape()
|
@@ -48,7 +48,9 @@ def scrape(
|
|
48
48
|
url = f"https://does.dc.gov/page/industry-closings-and-layoffs-warn-notifications-{current_year - 1}"
|
49
49
|
success, content = utils.save_if_good_url(targetfile, url)
|
50
50
|
|
51
|
-
root_html = cache.read(
|
51
|
+
root_html = cache.read(
|
52
|
+
"/".join(str(targetfile).split("/")[-2:])
|
53
|
+
) # Explicitly re-read as text for regex to work
|
52
54
|
|
53
55
|
# A June 2025 entry includes a weird table inside a table cell.
|
54
56
|
# This is an ugly patch.
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: warn-scraper
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.109
|
4
4
|
Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
|
5
5
|
Home-page: https://github.com/biglocalnews/warn-scraper
|
6
6
|
Author: Big Local News
|
@@ -1,238 +0,0 @@
|
|
1
|
-
import logging
|
2
|
-
from pathlib import Path
|
3
|
-
|
4
|
-
from bs4 import BeautifulSoup, Tag
|
5
|
-
|
6
|
-
from .. import utils
|
7
|
-
from ..cache import Cache
|
8
|
-
|
9
|
-
__authors__ = ["anikasikka"]
|
10
|
-
__tags__ = ["html"]
|
11
|
-
__source__ = {
|
12
|
-
"name": "Colorado Department of Labor and Employment",
|
13
|
-
"url": "https://cdle.colorado.gov/employers/layoff-separations/layoff-warn-list",
|
14
|
-
}
|
15
|
-
|
16
|
-
logger = logging.getLogger(__name__)
|
17
|
-
|
18
|
-
|
19
|
-
def scrape(
|
20
|
-
data_dir: Path = utils.WARN_DATA_DIR,
|
21
|
-
cache_dir: Path = utils.WARN_CACHE_DIR,
|
22
|
-
) -> Path:
|
23
|
-
"""
|
24
|
-
Scrape data from Colorado.
|
25
|
-
|
26
|
-
Keyword arguments:
|
27
|
-
data_dir -- the Path were the result will be saved (default WARN_DATA_DIR)
|
28
|
-
cache_dir -- the Path where results can be cached (default WARN_CACHE_DIR)
|
29
|
-
|
30
|
-
Returns: the Path where the file is written
|
31
|
-
"""
|
32
|
-
# Grab the page
|
33
|
-
page = utils.get_url(
|
34
|
-
"https://cdle.colorado.gov/employers/layoff-separations/layoff-warn-list"
|
35
|
-
)
|
36
|
-
html = page.text
|
37
|
-
|
38
|
-
# Write the raw file to the cache
|
39
|
-
cache = Cache(cache_dir)
|
40
|
-
cache.write("co/main/source.html", html)
|
41
|
-
|
42
|
-
# Parse the page
|
43
|
-
soup = BeautifulSoup(html, "html5lib")
|
44
|
-
|
45
|
-
# Get the link to the Google Sheet that's on the page
|
46
|
-
content_region = soup.find(class_="region-content")
|
47
|
-
if isinstance(content_region, Tag):
|
48
|
-
current_link = content_region.find("a", class_="btn-dark-blue")
|
49
|
-
else:
|
50
|
-
raise ValueError("Could not find content region")
|
51
|
-
if isinstance(current_link, Tag):
|
52
|
-
current_href = current_link["href"]
|
53
|
-
else:
|
54
|
-
raise ValueError("Could not find Google Sheet link")
|
55
|
-
|
56
|
-
# Open the Google Sheet
|
57
|
-
current_page = utils.get_url(current_href)
|
58
|
-
current_html = current_page.text
|
59
|
-
|
60
|
-
# Parse the Google Sheet
|
61
|
-
soup_current = BeautifulSoup(current_html, "html5lib")
|
62
|
-
table = soup_current.find(class_="waffle")
|
63
|
-
cleaned_data = scrape_google_sheets(table)
|
64
|
-
|
65
|
-
# Goes through the accordion links to get past data
|
66
|
-
content_region = soup.find(class_="region-content")
|
67
|
-
if isinstance(content_region, Tag):
|
68
|
-
accordion_list = content_region.find_all("dl")
|
69
|
-
else:
|
70
|
-
raise ValueError("Could not find content region")
|
71
|
-
|
72
|
-
# Make sure there's only one
|
73
|
-
assert len(accordion_list) == 1
|
74
|
-
|
75
|
-
# Grab the first one from the list
|
76
|
-
accordion = accordion_list[0]
|
77
|
-
|
78
|
-
link_list = [a for a in accordion.find_all("a") if "feedback" not in a.text]
|
79
|
-
logger.debug(f"Requesting {len(link_list)} discovered links")
|
80
|
-
for link in link_list:
|
81
|
-
page = utils.get_url(link["href"])
|
82
|
-
soup = BeautifulSoup(page.text, "html5lib")
|
83
|
-
table = soup.find(class_="waffle")
|
84
|
-
if "2017" in link.text:
|
85
|
-
header_list = [
|
86
|
-
"Company",
|
87
|
-
"Layoff Total",
|
88
|
-
"Workforce Region",
|
89
|
-
"WARN Date",
|
90
|
-
"Reason for Layoff",
|
91
|
-
]
|
92
|
-
elif "2019" in link.text:
|
93
|
-
header_list = [
|
94
|
-
"Company Name",
|
95
|
-
"Layoff Total",
|
96
|
-
"Workforce Local Area",
|
97
|
-
"WARN Date",
|
98
|
-
"Reason for Layoff",
|
99
|
-
"Occupations",
|
100
|
-
"Layoff Date(s)",
|
101
|
-
]
|
102
|
-
else:
|
103
|
-
header_list = []
|
104
|
-
cleaned_data += scrape_google_sheets(table, header_list)
|
105
|
-
|
106
|
-
# Clean up the headers
|
107
|
-
header_crosswalk = {
|
108
|
-
"Name": "company",
|
109
|
-
"Company Name": "company",
|
110
|
-
"Company": "company",
|
111
|
-
"WARN Date": "notice_date",
|
112
|
-
"Total Layoffs": "jobs",
|
113
|
-
"NAICS": "naics",
|
114
|
-
"Workforce Area": "workforce_area",
|
115
|
-
"# Perm": "permanent_job_losses",
|
116
|
-
"#Temp": "temporary_job_losses",
|
117
|
-
"Reduced Hours": "reduced_hours",
|
118
|
-
"#Furloughs": "furloughs",
|
119
|
-
"Begin Date": "begin_date",
|
120
|
-
"End Date": "end_date",
|
121
|
-
"Reason for Layoffs": "reason",
|
122
|
-
"Reason for Layoff": "reason",
|
123
|
-
"WARN Letter": "letter",
|
124
|
-
"Occupations Impacted": "occupations",
|
125
|
-
"Occupations": "occupations",
|
126
|
-
"Select the workforce area": "workforce_area",
|
127
|
-
"Total CO": "jobs",
|
128
|
-
"CO Layoffs": "jobs",
|
129
|
-
"Total number of permanent layoffs": "permanent_job_losses",
|
130
|
-
"# permanent": "permanent_job_losses",
|
131
|
-
"# Permanent": "permanent_job_losses",
|
132
|
-
"Total number of temporary layoffs": "temporary_job_losses",
|
133
|
-
"Total number of furloughs": "furloughs",
|
134
|
-
"Begin date of layoffs": "begin_date",
|
135
|
-
"End date of layoffs": "end_date",
|
136
|
-
"Layoff Total": "jobs",
|
137
|
-
"Local Area": "workforce_area",
|
138
|
-
"Layoff Date(s)": "begin_date",
|
139
|
-
"Temp Layoffs": "temporary_job_losses",
|
140
|
-
"Perm Layoffs": "permanent_job_losses",
|
141
|
-
"Furloughs": "furloughs",
|
142
|
-
"Workforce Local Area": "workforce_area",
|
143
|
-
"Workforce Region": "workforce_region",
|
144
|
-
"Contact Name": "contact",
|
145
|
-
"Contact Phone": "phone",
|
146
|
-
"Contact Email": "email",
|
147
|
-
"FEIN": "fein",
|
148
|
-
"Location Address": "location",
|
149
|
-
"Total number of employees at the location": "at_the_location",
|
150
|
-
"Sector 33 (6414) Guided Missle & Space Vehicle": "naics",
|
151
|
-
"@dropdown": "dropdown",
|
152
|
-
"Received": "received_date",
|
153
|
-
"Notes": "notes",
|
154
|
-
}
|
155
|
-
standardized_data = []
|
156
|
-
for row in cleaned_data:
|
157
|
-
row_dict = {}
|
158
|
-
for key, value in row.items():
|
159
|
-
standardized_key = header_crosswalk[key]
|
160
|
-
row_dict[standardized_key] = value
|
161
|
-
if len(row_dict["company"]) < 3 and row_dict["letter"] == "Avis Budget Group":
|
162
|
-
row_dict["company"] = "Avis Budget Group"
|
163
|
-
if len(row_dict["company"]) < 3: # or len(row_dict['naics']) <5:
|
164
|
-
logger.debug(f"Dropping row of questionable quality: {row_dict}")
|
165
|
-
else:
|
166
|
-
standardized_data.append(row_dict)
|
167
|
-
|
168
|
-
# Set the path to the final CSV
|
169
|
-
output_csv = data_dir / "co.csv"
|
170
|
-
|
171
|
-
# Write out the rows to the export directory
|
172
|
-
# headers = list(cleaned_data[0].keys())
|
173
|
-
utils.write_dict_rows_to_csv(
|
174
|
-
output_csv, set(header_crosswalk.values()), standardized_data
|
175
|
-
)
|
176
|
-
|
177
|
-
# Return the path to the final CSV
|
178
|
-
return output_csv
|
179
|
-
|
180
|
-
|
181
|
-
def scrape_google_sheets(table, header_list=None):
|
182
|
-
"""
|
183
|
-
Scrapes data out of a Google Sheet.
|
184
|
-
|
185
|
-
Keyword arguments:
|
186
|
-
table -- A Google Sheet table pulled into BeautifulSoup
|
187
|
-
header_list -- A list of header to use. Provide this when the source spreadsheet doesn't have a proper header row.
|
188
|
-
|
189
|
-
Returns: The parsed data as a list of dictionaries
|
190
|
-
"""
|
191
|
-
# If a header list isn't provided, pull one out automatically
|
192
|
-
if not header_list:
|
193
|
-
# Pull out the header row
|
194
|
-
header_soup = table.find_all("tr")[1]
|
195
|
-
|
196
|
-
# Parse the header row into a list,
|
197
|
-
# preserving its order in the sheet
|
198
|
-
header_list = []
|
199
|
-
for cell in header_soup.find_all("td"):
|
200
|
-
cell_text = cell.text.strip()
|
201
|
-
# Skip empty headers
|
202
|
-
if cell_text:
|
203
|
-
header_list.append(cell_text)
|
204
|
-
|
205
|
-
# Loop through all the data rows, which start
|
206
|
-
# after the header and the little bar
|
207
|
-
tr_list = table.find_all("tr")[3:]
|
208
|
-
logger.debug(f"Parsing {len(tr_list)} rows")
|
209
|
-
row_list = []
|
210
|
-
for row in tr_list:
|
211
|
-
# Only pull out the cells that have headers
|
212
|
-
cell_list = row.find_all("td")[: len(header_list)]
|
213
|
-
|
214
|
-
# Loop through the cells and key them into a dictionary using the header
|
215
|
-
row_dict = {}
|
216
|
-
for i, cell in enumerate(cell_list):
|
217
|
-
row_dict[header_list[i]] = cell.text.strip()
|
218
|
-
|
219
|
-
# Get values list for examination
|
220
|
-
value_list = list(row_dict.values())
|
221
|
-
|
222
|
-
# Skip empty rows
|
223
|
-
if not any(value_list):
|
224
|
-
continue
|
225
|
-
|
226
|
-
# Skip header rows
|
227
|
-
if "WARN Date" in value_list:
|
228
|
-
continue
|
229
|
-
|
230
|
-
# Keep whatever is left
|
231
|
-
row_list.append(row_dict)
|
232
|
-
|
233
|
-
# Return what we got
|
234
|
-
return row_list
|
235
|
-
|
236
|
-
|
237
|
-
if __name__ == "__main__":
|
238
|
-
scrape()
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{warn_scraper-1.2.107 → warn_scraper-1.2.109}/tests/cassettes/test_cached_search_results.yaml
RENAMED
File without changes
|
{warn_scraper-1.2.107 → warn_scraper-1.2.109}/tests/cassettes/test_missing_detail_page_values.yaml
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|