warn-scraper 1.2.104__tar.gz → 1.2.106__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {warn_scraper-1.2.104/warn_scraper.egg-info → warn_scraper-1.2.106}/PKG-INFO +1 -1
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/dc.py +5 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/id.py +23 -9
- {warn_scraper-1.2.104 → warn_scraper-1.2.106/warn_scraper.egg-info}/PKG-INFO +1 -1
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/.devcontainer/devcontainer.json +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/.github/dependabot.yml.disabled-for-sanity +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/.github/workflows/continuous-deployment.yml +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/.github/workflows/continuous-deployment.yml.broken-tests +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/.gitignore +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/.pre-commit-config.yaml +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/LICENSE +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/MANIFEST.in +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/Makefile +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/Pipfile +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/Pipfile.lock +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/README.md +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/Makefile +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/_static/R42693.pdf +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/_static/gao-03-1003.pdf +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/_static/releasing-actions-finished.png +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/_static/releasing-actions-start.png +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/_static/releasing-changelog-button.png +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/_static/releasing-changelog-entered.png +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/_static/releasing-draft-button.png +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/_static/releasing-name-release.png +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/_static/releasing-name-tag.png +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/_static/releasing-publish-button.png +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/_static/releasing-pypi.png +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/_static/releasing-release-published.png +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/_static/releasing-releases-button.png +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/_static/releasing-tag-button.png +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/_templates/sources.md.tmpl +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/conf.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/contributing.rst +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/index.rst +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/make.bat +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/reference.rst +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/releasing.md +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/requirements.txt +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/al.md +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/az.md +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/ca.md +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/co.md +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/dc.md +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/de.md +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/ia.md +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/in.md +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/job_center.md +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/ks.md +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/md.md +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/me.md +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/mo.md +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/ny.md +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/ok.md +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/or.md +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/sc.md +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/tx.md +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/ut.md +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/va.md +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/vt.md +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/wi.md +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/sources.md +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/usage.md +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/setup.cfg +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/setup.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/tests/__init__.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/tests/cassettes/test_cached_detail_pages.yaml +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/tests/cassettes/test_cached_search_results.yaml +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/tests/cassettes/test_missing_detail_page_values.yaml +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/tests/cassettes/test_no_results.yaml +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/tests/cassettes/test_paged_results.yaml +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/tests/cassettes/test_scrape_integration.yaml +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/tests/conftest.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/tests/fixtures/2021_page_1.html +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/tests/fixtures/2021_page_2.html +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/tests/test_cache.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/tests/test_delete.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/tests/test_job_center.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/tests/test_job_center_cache.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/tests/test_openpyxl.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/__init__.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/cache.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/cli.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/platforms/__init__.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/platforms/job_center/__init__.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/platforms/job_center/cache.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/platforms/job_center/site.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/platforms/job_center/urls.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/platforms/job_center/utils.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/runner.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/__init__.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/ak.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/al.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/az.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/ca.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/co.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/ct.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/de.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/fl.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/ga.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/hi.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/ia.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/il.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/in.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/ks.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/ky.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/la.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/md.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/me.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/mi.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/mo.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/mt.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/ne.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/nj.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/nm.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/ny.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/oh.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/ok.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/or.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/ri.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/sc.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/sd.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/tn.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/tx.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/ut.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/va.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/vt.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/wa.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/wi.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/utils.py +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn_scraper.egg-info/SOURCES.txt +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn_scraper.egg-info/dependency_links.txt +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn_scraper.egg-info/entry_points.txt +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn_scraper.egg-info/not-zip-safe +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn_scraper.egg-info/requires.txt +0 -0
- {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn_scraper.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: warn-scraper
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.106
|
4
4
|
Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
|
5
5
|
Home-page: https://github.com/biglocalnews/warn-scraper
|
6
6
|
Author: Big Local News
|
@@ -85,6 +85,11 @@ def scrape(
|
|
85
85
|
r.encoding = "utf-8"
|
86
86
|
html = r.text
|
87
87
|
|
88
|
+
# A June 2025 entry includes a weird table inside a table cell.
|
89
|
+
# This is an ugly patch.
|
90
|
+
weirdtable = r"(\s+<table>\s+<tbody>\s+<tr>\s+<td>)(.*?)(</td>\s+</tr>\s+</tbody>\s+</table>\s+)"
|
91
|
+
html = re.sub(weirdtable, r"\2", html)
|
92
|
+
|
88
93
|
# Save it to the cache
|
89
94
|
cache_key = uuid.uuid5(uuid.NAMESPACE_URL, href)
|
90
95
|
cache.write(f"dc/{cache_key}.html", html)
|
@@ -69,6 +69,11 @@ def scrape(
|
|
69
69
|
with pdfplumber.open(pdf_file) as pdf:
|
70
70
|
for index, page in enumerate(pdf.pages):
|
71
71
|
rows = page.extract_table()
|
72
|
+
if rows[0][0] in ["Date of\nLetter", "Date of Letter"] and index > 1:
|
73
|
+
rows = rows[
|
74
|
+
1:
|
75
|
+
] # Drop inside header rows that _clean_table will mangle if merged cells span pages
|
76
|
+
# logger.debug(f"\n\nRows for page {page}: {rows}")
|
72
77
|
output_rows += _clean_table(rows, index)
|
73
78
|
|
74
79
|
# Write out the data to a CSV
|
@@ -118,8 +123,9 @@ def _clean_table(rows, page_index) -> list:
|
|
118
123
|
# logger.debug(f"Dropping faulty row with {len(output_row)} elements: {output_row}")
|
119
124
|
|
120
125
|
# Only include the header on the first page
|
121
|
-
|
122
|
-
|
126
|
+
# No, this needed to be filtered earlier
|
127
|
+
# if page_index != 0:
|
128
|
+
# return output_rows[1:]
|
123
129
|
|
124
130
|
return output_rows
|
125
131
|
|
@@ -132,18 +138,26 @@ def filter_garbage_rows(incoming: list):
|
|
132
138
|
|
133
139
|
Returns: List of lists that have a minimum number of elements.
|
134
140
|
"""
|
135
|
-
|
141
|
+
shortrows: int = 0
|
142
|
+
mixedrows: int = 0
|
136
143
|
outgoing: list = []
|
137
|
-
for row in incoming:
|
138
|
-
|
144
|
+
for rowindex, row in enumerate(incoming):
|
145
|
+
error = False
|
146
|
+
if len(row) < 5:
|
147
|
+
error = True
|
148
|
+
logger.debug(f"Dropping short row: {row}")
|
149
|
+
shortrows += 1
|
150
|
+
if row[0] == "Date of Letter" and rowindex != 0: # Keep the header row
|
151
|
+
error = True
|
152
|
+
logger.debug(f"Dropping partial row: {row}")
|
153
|
+
mixedrows += 1
|
154
|
+
if not error:
|
139
155
|
outgoing.append(row)
|
140
|
-
|
141
|
-
badrows += 1
|
142
|
-
if badrows == 0:
|
156
|
+
if shortrows == 0 and mixedrows == 0:
|
143
157
|
logger.debug("No bad rows found.")
|
144
158
|
else:
|
145
159
|
logger.debug(
|
146
|
-
f"
|
160
|
+
f"!!!! Dropped {shortrows} rows with insufficient number of fields, and dropped {mixedrows} rows scrambled with header info"
|
147
161
|
)
|
148
162
|
return outgoing
|
149
163
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: warn-scraper
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.106
|
4
4
|
Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
|
5
5
|
Home-page: https://github.com/biglocalnews/warn-scraper
|
6
6
|
Author: Big Local News
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{warn_scraper-1.2.104 → warn_scraper-1.2.106}/tests/cassettes/test_cached_search_results.yaml
RENAMED
File without changes
|
{warn_scraper-1.2.104 → warn_scraper-1.2.106}/tests/cassettes/test_missing_detail_page_values.yaml
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|