warn-scraper 1.2.110__tar.gz → 1.2.112__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. {warn_scraper-1.2.110/warn_scraper.egg-info → warn_scraper-1.2.112}/PKG-INFO +1 -1
  2. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/ia.py +2 -0
  3. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/la.py +46 -4
  4. {warn_scraper-1.2.110 → warn_scraper-1.2.112/warn_scraper.egg-info}/PKG-INFO +1 -1
  5. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/.devcontainer/devcontainer.json +0 -0
  6. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/.github/dependabot.yml.disabled-for-sanity +0 -0
  7. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/.github/workflows/continuous-deployment.yml +0 -0
  8. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/.github/workflows/continuous-deployment.yml.broken-tests +0 -0
  9. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/.gitignore +0 -0
  10. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/.pre-commit-config.yaml +0 -0
  11. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/LICENSE +0 -0
  12. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/MANIFEST.in +0 -0
  13. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/Makefile +0 -0
  14. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/Pipfile +0 -0
  15. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/Pipfile.lock +0 -0
  16. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/README.md +0 -0
  17. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/Makefile +0 -0
  18. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/_static/R42693.pdf +0 -0
  19. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/_static/gao-03-1003.pdf +0 -0
  20. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/_static/releasing-actions-finished.png +0 -0
  21. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/_static/releasing-actions-start.png +0 -0
  22. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/_static/releasing-changelog-button.png +0 -0
  23. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/_static/releasing-changelog-entered.png +0 -0
  24. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/_static/releasing-draft-button.png +0 -0
  25. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/_static/releasing-name-release.png +0 -0
  26. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/_static/releasing-name-tag.png +0 -0
  27. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/_static/releasing-publish-button.png +0 -0
  28. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/_static/releasing-pypi.png +0 -0
  29. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/_static/releasing-release-published.png +0 -0
  30. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/_static/releasing-releases-button.png +0 -0
  31. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/_static/releasing-tag-button.png +0 -0
  32. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/_templates/sources.md.tmpl +0 -0
  33. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/conf.py +0 -0
  34. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/contributing.rst +0 -0
  35. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/index.rst +0 -0
  36. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/make.bat +0 -0
  37. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/reference.rst +0 -0
  38. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/releasing.md +0 -0
  39. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/requirements.txt +0 -0
  40. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/scrapers/al.md +0 -0
  41. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/scrapers/az.md +0 -0
  42. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/scrapers/ca.md +0 -0
  43. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/scrapers/co.md +0 -0
  44. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/scrapers/dc.md +0 -0
  45. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/scrapers/de.md +0 -0
  46. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/scrapers/ia.md +0 -0
  47. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/scrapers/in.md +0 -0
  48. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/scrapers/job_center.md +0 -0
  49. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/scrapers/ks.md +0 -0
  50. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/scrapers/md.md +0 -0
  51. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/scrapers/me.md +0 -0
  52. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/scrapers/mo.md +0 -0
  53. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/scrapers/ny.md +0 -0
  54. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/scrapers/ok.md +0 -0
  55. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/scrapers/or.md +0 -0
  56. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/scrapers/sc.md +0 -0
  57. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/scrapers/tx.md +0 -0
  58. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/scrapers/ut.md +0 -0
  59. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/scrapers/va.md +0 -0
  60. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/scrapers/vt.md +0 -0
  61. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/scrapers/wi.md +0 -0
  62. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/sources.md +0 -0
  63. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/docs/usage.md +0 -0
  64. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/setup.cfg +0 -0
  65. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/setup.py +0 -0
  66. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/tests/__init__.py +0 -0
  67. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/tests/cassettes/test_cached_detail_pages.yaml +0 -0
  68. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/tests/cassettes/test_cached_search_results.yaml +0 -0
  69. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/tests/cassettes/test_missing_detail_page_values.yaml +0 -0
  70. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/tests/cassettes/test_no_results.yaml +0 -0
  71. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/tests/cassettes/test_paged_results.yaml +0 -0
  72. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/tests/cassettes/test_scrape_integration.yaml +0 -0
  73. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/tests/conftest.py +0 -0
  74. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/tests/fixtures/2021_page_1.html +0 -0
  75. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/tests/fixtures/2021_page_2.html +0 -0
  76. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/tests/test_cache.py +0 -0
  77. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/tests/test_delete.py +0 -0
  78. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/tests/test_job_center.py +0 -0
  79. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/tests/test_job_center_cache.py +0 -0
  80. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/tests/test_openpyxl.py +0 -0
  81. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/__init__.py +0 -0
  82. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/cache.py +0 -0
  83. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/cli.py +0 -0
  84. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/platforms/__init__.py +0 -0
  85. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/platforms/job_center/__init__.py +0 -0
  86. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/platforms/job_center/cache.py +0 -0
  87. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/platforms/job_center/site.py +0 -0
  88. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/platforms/job_center/urls.py +0 -0
  89. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/platforms/job_center/utils.py +0 -0
  90. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/runner.py +0 -0
  91. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/__init__.py +0 -0
  92. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/ak.py +0 -0
  93. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/al.py +0 -0
  94. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/az.py +0 -0
  95. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/ca.py +0 -0
  96. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/co.py +0 -0
  97. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/ct.py +0 -0
  98. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/dc.py +0 -0
  99. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/de.py +0 -0
  100. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/fl.py +0 -0
  101. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/ga.py +0 -0
  102. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/hi.py +0 -0
  103. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/id.py +0 -0
  104. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/il.py +0 -0
  105. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/in.py +0 -0
  106. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/ks.py +0 -0
  107. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/ky.py +0 -0
  108. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/md.py +0 -0
  109. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/me.py +0 -0
  110. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/mi.py +0 -0
  111. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/mo.py +0 -0
  112. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/mt.py +0 -0
  113. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/ne.py +0 -0
  114. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/nj.py +0 -0
  115. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/nm.py +0 -0
  116. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/ny.py +0 -0
  117. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/oh.py +0 -0
  118. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/ok.py +0 -0
  119. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/or.py +0 -0
  120. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/ri.py +0 -0
  121. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/sc.py +0 -0
  122. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/sd.py +0 -0
  123. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/tn.py +0 -0
  124. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/tx.py +0 -0
  125. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/ut.py +0 -0
  126. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/va.py +0 -0
  127. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/vt.py +0 -0
  128. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/wa.py +0 -0
  129. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/scrapers/wi.py +0 -0
  130. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn/utils.py +0 -0
  131. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn_scraper.egg-info/SOURCES.txt +0 -0
  132. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn_scraper.egg-info/dependency_links.txt +0 -0
  133. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn_scraper.egg-info/entry_points.txt +0 -0
  134. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn_scraper.egg-info/not-zip-safe +0 -0
  135. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn_scraper.egg-info/requires.txt +0 -0
  136. {warn_scraper-1.2.110 → warn_scraper-1.2.112}/warn_scraper.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: warn-scraper
3
- Version: 1.2.110
3
+ Version: 1.2.112
4
4
  Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
5
5
  Home-page: https://github.com/biglocalnews/warn-scraper
6
6
  Author: Big Local News
@@ -58,6 +58,8 @@ def scrape(
58
58
 
59
59
  # Parse it, minus the header
60
60
  row_list += utils.parse_excel(historic_excel_path, keep_header=False)
61
+ if "Iowa WARN Log" in row_list[0]:
62
+ del row_list[0]
61
63
 
62
64
  # Set the export path
63
65
  data_path = data_dir / "ia.csv"
@@ -1,16 +1,19 @@
1
1
  import logging
2
2
  import os
3
3
  import re
4
+ import sys
5
+ from base64 import b64decode
4
6
  from datetime import datetime
5
7
  from pathlib import Path
6
8
 
7
9
  import pdfplumber
10
+ import requests
8
11
  from bs4 import BeautifulSoup
9
12
 
10
13
  from .. import utils
11
14
  from ..cache import Cache
12
15
 
13
- __authors__ = ["chriszs"]
16
+ __authors__ = ["chriszs", "stucka"]
14
17
  __tags__ = ["html", "pdf"]
15
18
  __source__ = {
16
19
  "name": "Louisiana Workforce Commission",
@@ -33,6 +36,14 @@ def scrape(
33
36
 
34
37
  Returns: the Path where the file is written
35
38
  """
39
+ try:
40
+ zyte_api_key = os.environ["ZYTE_API_KEY"]
41
+ except KeyError:
42
+ logger.error(
43
+ "No ZYTE_API_KEY variable found in environment. Please get an API key from Zyte and export it."
44
+ )
45
+ sys.exit(1)
46
+
36
47
  # Fire up the cache
37
48
  cache = Cache(cache_dir)
38
49
 
@@ -43,10 +54,22 @@ def scrape(
43
54
 
44
55
  # Download the root page
45
56
  url = f"{base_url}Downloads/{file_base}.asp"
46
- html = utils.get_url(url).text
57
+ api_response = requests.post(
58
+ "https://api.zyte.com/v1/extract",
59
+ auth=(zyte_api_key, ""),
60
+ json={
61
+ "url": url,
62
+ "httpResponseBody": True,
63
+ "followRedirect": True,
64
+ },
65
+ )
66
+ html_bytes: bytes = b64decode(api_response.json()["httpResponseBody"])
67
+ # html = utils.get_url(url).text
68
+ html = html_bytes.decode("utf-8", errors="backslashreplace")
47
69
 
48
70
  # Save it to the cache
49
- cache_key = f"{state_code}/{file_base}.html"
71
+ cache_key = cache_dir / f"{state_code}/{file_base}.html"
72
+ utils.create_directory(Path(cache_key), is_file=True)
50
73
  cache.write(cache_key, html)
51
74
 
52
75
  # Parse out the links to WARN notice PDFs
@@ -59,9 +82,28 @@ def scrape(
59
82
  if "WARN Notices" in link.text:
60
83
  # Download the PDF
61
84
  pdf_url = f"{base_url}{link['href']}"
62
- pdf_path = _read_or_download(cache, state_code, pdf_url)
85
+ logger.debug(pdf_url)
86
+ api_response = requests.post(
87
+ "https://api.zyte.com/v1/extract",
88
+ auth=(zyte_api_key, ""),
89
+ json={
90
+ "url": pdf_url,
91
+ "httpResponseBody": True,
92
+ "followRedirect": True,
93
+ },
94
+ )
95
+ http_response_body: bytes = b64decode(
96
+ api_response.json()["httpResponseBody"]
97
+ )
98
+ pdf_path = cache_dir / f"{state_code}/{os.path.basename(pdf_url)}"
99
+
100
+ with open(pdf_path, "wb") as fp:
101
+ fp.write(http_response_body)
102
+
103
+ # pdf_path = _read_or_download(cache, state_code, pdf_url)
63
104
 
64
105
  # Process the PDF
106
+ logger.debug(f"Attempting to parse {pdf_path}")
65
107
  rows = _process_pdf(pdf_path)
66
108
  all_rows.extend(rows)
67
109
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: warn-scraper
3
- Version: 1.2.110
3
+ Version: 1.2.112
4
4
  Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
5
5
  Home-page: https://github.com/biglocalnews/warn-scraper
6
6
  Author: Big Local News
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes