warn-scraper 1.2.74__tar.gz → 1.2.76__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. {warn-scraper-1.2.74/warn_scraper.egg-info → warn-scraper-1.2.76}/PKG-INFO +1 -1
  2. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/id.py +22 -9
  3. {warn-scraper-1.2.74 → warn-scraper-1.2.76/warn_scraper.egg-info}/PKG-INFO +1 -1
  4. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/.devcontainer/devcontainer.json +0 -0
  5. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/.github/dependabot.yml.disabled +0 -0
  6. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/.github/workflows/continuous-deployment.yml +0 -0
  7. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/.gitignore +0 -0
  8. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/.pre-commit-config.yaml +0 -0
  9. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/LICENSE +0 -0
  10. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/MANIFEST.in +0 -0
  11. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/Makefile +0 -0
  12. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/Pipfile +0 -0
  13. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/Pipfile.lock +0 -0
  14. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/README.md +0 -0
  15. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/Makefile +0 -0
  16. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/_static/R42693.pdf +0 -0
  17. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/_static/gao-03-1003.pdf +0 -0
  18. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/_static/releasing-actions-finished.png +0 -0
  19. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/_static/releasing-actions-start.png +0 -0
  20. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/_static/releasing-changelog-button.png +0 -0
  21. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/_static/releasing-changelog-entered.png +0 -0
  22. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/_static/releasing-draft-button.png +0 -0
  23. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/_static/releasing-name-release.png +0 -0
  24. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/_static/releasing-name-tag.png +0 -0
  25. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/_static/releasing-publish-button.png +0 -0
  26. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/_static/releasing-pypi.png +0 -0
  27. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/_static/releasing-release-published.png +0 -0
  28. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/_static/releasing-releases-button.png +0 -0
  29. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/_static/releasing-tag-button.png +0 -0
  30. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/_templates/sources.md.tmpl +0 -0
  31. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/conf.py +0 -0
  32. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/contributing.rst +0 -0
  33. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/index.rst +0 -0
  34. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/make.bat +0 -0
  35. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/reference.rst +0 -0
  36. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/releasing.md +0 -0
  37. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/requirements.txt +0 -0
  38. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/scrapers/al.md +0 -0
  39. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/scrapers/az.md +0 -0
  40. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/scrapers/ca.md +0 -0
  41. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/scrapers/co.md +0 -0
  42. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/scrapers/dc.md +0 -0
  43. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/scrapers/de.md +0 -0
  44. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/scrapers/ia.md +0 -0
  45. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/scrapers/in.md +0 -0
  46. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/scrapers/job_center.md +0 -0
  47. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/scrapers/ks.md +0 -0
  48. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/scrapers/md.md +0 -0
  49. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/scrapers/me.md +0 -0
  50. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/scrapers/mo.md +0 -0
  51. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/scrapers/ny.md +0 -0
  52. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/scrapers/ok.md +0 -0
  53. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/scrapers/or.md +0 -0
  54. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/scrapers/sc.md +0 -0
  55. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/scrapers/tx.md +0 -0
  56. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/scrapers/ut.md +0 -0
  57. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/scrapers/va.md +0 -0
  58. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/scrapers/vt.md +0 -0
  59. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/scrapers/wi.md +0 -0
  60. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/sources.md +0 -0
  61. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/docs/usage.md +0 -0
  62. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/setup.cfg +0 -0
  63. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/setup.py +0 -0
  64. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/tests/__init__.py +0 -0
  65. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/tests/cassettes/test_cached_detail_pages.yaml +0 -0
  66. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/tests/cassettes/test_cached_search_results.yaml +0 -0
  67. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/tests/cassettes/test_missing_detail_page_values.yaml +0 -0
  68. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/tests/cassettes/test_no_results.yaml +0 -0
  69. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/tests/cassettes/test_paged_results.yaml +0 -0
  70. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/tests/cassettes/test_scrape_integration.yaml +0 -0
  71. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/tests/conftest.py +0 -0
  72. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/tests/fixtures/2021_page_1.html +0 -0
  73. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/tests/fixtures/2021_page_2.html +0 -0
  74. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/tests/test_cache.py +0 -0
  75. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/tests/test_delete.py +0 -0
  76. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/tests/test_job_center.py +0 -0
  77. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/tests/test_job_center_cache.py +0 -0
  78. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/tests/test_openpyxl.py +0 -0
  79. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/__init__.py +0 -0
  80. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/cache.py +0 -0
  81. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/cli.py +0 -0
  82. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/platforms/__init__.py +0 -0
  83. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/platforms/job_center/__init__.py +0 -0
  84. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/platforms/job_center/cache.py +0 -0
  85. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/platforms/job_center/site.py +0 -0
  86. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/platforms/job_center/urls.py +0 -0
  87. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/platforms/job_center/utils.py +0 -0
  88. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/runner.py +0 -0
  89. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/__init__.py +0 -0
  90. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/ak.py +0 -0
  91. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/al.py +0 -0
  92. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/az.py +0 -0
  93. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/ca.py +0 -0
  94. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/co.py +0 -0
  95. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/ct.py +0 -0
  96. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/dc.py +0 -0
  97. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/de.py +0 -0
  98. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/fl.py +0 -0
  99. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/ga.py +0 -0
  100. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/hi.py +0 -0
  101. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/ia.py +0 -0
  102. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/il.py +0 -0
  103. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/in.py +0 -0
  104. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/ks.py +0 -0
  105. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/ky.py +0 -0
  106. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/la.py +0 -0
  107. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/md.py +0 -0
  108. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/me.py +0 -0
  109. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/mi.py +0 -0
  110. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/mo.py +0 -0
  111. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/mt.py +0 -0
  112. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/ne.py +0 -0
  113. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/nj.py +0 -0
  114. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/nm.py +0 -0
  115. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/ny.py +0 -0
  116. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/oh.py +0 -0
  117. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/ok.py +0 -0
  118. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/or.py +0 -0
  119. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/ri.py +0 -0
  120. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/sc.py +0 -0
  121. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/sd.py +0 -0
  122. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/tn.py +0 -0
  123. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/tx.py +0 -0
  124. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/ut.py +0 -0
  125. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/va.py +0 -0
  126. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/vt.py +0 -0
  127. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/wa.py +0 -0
  128. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/scrapers/wi.py +0 -0
  129. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn/utils.py +0 -0
  130. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn_scraper.egg-info/SOURCES.txt +0 -0
  131. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn_scraper.egg-info/dependency_links.txt +0 -0
  132. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn_scraper.egg-info/entry_points.txt +0 -0
  133. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn_scraper.egg-info/not-zip-safe +0 -0
  134. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn_scraper.egg-info/requires.txt +0 -0
  135. {warn-scraper-1.2.74 → warn-scraper-1.2.76}/warn_scraper.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: warn-scraper
3
- Version: 1.2.74
3
+ Version: 1.2.76
4
4
  Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
5
5
  Home-page: https://github.com/biglocalnews/warn-scraper
6
6
  Author: Big Local News
@@ -4,6 +4,8 @@ import re
4
4
  from pathlib import Path
5
5
 
6
6
  import pdfplumber
7
+ import requests
8
+ from bs4 import BeautifulSoup
7
9
 
8
10
  from .. import utils
9
11
  from ..cache import Cache
@@ -12,7 +14,7 @@ __authors__ = ["chriszs", "stucka"]
12
14
  __tags__ = ["pdf"]
13
15
  __source__ = {
14
16
  "name": "Idaho Department of Labor",
15
- "url": "https://www.labor.idaho.gov/dnn/Businesses/Layoff-Assistance#2",
17
+ "url": "https://www.labor.idaho.gov/businesss/layoff-assistance/",
16
18
  }
17
19
 
18
20
  logger = logging.getLogger(__name__)
@@ -32,7 +34,8 @@ def scrape(
32
34
  Returns: the Path where the file is written
33
35
  """
34
36
  # Create the URL of the source PDF
35
- base_url = "https://www.labor.idaho.gov/dnn/Portals/0/Publications/"
37
+ base_url = "https://www.labor.idaho.gov"
38
+ start_url = "https://www.labor.idaho.gov/businesss/layoff-assistance/"
36
39
  file_name = "WARNNotice.pdf"
37
40
  # There's a numeric parameter called v on this PDF URL that updates
38
41
  # from time to time. Suspect this is a cache-buster. We're using a
@@ -40,14 +43,23 @@ def scrape(
40
43
  min_cache_buster = 0
41
44
  max_cache_buster = 10000000000
42
45
  cache_buster = random.randrange(min_cache_buster, max_cache_buster)
43
- url = f"{base_url}{file_name}?v={cache_buster}"
46
+ page_url = f"{start_url}?v={cache_buster}"
44
47
 
45
- # Download the PDF with verify=False because
46
- # there's a persistent cert error we're working around.
47
48
  cache = Cache(cache_dir)
48
49
  state_code = "id"
50
+ logger.debug(f"Trying to fetch page at {page_url}")
51
+ r = requests.get(page_url)
52
+
53
+ # Start finding the link before "Who to contact"
54
+ html = r.text
55
+ localizedhtml = html.split("<h2>Who to contact")[0]
56
+ soup = BeautifulSoup(localizedhtml, features="lxml")
57
+ last_url = soup.find_all("a")[-1]["href"]
58
+ pdf_url = f"{base_url}{last_url}"
59
+
60
+ logger.debug(f"Trying to fetch PDF at {pdf_url}")
49
61
  cache_key = f"{state_code}/{file_name}"
50
- pdf_file = cache.download(cache_key, url, verify=True)
62
+ pdf_file = cache.download(cache_key, pdf_url, verify=True)
51
63
 
52
64
  # Loop through the PDF pages and scrape out the data
53
65
  output_rows: list = []
@@ -126,9 +138,10 @@ def filter_garbage_rows(incoming: list):
126
138
  badrows += 1
127
139
  if badrows == 0:
128
140
  logger.debug("No bad rows found.")
129
- logger.debug(
130
- f"!!!!! {badrows:,} bad rows dropped from the data set with insufficient number of fields."
131
- )
141
+ else:
142
+ logger.debug(
143
+ f"!!!!! {badrows:,} bad rows dropped from the data set with insufficient number of fields."
144
+ )
132
145
  return outgoing
133
146
 
134
147
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: warn-scraper
3
- Version: 1.2.74
3
+ Version: 1.2.76
4
4
  Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
5
5
  Home-page: https://github.com/biglocalnews/warn-scraper
6
6
  Author: Big Local News
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes