warn-scraper 1.2.55__tar.gz → 1.2.57__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. {warn-scraper-1.2.55/warn_scraper.egg-info → warn-scraper-1.2.57}/PKG-INFO +1 -1
  2. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/ct.py +15 -3
  3. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/dc.py +12 -3
  4. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/hi.py +9 -6
  5. {warn-scraper-1.2.55 → warn-scraper-1.2.57/warn_scraper.egg-info}/PKG-INFO +1 -1
  6. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/.devcontainer/devcontainer.json +0 -0
  7. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/.github/dependabot.yml.disabled +0 -0
  8. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/.github/workflows/continuous-deployment.yml +0 -0
  9. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/.gitignore +0 -0
  10. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/.pre-commit-config.yaml +0 -0
  11. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/LICENSE +0 -0
  12. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/MANIFEST.in +0 -0
  13. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/Makefile +0 -0
  14. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/Pipfile +0 -0
  15. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/Pipfile.lock +0 -0
  16. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/README.md +0 -0
  17. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/Makefile +0 -0
  18. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/_static/R42693.pdf +0 -0
  19. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/_static/gao-03-1003.pdf +0 -0
  20. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/_static/releasing-actions-finished.png +0 -0
  21. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/_static/releasing-actions-start.png +0 -0
  22. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/_static/releasing-changelog-button.png +0 -0
  23. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/_static/releasing-changelog-entered.png +0 -0
  24. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/_static/releasing-draft-button.png +0 -0
  25. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/_static/releasing-name-release.png +0 -0
  26. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/_static/releasing-name-tag.png +0 -0
  27. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/_static/releasing-publish-button.png +0 -0
  28. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/_static/releasing-pypi.png +0 -0
  29. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/_static/releasing-release-published.png +0 -0
  30. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/_static/releasing-releases-button.png +0 -0
  31. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/_static/releasing-tag-button.png +0 -0
  32. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/_templates/sources.md.tmpl +0 -0
  33. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/conf.py +0 -0
  34. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/contributing.rst +0 -0
  35. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/index.rst +0 -0
  36. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/make.bat +0 -0
  37. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/reference.rst +0 -0
  38. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/releasing.md +0 -0
  39. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/requirements.txt +0 -0
  40. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/scrapers/al.md +0 -0
  41. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/scrapers/az.md +0 -0
  42. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/scrapers/ca.md +0 -0
  43. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/scrapers/co.md +0 -0
  44. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/scrapers/dc.md +0 -0
  45. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/scrapers/de.md +0 -0
  46. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/scrapers/ia.md +0 -0
  47. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/scrapers/in.md +0 -0
  48. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/scrapers/job_center.md +0 -0
  49. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/scrapers/ks.md +0 -0
  50. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/scrapers/md.md +0 -0
  51. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/scrapers/me.md +0 -0
  52. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/scrapers/mo.md +0 -0
  53. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/scrapers/ny.md +0 -0
  54. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/scrapers/ok.md +0 -0
  55. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/scrapers/or.md +0 -0
  56. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/scrapers/sc.md +0 -0
  57. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/scrapers/tx.md +0 -0
  58. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/scrapers/ut.md +0 -0
  59. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/scrapers/va.md +0 -0
  60. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/scrapers/vt.md +0 -0
  61. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/scrapers/wi.md +0 -0
  62. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/sources.md +0 -0
  63. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/docs/usage.md +0 -0
  64. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/setup.cfg +0 -0
  65. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/setup.py +0 -0
  66. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/tests/__init__.py +0 -0
  67. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/tests/cassettes/test_cached_detail_pages.yaml +0 -0
  68. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/tests/cassettes/test_cached_search_results.yaml +0 -0
  69. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/tests/cassettes/test_delete.yaml +0 -0
  70. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/tests/cassettes/test_missing_detail_page_values.yaml +0 -0
  71. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/tests/cassettes/test_no_results.yaml +0 -0
  72. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/tests/cassettes/test_paged_results.yaml +0 -0
  73. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/tests/cassettes/test_scrape_integration.yaml +0 -0
  74. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/tests/conftest.py +0 -0
  75. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/tests/fixtures/2021_page_1.html +0 -0
  76. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/tests/fixtures/2021_page_2.html +0 -0
  77. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/tests/test_cache.py +0 -0
  78. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/tests/test_delete.py +0 -0
  79. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/tests/test_job_center.py +0 -0
  80. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/tests/test_job_center_cache.py +0 -0
  81. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/tests/test_openpyxl.py +0 -0
  82. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/__init__.py +0 -0
  83. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/cache.py +0 -0
  84. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/cli.py +0 -0
  85. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/platforms/__init__.py +0 -0
  86. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/platforms/job_center/__init__.py +0 -0
  87. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/platforms/job_center/cache.py +0 -0
  88. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/platforms/job_center/site.py +0 -0
  89. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/platforms/job_center/urls.py +0 -0
  90. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/platforms/job_center/utils.py +0 -0
  91. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/runner.py +0 -0
  92. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/__init__.py +0 -0
  93. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/ak.py +0 -0
  94. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/al.py +0 -0
  95. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/az.py +0 -0
  96. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/ca.py +0 -0
  97. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/co.py +0 -0
  98. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/de.py +0 -0
  99. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/fl.py +0 -0
  100. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/ga.py +0 -0
  101. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/ia.py +0 -0
  102. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/id.py +0 -0
  103. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/il.py +0 -0
  104. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/in.py +0 -0
  105. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/ks.py +0 -0
  106. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/ky.py +0 -0
  107. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/la.py +0 -0
  108. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/md.py +0 -0
  109. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/me.py +0 -0
  110. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/mi.py +0 -0
  111. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/mo.py +0 -0
  112. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/mt.py +0 -0
  113. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/ne.py +0 -0
  114. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/nj.py +0 -0
  115. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/nm.py +0 -0
  116. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/ny.py +0 -0
  117. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/oh.py +0 -0
  118. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/ok.py +0 -0
  119. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/or.py +0 -0
  120. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/ri.py +0 -0
  121. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/sc.py +0 -0
  122. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/sd.py +0 -0
  123. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/tn.py +0 -0
  124. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/tx.py +0 -0
  125. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/ut.py +0 -0
  126. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/va.py +0 -0
  127. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/vt.py +0 -0
  128. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/wa.py +0 -0
  129. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/scrapers/wi.py +0 -0
  130. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn/utils.py +0 -0
  131. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn_scraper.egg-info/SOURCES.txt +0 -0
  132. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn_scraper.egg-info/dependency_links.txt +0 -0
  133. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn_scraper.egg-info/entry_points.txt +0 -0
  134. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn_scraper.egg-info/not-zip-safe +0 -0
  135. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn_scraper.egg-info/requires.txt +0 -0
  136. {warn-scraper-1.2.55 → warn-scraper-1.2.57}/warn_scraper.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: warn-scraper
3
- Version: 1.2.55
3
+ Version: 1.2.57
4
4
  Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
5
5
  Home-page: https://github.com/biglocalnews/warn-scraper
6
6
  Author: Big Local News
@@ -3,6 +3,7 @@ from datetime import datetime
3
3
  from pathlib import Path
4
4
 
5
5
  from bs4 import BeautifulSoup
6
+ import requests
6
7
 
7
8
  from .. import utils
8
9
  from ..cache import Cache
@@ -40,8 +41,20 @@ def scrape(
40
41
  # We start in 2015
41
42
  current_year = datetime.now().year
42
43
 
43
- # Get the full range of years
44
- year_range = range(2015, current_year + 1)
44
+ if cache.exists(f"ct/{current_year}.html"):
45
+ # Get the full range of years
46
+ year_range = range(2015, current_year + 1)
47
+ else:
48
+ url = f"https://www.ctdol.state.ct.us/progsupt/bussrvce/warnreports/warn{current_year}.htm"
49
+ r = requests.head(url)
50
+ if r.ok:
51
+ logger.debug(f"Found first entry for {current_year}")
52
+ year_range = range(2015, current_year + 1)
53
+ else:
54
+ logger.debug(
55
+ f"No data for {current_year} found at {url}. Dropping back a year."
56
+ )
57
+ year_range = range(2015, current_year + 0)
45
58
 
46
59
  output_rows = []
47
60
  for year in year_range:
@@ -100,7 +113,6 @@ def _scrape_table(table) -> list:
100
113
  row_list = []
101
114
  # loop over table to process each row, skipping the header
102
115
  for table_row in table[0].find_all("tr")[1:]:
103
-
104
116
  # Get all the cells
105
117
  table_cells = table_row.find_all("td")
106
118
 
@@ -5,6 +5,7 @@ from datetime import datetime
5
5
  from pathlib import Path
6
6
 
7
7
  from bs4 import BeautifulSoup
8
+ import requests
8
9
 
9
10
  from .. import utils
10
11
  from ..cache import Cache
@@ -38,13 +39,22 @@ def scrape(
38
39
  # Get the root page
39
40
  today = datetime.today()
40
41
  current_year = today.year
41
- url = f"https://does.dc.gov/page/industry-closings-and-layoffs-warn-notifications-{current_year}"
42
+ targetfile = f"dc/{current_year}.html"
43
+ if not cache.exists(targetfile): # Check if we have an entry for the latest year
44
+ url = f"https://does.dc.gov/page/industry-closings-and-layoffs-warn-notifications-{current_year}"
45
+ r = requests.head(url)
46
+ if not r.ok:
47
+ logger.debug(f"Still no data found for {current_year}. Falling back.")
48
+ current_year = today.year - 1
49
+ targetfile = f"dc/{current_year}.html"
50
+ url = f"https://does.dc.gov/page/industry-closings-and-layoffs-warn-notifications-{current_year}"
51
+
42
52
  r = utils.get_url(url)
43
53
  r.encoding = "utf-8"
44
54
  root_html = r.text
45
55
 
46
56
  # Save it to the cache
47
- cache.write(f"dc/{current_year}.html", root_html)
57
+ cache.write(targetfile, root_html)
48
58
 
49
59
  # Parse the list of links
50
60
  soup = BeautifulSoup(root_html, "html5lib")
@@ -70,7 +80,6 @@ def scrape(
70
80
  root_html,
71
81
  ]
72
82
  for href in link_lookup.values():
73
-
74
83
  # Request the HTML
75
84
  r = utils.get_url(href)
76
85
  r.encoding = "utf-8"
@@ -1,13 +1,14 @@
1
1
  import datetime
2
2
  import logging
3
3
  from pathlib import Path
4
+ from urllib.parse import quote
4
5
 
5
6
  from bs4 import BeautifulSoup
6
7
 
7
8
  from .. import utils
8
9
 
9
10
  __authors__ = ["Ash1R", "stucka"]
10
- __tags__ = ["html"]
11
+ __tags__ = ["html", "pdf"]
11
12
  __source__ = {
12
13
  "name": "Workforce Development Hawaii",
13
14
  "url": "https://labor.hawaii.gov/wdc/real-time-warn-updates/",
@@ -28,15 +29,17 @@ def scrape(
28
29
  cache_dir -- the Path where results can be cached (default WARN_CACHE_DIR)
29
30
  Returns: the Path where the file is written
30
31
  """
31
- firstpage = utils.get_url("https://labor.hawaii.gov/wdc/real-time-warn-updates/")
32
+ cacheprefix = "https://webcache.googleusercontent.com/search?q=cache%3A" # Use Google Cache, per #600
33
+
34
+ firstpage = utils.get_url(cacheprefix + quote("https://labor.hawaii.gov/wdc/real-time-warn-updates/"))
32
35
  soup = BeautifulSoup(firstpage.text, features="html5lib")
33
36
  pagesection = soup.select("div.primary-content")[0]
34
37
  subpageurls = []
35
38
  for atag in pagesection.find_all("a"):
36
39
  href = atag["href"]
37
40
  if href.endswith("/"):
38
- href = href[:-1]
39
- subpageurls.append(href)
41
+ href = href # [:-1]
42
+ subpageurls.append(cacheprefix + quote(href))
40
43
 
41
44
  headers = ["Company", "Date", "PDF url", "location", "jobs"]
42
45
  data = [headers]
@@ -85,8 +88,8 @@ def scrape(
85
88
  row.append(dates[i])
86
89
 
87
90
  row.append(url)
88
- row.append(None) # location
89
- row.append(None) # jobs
91
+ row.append(None) # location
92
+ row.append(None) # jobs
90
93
  data.append(row)
91
94
 
92
95
  output_csv = data_dir / "hi.csv"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: warn-scraper
3
- Version: 1.2.55
3
+ Version: 1.2.57
4
4
  Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
5
5
  Home-page: https://github.com/biglocalnews/warn-scraper
6
6
  Author: Big Local News
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes