warn-scraper 1.2.108__tar.gz → 1.2.109__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. {warn_scraper-1.2.108/warn_scraper.egg-info → warn_scraper-1.2.109}/PKG-INFO +1 -1
  2. warn_scraper-1.2.109/warn/scrapers/co.py +365 -0
  3. {warn_scraper-1.2.108 → warn_scraper-1.2.109/warn_scraper.egg-info}/PKG-INFO +1 -1
  4. warn_scraper-1.2.108/warn/scrapers/co.py +0 -238
  5. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/.devcontainer/devcontainer.json +0 -0
  6. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/.github/dependabot.yml.disabled-for-sanity +0 -0
  7. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/.github/workflows/continuous-deployment.yml +0 -0
  8. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/.github/workflows/continuous-deployment.yml.broken-tests +0 -0
  9. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/.gitignore +0 -0
  10. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/.pre-commit-config.yaml +0 -0
  11. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/LICENSE +0 -0
  12. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/MANIFEST.in +0 -0
  13. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/Makefile +0 -0
  14. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/Pipfile +0 -0
  15. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/Pipfile.lock +0 -0
  16. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/README.md +0 -0
  17. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/Makefile +0 -0
  18. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/_static/R42693.pdf +0 -0
  19. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/_static/gao-03-1003.pdf +0 -0
  20. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/_static/releasing-actions-finished.png +0 -0
  21. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/_static/releasing-actions-start.png +0 -0
  22. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/_static/releasing-changelog-button.png +0 -0
  23. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/_static/releasing-changelog-entered.png +0 -0
  24. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/_static/releasing-draft-button.png +0 -0
  25. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/_static/releasing-name-release.png +0 -0
  26. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/_static/releasing-name-tag.png +0 -0
  27. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/_static/releasing-publish-button.png +0 -0
  28. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/_static/releasing-pypi.png +0 -0
  29. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/_static/releasing-release-published.png +0 -0
  30. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/_static/releasing-releases-button.png +0 -0
  31. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/_static/releasing-tag-button.png +0 -0
  32. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/_templates/sources.md.tmpl +0 -0
  33. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/conf.py +0 -0
  34. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/contributing.rst +0 -0
  35. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/index.rst +0 -0
  36. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/make.bat +0 -0
  37. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/reference.rst +0 -0
  38. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/releasing.md +0 -0
  39. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/requirements.txt +0 -0
  40. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/scrapers/al.md +0 -0
  41. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/scrapers/az.md +0 -0
  42. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/scrapers/ca.md +0 -0
  43. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/scrapers/co.md +0 -0
  44. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/scrapers/dc.md +0 -0
  45. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/scrapers/de.md +0 -0
  46. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/scrapers/ia.md +0 -0
  47. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/scrapers/in.md +0 -0
  48. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/scrapers/job_center.md +0 -0
  49. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/scrapers/ks.md +0 -0
  50. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/scrapers/md.md +0 -0
  51. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/scrapers/me.md +0 -0
  52. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/scrapers/mo.md +0 -0
  53. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/scrapers/ny.md +0 -0
  54. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/scrapers/ok.md +0 -0
  55. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/scrapers/or.md +0 -0
  56. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/scrapers/sc.md +0 -0
  57. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/scrapers/tx.md +0 -0
  58. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/scrapers/ut.md +0 -0
  59. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/scrapers/va.md +0 -0
  60. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/scrapers/vt.md +0 -0
  61. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/scrapers/wi.md +0 -0
  62. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/sources.md +0 -0
  63. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/docs/usage.md +0 -0
  64. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/setup.cfg +0 -0
  65. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/setup.py +0 -0
  66. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/tests/__init__.py +0 -0
  67. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/tests/cassettes/test_cached_detail_pages.yaml +0 -0
  68. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/tests/cassettes/test_cached_search_results.yaml +0 -0
  69. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/tests/cassettes/test_missing_detail_page_values.yaml +0 -0
  70. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/tests/cassettes/test_no_results.yaml +0 -0
  71. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/tests/cassettes/test_paged_results.yaml +0 -0
  72. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/tests/cassettes/test_scrape_integration.yaml +0 -0
  73. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/tests/conftest.py +0 -0
  74. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/tests/fixtures/2021_page_1.html +0 -0
  75. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/tests/fixtures/2021_page_2.html +0 -0
  76. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/tests/test_cache.py +0 -0
  77. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/tests/test_delete.py +0 -0
  78. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/tests/test_job_center.py +0 -0
  79. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/tests/test_job_center_cache.py +0 -0
  80. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/tests/test_openpyxl.py +0 -0
  81. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/__init__.py +0 -0
  82. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/cache.py +0 -0
  83. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/cli.py +0 -0
  84. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/platforms/__init__.py +0 -0
  85. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/platforms/job_center/__init__.py +0 -0
  86. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/platforms/job_center/cache.py +0 -0
  87. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/platforms/job_center/site.py +0 -0
  88. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/platforms/job_center/urls.py +0 -0
  89. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/platforms/job_center/utils.py +0 -0
  90. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/runner.py +0 -0
  91. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/__init__.py +0 -0
  92. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/ak.py +0 -0
  93. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/al.py +0 -0
  94. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/az.py +0 -0
  95. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/ca.py +0 -0
  96. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/ct.py +0 -0
  97. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/dc.py +0 -0
  98. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/de.py +0 -0
  99. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/fl.py +0 -0
  100. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/ga.py +0 -0
  101. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/hi.py +0 -0
  102. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/ia.py +0 -0
  103. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/id.py +0 -0
  104. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/il.py +0 -0
  105. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/in.py +0 -0
  106. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/ks.py +0 -0
  107. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/ky.py +0 -0
  108. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/la.py +0 -0
  109. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/md.py +0 -0
  110. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/me.py +0 -0
  111. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/mi.py +0 -0
  112. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/mo.py +0 -0
  113. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/mt.py +0 -0
  114. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/ne.py +0 -0
  115. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/nj.py +0 -0
  116. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/nm.py +0 -0
  117. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/ny.py +0 -0
  118. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/oh.py +0 -0
  119. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/ok.py +0 -0
  120. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/or.py +0 -0
  121. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/ri.py +0 -0
  122. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/sc.py +0 -0
  123. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/sd.py +0 -0
  124. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/tn.py +0 -0
  125. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/tx.py +0 -0
  126. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/ut.py +0 -0
  127. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/va.py +0 -0
  128. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/vt.py +0 -0
  129. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/wa.py +0 -0
  130. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/scrapers/wi.py +0 -0
  131. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn/utils.py +0 -0
  132. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn_scraper.egg-info/SOURCES.txt +0 -0
  133. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn_scraper.egg-info/dependency_links.txt +0 -0
  134. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn_scraper.egg-info/entry_points.txt +0 -0
  135. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn_scraper.egg-info/not-zip-safe +0 -0
  136. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn_scraper.egg-info/requires.txt +0 -0
  137. {warn_scraper-1.2.108 → warn_scraper-1.2.109}/warn_scraper.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: warn-scraper
3
- Version: 1.2.108
3
+ Version: 1.2.109
4
4
  Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
5
5
  Home-page: https://github.com/biglocalnews/warn-scraper
6
6
  Author: Big Local News
@@ -0,0 +1,365 @@
1
+ import logging
2
+ from pathlib import Path
3
+
4
+ from bs4 import BeautifulSoup, Tag
5
+
6
+ from .. import utils
7
+ from ..cache import Cache
8
+
9
+ __authors__ = ["anikasikka", "stucka"]
10
+ __tags__ = ["html"]
11
+ __source__ = {
12
+ "name": "Colorado Department of Labor and Employment",
13
+ "url": "https://cdle.colorado.gov/employers/layoff-separations/layoff-warn-list",
14
+ }
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ def scrape(
20
+ data_dir: Path = utils.WARN_DATA_DIR,
21
+ cache_dir: Path = utils.WARN_CACHE_DIR,
22
+ ) -> Path:
23
+ """
24
+ Scrape data from Colorado.
25
+
26
+ Keyword arguments:
27
+ data_dir -- the Path were the result will be saved (default WARN_DATA_DIR)
28
+ cache_dir -- the Path where results can be cached (default WARN_CACHE_DIR)
29
+
30
+ Returns: the Path where the file is written
31
+ """
32
+ # Grab the page
33
+ page = utils.get_url(
34
+ "https://cdle.colorado.gov/employers/layoff-separations/layoff-warn-list"
35
+ )
36
+ html = page.text
37
+
38
+ # Write the raw file to the cache
39
+ cache = Cache(cache_dir)
40
+ cache.write("co/main/source.html", html)
41
+
42
+ # Parse the page
43
+ soup = BeautifulSoup(html, "html5lib")
44
+
45
+ # Get the link to the Google Sheet that's on the page
46
+ content_region = soup.find(class_="region-content")
47
+ if isinstance(content_region, Tag):
48
+ current_link = content_region.find("a", class_="btn-dark-blue")
49
+ else:
50
+ raise ValueError("Could not find content region")
51
+ if isinstance(current_link, Tag):
52
+ current_href = current_link["href"]
53
+ else:
54
+ raise ValueError("Could not find Google Sheet link")
55
+
56
+ # Scraper had been working off partially loaded impression of the HTML in the dOM.
57
+ # This keyboard is not helping.
58
+ # Anyway, instead of trying to get a partially complete version and parse the HTML there,
59
+ # let's try to get the actual HTML export of the page.
60
+ # 2016 has a different filename schema we need to account for.
61
+
62
+ if "/edit" in current_href:
63
+ better_link = current_href.split("/edit")[0] + "/gviz/tq?tqx=out:html" # type: ignore
64
+ elif "drive.google.com/open?id=" in current_href: # Work from the ID
65
+ better_link = "https://docs.google.com/spreadsheets/d/"
66
+ better_link += current_href.split("open?id=")[-1] # type: ignore
67
+ better_link += "/gviz/tq?tqx=out:html"
68
+ else:
69
+ raise ValueError(f"Could not adapt {current_href} to find HTML export.")
70
+
71
+ # Open the Google Sheet
72
+ current_page = utils.get_url(better_link)
73
+ current_html = current_page.text
74
+
75
+ # Parse the Google Sheet
76
+ soup_current = BeautifulSoup(current_html, "html5lib")
77
+ # table = soup_current.find(class_="waffle")
78
+ table = soup_current.find("table")
79
+ cleaned_data = scrape_google_sheets(table)
80
+
81
+ # Goes through the accordion links to get past data
82
+ content_region = soup.find(class_="region-content")
83
+ if isinstance(content_region, Tag):
84
+ accordion_list = content_region.find_all("dl")
85
+ else:
86
+ raise ValueError("Could not find content region")
87
+
88
+ # Make sure there's only one
89
+ assert len(accordion_list) == 1
90
+
91
+ # Grab the first one from the list
92
+ accordion = accordion_list[0]
93
+
94
+ link_list = [a for a in accordion.find_all("a") if "feedback" not in a.text]
95
+ logger.debug(f"Requesting {len(link_list)} discovered links")
96
+ for link in link_list:
97
+ bad_url = link["href"]
98
+ # Scraper had been working off partially loaded impression of the HTML in the dOM.
99
+ # This keyboard is not helping.
100
+ # Anyway, instead of trying to get a partially complete version and parse the HTML there,
101
+ # let's try to get the actual HTML export of the page.
102
+ # 2016 has a different filename schema we need to account for.
103
+
104
+ if "/edit" in bad_url:
105
+ better_link = bad_url.split("/edit")[0] + "/gviz/tq?tqx=out:html"
106
+ elif "drive.google.com/open?id=" in bad_url:
107
+ better_link = "https://docs.google.com/spreadsheets/d/"
108
+ better_link += bad_url.split("open?id=")[-1] # Get just the Id
109
+ better_link += "/gviz/tq?tqx=out:html"
110
+ else:
111
+ raise ValueError(f"Could not adapt {bad_url} to find HTML export.")
112
+
113
+ page = utils.get_url(better_link)
114
+
115
+ soup = BeautifulSoup(page.text, "html5lib")
116
+ table = soup.find("table")
117
+ if "2017" in link.text:
118
+ header_list = [
119
+ "Company",
120
+ "Layoff Total",
121
+ "Workforce Region",
122
+ "WARN Date",
123
+ "Reason for Layoff",
124
+ ]
125
+ elif "2019" in link.text:
126
+ header_list = [
127
+ "Company Name",
128
+ "Layoff Total",
129
+ "Workforce Local Area",
130
+ "WARN Date",
131
+ "Reason for Layoff",
132
+ "Occupations",
133
+ "Layoff Date(s)",
134
+ ]
135
+ else:
136
+ header_list = []
137
+ cleaned_data += scrape_google_sheets(table, header_list)
138
+
139
+ # Clean up the headers
140
+ header_crosswalk = {
141
+ "Company Name": "company",
142
+ "Company": "company",
143
+ "Name": "company",
144
+ "WARN Date": "notice_date",
145
+ "Total Layoffs": "jobs",
146
+ "NAICS": "naics",
147
+ "Workforce Area": "workforce_area",
148
+ "# Perm": "permanent_job_losses",
149
+ "#Temp": "temporary_job_losses",
150
+ "Reduced Hours": "reduced_hours",
151
+ "#Furloughs": "furloughs",
152
+ "Begin Date": "begin_date",
153
+ "End Date": "end_date",
154
+ "Reason for Layoffs": "reason",
155
+ "Reason for Layoff": "reason",
156
+ "WARN Letter": "letter",
157
+ "Occupations Impacted": "occupations",
158
+ "Occupations": "occupations",
159
+ "Select the workforce area": "workforce_area",
160
+ "Total CO": "jobs",
161
+ "CO Layoffs": "jobs",
162
+ "Total number of permanent layoffs": "permanent_job_losses",
163
+ "# permanent": "permanent_job_losses",
164
+ "# Permanent": "permanent_job_losses",
165
+ "Total number of temporary layoffs": "temporary_job_losses",
166
+ "Total number of furloughs": "furloughs",
167
+ "Begin date of layoffs": "begin_date",
168
+ "End date of layoffs": "end_date",
169
+ "Layoff Total": "jobs",
170
+ "Local Area": "workforce_area",
171
+ "Layoff Date(s)": "begin_date",
172
+ "Temp Layoffs": "temporary_job_losses",
173
+ "Perm Layoffs": "permanent_job_losses",
174
+ "Furloughs": "furloughs",
175
+ "Workforce Local Area": "workforce_area",
176
+ "Workforce Region": "workforce_region",
177
+ "Contact Name": "contact",
178
+ "Contact Phone": "phone",
179
+ "Contact Email": "email",
180
+ "FEIN": "fein",
181
+ "Location Address": "location",
182
+ "Total number of employees at the location": "at_the_location",
183
+ "Sector 33 (6414) Guided Missle & Space Vehicle": "naics",
184
+ "@dropdown": "dropdown",
185
+ "Received": "received_date",
186
+ "Notes": "notes",
187
+ # Only add new matches above here, not below here.
188
+ }
189
+
190
+ header_garbage = {
191
+ # And then it got ugly with some columns getting unhidden.
192
+ "Timestamp": "timestamp",
193
+ "Email Address": "email_address",
194
+ "Is this a NEW WARN or a REVISION?": "is_this_a_new_warn_or_a_revision",
195
+ "Total number of employees with reduced hours": "total_number_of_employees_with_reduced_hours",
196
+ "Include the total number of employees on or expected to be on a Workshare plan.": "include_the_total_number_of_employees_on_or_expected_to_be_on_a_workshare_plan",
197
+ "Expected date of second job losses at location 1": "expected_date_of_second_job_losses_at_location_1",
198
+ "Expected end date of second job losses at location 1": "expected_end_date_of_second_job_losses_at_location_1",
199
+ "Expected date of third job losses at location 1": "expected_date_of_third_job_losses_at_location_1",
200
+ "Expected end date of third job losses at location 1": "expected_end_date_of_third_job_losses_at_location_1",
201
+ "Do the employees have bumping rights?": "do_the_employees_have_bumping_rights",
202
+ "Are the employees represented by a union?": "are_the_employees_represented_by_a_union",
203
+ "If you selected Rural Consortium for the workforce area, please choose a subarea using the map.": "if_you_selected_rural_consortium_for_the_workforce_area_please_choose_a_subarea_using_the_map",
204
+ "Name of union(s)": "name_of_unions",
205
+ "Contact phone number for union representative(s)": "contact_phone_number_for_union_representatives",
206
+ "Email address for union representative(s)": "email_address_for_union_representatives",
207
+ "Address, City, ZIP for Union 1": "address_city_zip_for_union_1",
208
+ "Has a second location been impacted?": "has_a_second_location_been_impacted",
209
+ "Location 2 Address": "location_2_address",
210
+ "Total number of employees at location 2": "total_number_of_employees_at_location_2",
211
+ "Total number of permanent layoffs at location 2": "total_number_of_permanent_layoffs_at_location_2",
212
+ "Total number of temporary layoffs at location 2": "total_number_of_temporary_layoffs_at_location_2",
213
+ "Total number of furloughs at location 2": "total_number_of_furloughs_at_location_2",
214
+ "Total number of employees with reduced hours at location 2": "total_number_of_employees_with_reduced_hours_at_location_2",
215
+ "Total number of employees on workshare plan at location 2": "total_number_of_employees_on_workshare_plan_at_location_2",
216
+ "Occupations Impacted at location 2": "occupations_impacted_at_location_2",
217
+ "Expected date of first job losses at location 2": "expected_date_of_first_job_losses_at_location_2",
218
+ "Contact name(s) for union representative(s)": "contact_names_for_union_representatives",
219
+ "Expected end date of first job losses at location 2": "expected_end_date_of_first_job_losses_at_location_2",
220
+ "Expected date of second job losses at location 2": "expected_date_of_second_job_losses_at_location_2",
221
+ "Expected end date of second job losses at location 2": "expected_end_date_of_second_job_losses_at_location_2",
222
+ "Expected date of third job losses at location 2": "expected_date_of_third_job_losses_at_location_2",
223
+ "Expected end date of third job losses at location 2": "expected_end_date_of_third_job_losses_at_location_2",
224
+ "Reason for Layoffs at location 2": "reason_for_layoffs_at_location_2",
225
+ "Do employees at location 2 having bumping rights?": "do_employees_at_location_2_having_bumping_rights",
226
+ "Are employees at location 2 represented by a union?": "are_employees_at_location_2_represented_by_a_union",
227
+ "Select the workforce area for location 2": "select_the_workforce_area_for_location_2",
228
+ "If you selected Other/Sub-Area, please choose a location from the following dropdown menu:": "if_you_selected_othersub_area_please_choose_a_location_from_the_following_dropdown_menu",
229
+ "Name of Union 2": "name_of_union_2",
230
+ "Contact name for Union 2": "contact_name_for_union_2",
231
+ "Contact phone number for Union 2": "contact_phone_number_for_union_2",
232
+ "Email address for Union 2": "email_address_for_union_2",
233
+ "Address, City, ZIP for Union 2": "address_city_zip_for_union_2",
234
+ "Has a third location been impacted?": "has_a_third_location_been_impacted",
235
+ "Location 3 Address": "location_3_address",
236
+ "Total number of employees at location 3": "total_number_of_employees_at_location_3",
237
+ "Total number of permanent layoffs at location 3": "total_number_of_permanent_layoffs_at_location_3",
238
+ "Total number of temporary layoffs at location 3": "total_number_of_temporary_layoffs_at_location_3",
239
+ "Total number of furloughs at location 3": "total_number_of_furloughs_at_location_3",
240
+ "Total number of employees with reduced hours at location 3": "total_number_of_employees_with_reduced_hours_at_location_3",
241
+ "Total number of employees on workshare plan at location 3": "total_number_of_employees_on_workshare_plan_at_location_3",
242
+ "Occupations Impacted at location 3": "occupations_impacted_at_location_3",
243
+ "Expected date of first job losses at location 3": "expected_date_of_first_job_losses_at_location_3",
244
+ "Expected end date of first job losses at location 3": "expected_end_date_of_first_job_losses_at_location_3",
245
+ "Expected date of second job losses at location 3": "expected_date_of_second_job_losses_at_location_3",
246
+ "Expected end date of second job losses at location 3": "expected_end_date_of_second_job_losses_at_location_3",
247
+ "Expected date of third job losses at location 3": "expected_date_of_third_job_losses_at_location_3",
248
+ "Expected end date of third job losses at location 3": "expected_end_date_of_third_job_losses_at_location_3",
249
+ "Reason for Layoffs at location 3": "reason_for_layoffs_at_location_3",
250
+ "Do employees at location 3 having bumping rights?": "do_employees_at_location_3_having_bumping_rights",
251
+ "Are employees at location 3 represented by a union?": "are_employees_at_location_3_represented_by_a_union",
252
+ "Select the workforce area for location 3": "select_the_workforce_area_for_location_3",
253
+ "Name of Union 3": "name_of_union_3",
254
+ "Contact name for Union 3": "contact_name_for_union_3",
255
+ "Contact phone number for Union 3": "contact_phone_number_for_union_3",
256
+ "Email address for Union 3": "email_address_for_union_3",
257
+ "Address, City, ZIP for Union 3": "address_city_zip_for_union_3",
258
+ "Include here any comments or additional details": "include_here_any_comments_or_additional_details",
259
+ # This is for garbage, not legit crosswalk. You probably do not want to add here.
260
+ }
261
+
262
+ standardized_data = []
263
+ for row in cleaned_data:
264
+ row_dict = {}
265
+ mangled = []
266
+ for key in row:
267
+ if (
268
+ key not in header_crosswalk and key not in header_garbage
269
+ ): # Get all missing keys at once
270
+ mangled.append(key)
271
+ if len(mangled) > 0:
272
+ logger.warning(f"Missing a bunch of keys: {'|'.join(mangled)}")
273
+
274
+ for key, value in row.items():
275
+ if (
276
+ key not in header_crosswalk and key not in header_garbage
277
+ ): # If we've never seen this before
278
+ logger.warning(f"Could not find {key} in header_crosswalk")
279
+ logger.warning(row)
280
+ if key not in header_garbage: # if it's in the crosswalk, if it's legit
281
+ standardized_key = header_crosswalk[key]
282
+ row_dict[standardized_key] = value
283
+ if len(row_dict["company"]) < 3 and row_dict["letter"] == "Avis Budget Group":
284
+ row_dict["company"] = "Avis Budget Group"
285
+ if len(row_dict["company"]) < 3: # or len(row_dict['naics']) <5:
286
+ logger.debug(f"Dropping row of questionable quality: {row_dict}")
287
+ elif "begin_date" in row_dict and row_dict["begin_date"] == "Layoff Date(s)":
288
+ logger.debug(f"Dropping row of questionable quality: {row_dict}")
289
+ else:
290
+ standardized_data.append(row_dict)
291
+
292
+ # Set the path to the final CSV
293
+ output_csv = data_dir / "co.csv"
294
+
295
+ # Write out the rows to the export directory
296
+ # headers = list(cleaned_data[0].keys())
297
+ utils.write_dict_rows_to_csv(
298
+ output_csv, set(header_crosswalk.values()), standardized_data
299
+ )
300
+
301
+ # Return the path to the final CSV
302
+ return output_csv
303
+
304
+
305
+ def scrape_google_sheets(table, header_list=None):
306
+ """
307
+ Scrapes data out of a Google Sheet.
308
+
309
+ Keyword arguments:
310
+ table -- A Google Sheet table pulled into BeautifulSoup
311
+ header_list -- A list of header to use. Provide this when the source spreadsheet doesn't have a proper header row.
312
+
313
+ Returns: The parsed data as a list of dictionaries
314
+ """
315
+ # logger.debug(table)
316
+ # If a header list isn't provided, pull one out automatically
317
+ if not header_list:
318
+ # Pull out the header row
319
+ # header_soup = table.find_all("tr")[1]
320
+ header_soup = table.find_all("tr")[0]
321
+ # Parse the header row into a list,
322
+ # preserving its order in the sheet
323
+ header_list = []
324
+ for cellindex, cell in enumerate(header_soup.find_all("td")):
325
+ cell_text = cell.text.strip()
326
+ # Skip empty headers
327
+ if cell_text:
328
+ header_list.append(cell_text)
329
+ if not cell_text and cellindex == 0:
330
+ header_list.append("Company Name")
331
+
332
+ # Loop through all the data rows, which start
333
+ # after the header and the little bar
334
+ tr_list = table.find_all("tr")[1:]
335
+ logger.debug(f"Parsing {len(tr_list)} rows")
336
+ row_list = []
337
+ for row in tr_list:
338
+ # Only pull out the cells that have headers
339
+ cell_list = row.find_all("td")[: len(header_list)]
340
+
341
+ # Loop through the cells and key them into a dictionary using the header
342
+ row_dict = {}
343
+ for i, cell in enumerate(cell_list):
344
+ row_dict[header_list[i]] = cell.text.strip()
345
+
346
+ # Get values list for examination
347
+ value_list = list(row_dict.values())
348
+
349
+ # Skip empty rows
350
+ if not any(value_list):
351
+ continue
352
+
353
+ # Skip header rows
354
+ if "WARN Date" in value_list:
355
+ continue
356
+
357
+ # Keep whatever is left
358
+ row_list.append(row_dict)
359
+
360
+ # Return what we got
361
+ return row_list
362
+
363
+
364
+ if __name__ == "__main__":
365
+ scrape()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: warn-scraper
3
- Version: 1.2.108
3
+ Version: 1.2.109
4
4
  Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
5
5
  Home-page: https://github.com/biglocalnews/warn-scraper
6
6
  Author: Big Local News
@@ -1,238 +0,0 @@
1
- import logging
2
- from pathlib import Path
3
-
4
- from bs4 import BeautifulSoup, Tag
5
-
6
- from .. import utils
7
- from ..cache import Cache
8
-
9
- __authors__ = ["anikasikka"]
10
- __tags__ = ["html"]
11
- __source__ = {
12
- "name": "Colorado Department of Labor and Employment",
13
- "url": "https://cdle.colorado.gov/employers/layoff-separations/layoff-warn-list",
14
- }
15
-
16
- logger = logging.getLogger(__name__)
17
-
18
-
19
- def scrape(
20
- data_dir: Path = utils.WARN_DATA_DIR,
21
- cache_dir: Path = utils.WARN_CACHE_DIR,
22
- ) -> Path:
23
- """
24
- Scrape data from Colorado.
25
-
26
- Keyword arguments:
27
- data_dir -- the Path were the result will be saved (default WARN_DATA_DIR)
28
- cache_dir -- the Path where results can be cached (default WARN_CACHE_DIR)
29
-
30
- Returns: the Path where the file is written
31
- """
32
- # Grab the page
33
- page = utils.get_url(
34
- "https://cdle.colorado.gov/employers/layoff-separations/layoff-warn-list"
35
- )
36
- html = page.text
37
-
38
- # Write the raw file to the cache
39
- cache = Cache(cache_dir)
40
- cache.write("co/main/source.html", html)
41
-
42
- # Parse the page
43
- soup = BeautifulSoup(html, "html5lib")
44
-
45
- # Get the link to the Google Sheet that's on the page
46
- content_region = soup.find(class_="region-content")
47
- if isinstance(content_region, Tag):
48
- current_link = content_region.find("a", class_="btn-dark-blue")
49
- else:
50
- raise ValueError("Could not find content region")
51
- if isinstance(current_link, Tag):
52
- current_href = current_link["href"]
53
- else:
54
- raise ValueError("Could not find Google Sheet link")
55
-
56
- # Open the Google Sheet
57
- current_page = utils.get_url(current_href)
58
- current_html = current_page.text
59
-
60
- # Parse the Google Sheet
61
- soup_current = BeautifulSoup(current_html, "html5lib")
62
- table = soup_current.find(class_="waffle")
63
- cleaned_data = scrape_google_sheets(table)
64
-
65
- # Goes through the accordion links to get past data
66
- content_region = soup.find(class_="region-content")
67
- if isinstance(content_region, Tag):
68
- accordion_list = content_region.find_all("dl")
69
- else:
70
- raise ValueError("Could not find content region")
71
-
72
- # Make sure there's only one
73
- assert len(accordion_list) == 1
74
-
75
- # Grab the first one from the list
76
- accordion = accordion_list[0]
77
-
78
- link_list = [a for a in accordion.find_all("a") if "feedback" not in a.text]
79
- logger.debug(f"Requesting {len(link_list)} discovered links")
80
- for link in link_list:
81
- page = utils.get_url(link["href"])
82
- soup = BeautifulSoup(page.text, "html5lib")
83
- table = soup.find(class_="waffle")
84
- if "2017" in link.text:
85
- header_list = [
86
- "Company",
87
- "Layoff Total",
88
- "Workforce Region",
89
- "WARN Date",
90
- "Reason for Layoff",
91
- ]
92
- elif "2019" in link.text:
93
- header_list = [
94
- "Company Name",
95
- "Layoff Total",
96
- "Workforce Local Area",
97
- "WARN Date",
98
- "Reason for Layoff",
99
- "Occupations",
100
- "Layoff Date(s)",
101
- ]
102
- else:
103
- header_list = []
104
- cleaned_data += scrape_google_sheets(table, header_list)
105
-
106
- # Clean up the headers
107
- header_crosswalk = {
108
- "Name": "company",
109
- "Company Name": "company",
110
- "Company": "company",
111
- "WARN Date": "notice_date",
112
- "Total Layoffs": "jobs",
113
- "NAICS": "naics",
114
- "Workforce Area": "workforce_area",
115
- "# Perm": "permanent_job_losses",
116
- "#Temp": "temporary_job_losses",
117
- "Reduced Hours": "reduced_hours",
118
- "#Furloughs": "furloughs",
119
- "Begin Date": "begin_date",
120
- "End Date": "end_date",
121
- "Reason for Layoffs": "reason",
122
- "Reason for Layoff": "reason",
123
- "WARN Letter": "letter",
124
- "Occupations Impacted": "occupations",
125
- "Occupations": "occupations",
126
- "Select the workforce area": "workforce_area",
127
- "Total CO": "jobs",
128
- "CO Layoffs": "jobs",
129
- "Total number of permanent layoffs": "permanent_job_losses",
130
- "# permanent": "permanent_job_losses",
131
- "# Permanent": "permanent_job_losses",
132
- "Total number of temporary layoffs": "temporary_job_losses",
133
- "Total number of furloughs": "furloughs",
134
- "Begin date of layoffs": "begin_date",
135
- "End date of layoffs": "end_date",
136
- "Layoff Total": "jobs",
137
- "Local Area": "workforce_area",
138
- "Layoff Date(s)": "begin_date",
139
- "Temp Layoffs": "temporary_job_losses",
140
- "Perm Layoffs": "permanent_job_losses",
141
- "Furloughs": "furloughs",
142
- "Workforce Local Area": "workforce_area",
143
- "Workforce Region": "workforce_region",
144
- "Contact Name": "contact",
145
- "Contact Phone": "phone",
146
- "Contact Email": "email",
147
- "FEIN": "fein",
148
- "Location Address": "location",
149
- "Total number of employees at the location": "at_the_location",
150
- "Sector 33 (6414) Guided Missle & Space Vehicle": "naics",
151
- "@dropdown": "dropdown",
152
- "Received": "received_date",
153
- "Notes": "notes",
154
- }
155
- standardized_data = []
156
- for row in cleaned_data:
157
- row_dict = {}
158
- for key, value in row.items():
159
- standardized_key = header_crosswalk[key]
160
- row_dict[standardized_key] = value
161
- if len(row_dict["company"]) < 3 and row_dict["letter"] == "Avis Budget Group":
162
- row_dict["company"] = "Avis Budget Group"
163
- if len(row_dict["company"]) < 3: # or len(row_dict['naics']) <5:
164
- logger.debug(f"Dropping row of questionable quality: {row_dict}")
165
- else:
166
- standardized_data.append(row_dict)
167
-
168
- # Set the path to the final CSV
169
- output_csv = data_dir / "co.csv"
170
-
171
- # Write out the rows to the export directory
172
- # headers = list(cleaned_data[0].keys())
173
- utils.write_dict_rows_to_csv(
174
- output_csv, set(header_crosswalk.values()), standardized_data
175
- )
176
-
177
- # Return the path to the final CSV
178
- return output_csv
179
-
180
-
181
- def scrape_google_sheets(table, header_list=None):
182
- """
183
- Scrapes data out of a Google Sheet.
184
-
185
- Keyword arguments:
186
- table -- A Google Sheet table pulled into BeautifulSoup
187
- header_list -- A list of header to use. Provide this when the source spreadsheet doesn't have a proper header row.
188
-
189
- Returns: The parsed data as a list of dictionaries
190
- """
191
- # If a header list isn't provided, pull one out automatically
192
- if not header_list:
193
- # Pull out the header row
194
- header_soup = table.find_all("tr")[1]
195
-
196
- # Parse the header row into a list,
197
- # preserving its order in the sheet
198
- header_list = []
199
- for cell in header_soup.find_all("td"):
200
- cell_text = cell.text.strip()
201
- # Skip empty headers
202
- if cell_text:
203
- header_list.append(cell_text)
204
-
205
- # Loop through all the data rows, which start
206
- # after the header and the little bar
207
- tr_list = table.find_all("tr")[3:]
208
- logger.debug(f"Parsing {len(tr_list)} rows")
209
- row_list = []
210
- for row in tr_list:
211
- # Only pull out the cells that have headers
212
- cell_list = row.find_all("td")[: len(header_list)]
213
-
214
- # Loop through the cells and key them into a dictionary using the header
215
- row_dict = {}
216
- for i, cell in enumerate(cell_list):
217
- row_dict[header_list[i]] = cell.text.strip()
218
-
219
- # Get values list for examination
220
- value_list = list(row_dict.values())
221
-
222
- # Skip empty rows
223
- if not any(value_list):
224
- continue
225
-
226
- # Skip header rows
227
- if "WARN Date" in value_list:
228
- continue
229
-
230
- # Keep whatever is left
231
- row_list.append(row_dict)
232
-
233
- # Return what we got
234
- return row_list
235
-
236
-
237
- if __name__ == "__main__":
238
- scrape()
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes