warn-scraper 1.2.112__tar.gz → 1.2.114__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/.pre-commit-config.yaml +1 -1
  2. {warn_scraper-1.2.112/warn_scraper.egg-info → warn_scraper-1.2.114}/PKG-INFO +1 -1
  3. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/Pipfile +1 -0
  4. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/Pipfile.lock +10 -1
  5. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/platforms/job_center/site.py +1 -1
  6. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/la.py +3 -39
  7. warn_scraper-1.2.114/warn/scrapers/ok.py +117 -0
  8. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/utils.py +103 -6
  9. {warn_scraper-1.2.112 → warn_scraper-1.2.114/warn_scraper.egg-info}/PKG-INFO +1 -1
  10. warn_scraper-1.2.112/warn/scrapers/ok.py +0 -42
  11. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/.devcontainer/devcontainer.json +0 -0
  12. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/.github/dependabot.yml.disabled-for-sanity +0 -0
  13. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/.github/workflows/continuous-deployment.yml +0 -0
  14. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/.github/workflows/continuous-deployment.yml.broken-tests +0 -0
  15. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/.gitignore +0 -0
  16. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/LICENSE +0 -0
  17. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/MANIFEST.in +0 -0
  18. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/Makefile +0 -0
  19. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/README.md +0 -0
  20. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/Makefile +0 -0
  21. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/_static/R42693.pdf +0 -0
  22. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/_static/gao-03-1003.pdf +0 -0
  23. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/_static/releasing-actions-finished.png +0 -0
  24. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/_static/releasing-actions-start.png +0 -0
  25. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/_static/releasing-changelog-button.png +0 -0
  26. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/_static/releasing-changelog-entered.png +0 -0
  27. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/_static/releasing-draft-button.png +0 -0
  28. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/_static/releasing-name-release.png +0 -0
  29. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/_static/releasing-name-tag.png +0 -0
  30. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/_static/releasing-publish-button.png +0 -0
  31. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/_static/releasing-pypi.png +0 -0
  32. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/_static/releasing-release-published.png +0 -0
  33. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/_static/releasing-releases-button.png +0 -0
  34. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/_static/releasing-tag-button.png +0 -0
  35. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/_templates/sources.md.tmpl +0 -0
  36. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/conf.py +0 -0
  37. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/contributing.rst +0 -0
  38. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/index.rst +0 -0
  39. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/make.bat +0 -0
  40. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/reference.rst +0 -0
  41. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/releasing.md +0 -0
  42. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/requirements.txt +0 -0
  43. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/scrapers/al.md +0 -0
  44. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/scrapers/az.md +0 -0
  45. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/scrapers/ca.md +0 -0
  46. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/scrapers/co.md +0 -0
  47. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/scrapers/dc.md +0 -0
  48. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/scrapers/de.md +0 -0
  49. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/scrapers/ia.md +0 -0
  50. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/scrapers/in.md +0 -0
  51. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/scrapers/job_center.md +0 -0
  52. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/scrapers/ks.md +0 -0
  53. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/scrapers/md.md +0 -0
  54. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/scrapers/me.md +0 -0
  55. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/scrapers/mo.md +0 -0
  56. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/scrapers/ny.md +0 -0
  57. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/scrapers/ok.md +0 -0
  58. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/scrapers/or.md +0 -0
  59. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/scrapers/sc.md +0 -0
  60. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/scrapers/tx.md +0 -0
  61. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/scrapers/ut.md +0 -0
  62. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/scrapers/va.md +0 -0
  63. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/scrapers/vt.md +0 -0
  64. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/scrapers/wi.md +0 -0
  65. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/sources.md +0 -0
  66. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/docs/usage.md +0 -0
  67. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/setup.cfg +0 -0
  68. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/setup.py +0 -0
  69. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/tests/__init__.py +0 -0
  70. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/tests/cassettes/test_cached_detail_pages.yaml +0 -0
  71. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/tests/cassettes/test_cached_search_results.yaml +0 -0
  72. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/tests/cassettes/test_missing_detail_page_values.yaml +0 -0
  73. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/tests/cassettes/test_no_results.yaml +0 -0
  74. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/tests/cassettes/test_paged_results.yaml +0 -0
  75. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/tests/cassettes/test_scrape_integration.yaml +0 -0
  76. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/tests/conftest.py +0 -0
  77. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/tests/fixtures/2021_page_1.html +0 -0
  78. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/tests/fixtures/2021_page_2.html +0 -0
  79. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/tests/test_cache.py +0 -0
  80. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/tests/test_delete.py +0 -0
  81. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/tests/test_job_center.py +0 -0
  82. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/tests/test_job_center_cache.py +0 -0
  83. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/tests/test_openpyxl.py +0 -0
  84. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/__init__.py +0 -0
  85. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/cache.py +0 -0
  86. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/cli.py +0 -0
  87. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/platforms/__init__.py +0 -0
  88. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/platforms/job_center/__init__.py +0 -0
  89. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/platforms/job_center/cache.py +0 -0
  90. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/platforms/job_center/urls.py +0 -0
  91. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/platforms/job_center/utils.py +0 -0
  92. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/runner.py +0 -0
  93. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/__init__.py +0 -0
  94. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/ak.py +0 -0
  95. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/al.py +0 -0
  96. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/az.py +0 -0
  97. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/ca.py +0 -0
  98. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/co.py +0 -0
  99. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/ct.py +0 -0
  100. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/dc.py +0 -0
  101. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/de.py +0 -0
  102. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/fl.py +0 -0
  103. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/ga.py +0 -0
  104. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/hi.py +0 -0
  105. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/ia.py +0 -0
  106. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/id.py +0 -0
  107. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/il.py +0 -0
  108. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/in.py +0 -0
  109. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/ks.py +0 -0
  110. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/ky.py +0 -0
  111. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/md.py +0 -0
  112. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/me.py +0 -0
  113. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/mi.py +0 -0
  114. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/mo.py +0 -0
  115. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/mt.py +0 -0
  116. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/ne.py +0 -0
  117. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/nj.py +0 -0
  118. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/nm.py +0 -0
  119. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/ny.py +0 -0
  120. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/oh.py +0 -0
  121. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/or.py +0 -0
  122. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/ri.py +0 -0
  123. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/sc.py +0 -0
  124. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/sd.py +0 -0
  125. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/tn.py +0 -0
  126. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/tx.py +0 -0
  127. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/ut.py +0 -0
  128. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/va.py +0 -0
  129. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/vt.py +0 -0
  130. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/wa.py +0 -0
  131. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn/scrapers/wi.py +0 -0
  132. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn_scraper.egg-info/SOURCES.txt +0 -0
  133. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn_scraper.egg-info/dependency_links.txt +0 -0
  134. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn_scraper.egg-info/entry_points.txt +0 -0
  135. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn_scraper.egg-info/not-zip-safe +0 -0
  136. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn_scraper.egg-info/requires.txt +0 -0
  137. {warn_scraper-1.2.112 → warn_scraper-1.2.114}/warn_scraper.egg-info/top_level.txt +0 -0
@@ -26,7 +26,7 @@ repos:
26
26
  additional_dependencies: [black]
27
27
 
28
28
  - repo: https://github.com/timothycrosley/isort
29
- rev: 5.13.2
29
+ rev: 6.0.1
30
30
  hooks:
31
31
  - id: isort
32
32
  args: ["--profile", "black"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: warn-scraper
3
- Version: 1.2.112
3
+ Version: 1.2.114
4
4
  Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
5
5
  Home-page: https://github.com/biglocalnews/warn-scraper
6
6
  Author: Big Local News
@@ -32,6 +32,7 @@ sphinxcontrib-serializinghtml = "1.1.5"
32
32
  setuptools = "*"
33
33
  jinja2 = "*"
34
34
  pytest = "==8.3.5"
35
+ isort = "==6.0.1"
35
36
 
36
37
  [packages]
37
38
  beautifulsoup4 = "*"
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "_meta": {
3
3
  "hash": {
4
- "sha256": "aac81610731435f6fe712525f4e5c72e3d7598da79d5a3a2f7580d7c013008a2"
4
+ "sha256": "b1cc4814822457ec04972c1a07991d1f983c946e8cdcfc315cbe1b6d9d84b2cd"
5
5
  },
6
6
  "pipfile-spec": 6,
7
7
  "requires": {
@@ -1067,6 +1067,15 @@
1067
1067
  "markers": "python_version >= '3.8'",
1068
1068
  "version": "==2.1.0"
1069
1069
  },
1070
+ "isort": {
1071
+ "hashes": [
1072
+ "sha256:1cb5df28dfbc742e490c5e41bad6da41b805b0a8be7bc93cd0fb2a8a890ac450",
1073
+ "sha256:2dc5d7f65c9678d94c88dfc29161a320eec67328bc97aad576874cb4be1e9615"
1074
+ ],
1075
+ "index": "pypi",
1076
+ "markers": "python_full_version >= '3.9.0'",
1077
+ "version": "==6.0.1"
1078
+ },
1070
1079
  "jaraco.classes": {
1071
1080
  "hashes": [
1072
1081
  "sha256:47a024b51d0239c0dd8c8540c6c7f484be3b8fcf0b2d85c13825780d3b3f3acd",
@@ -112,7 +112,7 @@ class Site:
112
112
  logger.debug("Fetching from cache")
113
113
  return self.cache.fetch(url, params)
114
114
  else:
115
- logger.debug("Pulling from the web")
115
+ logger.debug(f"Pulling from the web: {url} with params {params}")
116
116
  response = requests.get(url, params=params, verify=self.verify)
117
117
  logger.debug(f"Response code: {response.status_code}")
118
118
  html = response.text
@@ -1,13 +1,10 @@
1
1
  import logging
2
2
  import os
3
3
  import re
4
- import sys
5
- from base64 import b64decode
6
4
  from datetime import datetime
7
5
  from pathlib import Path
8
6
 
9
7
  import pdfplumber
10
- import requests
11
8
  from bs4 import BeautifulSoup
12
9
 
13
10
  from .. import utils
@@ -36,14 +33,6 @@ def scrape(
36
33
 
37
34
  Returns: the Path where the file is written
38
35
  """
39
- try:
40
- zyte_api_key = os.environ["ZYTE_API_KEY"]
41
- except KeyError:
42
- logger.error(
43
- "No ZYTE_API_KEY variable found in environment. Please get an API key from Zyte and export it."
44
- )
45
- sys.exit(1)
46
-
47
36
  # Fire up the cache
48
37
  cache = Cache(cache_dir)
49
38
 
@@ -54,18 +43,7 @@ def scrape(
54
43
 
55
44
  # Download the root page
56
45
  url = f"{base_url}Downloads/{file_base}.asp"
57
- api_response = requests.post(
58
- "https://api.zyte.com/v1/extract",
59
- auth=(zyte_api_key, ""),
60
- json={
61
- "url": url,
62
- "httpResponseBody": True,
63
- "followRedirect": True,
64
- },
65
- )
66
- html_bytes: bytes = b64decode(api_response.json()["httpResponseBody"])
67
- # html = utils.get_url(url).text
68
- html = html_bytes.decode("utf-8", errors="backslashreplace")
46
+ htmlbin, html = utils.get_with_zyte(url)
69
47
 
70
48
  # Save it to the cache
71
49
  cache_key = cache_dir / f"{state_code}/{file_base}.html"
@@ -82,25 +60,11 @@ def scrape(
82
60
  if "WARN Notices" in link.text:
83
61
  # Download the PDF
84
62
  pdf_url = f"{base_url}{link['href']}"
85
- logger.debug(pdf_url)
86
- api_response = requests.post(
87
- "https://api.zyte.com/v1/extract",
88
- auth=(zyte_api_key, ""),
89
- json={
90
- "url": pdf_url,
91
- "httpResponseBody": True,
92
- "followRedirect": True,
93
- },
94
- )
95
- http_response_body: bytes = b64decode(
96
- api_response.json()["httpResponseBody"]
97
- )
63
+ rawbin, rawtext = utils.get_with_zyte(pdf_url)
98
64
  pdf_path = cache_dir / f"{state_code}/{os.path.basename(pdf_url)}"
99
65
 
100
66
  with open(pdf_path, "wb") as fp:
101
- fp.write(http_response_body)
102
-
103
- # pdf_path = _read_or_download(cache, state_code, pdf_url)
67
+ fp.write(rawbin)
104
68
 
105
69
  # Process the PDF
106
70
  logger.debug(f"Attempting to parse {pdf_path}")
@@ -0,0 +1,117 @@
1
+ import logging
2
+ from pathlib import Path
3
+
4
+ import requests
5
+
6
+ from .. import utils
7
+
8
+ __authors__ = ["zstumgoren", "Dilcia19", "stucka"]
9
+ __tags__ = [""]
10
+ __source__ = {
11
+ "name": "Oklahoma Office of Workforces Development",
12
+ "url": "https://www.employoklahoma.gov/Participants/s/warnnotices",
13
+ }
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ def scrape(
19
+ data_dir: Path = utils.WARN_DATA_DIR,
20
+ cache_dir: Path = utils.WARN_CACHE_DIR,
21
+ use_cache: bool = True,
22
+ ) -> Path:
23
+ """
24
+ Scrape data from Oklahoma.
25
+
26
+ Keyword arguments:
27
+ data_dir -- the Path were the result will be saved (default WARN_DATA_DIR)
28
+ cache_dir -- the Path where results can be cached (default WARN_CACHE_DIR)
29
+ use_cache -- a Boolean indicating whether the cache should be used (default True)
30
+
31
+ Returns: the Path where the file is written
32
+ """
33
+ output_csv = data_dir / "ok.csv"
34
+ # search_url = "https://okjobmatch.com/search/warn_lookups"
35
+ # search_url = "https://www.employoklahoma.gov/Participants/s/warnnotices"
36
+ posturl = "https://www.employoklahoma.gov/Participants/s/sfsites/aura?r=2&aura.ApexAction.execute=6"
37
+
38
+ # There are a bunch of hard-coded values in here that seem to work for at least a day.
39
+ # Undetermined:
40
+ # -- Will this continue working in the short- or medium-term?
41
+ # -- What is the signficance of each variable?
42
+ # -- How do we refresh these?
43
+
44
+ headers = {
45
+ "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:142.0) Gecko/20100101 Firefox/142.0",
46
+ "Accept": "*/*",
47
+ "Accept-Language": "en-US,en;q=0.5",
48
+ "Accept-Encoding": "gzip, deflate, br, zstd",
49
+ "Referer": "https://www.employoklahoma.gov/Participants/s/warnnotices",
50
+ "X-SFDC-LDS-Endpoints": "ApexActionController.execute:ConfigurableLoginAndMaintenanceMessages.hasDocument, ApexActionController.execute:ConfigurableLoginAndMaintenanceMessages.checkJobExpiry, ApexActionController.execute:ConfigurableLoginAndMaintenanceMessages.checkResumeExpiry, ApexActionController.execute:ConfigurableLoginAndMaintenanceMessages.checkUIRegistered, ApexActionController.execute:ConfigurableLoginAndMaintenanceMessages.getLoginMaintenanceMessage, ApexActionController.execute:OESC_JS_getWARNLayoffNotices.getListofLayoffAccService",
51
+ "X-SFDC-Page-Scope-Id": "9c659a19-8020-41b0-a81c-36335e22801a",
52
+ "X-SFDC-Request-Id": "16140000007a08bd2f",
53
+ "X-SFDC-Page-Cache": "9439898463d86806",
54
+ "Content-Type": "application/x-www-form-urlencoded;charset=UTF-8",
55
+ "X-B3-TraceId": "856a2236ba7d283e",
56
+ "X-B3-SpanId": "b79b2da3a7dc4544",
57
+ "X-B3-Sampled": "0",
58
+ "Origin": "https://www.employoklahoma.gov",
59
+ "Connection": "keep-alive",
60
+ "Cookie": "renderCtx=%7B%22pageId%22%3A%223823bba2-3b00-4db7-aca6-5ca0eb67fc63%22%2C%22schema%22%3A%22Published%22%2C%22viewType%22%3A%22Published%22%2C%22brandingSetId%22%3A%22fa0b6362-0214-44b9-947d-2543eaab22c7%22%2C%22audienceIds%22%3A%22%22%7D; CookieConsentPolicy=0:1; LSKey-c$CookieConsentPolicy=0:1; pctrk=f3070d0c-7078-4062-96bb-de9e82cbb1db",
61
+ "Sec-Fetch-Dest": "empty",
62
+ "Sec-Fetch-Mode": "cors",
63
+ "Sec-Fetch-Site": "same-origin",
64
+ }
65
+
66
+ payload = "message=%7B%22actions%22%3A%5B%7B%22id%22%3A%22156%3Ba%22%2C%22descriptor%22%3A%22aura%3A%2F%2FApexActionController%2FACTION%24execute%22%2C%22callingDescriptor%22%3A%22UNKNOWN%22%2C%22params%22%3A%7B%22namespace%22%3A%22%22%2C%22classname%22%3A%22ConfigurableLoginAndMaintenanceMessages%22%2C%22method%22%3A%22hasDocument%22%2C%22params%22%3A%7B%7D%2C%22cacheable%22%3Afalse%2C%22isContinuation%22%3Afalse%7D%7D%2C%7B%22id%22%3A%22157%3Ba%22%2C%22descriptor%22%3A%22aura%3A%2F%2FApexActionController%2FACTION%24execute%22%2C%22callingDescriptor%22%3A%22UNKNOWN%22%2C%22params%22%3A%7B%22namespace%22%3A%22%22%2C%22classname%22%3A%22ConfigurableLoginAndMaintenanceMessages%22%2C%22method%22%3A%22checkJobExpiry%22%2C%22params%22%3A%7B%7D%2C%22cacheable%22%3Afalse%2C%22isContinuation%22%3Afalse%7D%7D%2C%7B%22id%22%3A%22158%3Ba%22%2C%22descriptor%22%3A%22aura%3A%2F%2FApexActionController%2FACTION%24execute%22%2C%22callingDescriptor%22%3A%22UNKNOWN%22%2C%22params%22%3A%7B%22namespace%22%3A%22%22%2C%22classname%22%3A%22ConfigurableLoginAndMaintenanceMessages%22%2C%22method%22%3A%22checkResumeExpiry%22%2C%22params%22%3A%7B%7D%2C%22cacheable%22%3Afalse%2C%22isContinuation%22%3Afalse%7D%7D%2C%7B%22id%22%3A%22159%3Ba%22%2C%22descriptor%22%3A%22aura%3A%2F%2FApexActionController%2FACTION%24execute%22%2C%22callingDescriptor%22%3A%22UNKNOWN%22%2C%22params%22%3A%7B%22namespace%22%3A%22%22%2C%22classname%22%3A%22ConfigurableLoginAndMaintenanceMessages%22%2C%22method%22%3A%22checkUIRegistered%22%2C%22params%22%3A%7B%7D%2C%22cacheable%22%3Afalse%2C%22isContinuation%22%3Afalse%7D%7D%2C%7B%22id%22%3A%22160%3Ba%22%2C%22descriptor%22%3A%22aura%3A%2F%2FApexActionController%2FACTION%24execute%22%2C%22callingDescriptor%22%3A%22UNKNOWN%22%2C%22params%22%3A%7B%22namespace%22%3A%22%22%2C%22classname%22%3A%22ConfigurableLoginAndMaintenanceMessages%22%2C%22method%22%3A%22getLoginMaintenanceMessage%22%2C%22params%22%3A%7B%22displayTo%22%3A%22Job%20Seekers%22%2C%22messageType%22%3A%22Portal%20Login%20Messages%22%7D%2C%22cacheable%22%3Afalse%2C%22isContinuation%22%3Afalse%7D%7D%2C%7B%22id%22%3A%22161%3Ba%22%2C%22descriptor%22%3A%22aura%3A%2F%2FApexActionController%2FACTION%24execute%22%2C%22callingDescriptor%22%3A%22UNKNOWN%22%2C%22params%22%3A%7B%22namespace%22%3A%22%22%2C%22classname%22%3A%22OESC_JS_getWARNLayoffNotices%22%2C%22method%22%3A%22getListofLayoffAccService%22%2C%22cacheable%22%3Afalse%2C%22isContinuation%22%3Afalse%7D%7D%5D%7D&aura.context=%7B%22mode%22%3A%22PROD%22%2C%22fwuid%22%3A%22eE5UbjZPdVlRT3M0d0xtOXc5MzVOQWg5TGxiTHU3MEQ5RnBMM0VzVXc1cmcxMi42MjkxNDU2LjE2Nzc3MjE2%22%2C%22app%22%3A%22siteforce%3AcommunityApp%22%2C%22loaded%22%3A%7B%22APPLICATION%40markup%3A%2F%2Fsiteforce%3AcommunityApp%22%3A%221305_7pTC6grCTP7M16KdvDQ-Xw%22%7D%2C%22dn%22%3A%5B%5D%2C%22globals%22%3A%7B%7D%2C%22uad%22%3Atrue%7D&aura.pageURI=%2FParticipants%2Fs%2Fwarnnotices&aura.token=null"
67
+
68
+ logger.debug(f"Attempting to send hard-coded data to {posturl}")
69
+ r = requests.post(posturl, headers=headers, data=payload)
70
+ rawdata = r.json()
71
+
72
+ for entry in rawdata["actions"]:
73
+ if (
74
+ entry["id"] == "161;a"
75
+ ): # What is this value? Will this change? Also no idea.
76
+ cleanerdata = entry["returnValue"]["returnValue"]
77
+ """
78
+ fields = set()
79
+ for entry in cleanerdata:
80
+ for field in entry:
81
+ fields.add(field)
82
+ {'Id',
83
+ 'Launchpad__Layoff_Closure_Type__c',
84
+ 'Launchpad__Notice_Date__c',
85
+ 'OESC_Employer_City__c',
86
+ 'OESC_Employer_Name__c',
87
+ 'OESC_Employer_Zip_Code__c',
88
+ 'RecordTypeId',
89
+ 'Select_Local_Workforce_Board__c'}
90
+ """
91
+ fields = {
92
+ "Id": "id",
93
+ "Launchpad__Layoff_Closure_Type__c": "closure_type",
94
+ "Launchpad__Notice_Date__c": "notice_date",
95
+ "OESC_Employer_City__c": "city",
96
+ "OESC_Employer_Name__c": "company name",
97
+ "OESC_Employer_Zip_Code__c": "zip_code",
98
+ "RecordTypeId": "record_type_id",
99
+ "Select_Local_Workforce_Board__c": "workforce_board",
100
+ }
101
+
102
+ masterlist = []
103
+ for entry in cleanerdata:
104
+ line = {}
105
+ for item in fields:
106
+ if item in entry:
107
+ line[fields[item]] = entry[item]
108
+ else:
109
+ line[fields[item]] = None
110
+ masterlist.append(line)
111
+
112
+ utils.write_dict_rows_to_csv(output_csv, list(fields.values()), masterlist)
113
+ return output_csv
114
+
115
+
116
+ if __name__ == "__main__":
117
+ scrape()
@@ -1,7 +1,9 @@
1
1
  import csv
2
+ import json
2
3
  import logging
3
4
  import os
4
5
  import typing
6
+ from base64 import b64decode, b64encode
5
7
  from pathlib import Path
6
8
  from time import sleep
7
9
 
@@ -94,6 +96,103 @@ def save_if_good_url(filename, url, **kwargs):
94
96
  return success_flag, content
95
97
 
96
98
 
99
+ def get_with_zyte(url):
100
+ """Use Zyte as a proxy server to retrieve data not available without it.
101
+
102
+ Args:
103
+ url (str): URL to retrieve
104
+ Returns:
105
+ returnbin (bin): raw binary representation of returned data object
106
+ returntext (str): utf-8 conversion of returned data object, e.g., HTML
107
+ Failures:
108
+ Returns (None, None) if it encounters a problem and logs an error.
109
+ Requires:
110
+ ZYTE_API_KEY to be set in environment
111
+ """
112
+ logger.debug(f"Seeking to fetch {url} with Zyte")
113
+ try:
114
+ zyte_api_key = os.environ["ZYTE_API_KEY"]
115
+ except KeyError:
116
+ logger.error(
117
+ "No ZYTE_API_KEY variable found in environment. Please get an API key from Zyte and export it."
118
+ )
119
+ return (None, None)
120
+
121
+ api_response = requests.post(
122
+ "https://api.zyte.com/v1/extract",
123
+ auth=(zyte_api_key, ""),
124
+ json={
125
+ "url": url,
126
+ "httpResponseBody": True,
127
+ "followRedirect": True,
128
+ },
129
+ )
130
+
131
+ if not api_response.ok:
132
+ logger.error(
133
+ f"Error downloading {url} with get_with_zyte. Repsonse code: {api_response.status_code}"
134
+ )
135
+ return (None, None)
136
+ returnbin: bytes = b64decode(api_response.json()["httpResponseBody"])
137
+ returntext: str = returnbin.decode("utf-8", errors="backslashreplace")
138
+ logger.debug(f"Fetched {url}")
139
+ return (returnbin, returntext)
140
+
141
+
142
+ def post_with_zyte(url, payload):
143
+ """Use Zyte as a proxy server to retrieve data not available without it.
144
+
145
+ Args:
146
+ url (str): URL to retrieve
147
+ payload: (dict, str or binary): POST body.
148
+ If type dict: Convert to utf-8 text then:
149
+ If type str: Convert to b64encoded
150
+ Returns:
151
+ returnbin (bin): raw binary representation of returned data object
152
+ returntext (str): utf-8 conversion of returned data object, e.g., HTML
153
+ Failures:
154
+ Returns (None, None) if it encounters a problem and logs an error.
155
+ Requires:
156
+ ZYTE_API_KEY to be set in environment
157
+ """
158
+ logger.debug(f"Seeking to fetch {url} with Zyte")
159
+ try:
160
+ zyte_api_key = os.environ["ZYTE_API_KEY"]
161
+ except KeyError:
162
+ logger.error(
163
+ "No ZYTE_API_KEY variable found in environment. Please get an API key from Zyte and export it."
164
+ )
165
+ return (None, None)
166
+
167
+ if isinstance(payload, dict):
168
+ payload = json.dumps(payload)
169
+
170
+ if isinstance(payload, str):
171
+ payload = b64encode(payload.encode("utf-8"))
172
+
173
+ api_response = requests.post(
174
+ "https://api.zyte.com/v1/extract",
175
+ auth=(zyte_api_key, ""),
176
+ json={
177
+ "url": url,
178
+ "httpRequestMethod": "POST",
179
+ "httpRequestBody": payload,
180
+ "httpResponseBody": True,
181
+ "followRedirect": True,
182
+ },
183
+ )
184
+
185
+ if not api_response.ok:
186
+ logger.error(
187
+ f"Error downloading {url} with post_with_zyte. Repsonse code: {api_response.status_code}. Reponse: {api_response.json()}"
188
+ )
189
+ return (None, None)
190
+ returnbin: bytes = b64decode(api_response.json()["httpResponseBody"])
191
+ returntext: str = returnbin.decode("utf-8", errors="backslashreplace")
192
+ logger.debug(f"Fetched {url}")
193
+ return (returnbin, returntext)
194
+
195
+
97
196
  def write_rows_to_csv(output_path: Path, rows: list, mode="w"):
98
197
  """Write the provided list to the provided path as comma-separated values.
99
198
 
@@ -109,21 +208,19 @@ def write_rows_to_csv(output_path: Path, rows: list, mode="w"):
109
208
  writer.writerows(rows)
110
209
 
111
210
 
112
- def write_dict_rows_to_csv(
113
- output_path, headers, rows, mode="w", extrasaction="raise", encoding="utf-8"
114
- ):
115
- """Write the provided dictionary to the provided path as comma-separated values.
211
+ def write_dict_rows_to_csv(output_path, headers, rows, mode="w", extrasaction="raise"):
212
+ """Write the provided list of dictionaries to the provided path as comma-separated values.
116
213
 
117
214
  Args:
118
215
  output_path (Path): the Path were the result will be saved
119
216
  headers (list): a list of the headers for the output file
120
- rows (list): the dict to be saved
217
+ rows (list): the list of dictionaries to be saved
121
218
  mode (str): the mode to be used when opening the file (default 'w')
122
219
  extrasaction (str): what to do if the if a field isn't in the headers (default 'raise')
123
220
  """
124
221
  create_directory(output_path, is_file=True)
125
222
  logger.debug(f"Writing {len(rows)} rows to {output_path}")
126
- with open(output_path, mode, newline="", encoding=encoding) as f:
223
+ with open(output_path, mode, newline="") as f:
127
224
  # Create the writer object
128
225
  writer = csv.DictWriter(f, fieldnames=headers, extrasaction=extrasaction)
129
226
  # If we are writing a new row ...
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: warn-scraper
3
- Version: 1.2.112
3
+ Version: 1.2.114
4
4
  Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
5
5
  Home-page: https://github.com/biglocalnews/warn-scraper
6
6
  Author: Big Local News
@@ -1,42 +0,0 @@
1
- from pathlib import Path
2
-
3
- from warn.platforms.job_center.utils import scrape_state
4
-
5
- from .. import utils
6
-
7
- __authors__ = ["zstumgoren", "Dilcia19"]
8
- __tags__ = ["jobcenter"]
9
- __source__ = {
10
- "name": "Oklahoma Office of Workforces Development",
11
- "url": "https://okjobmatch.com/search/warn_lookups/new",
12
- }
13
-
14
-
15
- def scrape(
16
- data_dir: Path = utils.WARN_DATA_DIR,
17
- cache_dir: Path = utils.WARN_CACHE_DIR,
18
- use_cache: bool = True,
19
- ) -> Path:
20
- """
21
- Scrape data from Oklahoma.
22
-
23
- Keyword arguments:
24
- data_dir -- the Path were the result will be saved (default WARN_DATA_DIR)
25
- cache_dir -- the Path where results can be cached (default WARN_CACHE_DIR)
26
- use_cache -- a Boolean indicating whether the cache should be used (default True)
27
-
28
- Returns: the Path where the file is written
29
- """
30
- output_csv = data_dir / "ok.csv"
31
- search_url = "https://okjobmatch.com/search/warn_lookups"
32
- # Date chosen based on manual research
33
- stop_year = 1999
34
- # Use cache for years before current and prior year
35
- scrape_state(
36
- "OK", search_url, output_csv, stop_year, cache_dir, use_cache=use_cache
37
- )
38
- return output_csv
39
-
40
-
41
- if __name__ == "__main__":
42
- scrape()
File without changes
File without changes
File without changes
File without changes
File without changes