warn-scraper 1.2.116__tar.gz → 1.2.118__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. {warn_scraper-1.2.116/warn_scraper.egg-info → warn_scraper-1.2.118}/PKG-INFO +1 -1
  2. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/co.py +1 -0
  3. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/ky.py +65 -36
  4. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/utils.py +35 -0
  5. {warn_scraper-1.2.116 → warn_scraper-1.2.118/warn_scraper.egg-info}/PKG-INFO +1 -1
  6. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/.devcontainer/devcontainer.json +0 -0
  7. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/.github/dependabot.yml.disabled-for-sanity +0 -0
  8. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/.github/workflows/continuous-deployment.yml +0 -0
  9. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/.github/workflows/continuous-deployment.yml.broken-tests +0 -0
  10. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/.gitignore +0 -0
  11. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/.pre-commit-config.yaml +0 -0
  12. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/LICENSE +0 -0
  13. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/MANIFEST.in +0 -0
  14. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/Makefile +0 -0
  15. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/Pipfile +0 -0
  16. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/Pipfile.lock +0 -0
  17. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/README.md +0 -0
  18. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/Makefile +0 -0
  19. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/_static/R42693.pdf +0 -0
  20. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/_static/gao-03-1003.pdf +0 -0
  21. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/_static/releasing-actions-finished.png +0 -0
  22. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/_static/releasing-actions-start.png +0 -0
  23. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/_static/releasing-changelog-button.png +0 -0
  24. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/_static/releasing-changelog-entered.png +0 -0
  25. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/_static/releasing-draft-button.png +0 -0
  26. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/_static/releasing-name-release.png +0 -0
  27. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/_static/releasing-name-tag.png +0 -0
  28. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/_static/releasing-publish-button.png +0 -0
  29. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/_static/releasing-pypi.png +0 -0
  30. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/_static/releasing-release-published.png +0 -0
  31. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/_static/releasing-releases-button.png +0 -0
  32. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/_static/releasing-tag-button.png +0 -0
  33. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/_templates/sources.md.tmpl +0 -0
  34. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/conf.py +0 -0
  35. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/contributing.rst +0 -0
  36. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/index.rst +0 -0
  37. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/make.bat +0 -0
  38. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/reference.rst +0 -0
  39. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/releasing.md +0 -0
  40. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/requirements.txt +0 -0
  41. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/scrapers/al.md +0 -0
  42. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/scrapers/az.md +0 -0
  43. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/scrapers/ca.md +0 -0
  44. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/scrapers/co.md +0 -0
  45. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/scrapers/dc.md +0 -0
  46. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/scrapers/de.md +0 -0
  47. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/scrapers/ia.md +0 -0
  48. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/scrapers/in.md +0 -0
  49. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/scrapers/job_center.md +0 -0
  50. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/scrapers/ks.md +0 -0
  51. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/scrapers/md.md +0 -0
  52. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/scrapers/me.md +0 -0
  53. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/scrapers/mo.md +0 -0
  54. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/scrapers/ny.md +0 -0
  55. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/scrapers/ok.md +0 -0
  56. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/scrapers/or.md +0 -0
  57. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/scrapers/sc.md +0 -0
  58. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/scrapers/tx.md +0 -0
  59. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/scrapers/ut.md +0 -0
  60. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/scrapers/va.md +0 -0
  61. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/scrapers/vt.md +0 -0
  62. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/scrapers/wi.md +0 -0
  63. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/sources.md +0 -0
  64. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/docs/usage.md +0 -0
  65. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/setup.cfg +0 -0
  66. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/setup.py +0 -0
  67. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/tests/__init__.py +0 -0
  68. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/tests/cassettes/test_cached_detail_pages.yaml +0 -0
  69. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/tests/cassettes/test_cached_search_results.yaml +0 -0
  70. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/tests/cassettes/test_missing_detail_page_values.yaml +0 -0
  71. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/tests/cassettes/test_no_results.yaml +0 -0
  72. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/tests/cassettes/test_paged_results.yaml +0 -0
  73. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/tests/cassettes/test_scrape_integration.yaml +0 -0
  74. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/tests/conftest.py +0 -0
  75. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/tests/fixtures/2021_page_1.html +0 -0
  76. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/tests/fixtures/2021_page_2.html +0 -0
  77. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/tests/test_cache.py +0 -0
  78. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/tests/test_delete.py +0 -0
  79. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/tests/test_job_center.py +0 -0
  80. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/tests/test_job_center_cache.py +0 -0
  81. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/tests/test_openpyxl.py +0 -0
  82. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/__init__.py +0 -0
  83. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/cache.py +0 -0
  84. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/cli.py +0 -0
  85. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/platforms/__init__.py +0 -0
  86. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/platforms/job_center/__init__.py +0 -0
  87. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/platforms/job_center/cache.py +0 -0
  88. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/platforms/job_center/site.py +0 -0
  89. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/platforms/job_center/urls.py +0 -0
  90. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/platforms/job_center/utils.py +0 -0
  91. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/runner.py +0 -0
  92. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/__init__.py +0 -0
  93. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/ak.py +0 -0
  94. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/al.py +0 -0
  95. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/az.py +0 -0
  96. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/ca.py +0 -0
  97. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/ct.py +0 -0
  98. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/dc.py +0 -0
  99. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/de.py +0 -0
  100. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/fl.py +0 -0
  101. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/ga.py +0 -0
  102. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/hi.py +0 -0
  103. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/ia.py +0 -0
  104. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/id.py +0 -0
  105. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/il.py +0 -0
  106. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/in.py +0 -0
  107. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/ks.py +0 -0
  108. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/la.py +0 -0
  109. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/md.py +0 -0
  110. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/me.py +0 -0
  111. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/mi.py +0 -0
  112. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/mo.py +0 -0
  113. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/mt.py +0 -0
  114. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/ne.py +0 -0
  115. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/nj.py +0 -0
  116. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/nm.py +0 -0
  117. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/ny.py +0 -0
  118. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/oh.py +0 -0
  119. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/ok.py +0 -0
  120. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/or.py +0 -0
  121. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/ri.py +0 -0
  122. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/sc.py +0 -0
  123. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/sd.py +0 -0
  124. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/tn.py +0 -0
  125. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/tx.py +0 -0
  126. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/ut.py +0 -0
  127. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/va.py +0 -0
  128. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/vt.py +0 -0
  129. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/wa.py +0 -0
  130. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn/scrapers/wi.py +0 -0
  131. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn_scraper.egg-info/SOURCES.txt +0 -0
  132. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn_scraper.egg-info/dependency_links.txt +0 -0
  133. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn_scraper.egg-info/entry_points.txt +0 -0
  134. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn_scraper.egg-info/not-zip-safe +0 -0
  135. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn_scraper.egg-info/requires.txt +0 -0
  136. {warn_scraper-1.2.116 → warn_scraper-1.2.118}/warn_scraper.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: warn-scraper
3
- Version: 1.2.116
3
+ Version: 1.2.118
4
4
  Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
5
5
  Home-page: https://github.com/biglocalnews/warn-scraper
6
6
  Author: Big Local News
@@ -184,6 +184,7 @@ def scrape(
184
184
  "@dropdown": "dropdown",
185
185
  "Received": "received_date",
186
186
  "Notes": "notes",
187
+ "12/1/25`": "company",
187
188
  # Only add new matches above here, not below here.
188
189
  }
189
190
 
@@ -60,34 +60,62 @@ def scrape(
60
60
  # Open it up
61
61
  workbook = load_workbook(filename=latest_path)
62
62
 
63
- dirty_list: list = []
63
+ crosswalk = {
64
+ "Closure or Layoff?": "closure_or_layoff",
65
+ "Company Name": "company",
66
+ "Company: Company Name": "company",
67
+ "County": "county",
68
+ "Date Received": "date_received",
69
+ "Employees": "employees",
70
+ "NAICS": "NAICS",
71
+ "NAICS Code": "NAICS",
72
+ "Notice Link": "notice_url",
73
+ "Notice Type": "source",
74
+ "Notice URL": "notice_url",
75
+ "Notice: Notice Number": "notice_number",
76
+ "Number of Employees Affected": "employees",
77
+ "Projected Date": "date_effective",
78
+ "Region": "region",
79
+ "Trade": "trade",
80
+ "Type of Employees Affected": "union_affected",
81
+ "Workforce Board": "region",
82
+ "address": "address",
83
+ "comments": "comments",
84
+ "congressional": "congressional",
85
+ "contact": "contact",
86
+ "industry": "industry",
87
+ "neg": "neg", # Rarely seen, only in historical data, maybe all with "N"
88
+ "occupations": "industry",
89
+ "source": "source",
90
+ "union": "union", # Unclear if different than types of employees affected/union_affected
91
+ }
92
+
93
+ masterlist: list = []
64
94
  for sheet in workbook.worksheets:
65
95
  localrows = parse_xlsx(sheet)
66
- dirty_list.extend(localrows)
67
96
 
68
- headers = dirty_list[0]
69
- row_list = []
70
- for rowindex, row in enumerate(dirty_list):
71
- if (
72
- row != headers
73
- ): # Filter out headers, but also double-check when headers may change
74
- if row[0] == "Date Received":
75
- logger.debug(
76
- f"Dropping dirty row that doesn't quite match headers in row {rowindex}"
77
- )
78
- logger.debug(f"Want: {headers}")
79
- logger.debug(f"Got : {row}")
97
+ # Traverse each tab. Assume the first line is a header. Check if the second line is bogus.
98
+ # Build a list of dicts.
99
+ localheadersraw: list = localrows[0]
100
+ localheaders: list = []
101
+ for entry in localheadersraw:
102
+ if entry not in crosswalk:
103
+ logger.error(f"Potential header {entry} not found in crosswalk.")
80
104
  else:
81
- line = {}
82
- for i, fieldname in enumerate(headers):
105
+ localheaders.append(crosswalk[entry])
106
+ for row in localrows[1:]: # Skip the header row
107
+ if row[0] != "Date Received": # Check for fake second header
108
+ line: dict = {}
109
+ for i, fieldname in enumerate(localheaders):
83
110
  line[fieldname] = row[i]
84
- row_list.append(line)
85
- # dirty_list = None
86
- logger.debug(
87
- f"Successfully merged {len(row_list)-1:,} records from new spreadsheet."
88
- )
111
+ if isinstance(row[i], str):
112
+ line[fieldname] = row[i].strip()
113
+ masterlist.append(line)
89
114
 
90
- # Need to double-check this archived file code, and make sure headers match
115
+ logger.debug(f"Successfully merged {len(masterlist)} records from new spreadsheet.")
116
+
117
+ # Earlier versions of this code needed the archived data to match the new data.
118
+ # We can no longer expect that since October 2025 data revisions.
91
119
 
92
120
  archive_url = "https://storage.googleapis.com/bln-data-public/warn-layoffs/ky-historical-normalized.csv"
93
121
 
@@ -96,24 +124,25 @@ def scrape(
96
124
 
97
125
  reader = list(csv.reader(r.text.splitlines()))
98
126
 
99
- headerlength = len(headers)
100
-
101
- assert reader[0][:headerlength] == headers
102
- logger.debug(
103
- f"Historical data matches current headers. Merging {len(reader)-1:,} records."
104
- )
105
-
127
+ localheadersraw = reader[0]
128
+ localheaders: list = [] # type: ignore
129
+ for entry in localheadersraw:
130
+ if entry not in crosswalk:
131
+ logger.error(f"Cannot match possible header value of {entry} to crosswalk.")
132
+ else:
133
+ localheaders.append(crosswalk[entry])
106
134
  for row in reader[1:]: # Skip header row
107
- line = {}
108
- for i, item in enumerate(headers):
109
- line[item] = row[
110
- i
111
- ] # Make this a list of dictionaries to match earlier input
112
- row_list.append(line)
135
+ line: dict = {} # type: ignore
136
+ for i, fieldname in enumerate(localheaders):
137
+ line[fieldname] = row[i]
138
+ if isinstance(row[i], str):
139
+ line[fieldname] = row[i].strip()
140
+ masterlist.append(line)
141
+ logger.debug("Historical records folded in.")
113
142
 
114
143
  # Write out the results
115
144
  data_path = data_dir / "ky.csv"
116
- utils.write_dict_rows_to_csv(data_path, headers, row_list, extrasaction="ignore")
145
+ utils.write_disparate_dict_rows_to_csv(data_path, masterlist)
117
146
 
118
147
  # Pass it out
119
148
  return data_path
@@ -232,6 +232,41 @@ def write_dict_rows_to_csv(output_path, headers, rows, mode="w", extrasaction="r
232
232
  writer.writerow(row)
233
233
 
234
234
 
235
+ def write_disparate_dict_rows_to_csv(output_path, rows, mode="w"):
236
+ """Write the provided list of dictionaries to the provided path as comma-separated values, while determining a header.
237
+
238
+ Args:
239
+ output_path (Path): the Path were the result will be saved
240
+ rows (list): the list of dictionaries to be saved; can have disparate dict keys
241
+ mode (str): the mode to be used when opening the file (default 'w')
242
+ """
243
+ create_directory(output_path, is_file=True)
244
+ headers: set = set() # Get all the potential header names
245
+ for row in rows:
246
+ for item in row:
247
+ headers.add(item)
248
+ headers = list(sorted(headers))
249
+ logger.debug(f"Found {len(headers):,} header entries in list of dicts.")
250
+ logger.debug(f"Writing {len(rows)} rows to {output_path}")
251
+ with open(output_path, mode, newline="") as outfile:
252
+ # Create the writer object
253
+ writer = csv.writer(outfile)
254
+ # If we are writing a new row ...
255
+ if mode == "w":
256
+ # ... drop in the headers
257
+ writer.writerow(headers)
258
+ # Loop through the dicts and write them in one by one.
259
+ for row in rows:
260
+ line = {}
261
+ for item in headers:
262
+ if item in row:
263
+ line[item] = row[item]
264
+ else:
265
+ line[item] = None
266
+ writer.writerow(list(line.values()))
267
+ return
268
+
269
+
235
270
  def get_all_scrapers():
236
271
  """Get all the states and territories that have scrapers.
237
272
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: warn-scraper
3
- Version: 1.2.116
3
+ Version: 1.2.118
4
4
  Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
5
5
  Home-page: https://github.com/biglocalnews/warn-scraper
6
6
  Author: Big Local News
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes