warn-scraper 1.2.104__tar.gz → 1.2.106__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. {warn_scraper-1.2.104/warn_scraper.egg-info → warn_scraper-1.2.106}/PKG-INFO +1 -1
  2. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/dc.py +5 -0
  3. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/id.py +23 -9
  4. {warn_scraper-1.2.104 → warn_scraper-1.2.106/warn_scraper.egg-info}/PKG-INFO +1 -1
  5. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/.devcontainer/devcontainer.json +0 -0
  6. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/.github/dependabot.yml.disabled-for-sanity +0 -0
  7. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/.github/workflows/continuous-deployment.yml +0 -0
  8. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/.github/workflows/continuous-deployment.yml.broken-tests +0 -0
  9. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/.gitignore +0 -0
  10. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/.pre-commit-config.yaml +0 -0
  11. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/LICENSE +0 -0
  12. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/MANIFEST.in +0 -0
  13. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/Makefile +0 -0
  14. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/Pipfile +0 -0
  15. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/Pipfile.lock +0 -0
  16. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/README.md +0 -0
  17. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/Makefile +0 -0
  18. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/_static/R42693.pdf +0 -0
  19. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/_static/gao-03-1003.pdf +0 -0
  20. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/_static/releasing-actions-finished.png +0 -0
  21. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/_static/releasing-actions-start.png +0 -0
  22. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/_static/releasing-changelog-button.png +0 -0
  23. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/_static/releasing-changelog-entered.png +0 -0
  24. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/_static/releasing-draft-button.png +0 -0
  25. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/_static/releasing-name-release.png +0 -0
  26. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/_static/releasing-name-tag.png +0 -0
  27. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/_static/releasing-publish-button.png +0 -0
  28. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/_static/releasing-pypi.png +0 -0
  29. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/_static/releasing-release-published.png +0 -0
  30. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/_static/releasing-releases-button.png +0 -0
  31. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/_static/releasing-tag-button.png +0 -0
  32. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/_templates/sources.md.tmpl +0 -0
  33. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/conf.py +0 -0
  34. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/contributing.rst +0 -0
  35. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/index.rst +0 -0
  36. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/make.bat +0 -0
  37. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/reference.rst +0 -0
  38. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/releasing.md +0 -0
  39. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/requirements.txt +0 -0
  40. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/al.md +0 -0
  41. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/az.md +0 -0
  42. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/ca.md +0 -0
  43. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/co.md +0 -0
  44. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/dc.md +0 -0
  45. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/de.md +0 -0
  46. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/ia.md +0 -0
  47. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/in.md +0 -0
  48. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/job_center.md +0 -0
  49. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/ks.md +0 -0
  50. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/md.md +0 -0
  51. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/me.md +0 -0
  52. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/mo.md +0 -0
  53. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/ny.md +0 -0
  54. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/ok.md +0 -0
  55. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/or.md +0 -0
  56. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/sc.md +0 -0
  57. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/tx.md +0 -0
  58. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/ut.md +0 -0
  59. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/va.md +0 -0
  60. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/vt.md +0 -0
  61. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/scrapers/wi.md +0 -0
  62. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/sources.md +0 -0
  63. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/docs/usage.md +0 -0
  64. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/setup.cfg +0 -0
  65. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/setup.py +0 -0
  66. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/tests/__init__.py +0 -0
  67. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/tests/cassettes/test_cached_detail_pages.yaml +0 -0
  68. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/tests/cassettes/test_cached_search_results.yaml +0 -0
  69. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/tests/cassettes/test_missing_detail_page_values.yaml +0 -0
  70. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/tests/cassettes/test_no_results.yaml +0 -0
  71. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/tests/cassettes/test_paged_results.yaml +0 -0
  72. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/tests/cassettes/test_scrape_integration.yaml +0 -0
  73. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/tests/conftest.py +0 -0
  74. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/tests/fixtures/2021_page_1.html +0 -0
  75. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/tests/fixtures/2021_page_2.html +0 -0
  76. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/tests/test_cache.py +0 -0
  77. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/tests/test_delete.py +0 -0
  78. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/tests/test_job_center.py +0 -0
  79. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/tests/test_job_center_cache.py +0 -0
  80. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/tests/test_openpyxl.py +0 -0
  81. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/__init__.py +0 -0
  82. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/cache.py +0 -0
  83. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/cli.py +0 -0
  84. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/platforms/__init__.py +0 -0
  85. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/platforms/job_center/__init__.py +0 -0
  86. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/platforms/job_center/cache.py +0 -0
  87. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/platforms/job_center/site.py +0 -0
  88. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/platforms/job_center/urls.py +0 -0
  89. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/platforms/job_center/utils.py +0 -0
  90. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/runner.py +0 -0
  91. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/__init__.py +0 -0
  92. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/ak.py +0 -0
  93. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/al.py +0 -0
  94. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/az.py +0 -0
  95. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/ca.py +0 -0
  96. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/co.py +0 -0
  97. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/ct.py +0 -0
  98. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/de.py +0 -0
  99. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/fl.py +0 -0
  100. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/ga.py +0 -0
  101. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/hi.py +0 -0
  102. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/ia.py +0 -0
  103. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/il.py +0 -0
  104. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/in.py +0 -0
  105. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/ks.py +0 -0
  106. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/ky.py +0 -0
  107. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/la.py +0 -0
  108. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/md.py +0 -0
  109. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/me.py +0 -0
  110. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/mi.py +0 -0
  111. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/mo.py +0 -0
  112. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/mt.py +0 -0
  113. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/ne.py +0 -0
  114. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/nj.py +0 -0
  115. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/nm.py +0 -0
  116. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/ny.py +0 -0
  117. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/oh.py +0 -0
  118. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/ok.py +0 -0
  119. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/or.py +0 -0
  120. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/ri.py +0 -0
  121. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/sc.py +0 -0
  122. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/sd.py +0 -0
  123. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/tn.py +0 -0
  124. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/tx.py +0 -0
  125. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/ut.py +0 -0
  126. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/va.py +0 -0
  127. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/vt.py +0 -0
  128. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/wa.py +0 -0
  129. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/scrapers/wi.py +0 -0
  130. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn/utils.py +0 -0
  131. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn_scraper.egg-info/SOURCES.txt +0 -0
  132. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn_scraper.egg-info/dependency_links.txt +0 -0
  133. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn_scraper.egg-info/entry_points.txt +0 -0
  134. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn_scraper.egg-info/not-zip-safe +0 -0
  135. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn_scraper.egg-info/requires.txt +0 -0
  136. {warn_scraper-1.2.104 → warn_scraper-1.2.106}/warn_scraper.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: warn-scraper
3
- Version: 1.2.104
3
+ Version: 1.2.106
4
4
  Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
5
5
  Home-page: https://github.com/biglocalnews/warn-scraper
6
6
  Author: Big Local News
@@ -85,6 +85,11 @@ def scrape(
85
85
  r.encoding = "utf-8"
86
86
  html = r.text
87
87
 
88
+ # A June 2025 entry includes a weird table inside a table cell.
89
+ # This is an ugly patch.
90
+ weirdtable = r"(\s+<table>\s+<tbody>\s+<tr>\s+<td>)(.*?)(</td>\s+</tr>\s+</tbody>\s+</table>\s+)"
91
+ html = re.sub(weirdtable, r"\2", html)
92
+
88
93
  # Save it to the cache
89
94
  cache_key = uuid.uuid5(uuid.NAMESPACE_URL, href)
90
95
  cache.write(f"dc/{cache_key}.html", html)
@@ -69,6 +69,11 @@ def scrape(
69
69
  with pdfplumber.open(pdf_file) as pdf:
70
70
  for index, page in enumerate(pdf.pages):
71
71
  rows = page.extract_table()
72
+ if rows[0][0] in ["Date of\nLetter", "Date of Letter"] and index > 1:
73
+ rows = rows[
74
+ 1:
75
+ ] # Drop inside header rows that _clean_table will mangle if merged cells span pages
76
+ # logger.debug(f"\n\nRows for page {page}: {rows}")
72
77
  output_rows += _clean_table(rows, index)
73
78
 
74
79
  # Write out the data to a CSV
@@ -118,8 +123,9 @@ def _clean_table(rows, page_index) -> list:
118
123
  # logger.debug(f"Dropping faulty row with {len(output_row)} elements: {output_row}")
119
124
 
120
125
  # Only include the header on the first page
121
- if page_index != 0:
122
- return output_rows[1:]
126
+ # No, this needed to be filtered earlier
127
+ # if page_index != 0:
128
+ # return output_rows[1:]
123
129
 
124
130
  return output_rows
125
131
 
@@ -132,18 +138,26 @@ def filter_garbage_rows(incoming: list):
132
138
 
133
139
  Returns: List of lists that have a minimum number of elements.
134
140
  """
135
- badrows: int = 0
141
+ shortrows: int = 0
142
+ mixedrows: int = 0
136
143
  outgoing: list = []
137
- for row in incoming:
138
- if len(row) >= 4:
144
+ for rowindex, row in enumerate(incoming):
145
+ error = False
146
+ if len(row) < 5:
147
+ error = True
148
+ logger.debug(f"Dropping short row: {row}")
149
+ shortrows += 1
150
+ if row[0] == "Date of Letter" and rowindex != 0: # Keep the header row
151
+ error = True
152
+ logger.debug(f"Dropping partial row: {row}")
153
+ mixedrows += 1
154
+ if not error:
139
155
  outgoing.append(row)
140
- else:
141
- badrows += 1
142
- if badrows == 0:
156
+ if shortrows == 0 and mixedrows == 0:
143
157
  logger.debug("No bad rows found.")
144
158
  else:
145
159
  logger.debug(
146
- f"!!!!! {badrows:,} bad rows dropped from the data set with insufficient number of fields."
160
+ f"!!!! Dropped {shortrows} rows with insufficient number of fields, and dropped {mixedrows} rows scrambled with header info"
147
161
  )
148
162
  return outgoing
149
163
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: warn-scraper
3
- Version: 1.2.104
3
+ Version: 1.2.106
4
4
  Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
5
5
  Home-page: https://github.com/biglocalnews/warn-scraper
6
6
  Author: Big Local News
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes