warn-scraper 1.2.72__tar.gz → 1.2.73__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. {warn-scraper-1.2.72/warn_scraper.egg-info → warn-scraper-1.2.73}/PKG-INFO +1 -1
  2. warn-scraper-1.2.73/warn/scrapers/tn.py +94 -0
  3. {warn-scraper-1.2.72 → warn-scraper-1.2.73/warn_scraper.egg-info}/PKG-INFO +1 -1
  4. warn-scraper-1.2.72/warn/scrapers/tn.py +0 -156
  5. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/.devcontainer/devcontainer.json +0 -0
  6. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/.github/dependabot.yml.disabled +0 -0
  7. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/.github/workflows/continuous-deployment.yml +0 -0
  8. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/.gitignore +0 -0
  9. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/.pre-commit-config.yaml +0 -0
  10. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/LICENSE +0 -0
  11. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/MANIFEST.in +0 -0
  12. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/Makefile +0 -0
  13. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/Pipfile +0 -0
  14. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/Pipfile.lock +0 -0
  15. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/README.md +0 -0
  16. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/Makefile +0 -0
  17. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/_static/R42693.pdf +0 -0
  18. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/_static/gao-03-1003.pdf +0 -0
  19. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/_static/releasing-actions-finished.png +0 -0
  20. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/_static/releasing-actions-start.png +0 -0
  21. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/_static/releasing-changelog-button.png +0 -0
  22. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/_static/releasing-changelog-entered.png +0 -0
  23. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/_static/releasing-draft-button.png +0 -0
  24. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/_static/releasing-name-release.png +0 -0
  25. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/_static/releasing-name-tag.png +0 -0
  26. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/_static/releasing-publish-button.png +0 -0
  27. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/_static/releasing-pypi.png +0 -0
  28. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/_static/releasing-release-published.png +0 -0
  29. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/_static/releasing-releases-button.png +0 -0
  30. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/_static/releasing-tag-button.png +0 -0
  31. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/_templates/sources.md.tmpl +0 -0
  32. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/conf.py +0 -0
  33. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/contributing.rst +0 -0
  34. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/index.rst +0 -0
  35. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/make.bat +0 -0
  36. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/reference.rst +0 -0
  37. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/releasing.md +0 -0
  38. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/requirements.txt +0 -0
  39. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/scrapers/al.md +0 -0
  40. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/scrapers/az.md +0 -0
  41. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/scrapers/ca.md +0 -0
  42. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/scrapers/co.md +0 -0
  43. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/scrapers/dc.md +0 -0
  44. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/scrapers/de.md +0 -0
  45. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/scrapers/ia.md +0 -0
  46. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/scrapers/in.md +0 -0
  47. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/scrapers/job_center.md +0 -0
  48. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/scrapers/ks.md +0 -0
  49. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/scrapers/md.md +0 -0
  50. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/scrapers/me.md +0 -0
  51. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/scrapers/mo.md +0 -0
  52. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/scrapers/ny.md +0 -0
  53. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/scrapers/ok.md +0 -0
  54. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/scrapers/or.md +0 -0
  55. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/scrapers/sc.md +0 -0
  56. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/scrapers/tx.md +0 -0
  57. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/scrapers/ut.md +0 -0
  58. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/scrapers/va.md +0 -0
  59. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/scrapers/vt.md +0 -0
  60. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/scrapers/wi.md +0 -0
  61. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/sources.md +0 -0
  62. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/docs/usage.md +0 -0
  63. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/setup.cfg +0 -0
  64. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/setup.py +0 -0
  65. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/tests/__init__.py +0 -0
  66. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/tests/cassettes/test_cached_detail_pages.yaml +0 -0
  67. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/tests/cassettes/test_cached_search_results.yaml +0 -0
  68. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/tests/cassettes/test_missing_detail_page_values.yaml +0 -0
  69. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/tests/cassettes/test_no_results.yaml +0 -0
  70. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/tests/cassettes/test_paged_results.yaml +0 -0
  71. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/tests/cassettes/test_scrape_integration.yaml +0 -0
  72. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/tests/conftest.py +0 -0
  73. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/tests/fixtures/2021_page_1.html +0 -0
  74. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/tests/fixtures/2021_page_2.html +0 -0
  75. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/tests/test_cache.py +0 -0
  76. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/tests/test_delete.py +0 -0
  77. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/tests/test_job_center.py +0 -0
  78. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/tests/test_job_center_cache.py +0 -0
  79. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/tests/test_openpyxl.py +0 -0
  80. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/__init__.py +0 -0
  81. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/cache.py +0 -0
  82. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/cli.py +0 -0
  83. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/platforms/__init__.py +0 -0
  84. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/platforms/job_center/__init__.py +0 -0
  85. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/platforms/job_center/cache.py +0 -0
  86. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/platforms/job_center/site.py +0 -0
  87. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/platforms/job_center/urls.py +0 -0
  88. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/platforms/job_center/utils.py +0 -0
  89. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/runner.py +0 -0
  90. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/__init__.py +0 -0
  91. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/ak.py +0 -0
  92. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/al.py +0 -0
  93. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/az.py +0 -0
  94. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/ca.py +0 -0
  95. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/co.py +0 -0
  96. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/ct.py +0 -0
  97. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/dc.py +0 -0
  98. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/de.py +0 -0
  99. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/fl.py +0 -0
  100. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/ga.py +0 -0
  101. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/hi.py +0 -0
  102. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/ia.py +0 -0
  103. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/id.py +0 -0
  104. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/il.py +0 -0
  105. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/in.py +0 -0
  106. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/ks.py +0 -0
  107. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/ky.py +0 -0
  108. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/la.py +0 -0
  109. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/md.py +0 -0
  110. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/me.py +0 -0
  111. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/mi.py +0 -0
  112. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/mo.py +0 -0
  113. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/mt.py +0 -0
  114. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/ne.py +0 -0
  115. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/nj.py +0 -0
  116. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/nm.py +0 -0
  117. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/ny.py +0 -0
  118. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/oh.py +0 -0
  119. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/ok.py +0 -0
  120. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/or.py +0 -0
  121. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/ri.py +0 -0
  122. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/sc.py +0 -0
  123. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/sd.py +0 -0
  124. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/tx.py +0 -0
  125. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/ut.py +0 -0
  126. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/va.py +0 -0
  127. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/vt.py +0 -0
  128. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/wa.py +0 -0
  129. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/scrapers/wi.py +0 -0
  130. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn/utils.py +0 -0
  131. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn_scraper.egg-info/SOURCES.txt +0 -0
  132. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn_scraper.egg-info/dependency_links.txt +0 -0
  133. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn_scraper.egg-info/entry_points.txt +0 -0
  134. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn_scraper.egg-info/not-zip-safe +0 -0
  135. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn_scraper.egg-info/requires.txt +0 -0
  136. {warn-scraper-1.2.72 → warn-scraper-1.2.73}/warn_scraper.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: warn-scraper
3
- Version: 1.2.72
3
+ Version: 1.2.73
4
4
  Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
5
5
  Home-page: https://github.com/biglocalnews/warn-scraper
6
6
  Author: Big Local News
@@ -0,0 +1,94 @@
1
+ import csv
2
+ import typing
3
+ from pathlib import Path
4
+
5
+ from bs4 import BeautifulSoup
6
+
7
+ from .. import utils
8
+ from ..cache import Cache
9
+
10
+ __authors__ = ["anikasikka", "stucka"]
11
+ __tags__ = ["html"]
12
+ __source__ = {
13
+ "name": "Tennessee Department of Labor and Workforce Development",
14
+ "url": "https://www.tn.gov/workforce/general-resources/major-publications0/major-publications-redirect/reports.html",
15
+ }
16
+
17
+
18
+ def scrape(
19
+ data_dir: Path = utils.WARN_DATA_DIR,
20
+ cache_dir: Path = utils.WARN_CACHE_DIR,
21
+ ) -> Path:
22
+ """
23
+ Scrape data from Tennessee.
24
+
25
+ Keyword arguments:
26
+ data_dir -- the Path were the result will be saved (default WARN_DATA_DIR)
27
+ cache_dir -- the Path where results can be cached (default WARN_CACHE_DIR)
28
+
29
+ Returns: the Path where the file is written
30
+ """
31
+ # Initialize the cache
32
+ cache = Cache(cache_dir)
33
+
34
+ # Grab the HTML page with the latest years of data
35
+ page = utils.get_url(
36
+ "https://www.tn.gov/workforce/general-resources/major-publications0/major-publications-redirect/reports.html"
37
+ )
38
+ html = page.text
39
+ cache.write("tn/source.html", html)
40
+ soup = BeautifulSoup(html, "html5lib")
41
+ tables = soup.find_all(attrs={"class": "tn-datatable"})
42
+ rows = BeautifulSoup(str(tables), "html5lib").find_all("tr")
43
+
44
+ dataheaders: typing.List = [
45
+ "Notice Date",
46
+ "Effective Date",
47
+ "Received Date",
48
+ "Company",
49
+ "City",
50
+ "County",
51
+ "No. Of Employees",
52
+ "Layoff/Closure",
53
+ "Notice ID",
54
+ # "Notice URL",
55
+ ]
56
+
57
+ staginglist: typing.List = []
58
+ for row in reversed(rows):
59
+ cells = row.find_all("td")
60
+ if len(cells) == 6: # Filter for potentially valid rows
61
+ line: typing.Dict = {}
62
+ for item in dataheaders: # Build an ordered dictionary with null values
63
+ line[item] = None
64
+ line["Notice Date"] = cells[0].text.strip()
65
+ line["Effective Date"] = cells[4].text.strip()
66
+ line["Company"] = cells[1].text.strip()
67
+ line["County"] = cells[2].text.strip()
68
+ line["No. Of Employees"] = cells[3].text.strip()
69
+ line["Notice ID"] = cells[5].text.strip()
70
+ # line['Notice URL'] = cells[1].find("a")['href']
71
+ staginglist.append(line)
72
+
73
+ # Bring in historical data
74
+ historical_file = cache_dir / "tn/tn_historical.csv"
75
+ historical_url = (
76
+ "https://storage.googleapis.com/bln-data-public/warn-layoffs/tn_historical.csv"
77
+ )
78
+ utils.fetch_if_not_cached(historical_file, historical_url)
79
+ historical_str = cache.read("tn/tn_historical.csv")
80
+
81
+ historicallist = list(csv.DictReader(historical_str.splitlines()))
82
+
83
+ # Combine fresh and historical
84
+ staginglist.extend(historicallist)
85
+
86
+ output_csv = data_dir / "tn.csv"
87
+
88
+ utils.write_dict_rows_to_csv(output_csv, dataheaders, staginglist)
89
+
90
+ return output_csv
91
+
92
+
93
+ if __name__ == "__main__":
94
+ scrape()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: warn-scraper
3
- Version: 1.2.72
3
+ Version: 1.2.73
4
4
  Summary: Command-line interface for downloading WARN Act notices of qualified plant closings and mass layoffs from state government websites
5
5
  Home-page: https://github.com/biglocalnews/warn-scraper
6
6
  Author: Big Local News
@@ -1,156 +0,0 @@
1
- import typing
2
- from pathlib import Path
3
-
4
- import pdfplumber
5
- from bs4 import BeautifulSoup
6
-
7
- from .. import utils
8
- from ..cache import Cache
9
-
10
- __authors__ = ["anikasikka"]
11
- __tags__ = ["html", "pdf"]
12
- __source__ = {
13
- "name": "Tennessee Department of Labor and Workforce Development",
14
- "url": "https://www.tn.gov/workforce/general-resources/major-publications0/major-publications-redirect/reports.html",
15
- }
16
-
17
-
18
- def scrape(
19
- data_dir: Path = utils.WARN_DATA_DIR,
20
- cache_dir: Path = utils.WARN_CACHE_DIR,
21
- ) -> Path:
22
- """
23
- Scrape data from Tennessee.
24
-
25
- Keyword arguments:
26
- data_dir -- the Path were the result will be saved (default WARN_DATA_DIR)
27
- cache_dir -- the Path where results can be cached (default WARN_CACHE_DIR)
28
-
29
- Returns: the Path where the file is written
30
- """
31
- # Initialize the cache
32
- cache = Cache(cache_dir)
33
-
34
- # Grab the HTML page with the latest years of data
35
- page = utils.get_url(
36
- "https://www.tn.gov/workforce/general-resources/major-publications0/major-publications-redirect/reports.html"
37
- )
38
- html = page.text
39
- cache.write("tn/source.html", html)
40
-
41
- # Grab the PDF with the archived historial data
42
- pdf_url = "https://www.tn.gov/content/dam/tn/workforce/documents/majorpublications/reports/WarnReportByMonth.pdf"
43
- pdf_file = cache.download("tn/pdffile.pdf", pdf_url)
44
-
45
- # Set the headers we'll use for both sources
46
- tn_headers = [
47
- "Notice Date",
48
- "Effective Date",
49
- "Received Date",
50
- "Company",
51
- "City",
52
- "County",
53
- "No. Of Employees",
54
- "Layoff/Closure",
55
- "Notice ID",
56
- ]
57
- cleaned_data: typing.List[typing.Any] = [tn_headers]
58
-
59
- # Parse the latest HTML file and convert to a list of rows, with a header in the first row.
60
- soup = BeautifulSoup(html, "html5lib")
61
-
62
- # Grab all the list items on the page
63
- data_list = soup.find_all("p")
64
-
65
- # Loop through them all, skipping the first item, which is a header
66
- for data in data_list[1:]:
67
- # splitting the data on its delimiter
68
- items = str(data).split("|")
69
-
70
- # making sure that the last item in the list is the data value of interest
71
- # splitting based on last character of each text-html data sequence
72
- raw_data = []
73
- for item in items:
74
- value_html = item.split(":")[-1]
75
- value_soup = BeautifulSoup(value_html, "html5lib")
76
- string_list = list(value_soup.stripped_strings)
77
- if len(string_list) > 0:
78
- value = string_list[-1]
79
- else:
80
- continue
81
- raw_data.append(value)
82
-
83
- # If there aren't six entries it's junk
84
- if len(raw_data) != 6:
85
- continue
86
-
87
- # Pluck out the values we want
88
- nice_data = [
89
- raw_data[0], # Notice Date
90
- raw_data[4], # Effective Date
91
- "", # Received Date
92
- raw_data[1], # Company
93
- "", # City
94
- raw_data[2], # County
95
- raw_data[3], # Number of employees
96
- "", # Layoff/Closure
97
- raw_data[5], # Notice ID
98
- ]
99
-
100
- # Add them to the master list
101
- cleaned_data.append(nice_data)
102
-
103
- # The PDF header blacklist of rows to toss
104
- pdf_header_blacklist = [
105
- "Notice Date",
106
- "Total",
107
- ]
108
-
109
- # Open the PDF
110
- with pdfplumber.open(pdf_file) as pdf:
111
- # Loop through all the pages
112
- for i, my_page in enumerate(pdf.pages):
113
- # Sll even pages have data, odd pages don't have the data
114
- if i % 2 != 0:
115
- continue
116
-
117
- # Pull out the table and loop through the rows
118
- table = my_page.extract_table()
119
- if not table:
120
- continue
121
-
122
- # Cut empty rows
123
- row_list = [r for r in table if any(r)]
124
- if not row_list:
125
- continue
126
-
127
- # If this is a summary table, skip it
128
- first_cell = row_list[0][0]
129
- assert first_cell
130
- if first_cell.lower().strip() == "summary by month":
131
- continue
132
-
133
- # Loop through all the rows ...
134
- for row in row_list:
135
- # Skip remove redundant headers
136
- if row[0] in pdf_header_blacklist:
137
- continue
138
-
139
- # Toss in an empty Notice ID since it isn't in the PDF
140
- row.append("")
141
-
142
- # Add the data to our output
143
- cleaned_data.append(row)
144
-
145
- # Set the path to the final CSV
146
- output_csv = data_dir / "tn.csv"
147
-
148
- # Write out the rows to the export directory
149
- utils.write_rows_to_csv(output_csv, cleaned_data)
150
-
151
- # Return the path to the final CSV
152
- return output_csv
153
-
154
-
155
- if __name__ == "__main__":
156
- scrape()
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes