apoc-data 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,44 @@
1
+ Metadata-Version: 2.1
2
+ Name: apoc-data
3
+ Version: 0.1.0
4
+ Summary: Data from the Alaska Public Offices Commission.
5
+ Author-Email: Nick Crews <nicholas.b.crews@gmail.com>
6
+ License: MIT
7
+ Requires-Python: >=3.9
8
+ Description-Content-Type: text/markdown
9
+
10
+ # APOC Data
11
+
12
+ Data from the [Alaska Public Offices Commission](https://aws.state.ak.us/ApocReports/Campaign/).
13
+
14
+ This scrapes the CSV files from the APOC website once a day and uploads them to
15
+ [this repo's releases](https://github.com/NickCrews/apoc-data/releases).
16
+
17
+ ## Download
18
+
19
+ You can download these CSVs using the direct URLs from the releases page
20
+ using curl, pandas, ibis, whatever!
21
+
22
+ ```bash
23
+ curl -L https://github.com/NickCrews/apoc-data/releases/download/20240716-025636/candidate_registration.csv > candidate_registration.csv
24
+ ```
25
+
26
+ or we have a tiny python script that makes this a little nicer, eg get the latest
27
+ release, choose the download directory, etc. Read the script for more info.
28
+
29
+ ```bash
30
+ curl -s https://raw.githubusercontent.com/NickCrews/apoc-data/main/src/apoc_data/download.py | python - --release latest
31
+ ```
32
+
33
+ ## Dev Notes
34
+
35
+ ```shell
36
+ pdm install
37
+ playwright install chromium
38
+ ```
39
+
40
+ scrape:
41
+
42
+ ```shell
43
+ python -m apoc_data.scrape --directory downloads --no-headless
44
+ ```
@@ -0,0 +1,35 @@
1
+ # APOC Data
2
+
3
+ Data from the [Alaska Public Offices Commission](https://aws.state.ak.us/ApocReports/Campaign/).
4
+
5
+ This scrapes the CSV files from the APOC website once a day and uploads them to
6
+ [this repo's releases](https://github.com/NickCrews/apoc-data/releases).
7
+
8
+ ## Download
9
+
10
+ You can download these CSVs using the direct URLs from the releases page
11
+ using curl, pandas, ibis, whatever!
12
+
13
+ ```bash
14
+ curl -L https://github.com/NickCrews/apoc-data/releases/download/20240716-025636/candidate_registration.csv > candidate_registration.csv
15
+ ```
16
+
17
+ or we have a tiny python script that makes this a little nicer, eg get the latest
18
+ release, choose the download directory, etc. Read the script for more info.
19
+
20
+ ```bash
21
+ curl -s https://raw.githubusercontent.com/NickCrews/apoc-data/main/src/apoc_data/download.py | python - --release latest
22
+ ```
23
+
24
+ ## Dev Notes
25
+
26
+ ```shell
27
+ pdm install
28
+ playwright install chromium
29
+ ```
30
+
31
+ scrape:
32
+
33
+ ```shell
34
+ python -m apoc_data.scrape --directory downloads --no-headless
35
+ ```
@@ -0,0 +1,28 @@
1
+ [project]
2
+ name = "apoc-data"
3
+ version = "0.1.0"
4
+ description = "Data from the Alaska Public Offices Commission."
5
+ authors = [
6
+ { name = "Nick Crews", email = "nicholas.b.crews@gmail.com" },
7
+ ]
8
+ dependencies = []
9
+ requires-python = ">=3.9"
10
+ readme = "README.md"
11
+
12
+ [project.license]
13
+ text = "MIT"
14
+
15
+ [tool.pdm]
16
+ distribution = true
17
+
18
+ [tool.pdm.dev-dependencies]
19
+ dev = [
20
+ "playwright",
21
+ "ruff",
22
+ ]
23
+
24
+ [build-system]
25
+ requires = [
26
+ "pdm-backend",
27
+ ]
28
+ build-backend = "pdm.backend"
@@ -0,0 +1,3 @@
1
+ """Data from the Alaska Public Offices Commission."""
2
+
3
+ from apoc_data import scrape as scrape
@@ -0,0 +1,97 @@
1
+ """Download CSV(s) of APOC data from https://github.com/NickCrews/apoc-data/releases
2
+
3
+ A no-install way to use this script is to download it from github and pipe to curl:
4
+
5
+ ```shell
6
+ curl -s https://raw.githubusercontent.com/NickCrews/apoc-data/main/src/apoc_data/download.py | python - --release latest
7
+ ```
8
+ """
9
+
10
+ import argparse
11
+ import json
12
+ from pathlib import Path
13
+ from urllib.request import urlopen
14
+
15
+
16
+ def download(
17
+ *,
18
+ release: str = "latest",
19
+ filename: str | None = None,
20
+ destination: str | Path = "downloads/",
21
+ ) -> None:
22
+ """Download CSV(s) of APOC data from https://github.com/NickCrews/apoc-data/releases.
23
+
24
+ Parameters
25
+ ----------
26
+ release : str, optional
27
+ The name of the release to download.
28
+ Default is None, which means latest release
29
+ filename : str, optional
30
+ The name of the file to download.
31
+ Default is None, which downloads all files.
32
+ destination : str or Path, optional
33
+ Where to save the file(s).
34
+ If this looks like a file (the final path segment contains a `.`),
35
+ then we can only download a single file, and it will be saved to that location.
36
+ Otherwise, the file(s) will be saved underneath there.
37
+ """
38
+ destination = Path(destination)
39
+ release, assets = _get_release_info(release)
40
+ if filename is not None:
41
+ if filename not in assets:
42
+ raise ValueError(f"Release {release} does not have a file named {filename}")
43
+ if not _is_file(destination):
44
+ destination = destination / filename
45
+ _download_asset(assets[filename], destination)
46
+ else:
47
+ if _is_file(destination):
48
+ raise ValueError("Can't download all files to a single file")
49
+ for name, url in assets.items():
50
+ _download_asset(url, destination / name)
51
+
52
+
53
+ def _is_file(destination: Path) -> bool:
54
+ return "." in destination.name
55
+
56
+
57
+ def _get_release_info(release: str) -> tuple[str, dict[str, str]]:
58
+ url = f"https://api.github.com/repos/NickCrews/apoc-data/releases/{release}"
59
+ with urlopen(url) as response:
60
+ info = json.loads(response.read())
61
+ assets = {asset["name"]: asset["browser_download_url"] for asset in info["assets"]}
62
+ return info["tag_name"], assets
63
+
64
+
65
+ def _download_asset(url: str, destination: Path) -> None:
66
+ destination.parent.mkdir(parents=True, exist_ok=True)
67
+ with urlopen(url) as response, open(destination, "wb") as file:
68
+ file.write(response.read())
69
+
70
+
71
+ def cli():
72
+ parser = argparse.ArgumentParser(
73
+ description="Download data from the Alaska Public Offices Commission"
74
+ )
75
+ parser.add_argument(
76
+ "--release",
77
+ type=str,
78
+ default="latest",
79
+ help="The name of the release to download",
80
+ )
81
+ parser.add_argument(
82
+ "--filename",
83
+ type=str,
84
+ help="The name of the file to download",
85
+ )
86
+ parser.add_argument(
87
+ "--destination",
88
+ type=str,
89
+ default="downloads/",
90
+ help="Where to save the file(s)",
91
+ )
92
+ args = parser.parse_args()
93
+ download(release=args.release, filename=args.filename, destination=args.destination)
94
+
95
+
96
+ if __name__ == "__main__":
97
+ cli()
@@ -0,0 +1,14 @@
1
+ from apoc_data.scrape._scraper import (
2
+ CandidateRegistrationScraper as CandidateRegistrationScraper,
3
+ )
4
+ from apoc_data.scrape._scraper import (
5
+ EntityRegistrationScraper as EntityRegistrationScraper,
6
+ )
7
+ from apoc_data.scrape._scraper import (
8
+ GroupRegistrationScraper as GroupRegistrationScraper,
9
+ )
10
+ from apoc_data.scrape._scraper import IncomeScraper as IncomeScraper
11
+ from apoc_data.scrape._scraper import LetterOfIntentScraper as LetterOfIntentScraper
12
+ from apoc_data.scrape._scraper import PScraper as PScraper
13
+ from apoc_data.scrape._scraper import run_scrapers as run_scrapers
14
+ from apoc_data.scrape._scraper import scrape_all as scrape_all
@@ -0,0 +1,47 @@
1
+ import argparse
2
+ import logging
3
+ from pathlib import Path
4
+
5
+ from apoc_data.scrape import scrape_all
6
+ from apoc_data.scrape._scraper import DEFAULT_DIRECTORY
7
+
8
+
9
+ def main():
10
+ parser = argparse.ArgumentParser(
11
+ description="Download data from the Alaska Public Offices Commission"
12
+ )
13
+ parser.add_argument(
14
+ "--directory",
15
+ type=str,
16
+ default=DEFAULT_DIRECTORY,
17
+ help="The directory to save the data to",
18
+ )
19
+ parser.add_argument(
20
+ "--headless",
21
+ "--no-headless",
22
+ dest="headless",
23
+ default=True,
24
+ action=_BooleanAction,
25
+ help="Run the browser in headless mode",
26
+ )
27
+ args = parser.parse_args()
28
+ directory = Path(args.directory).absolute()
29
+ if directory.is_file():
30
+ raise ValueError("The directory can't be a file")
31
+ logging.basicConfig(level=logging.INFO)
32
+ scrape_all(directory, headless=args.headless)
33
+
34
+
35
+ # from https://thisdataguy.com/2017/07/03/no-options-with-argparse-and-python/
36
+ class _BooleanAction(argparse.Action):
37
+ def __init__(self, option_strings, dest, nargs=None, **kwargs):
38
+ super(_BooleanAction, self).__init__(option_strings, dest, nargs=0, **kwargs)
39
+
40
+ def __call__(self, parser, namespace, values, option_string=None):
41
+ setattr(
42
+ namespace, self.dest, False if option_string.startswith("--no") else True
43
+ )
44
+
45
+
46
+ if __name__ == "__main__":
47
+ main()
@@ -0,0 +1,73 @@
1
+ from enum import Enum
2
+ from typing import NamedTuple
3
+
4
+
5
+ class YearEnum(str, Enum):
6
+ """The possible `Year` values you can filter by at
7
+ https://aws.state.ak.us/ApocReports/CampaignDisclosure/CDIncome.aspx"""
8
+
9
+ any = "Any"
10
+ _2008 = "2008"
11
+ _2009 = "2009"
12
+ _2010 = "2010"
13
+ _2011 = "2011"
14
+ _2012 = "2012"
15
+ _2013 = "2013"
16
+ _2014 = "2014"
17
+ _2015 = "2015"
18
+ _2016 = "2016"
19
+ _2017 = "2017"
20
+ _2018 = "2018"
21
+ _2019 = "2019"
22
+ _2020 = "2020"
23
+ _2021 = "2021"
24
+ _2022 = "2022"
25
+ _2023 = "2023"
26
+ _2024 = "2024"
27
+ _2025 = "2025"
28
+ _2026 = "2026"
29
+
30
+ @classmethod
31
+ def _missing_(cls, value):
32
+ """Convert to str, in case you pass in an int."""
33
+ # https://stackoverflow.com/a/55052069/5156887
34
+ if isinstance(value, int):
35
+ return cls(str(value))
36
+ return super()._missing_(value)
37
+
38
+ def __repr__(self) -> str:
39
+ return f"'{self.value}'"
40
+
41
+
42
+ class StatusEnum(str, Enum):
43
+ """The possible `Status` values you can filter by at
44
+ https://aws.state.ak.us/ApocReports/CampaignDisclosure/CDIncome.aspx"""
45
+
46
+ complete_not_amended = "Complete, Not Amended"
47
+ all_complete_forms = "All Complete Forms"
48
+ non_amended_only = "Non-Amended Only"
49
+ amended_only = "Amended Only"
50
+ non_amendments_only = "Non-Amendments Only"
51
+ amendments_only = "Amendments Only"
52
+
53
+ def __repr__(self) -> str:
54
+ return f"'{self.value}'"
55
+
56
+
57
+ # allow a __new__ with coercion:
58
+ # https://github.com/python/typing/issues/526#issuecomment-665750135
59
+ class ScrapeFilters(
60
+ NamedTuple("Foo", [("report_year", YearEnum), ("status", StatusEnum)])
61
+ ):
62
+ """Which filters to run in the APOC UI.
63
+
64
+ See https://aws.state.ak.us/ApocReports/CampaignDisclosure/CDIncome.aspx
65
+ for an example.
66
+ """
67
+
68
+ def __new__(
69
+ cls,
70
+ report_year: YearEnum | int | str = YearEnum.any,
71
+ status: StatusEnum | str = StatusEnum.complete_not_amended,
72
+ ):
73
+ return super().__new__(cls, YearEnum(report_year), StatusEnum(status))
@@ -0,0 +1,237 @@
1
+ """Scrape the Alaska Public Offices Commission website for campaign finance data.
2
+
3
+ Uses Playwright to emulate me going to the page and clicking the buttons.
4
+ The site appears to be built with ASP.NET, so it's not easy to scrape
5
+ using requests and BeautifulSoup. All the state is stored in the session,
6
+ so you can't just make a GET request to the export URL.
7
+ You need to actually have a browser session that has gone through the
8
+ proper steps to get the data.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import asyncio
14
+ import logging
15
+ from contextlib import asynccontextmanager
16
+ from pathlib import Path
17
+ from typing import TYPE_CHECKING, AsyncIterable, ClassVar, Coroutine, Iterable, Protocol
18
+
19
+ from playwright.async_api import BrowserContext, async_playwright, expect
20
+
21
+ from ._filters import ScrapeFilters, YearEnum
22
+
23
+ if TYPE_CHECKING:
24
+ from playwright.async_api import BrowserContext, Download, Page
25
+
26
+ _logger = logging.getLogger(__name__)
27
+
28
+ DEFAULT_DIRECTORY = "scraped/"
29
+
30
+
31
+ @asynccontextmanager
32
+ async def make_browser_async(headless: bool = True) -> AsyncIterable[BrowserContext]:
33
+ async with async_playwright() as p:
34
+ browser = await p.chromium.launch(
35
+ headless=headless,
36
+ # This sometimes avoids race conditions?
37
+ # slow_mo=200,
38
+ )
39
+ yield await browser.new_context(
40
+ **p.devices["Desktop Chrome"],
41
+ )
42
+
43
+
44
+ async def _run_scrape_flow(page: Page, url: str, filters: ScrapeFilters) -> Download:
45
+ # unconditionally reload the page to clear out any old state
46
+ await page.goto(url)
47
+
48
+ # after page load it takes a bit for the dropdowns to be ready?
49
+ await page.wait_for_timeout(100)
50
+ await page.select_option("select:below(:text('Status:'))", filters.status.value)
51
+ await page.select_option(
52
+ "select:below(:text('Report Year:'))", filters.report_year.value
53
+ )
54
+ # it still appears a manual wait is needed??
55
+ await page.wait_for_timeout(100)
56
+
57
+ await page.click("//input[@value='Search']")
58
+ await page.wait_for_timeout(100)
59
+ # Wait for either 1. results to come in or 2. the "no results" message to show.
60
+ # Otherwise if we export too early we won't get any data.
61
+ await expect(page.get_by_text("Press 'Search' to Load Results.")).to_be_hidden()
62
+
63
+ await page.click("//input[@value='Export']")
64
+ # This has to wait for the server to actually begin the download.
65
+ # When it is really busy, this can take a long time.
66
+ # So we make this timeout quite large.
67
+ async with page.expect_download(timeout=120_000) as download_info:
68
+ # The first link with text ".CSV" below the text "Export All Pages:"
69
+ await page.click("a:text('.CSV'):below(:text('Export All Pages:'))")
70
+
71
+ await page.click("//input[@value='Close']")
72
+ return await download_info.value
73
+
74
+
75
+ class PScraper(Protocol):
76
+ def __call__(self, browser_context: BrowserContext) -> None: ...
77
+
78
+
79
+ async def run_scrapers(
80
+ scrapers: Iterable[PScraper],
81
+ *,
82
+ browser_context: BrowserContext
83
+ | Coroutine[None, None, BrowserContext]
84
+ | None = None,
85
+ ) -> None:
86
+ if isinstance(browser_context, BrowserContext):
87
+ for s in scrapers:
88
+ await s(browser_context)
89
+ elif isinstance(browser_context, Coroutine):
90
+ browser_context = await browser_context
91
+ await run_scrapers(scrapers, browser_context=browser_context)
92
+ elif browser_context is None:
93
+ async with make_browser_async() as ctx:
94
+ await run_scrapers(scrapers, browser_context=ctx)
95
+
96
+
97
+ class _ScraperBase:
98
+ _HOME_URL: ClassVar[str]
99
+ name: ClassVar[str]
100
+
101
+ def __init__(
102
+ self,
103
+ *,
104
+ filters: ScrapeFilters | None = None,
105
+ destination: str | Path,
106
+ ):
107
+ self.destination = Path(destination)
108
+ self.filters = filters or ScrapeFilters()
109
+
110
+ async def __call__(self, browser_context: BrowserContext) -> None:
111
+ page = (
112
+ browser_context.pages[0]
113
+ if browser_context.pages
114
+ else await browser_context.new_page()
115
+ )
116
+ _logger.info(
117
+ f"Downloading {self.name} to {self.destination} using {self.filters}"
118
+ )
119
+ download = await _run_scrape_flow(page, self._HOME_URL, self.filters)
120
+ _logger.info("Download started")
121
+ path = await download.path()
122
+ if path.stat().st_size == 0:
123
+ # We end up with an empty file, instead of a CSV with a header row and
124
+ # no data rows. In downstream processing this makes importing data
125
+ # with *.csv barf. So we simply don't include it.
126
+ # We could abort the download further upstream if we wanted.
127
+ _logger.info(f"No results. Not writing to {self.destination}")
128
+ else:
129
+ await download.save_as(self.destination)
130
+ _logger.info(f"Downloaded {self.destination}")
131
+
132
+ def run(
133
+ self,
134
+ browser_context: BrowserContext
135
+ | Coroutine[None, None, BrowserContext]
136
+ | None = None,
137
+ ) -> None:
138
+ """Run the download in the given browser"""
139
+ asyncio.run(run_scrapers([self], browser_context=browser_context))
140
+
141
+
142
+ class CandidateRegistrationScraper(_ScraperBase):
143
+ _HOME_URL = "https://aws.state.ak.us/ApocReports/Registration/CandidateRegistration/CRForms.aspx"
144
+ name = "candidate_registration"
145
+
146
+
147
+ class LetterOfIntentScraper(_ScraperBase):
148
+ _HOME_URL = (
149
+ "https://aws.state.ak.us/ApocReports/Registration/LetterOfIntent/LOIForms.aspx"
150
+ )
151
+ name = "letter_of_intent"
152
+
153
+
154
+ class GroupRegistrationScraper(_ScraperBase):
155
+ _HOME_URL = "https://aws.state.ak.us/ApocReports/Registration/GroupRegistration/GRForms.aspx"
156
+ name = "group_registration"
157
+
158
+
159
+ class EntityRegistrationScraper(_ScraperBase):
160
+ _HOME_URL = "https://aws.state.ak.us/ApocReports/Registration/EntityRegistration/ERForms.aspx"
161
+ name = "entity_registration"
162
+
163
+
164
+ class DebtScraper(_ScraperBase):
165
+ _HOME_URL = "https://aws.state.ak.us/ApocReports/CampaignDisclosure/CDDebt.aspx"
166
+ name = "debt"
167
+
168
+
169
+ class ExpenditureScraper(_ScraperBase):
170
+ _HOME_URL = (
171
+ "https://aws.state.ak.us/ApocReports/CampaignDisclosure/CDExpenditures.aspx"
172
+ )
173
+ name = "expenditures"
174
+
175
+
176
+ class IncomeScraper(_ScraperBase):
177
+ _HOME_URL = "https://aws.state.ak.us/ApocReports/CampaignDisclosure/CDIncome.aspx"
178
+ name = "income"
179
+
180
+ def __init__(self, *, filters: ScrapeFilters, destination: str | Path):
181
+ super().__init__(filters=filters, destination=destination)
182
+ if self.filters.report_year == YearEnum.any:
183
+ raise ValueError("For Receipts, can't use report_year=Any")
184
+
185
+
186
+ def scrape_all(
187
+ directory: str | Path = DEFAULT_DIRECTORY,
188
+ *,
189
+ headless: bool = True,
190
+ ) -> None:
191
+ """Scrape .CSVs from https://aws.state.ak.us/ApocReports/Campaign/
192
+
193
+ This will download the following files:
194
+ - candidate_registration.csv
195
+ - letter_of_intent.csv
196
+ - group_registration.csv
197
+ - entity_registration.csv
198
+ - expenditure.csv
199
+ - debt.csv
200
+ - income_{year}.csv for each year where there is data
201
+
202
+ Parameters
203
+ ----------
204
+ directory : str or Path
205
+ The directory to save the files to.
206
+ browser_context : BrowserContext, optional
207
+ A browser context to use for downloading.
208
+ If not provided, a temporary one will be created.
209
+ """
210
+ directory = Path(directory)
211
+ classes: list[_ScraperBase] = [
212
+ CandidateRegistrationScraper,
213
+ LetterOfIntentScraper,
214
+ GroupRegistrationScraper,
215
+ EntityRegistrationScraper,
216
+ DebtScraper,
217
+ ExpenditureScraper,
218
+ ]
219
+ scrapers = [cls(destination=directory / f"{cls.name}.csv") for cls in classes] + [
220
+ IncomeScraper(
221
+ filters=ScrapeFilters(report_year=year),
222
+ destination=directory / f"{IncomeScraper.name}_{year.value}.csv",
223
+ )
224
+ for year in YearEnum
225
+ if year != YearEnum.any
226
+ ]
227
+
228
+ async def run():
229
+ async with make_browser_async(headless=headless) as browser_context:
230
+ await run_scrapers(scrapers, browser_context=browser_context)
231
+
232
+ asyncio.run(run())
233
+
234
+
235
+ if __name__ == "__main__":
236
+ logging.basicConfig(level=logging.INFO)
237
+ scrape_all()