PyPI - apoc-data - Versions diffs - 0.1.0__tar.gz - Mend

apoc-data 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

apoc_data-0.1.0/PKG-INFO +44 -0
apoc_data-0.1.0/README.md +35 -0
apoc_data-0.1.0/pyproject.toml +28 -0
apoc_data-0.1.0/src/apoc_data/__init__.py +3 -0
apoc_data-0.1.0/src/apoc_data/download.py +97 -0
apoc_data-0.1.0/src/apoc_data/scrape/__init__.py +14 -0
apoc_data-0.1.0/src/apoc_data/scrape/__main__.py +47 -0
apoc_data-0.1.0/src/apoc_data/scrape/_filters.py +73 -0
apoc_data-0.1.0/src/apoc_data/scrape/_scraper.py +237 -0

apoc_data-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,44 @@
+Metadata-Version: 2.1
+Name: apoc-data
+Version: 0.1.0
+Summary: Data from the Alaska Public Offices Commission.
+Author-Email: Nick Crews <nicholas.b.crews@gmail.com>
+License: MIT
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+# APOC Data
+Data from the [Alaska Public Offices Commission](https://aws.state.ak.us/ApocReports/Campaign/).
+This scrapes the CSV files from the APOC website once a day and uploads them to
+[this repo's releases](https://github.com/NickCrews/apoc-data/releases).
+## Download
+You can download these CSVs using the direct URLs from the releases page
+using curl, pandas, ibis, whatever!
+```bash
+curl -L https://github.com/NickCrews/apoc-data/releases/download/20240716-025636/candidate_registration.csv > candidate_registration.csv
+```
+or we have a tiny python script that makes this a little nicer, eg get the latest
+release, choose the download directory, etc. Read the script for more info.
+```bash
+curl -s https://raw.githubusercontent.com/NickCrews/apoc-data/main/src/apoc_data/download.py | python - --release latest
+```
+## Dev Notes
+```shell
+pdm install
+playwright install chromium
+```
+scrape:
+```shell
+python -m apoc_data.scrape --directory downloads --no-headless
+```

apoc_data-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,35 @@
+# APOC Data
+Data from the [Alaska Public Offices Commission](https://aws.state.ak.us/ApocReports/Campaign/).
+This scrapes the CSV files from the APOC website once a day and uploads them to
+[this repo's releases](https://github.com/NickCrews/apoc-data/releases).
+## Download
+You can download these CSVs using the direct URLs from the releases page
+using curl, pandas, ibis, whatever!
+```bash
+curl -L https://github.com/NickCrews/apoc-data/releases/download/20240716-025636/candidate_registration.csv > candidate_registration.csv
+```
+or we have a tiny python script that makes this a little nicer, eg get the latest
+release, choose the download directory, etc. Read the script for more info.
+```bash
+curl -s https://raw.githubusercontent.com/NickCrews/apoc-data/main/src/apoc_data/download.py | python - --release latest
+```
+## Dev Notes
+```shell
+pdm install
+playwright install chromium
+```
+scrape:
+```shell
+python -m apoc_data.scrape --directory downloads --no-headless
+```

apoc_data-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,28 @@
+[project]
+name = "apoc-data"
+version = "0.1.0"
+description = "Data from the Alaska Public Offices Commission."
+authors = [
+    { name = "Nick Crews", email = "nicholas.b.crews@gmail.com" },
+]
+dependencies = []
+requires-python = ">=3.9"
+readme = "README.md"
+[project.license]
+text = "MIT"
+[tool.pdm]
+distribution = true
+[tool.pdm.dev-dependencies]
+dev = [
+    "playwright",
+    "ruff",
+]
+[build-system]
+requires = [
+    "pdm-backend",
+]
+build-backend = "pdm.backend"

apoc_data-0.1.0/src/apoc_data/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+"""Data from the Alaska Public Offices Commission."""
+from apoc_data import scrape as scrape

apoc_data-0.1.0/src/apoc_data/download.py ADDED Viewed

@@ -0,0 +1,97 @@
+"""Download CSV(s) of APOC data from https://github.com/NickCrews/apoc-data/releases
+A no-install way to use this script is to download it from github and pipe to curl:
+```shell
+curl -s https://raw.githubusercontent.com/NickCrews/apoc-data/main/src/apoc_data/download.py | python - --release latest
+```
+"""
+import argparse
+import json
+from pathlib import Path
+from urllib.request import urlopen
+def download(
+    *,
+    release: str = "latest",
+    filename: str | None = None,
+    destination: str | Path = "downloads/",
+) -> None:
+    """Download CSV(s) of APOC data from https://github.com/NickCrews/apoc-data/releases.
+    Parameters
+    ----------
+    release : str, optional
+        The name of the release to download.
+        Default is None, which means latest release
+    filename : str, optional
+        The name of the file to download.
+        Default is None, which downloads all files.
+    destination : str or Path, optional
+        Where to save the file(s).
+        If this looks like a file (the final path segment contains a `.`),
+        then we can only download a single file, and it will be saved to that location.
+        Otherwise, the file(s) will be saved underneath there.
+    """
+    destination = Path(destination)
+    release, assets = _get_release_info(release)
+    if filename is not None:
+        if filename not in assets:
+            raise ValueError(f"Release {release} does not have a file named {filename}")
+        if not _is_file(destination):
+            destination = destination / filename
+        _download_asset(assets[filename], destination)
+    else:
+        if _is_file(destination):
+            raise ValueError("Can't download all files to a single file")
+        for name, url in assets.items():
+            _download_asset(url, destination / name)
+def _is_file(destination: Path) -> bool:
+    return "." in destination.name
+def _get_release_info(release: str) -> tuple[str, dict[str, str]]:
+    url = f"https://api.github.com/repos/NickCrews/apoc-data/releases/{release}"
+    with urlopen(url) as response:
+        info = json.loads(response.read())
+    assets = {asset["name"]: asset["browser_download_url"] for asset in info["assets"]}
+    return info["tag_name"], assets
+def _download_asset(url: str, destination: Path) -> None:
+    destination.parent.mkdir(parents=True, exist_ok=True)
+    with urlopen(url) as response, open(destination, "wb") as file:
+        file.write(response.read())
+def cli():
+    parser = argparse.ArgumentParser(
+        description="Download data from the Alaska Public Offices Commission"
+    )
+    parser.add_argument(
+        "--release",
+        type=str,
+        default="latest",
+        help="The name of the release to download",
+    )
+    parser.add_argument(
+        "--filename",
+        type=str,
+        help="The name of the file to download",
+    )
+    parser.add_argument(
+        "--destination",
+        type=str,
+        default="downloads/",
+        help="Where to save the file(s)",
+    )
+    args = parser.parse_args()
+    download(release=args.release, filename=args.filename, destination=args.destination)
+if __name__ == "__main__":
+    cli()

apoc_data-0.1.0/src/apoc_data/scrape/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+from apoc_data.scrape._scraper import (
+    CandidateRegistrationScraper as CandidateRegistrationScraper,
+)
+from apoc_data.scrape._scraper import (
+    EntityRegistrationScraper as EntityRegistrationScraper,
+)
+from apoc_data.scrape._scraper import (
+    GroupRegistrationScraper as GroupRegistrationScraper,
+)
+from apoc_data.scrape._scraper import IncomeScraper as IncomeScraper
+from apoc_data.scrape._scraper import LetterOfIntentScraper as LetterOfIntentScraper
+from apoc_data.scrape._scraper import PScraper as PScraper
+from apoc_data.scrape._scraper import run_scrapers as run_scrapers
+from apoc_data.scrape._scraper import scrape_all as scrape_all

apoc_data-0.1.0/src/apoc_data/scrape/__main__.py ADDED Viewed

@@ -0,0 +1,47 @@
+import argparse
+import logging
+from pathlib import Path
+from apoc_data.scrape import scrape_all
+from apoc_data.scrape._scraper import DEFAULT_DIRECTORY
+def main():
+    parser = argparse.ArgumentParser(
+        description="Download data from the Alaska Public Offices Commission"
+    )
+    parser.add_argument(
+        "--directory",
+        type=str,
+        default=DEFAULT_DIRECTORY,
+        help="The directory to save the data to",
+    )
+    parser.add_argument(
+        "--headless",
+        "--no-headless",
+        dest="headless",
+        default=True,
+        action=_BooleanAction,
+        help="Run the browser in headless mode",
+    )
+    args = parser.parse_args()
+    directory = Path(args.directory).absolute()
+    if directory.is_file():
+        raise ValueError("The directory can't be a file")
+    logging.basicConfig(level=logging.INFO)
+    scrape_all(directory, headless=args.headless)
+# from https://thisdataguy.com/2017/07/03/no-options-with-argparse-and-python/
+class _BooleanAction(argparse.Action):
+    def __init__(self, option_strings, dest, nargs=None, **kwargs):
+        super(_BooleanAction, self).__init__(option_strings, dest, nargs=0, **kwargs)
+    def __call__(self, parser, namespace, values, option_string=None):
+        setattr(
+            namespace, self.dest, False if option_string.startswith("--no") else True
+        )
+if __name__ == "__main__":
+    main()

apoc_data-0.1.0/src/apoc_data/scrape/_filters.py ADDED Viewed

@@ -0,0 +1,73 @@
+from enum import Enum
+from typing import NamedTuple
+class YearEnum(str, Enum):
+    """The possible `Year` values you can filter by at
+    https://aws.state.ak.us/ApocReports/CampaignDisclosure/CDIncome.aspx"""
+    any = "Any"
+    _2008 = "2008"
+    _2009 = "2009"
+    _2010 = "2010"
+    _2011 = "2011"
+    _2012 = "2012"
+    _2013 = "2013"
+    _2014 = "2014"
+    _2015 = "2015"
+    _2016 = "2016"
+    _2017 = "2017"
+    _2018 = "2018"
+    _2019 = "2019"
+    _2020 = "2020"
+    _2021 = "2021"
+    _2022 = "2022"
+    _2023 = "2023"
+    _2024 = "2024"
+    _2025 = "2025"
+    _2026 = "2026"
+    @classmethod
+    def _missing_(cls, value):
+        """Convert to str, in case you pass in an int."""
+        # https://stackoverflow.com/a/55052069/5156887
+        if isinstance(value, int):
+            return cls(str(value))
+        return super()._missing_(value)
+    def __repr__(self) -> str:
+        return f"'{self.value}'"
+class StatusEnum(str, Enum):
+    """The possible `Status` values you can filter by at
+    https://aws.state.ak.us/ApocReports/CampaignDisclosure/CDIncome.aspx"""
+    complete_not_amended = "Complete, Not Amended"
+    all_complete_forms = "All Complete Forms"
+    non_amended_only = "Non-Amended Only"
+    amended_only = "Amended Only"
+    non_amendments_only = "Non-Amendments Only"
+    amendments_only = "Amendments Only"
+    def __repr__(self) -> str:
+        return f"'{self.value}'"
+# allow a __new__ with coercion:
+# https://github.com/python/typing/issues/526#issuecomment-665750135
+class ScrapeFilters(
+    NamedTuple("Foo", [("report_year", YearEnum), ("status", StatusEnum)])
+):
+    """Which filters to run in the APOC UI.
+    See https://aws.state.ak.us/ApocReports/CampaignDisclosure/CDIncome.aspx
+    for an example.
+    """
+    def __new__(
+        cls,
+        report_year: YearEnum | int | str = YearEnum.any,
+        status: StatusEnum | str = StatusEnum.complete_not_amended,
+    ):
+        return super().__new__(cls, YearEnum(report_year), StatusEnum(status))

apoc_data-0.1.0/src/apoc_data/scrape/_scraper.py ADDED Viewed

@@ -0,0 +1,237 @@
+"""Scrape the Alaska Public Offices Commission website for campaign finance data.
+Uses Playwright to emulate me going to the page and clicking the buttons.
+The site appears to be built with ASP.NET, so it's not easy to scrape
+using requests and BeautifulSoup. All the state is stored in the session,
+so you can't just make a GET request to the export URL.
+You need to actually have a browser session that has gone through the
+proper steps to get the data.
+"""
+from __future__ import annotations
+import asyncio
+import logging
+from contextlib import asynccontextmanager
+from pathlib import Path
+from typing import TYPE_CHECKING, AsyncIterable, ClassVar, Coroutine, Iterable, Protocol
+from playwright.async_api import BrowserContext, async_playwright, expect
+from ._filters import ScrapeFilters, YearEnum
+if TYPE_CHECKING:
+    from playwright.async_api import BrowserContext, Download, Page
+_logger = logging.getLogger(__name__)
+DEFAULT_DIRECTORY = "scraped/"
+@asynccontextmanager
+async def make_browser_async(headless: bool = True) -> AsyncIterable[BrowserContext]:
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(
+            headless=headless,
+            # This sometimes avoids race conditions?
+            # slow_mo=200,
+        )
+        yield await browser.new_context(
+            **p.devices["Desktop Chrome"],
+        )
+async def _run_scrape_flow(page: Page, url: str, filters: ScrapeFilters) -> Download:
+    # unconditionally reload the page to clear out any old state
+    await page.goto(url)
+    # after page load it takes a bit for the dropdowns to be ready?
+    await page.wait_for_timeout(100)
+    await page.select_option("select:below(:text('Status:'))", filters.status.value)
+    await page.select_option(
+        "select:below(:text('Report Year:'))", filters.report_year.value
+    )
+    # it still appears a manual wait is needed??
+    await page.wait_for_timeout(100)
+    await page.click("//input[@value='Search']")
+    await page.wait_for_timeout(100)
+    # Wait for either 1. results to come in or 2. the "no results" message to show.
+    # Otherwise if we export too early we won't get any data.
+    await expect(page.get_by_text("Press 'Search' to Load Results.")).to_be_hidden()
+    await page.click("//input[@value='Export']")
+    # This has to wait for the server to actually begin the download.
+    # When it is really busy, this can take a long time.
+    # So we make this timeout quite large.
+    async with page.expect_download(timeout=120_000) as download_info:
+        # The first link with text ".CSV" below the text "Export All Pages:"
+        await page.click("a:text('.CSV'):below(:text('Export All Pages:'))")
+    await page.click("//input[@value='Close']")
+    return await download_info.value
+class PScraper(Protocol):
+    def __call__(self, browser_context: BrowserContext) -> None: ...
+async def run_scrapers(
+    scrapers: Iterable[PScraper],
+    *,
+    browser_context: BrowserContext
+    | Coroutine[None, None, BrowserContext]
+    | None = None,
+) -> None:
+    if isinstance(browser_context, BrowserContext):
+        for s in scrapers:
+            await s(browser_context)
+    elif isinstance(browser_context, Coroutine):
+        browser_context = await browser_context
+        await run_scrapers(scrapers, browser_context=browser_context)
+    elif browser_context is None:
+        async with make_browser_async() as ctx:
+            await run_scrapers(scrapers, browser_context=ctx)
+class _ScraperBase:
+    _HOME_URL: ClassVar[str]
+    name: ClassVar[str]
+    def __init__(
+        self,
+        *,
+        filters: ScrapeFilters | None = None,
+        destination: str | Path,
+    ):
+        self.destination = Path(destination)
+        self.filters = filters or ScrapeFilters()
+    async def __call__(self, browser_context: BrowserContext) -> None:
+        page = (
+            browser_context.pages[0]
+            if browser_context.pages
+            else await browser_context.new_page()
+        )
+        _logger.info(
+            f"Downloading {self.name} to {self.destination} using {self.filters}"
+        )
+        download = await _run_scrape_flow(page, self._HOME_URL, self.filters)
+        _logger.info("Download started")
+        path = await download.path()
+        if path.stat().st_size == 0:
+            # We end up with an empty file, instead of a CSV with a header row and
+            # no data rows. In downstream processing this makes importing data
+            # with *.csv barf. So we simply don't include it.
+            # We could abort the download further upstream if we wanted.
+            _logger.info(f"No results. Not writing to {self.destination}")
+        else:
+            await download.save_as(self.destination)
+            _logger.info(f"Downloaded {self.destination}")
+    def run(
+        self,
+        browser_context: BrowserContext
+        | Coroutine[None, None, BrowserContext]
+        | None = None,
+    ) -> None:
+        """Run the download in the given browser"""
+        asyncio.run(run_scrapers([self], browser_context=browser_context))
+class CandidateRegistrationScraper(_ScraperBase):
+    _HOME_URL = "https://aws.state.ak.us/ApocReports/Registration/CandidateRegistration/CRForms.aspx"
+    name = "candidate_registration"
+class LetterOfIntentScraper(_ScraperBase):
+    _HOME_URL = (
+        "https://aws.state.ak.us/ApocReports/Registration/LetterOfIntent/LOIForms.aspx"
+    )
+    name = "letter_of_intent"
+class GroupRegistrationScraper(_ScraperBase):
+    _HOME_URL = "https://aws.state.ak.us/ApocReports/Registration/GroupRegistration/GRForms.aspx"
+    name = "group_registration"
+class EntityRegistrationScraper(_ScraperBase):
+    _HOME_URL = "https://aws.state.ak.us/ApocReports/Registration/EntityRegistration/ERForms.aspx"
+    name = "entity_registration"
+class DebtScraper(_ScraperBase):
+    _HOME_URL = "https://aws.state.ak.us/ApocReports/CampaignDisclosure/CDDebt.aspx"
+    name = "debt"
+class ExpenditureScraper(_ScraperBase):
+    _HOME_URL = (
+        "https://aws.state.ak.us/ApocReports/CampaignDisclosure/CDExpenditures.aspx"
+    )
+    name = "expenditures"
+class IncomeScraper(_ScraperBase):
+    _HOME_URL = "https://aws.state.ak.us/ApocReports/CampaignDisclosure/CDIncome.aspx"
+    name = "income"
+    def __init__(self, *, filters: ScrapeFilters, destination: str | Path):
+        super().__init__(filters=filters, destination=destination)
+        if self.filters.report_year == YearEnum.any:
+            raise ValueError("For Receipts, can't use report_year=Any")
+def scrape_all(
+    directory: str | Path = DEFAULT_DIRECTORY,
+    *,
+    headless: bool = True,
+) -> None:
+    """Scrape .CSVs from https://aws.state.ak.us/ApocReports/Campaign/
+    This will download the following files:
+    - candidate_registration.csv
+    - letter_of_intent.csv
+    - group_registration.csv
+    - entity_registration.csv
+    - expenditure.csv
+    - debt.csv
+    - income_{year}.csv for each year where there is data
+    Parameters
+    ----------
+    directory : str or Path
+        The directory to save the files to.
+    browser_context : BrowserContext, optional
+        A browser context to use for downloading.
+        If not provided, a temporary one will be created.
+    """
+    directory = Path(directory)
+    classes: list[_ScraperBase] = [
+        CandidateRegistrationScraper,
+        LetterOfIntentScraper,
+        GroupRegistrationScraper,
+        EntityRegistrationScraper,
+        DebtScraper,
+        ExpenditureScraper,
+    ]
+    scrapers = [cls(destination=directory / f"{cls.name}.csv") for cls in classes] + [
+        IncomeScraper(
+            filters=ScrapeFilters(report_year=year),
+            destination=directory / f"{IncomeScraper.name}_{year.value}.csv",
+        )
+        for year in YearEnum
+        if year != YearEnum.any
+    ]
+    async def run():
+        async with make_browser_async(headless=headless) as browser_context:
+            await run_scrapers(scrapers, browser_context=browser_context)
+    asyncio.run(run())
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    scrape_all()