apoc-data 0.1.2__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- apoc_data-0.2.0/PKG-INFO +93 -0
- apoc_data-0.2.0/README.md +81 -0
- apoc_data-0.2.0/pyproject.toml +29 -0
- apoc_data-0.2.0/src/apoc_data/__main__.py +4 -0
- apoc_data-0.2.0/src/apoc_data/cli.py +261 -0
- apoc_data-0.2.0/src/apoc_data/releases/__init__.py +17 -0
- apoc_data-0.2.0/src/apoc_data/releases/_gh.py +213 -0
- {apoc_data-0.1.2 → apoc_data-0.2.0}/src/apoc_data/scrape/_filters.py +2 -0
- {apoc_data-0.1.2 → apoc_data-0.2.0}/src/apoc_data/scrape/_scraper.py +28 -4
- apoc_data-0.1.2/PKG-INFO +0 -62
- apoc_data-0.1.2/README.md +0 -53
- apoc_data-0.1.2/pyproject.toml +0 -28
- apoc_data-0.1.2/src/apoc_data/download.py +0 -111
- apoc_data-0.1.2/src/apoc_data/scrape/__main__.py +0 -47
- {apoc_data-0.1.2 → apoc_data-0.2.0}/src/apoc_data/__init__.py +0 -0
- {apoc_data-0.1.2 → apoc_data-0.2.0}/src/apoc_data/scrape/__init__.py +0 -0
apoc_data-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: apoc-data
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Data from the Alaska Public Offices Commission.
|
|
5
|
+
Author: Nick Crews
|
|
6
|
+
Author-email: Nick Crews <nicholas.b.crews@gmail.com>
|
|
7
|
+
License: MIT
|
|
8
|
+
Requires-Dist: playwright ; extra == 'scrape'
|
|
9
|
+
Requires-Python: >=3.9
|
|
10
|
+
Provides-Extra: scrape
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
|
|
13
|
+
# APOC Data
|
|
14
|
+
|
|
15
|
+
Alaska campaign financial disclosure data from the [Alaska Public Offices Commission](https://aws.state.ak.us/ApocReports/Campaign/).
|
|
16
|
+
|
|
17
|
+
This scrapes the CSV files from the APOC website once a day and uploads them to
|
|
18
|
+
[this repo's releases](https://github.com/NickCrews/apoc-data/releases).
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
## Download a Recent Scrape
|
|
23
|
+
|
|
24
|
+
You can download the daily-scraped CSVs from the GitHub releases (this is what most users want).
|
|
25
|
+
|
|
26
|
+
### From the Web Interface
|
|
27
|
+
|
|
28
|
+
Browse from [this repo's releases](https://github.com/NickCrews/apoc-data/releases).
|
|
29
|
+
|
|
30
|
+
### From the CLI
|
|
31
|
+
|
|
32
|
+
Using [uv](https://docs.astral.sh/uv/)'s `uvx`:
|
|
33
|
+
|
|
34
|
+
```shell
|
|
35
|
+
uvx apoc-data release download # downloads all files from the latest release to ./downloads/ folder
|
|
36
|
+
uvx apoc-data release download "20260702-125614" --destination mydownloads/ # specify explicitly
|
|
37
|
+
uvx apoc-data asset download debt.csv --destination apoc_debt.csv # download a single file
|
|
38
|
+
uvx apoc-data release list # see what releases are available
|
|
39
|
+
uvx apoc-data asset list --json # see what files are in the latest release, as JSON
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
Or, you can download these CSVs directly using the direct URLs from the releases page
|
|
43
|
+
using curl, pandas, ibis, whatever!
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
curl -L https://github.com/NickCrews/apoc-data/releases/latest/download/candidate_registration.csv > candidate_registration.csv # get latest
|
|
47
|
+
curl -L https://github.com/NickCrews/apoc-data/releases/download/20240716-025636/candidate_registration.csv > candidate_registration.csv # or a different url pattern for specific releases
|
|
48
|
+
|
|
49
|
+
# query directly using duckdb
|
|
50
|
+
duckdb -c "SELECT count(*) FROM 'https://github.com/NickCrews/apoc-data/releases/latest/download/candidate_registration.csv'"
|
|
51
|
+
duckdb -c "SELECT count(*) FROM 'https://github.com/NickCrews/apoc-data/releases/download/20240716-025636/candidate_registration.csv'"
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### From python
|
|
55
|
+
|
|
56
|
+
We provide a python API too. `uv add apoc-data` and then
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
from apoc_data.releases import asset_download, release_download, release_list
|
|
60
|
+
|
|
61
|
+
release_list() # all releases, newest first, as `Release` objects
|
|
62
|
+
release_download(destination="downloads/") # all files from the latest release
|
|
63
|
+
asset_download("debt.csv", destination="apoc_debt.csv") # a single file
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
---
|
|
67
|
+
|
|
68
|
+
## Scrape Yourself
|
|
69
|
+
|
|
70
|
+
You can also scrape fresh data directly from the APOC website
|
|
71
|
+
(requires the `scrape` extra for playwright):
|
|
72
|
+
|
|
73
|
+
```shell
|
|
74
|
+
uvx "apoc-data[scrape]" scrape --directory scraped/
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
There is also a python API. Read the source code.
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## Dev Notes
|
|
82
|
+
|
|
83
|
+
Create venv and install dev deps:
|
|
84
|
+
|
|
85
|
+
```shell
|
|
86
|
+
uv sync
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
scrape:
|
|
90
|
+
|
|
91
|
+
```shell
|
|
92
|
+
uv run apoc-data scrape --directory downloads --no-headless
|
|
93
|
+
```
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# APOC Data
|
|
2
|
+
|
|
3
|
+
Alaska campaign financial disclosure data from the [Alaska Public Offices Commission](https://aws.state.ak.us/ApocReports/Campaign/).
|
|
4
|
+
|
|
5
|
+
This scrapes the CSV files from the APOC website once a day and uploads them to
|
|
6
|
+
[this repo's releases](https://github.com/NickCrews/apoc-data/releases).
|
|
7
|
+
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
## Download a Recent Scrape
|
|
11
|
+
|
|
12
|
+
You can download the daily-scraped CSVs from the GitHub releases (this is what most users want).
|
|
13
|
+
|
|
14
|
+
### From the Web Interface
|
|
15
|
+
|
|
16
|
+
Browse from [this repo's releases](https://github.com/NickCrews/apoc-data/releases).
|
|
17
|
+
|
|
18
|
+
### From the CLI
|
|
19
|
+
|
|
20
|
+
Using [uv](https://docs.astral.sh/uv/)'s `uvx`:
|
|
21
|
+
|
|
22
|
+
```shell
|
|
23
|
+
uvx apoc-data release download # downloads all files from the latest release to ./downloads/ folder
|
|
24
|
+
uvx apoc-data release download "20260702-125614" --destination mydownloads/ # specify explicitly
|
|
25
|
+
uvx apoc-data asset download debt.csv --destination apoc_debt.csv # download a single file
|
|
26
|
+
uvx apoc-data release list # see what releases are available
|
|
27
|
+
uvx apoc-data asset list --json # see what files are in the latest release, as JSON
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
Or, you can download these CSVs directly using the direct URLs from the releases page
|
|
31
|
+
using curl, pandas, ibis, whatever!
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
curl -L https://github.com/NickCrews/apoc-data/releases/latest/download/candidate_registration.csv > candidate_registration.csv # get latest
|
|
35
|
+
curl -L https://github.com/NickCrews/apoc-data/releases/download/20240716-025636/candidate_registration.csv > candidate_registration.csv # or a different url pattern for specific releases
|
|
36
|
+
|
|
37
|
+
# query directly using duckdb
|
|
38
|
+
duckdb -c "SELECT count(*) FROM 'https://github.com/NickCrews/apoc-data/releases/latest/download/candidate_registration.csv'"
|
|
39
|
+
duckdb -c "SELECT count(*) FROM 'https://github.com/NickCrews/apoc-data/releases/download/20240716-025636/candidate_registration.csv'"
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
### From python
|
|
43
|
+
|
|
44
|
+
We provide a python API too. `uv add apoc-data` and then
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
from apoc_data.releases import asset_download, release_download, release_list
|
|
48
|
+
|
|
49
|
+
release_list() # all releases, newest first, as `Release` objects
|
|
50
|
+
release_download(destination="downloads/") # all files from the latest release
|
|
51
|
+
asset_download("debt.csv", destination="apoc_debt.csv") # a single file
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## Scrape Yourself
|
|
57
|
+
|
|
58
|
+
You can also scrape fresh data directly from the APOC website
|
|
59
|
+
(requires the `scrape` extra for playwright):
|
|
60
|
+
|
|
61
|
+
```shell
|
|
62
|
+
uvx "apoc-data[scrape]" scrape --directory scraped/
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
There is also a python API. Read the source code.
|
|
66
|
+
|
|
67
|
+
---
|
|
68
|
+
|
|
69
|
+
## Dev Notes
|
|
70
|
+
|
|
71
|
+
Create venv and install dev deps:
|
|
72
|
+
|
|
73
|
+
```shell
|
|
74
|
+
uv sync
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
scrape:
|
|
78
|
+
|
|
79
|
+
```shell
|
|
80
|
+
uv run apoc-data scrape --directory downloads --no-headless
|
|
81
|
+
```
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "apoc-data"
|
|
3
|
+
version = "0.2.0"
|
|
4
|
+
description = "Data from the Alaska Public Offices Commission."
|
|
5
|
+
authors = [
|
|
6
|
+
{name = "Nick Crews", email = "nicholas.b.crews@gmail.com"},
|
|
7
|
+
]
|
|
8
|
+
dependencies = []
|
|
9
|
+
requires-python = ">=3.9"
|
|
10
|
+
readme = "README.md"
|
|
11
|
+
license = {text = "MIT"}
|
|
12
|
+
|
|
13
|
+
[project.scripts]
|
|
14
|
+
apoc-data = "apoc_data.cli:main"
|
|
15
|
+
|
|
16
|
+
[project.optional-dependencies]
|
|
17
|
+
scrape = [
|
|
18
|
+
"playwright",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
[dependency-groups]
|
|
22
|
+
dev = [
|
|
23
|
+
"apoc-data[scrape]",
|
|
24
|
+
"ruff",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[build-system]
|
|
28
|
+
requires = ["uv_build>=0.11,<0.12"]
|
|
29
|
+
build-backend = "uv_build"
|
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
"""Single CLI entry point for apoc-data.
|
|
2
|
+
|
|
3
|
+
Subcommands:
|
|
4
|
+
|
|
5
|
+
- ``release list|get|download``: inspect the available releases and download
|
|
6
|
+
all files in one (what most end users want; no extra dependencies).
|
|
7
|
+
- ``asset list|download``: inspect and fetch individual files within a release.
|
|
8
|
+
- ``scrape``: scrape the data from the APOC website using playwright
|
|
9
|
+
(requires installing the ``scrape`` extra, e.g. ``apoc-data[scrape]``).
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
|
|
13
|
+
```shell
|
|
14
|
+
uvx apoc-data release download
|
|
15
|
+
uvx apoc-data release list --json
|
|
16
|
+
uvx apoc-data asset list --release 20240716-025636
|
|
17
|
+
uvx apoc-data asset download debt.csv --destination apoc_debt.csv
|
|
18
|
+
uvx "apoc-data[scrape]" scrape --directory scraped/
|
|
19
|
+
```
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import argparse
|
|
25
|
+
import json
|
|
26
|
+
import logging
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
|
|
29
|
+
from apoc_data.releases import (
|
|
30
|
+
Asset,
|
|
31
|
+
Release,
|
|
32
|
+
asset_download,
|
|
33
|
+
asset_list,
|
|
34
|
+
release_download,
|
|
35
|
+
release_get,
|
|
36
|
+
release_list,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _add_json_flag(parser: argparse.ArgumentParser) -> None:
|
|
41
|
+
parser.add_argument(
|
|
42
|
+
"--json",
|
|
43
|
+
action="store_true",
|
|
44
|
+
help="Output JSON instead of human-readable text",
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _add_release_flag(parser: argparse.ArgumentParser) -> None:
|
|
49
|
+
parser.add_argument(
|
|
50
|
+
"--release",
|
|
51
|
+
type=str,
|
|
52
|
+
default="latest",
|
|
53
|
+
help='A release tag, or "latest" (the default)',
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _human_size(size: int) -> str:
|
|
58
|
+
n = float(size)
|
|
59
|
+
for unit in ("B", "KB", "MB", "GB"):
|
|
60
|
+
if n < 1024 or unit == "GB":
|
|
61
|
+
return f"{n:.0f} {unit}" if unit == "B" else f"{n:.1f} {unit}"
|
|
62
|
+
n /= 1024
|
|
63
|
+
raise AssertionError("unreachable")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _print_release(release: Release) -> None:
|
|
67
|
+
print(f"tag: {release.tag}")
|
|
68
|
+
print(f"name: {release.name}")
|
|
69
|
+
print(f"url: {release.url}")
|
|
70
|
+
print(f"published_at: {release.published_at.isoformat()}")
|
|
71
|
+
print("assets:")
|
|
72
|
+
for asset in release.assets:
|
|
73
|
+
print(f" {asset.name} ({_human_size(asset.size)})")
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _print_assets(assets: list[Asset]) -> None:
|
|
77
|
+
if not assets:
|
|
78
|
+
print("<no files>")
|
|
79
|
+
return
|
|
80
|
+
width = max(len(a.name) for a in assets)
|
|
81
|
+
for asset in assets:
|
|
82
|
+
print(f"{asset.name:<{width}} {_human_size(asset.size):>9} {asset.url}")
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _run_release_list(args: argparse.Namespace) -> None:
|
|
86
|
+
releases = release_list()
|
|
87
|
+
if args.json:
|
|
88
|
+
print(json.dumps([r.to_dict() for r in releases], indent=2))
|
|
89
|
+
return
|
|
90
|
+
for release in releases:
|
|
91
|
+
n_assets = len(release.assets)
|
|
92
|
+
print(f"{release.tag} {release.published_at.isoformat()} {n_assets} assets")
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _run_release_get(args: argparse.Namespace) -> None:
|
|
96
|
+
release = release_get(args.release)
|
|
97
|
+
if args.json:
|
|
98
|
+
print(json.dumps(release.to_dict(), indent=2))
|
|
99
|
+
else:
|
|
100
|
+
_print_release(release)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _run_asset_list(args: argparse.Namespace) -> None:
|
|
104
|
+
assets = asset_list(args.release)
|
|
105
|
+
if args.json:
|
|
106
|
+
print(json.dumps([a.to_dict() for a in assets], indent=2))
|
|
107
|
+
else:
|
|
108
|
+
_print_assets(assets)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _run_release_download(args: argparse.Namespace) -> None:
|
|
112
|
+
paths = release_download(args.release, destination=args.destination)
|
|
113
|
+
if args.json:
|
|
114
|
+
print(json.dumps([str(p) for p in paths], indent=2))
|
|
115
|
+
else:
|
|
116
|
+
for path in paths:
|
|
117
|
+
print(path)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _run_asset_download(args: argparse.Namespace) -> None:
|
|
121
|
+
path = asset_download(
|
|
122
|
+
args.filename, release=args.release, destination=args.destination
|
|
123
|
+
)
|
|
124
|
+
if args.json:
|
|
125
|
+
print(json.dumps(str(path)))
|
|
126
|
+
else:
|
|
127
|
+
print(path)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _run_scrape(args: argparse.Namespace) -> None:
|
|
131
|
+
try:
|
|
132
|
+
from apoc_data.scrape import scrape_all
|
|
133
|
+
from apoc_data.scrape._scraper import DEFAULT_DIRECTORY
|
|
134
|
+
except ImportError as e:
|
|
135
|
+
raise SystemExit(
|
|
136
|
+
"The scrape command requires extra dependencies. "
|
|
137
|
+
'Install them with the "scrape" extra, e.g. '
|
|
138
|
+
'`pip install "apoc-data[scrape]"` or `uvx "apoc-data[scrape]" scrape`.'
|
|
139
|
+
f"\n(import failed: {e})"
|
|
140
|
+
) from e
|
|
141
|
+
|
|
142
|
+
directory = Path(args.directory or DEFAULT_DIRECTORY).absolute()
|
|
143
|
+
if directory.is_file():
|
|
144
|
+
raise ValueError("The directory can't be a file")
|
|
145
|
+
logging.basicConfig(level=logging.INFO)
|
|
146
|
+
scrape_all(directory, headless=args.headless)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def main(argv: list[str] | None = None) -> None:
|
|
150
|
+
parser = argparse.ArgumentParser(
|
|
151
|
+
prog="apoc-data",
|
|
152
|
+
description="Data from the Alaska Public Offices Commission",
|
|
153
|
+
)
|
|
154
|
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
155
|
+
|
|
156
|
+
release_parser = subparsers.add_parser(
|
|
157
|
+
"release",
|
|
158
|
+
help="Inspect the available releases",
|
|
159
|
+
)
|
|
160
|
+
release_subparsers = release_parser.add_subparsers(
|
|
161
|
+
dest="release_command", required=True
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
release_list_parser = release_subparsers.add_parser(
|
|
165
|
+
"list",
|
|
166
|
+
help="List all releases, newest first",
|
|
167
|
+
)
|
|
168
|
+
_add_json_flag(release_list_parser)
|
|
169
|
+
release_list_parser.set_defaults(func=_run_release_list)
|
|
170
|
+
|
|
171
|
+
release_get_parser = release_subparsers.add_parser(
|
|
172
|
+
"get",
|
|
173
|
+
help="Show a single release",
|
|
174
|
+
)
|
|
175
|
+
release_get_parser.add_argument(
|
|
176
|
+
"release",
|
|
177
|
+
nargs="?",
|
|
178
|
+
default="latest",
|
|
179
|
+
help='A release tag, or "latest" (the default)',
|
|
180
|
+
)
|
|
181
|
+
_add_json_flag(release_get_parser)
|
|
182
|
+
release_get_parser.set_defaults(func=_run_release_get)
|
|
183
|
+
|
|
184
|
+
release_download_parser = release_subparsers.add_parser(
|
|
185
|
+
"download",
|
|
186
|
+
help="Download all files in a release to a folder",
|
|
187
|
+
description="Download all CSVs of APOC data from "
|
|
188
|
+
"https://github.com/NickCrews/apoc-data/releases",
|
|
189
|
+
)
|
|
190
|
+
release_download_parser.add_argument(
|
|
191
|
+
"release",
|
|
192
|
+
nargs="?",
|
|
193
|
+
default="latest",
|
|
194
|
+
help='A release tag, or "latest" (the default)',
|
|
195
|
+
)
|
|
196
|
+
release_download_parser.add_argument(
|
|
197
|
+
"--destination",
|
|
198
|
+
type=str,
|
|
199
|
+
default="downloads/",
|
|
200
|
+
help="The folder to save the files under (default: downloads/)",
|
|
201
|
+
)
|
|
202
|
+
_add_json_flag(release_download_parser)
|
|
203
|
+
release_download_parser.set_defaults(func=_run_release_download)
|
|
204
|
+
|
|
205
|
+
asset_parser = subparsers.add_parser(
|
|
206
|
+
"asset",
|
|
207
|
+
help="Inspect and download the files within a release",
|
|
208
|
+
)
|
|
209
|
+
asset_subparsers = asset_parser.add_subparsers(dest="asset_command", required=True)
|
|
210
|
+
|
|
211
|
+
asset_list_parser = asset_subparsers.add_parser(
|
|
212
|
+
"list",
|
|
213
|
+
help="List the files in a release",
|
|
214
|
+
)
|
|
215
|
+
_add_release_flag(asset_list_parser)
|
|
216
|
+
_add_json_flag(asset_list_parser)
|
|
217
|
+
asset_list_parser.set_defaults(func=_run_asset_list)
|
|
218
|
+
|
|
219
|
+
asset_download_parser = asset_subparsers.add_parser(
|
|
220
|
+
"download",
|
|
221
|
+
help="Download file(s) from a release",
|
|
222
|
+
)
|
|
223
|
+
asset_download_parser.add_argument(
|
|
224
|
+
"filename",
|
|
225
|
+
help="The name of the file to download, e.g. debt.csv",
|
|
226
|
+
)
|
|
227
|
+
_add_release_flag(asset_download_parser)
|
|
228
|
+
asset_download_parser.add_argument(
|
|
229
|
+
"--destination",
|
|
230
|
+
type=str,
|
|
231
|
+
default=None,
|
|
232
|
+
help="The file path to save to (default: the filename in the current directory)",
|
|
233
|
+
)
|
|
234
|
+
_add_json_flag(asset_download_parser)
|
|
235
|
+
asset_download_parser.set_defaults(func=_run_asset_download)
|
|
236
|
+
|
|
237
|
+
scrape_parser = subparsers.add_parser(
|
|
238
|
+
"scrape",
|
|
239
|
+
help='Scrape the data from the APOC website (requires the "scrape" extra)',
|
|
240
|
+
description="Scrape .CSVs from https://aws.state.ak.us/ApocReports/Campaign/",
|
|
241
|
+
)
|
|
242
|
+
scrape_parser.add_argument(
|
|
243
|
+
"--directory",
|
|
244
|
+
type=str,
|
|
245
|
+
default=None,
|
|
246
|
+
help="The directory to save the data to (default: scraped/)",
|
|
247
|
+
)
|
|
248
|
+
scrape_parser.add_argument(
|
|
249
|
+
"--headless",
|
|
250
|
+
default=True,
|
|
251
|
+
action=argparse.BooleanOptionalAction,
|
|
252
|
+
help="Run the browser in headless mode",
|
|
253
|
+
)
|
|
254
|
+
scrape_parser.set_defaults(func=_run_scrape)
|
|
255
|
+
|
|
256
|
+
args = parser.parse_args(argv)
|
|
257
|
+
args.func(args)
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
if __name__ == "__main__":
|
|
261
|
+
main()
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from apoc_data.releases._gh import Asset as Asset
|
|
2
|
+
from apoc_data.releases._gh import Release as Release
|
|
3
|
+
from apoc_data.releases._gh import asset_download as asset_download
|
|
4
|
+
from apoc_data.releases._gh import asset_list as asset_list
|
|
5
|
+
from apoc_data.releases._gh import release_download as release_download
|
|
6
|
+
from apoc_data.releases._gh import release_get as release_get
|
|
7
|
+
from apoc_data.releases._gh import release_list as release_list
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"Asset",
|
|
11
|
+
"Release",
|
|
12
|
+
"asset_download",
|
|
13
|
+
"asset_list",
|
|
14
|
+
"release_download",
|
|
15
|
+
"release_get",
|
|
16
|
+
"release_list",
|
|
17
|
+
]
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
"""Access the APOC data published at https://github.com/NickCrews/apoc-data/releases."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import dataclasses
|
|
6
|
+
import json
|
|
7
|
+
import os
|
|
8
|
+
from datetime import datetime, timezone
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
from urllib.error import HTTPError
|
|
12
|
+
from urllib.request import Request, urlopen
|
|
13
|
+
|
|
14
|
+
_REPO = "NickCrews/apoc-data"
|
|
15
|
+
_API_ROOT = f"https://api.github.com/repos/{_REPO}"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclasses.dataclass(frozen=True)
|
|
19
|
+
class Asset:
|
|
20
|
+
"""A single downloadable file in a release."""
|
|
21
|
+
|
|
22
|
+
name: str
|
|
23
|
+
"""The filename, e.g. ``candidate_registration.csv``."""
|
|
24
|
+
url: str
|
|
25
|
+
"""The direct download URL."""
|
|
26
|
+
size: int
|
|
27
|
+
"""The size in bytes."""
|
|
28
|
+
updated_at: datetime
|
|
29
|
+
"""When the asset was last updated."""
|
|
30
|
+
|
|
31
|
+
@classmethod
|
|
32
|
+
def _from_api(cls, raw: dict[str, Any]) -> Asset:
|
|
33
|
+
return cls(
|
|
34
|
+
name=raw["name"],
|
|
35
|
+
url=raw["browser_download_url"],
|
|
36
|
+
size=raw["size"],
|
|
37
|
+
updated_at=_parse_timestamp(raw["updated_at"]),
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
def to_dict(self) -> dict[str, Any]:
|
|
41
|
+
"""Convert to a JSON-serializable dict."""
|
|
42
|
+
return {
|
|
43
|
+
"name": self.name,
|
|
44
|
+
"url": self.url,
|
|
45
|
+
"size": self.size,
|
|
46
|
+
"updated_at": self.updated_at.isoformat(),
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
def download(self, destination: str | Path) -> Path:
|
|
50
|
+
"""Download this asset to the given file path and return it."""
|
|
51
|
+
destination = Path(destination)
|
|
52
|
+
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
53
|
+
destination.write_bytes(_get(self.url))
|
|
54
|
+
return destination
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclasses.dataclass(frozen=True)
|
|
58
|
+
class Release:
|
|
59
|
+
"""A release of the APOC data on GitHub."""
|
|
60
|
+
|
|
61
|
+
tag: str
|
|
62
|
+
"""The git tag, e.g. ``20240716-025636``."""
|
|
63
|
+
name: str
|
|
64
|
+
"""The human-readable release title."""
|
|
65
|
+
url: str
|
|
66
|
+
"""The web page for the release."""
|
|
67
|
+
published_at: datetime
|
|
68
|
+
"""When the release was published."""
|
|
69
|
+
assets: tuple[Asset, ...]
|
|
70
|
+
"""The downloadable files in this release."""
|
|
71
|
+
|
|
72
|
+
@classmethod
|
|
73
|
+
def _from_api(cls, raw: dict[str, Any]) -> Release:
|
|
74
|
+
return cls(
|
|
75
|
+
tag=raw["tag_name"],
|
|
76
|
+
name=raw["name"] or raw["tag_name"],
|
|
77
|
+
url=raw["html_url"],
|
|
78
|
+
published_at=_parse_timestamp(raw["published_at"]),
|
|
79
|
+
assets=tuple(Asset._from_api(a) for a in raw["assets"]),
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
def to_dict(self) -> dict[str, Any]:
|
|
83
|
+
"""Convert to a JSON-serializable dict."""
|
|
84
|
+
return {
|
|
85
|
+
"tag": self.tag,
|
|
86
|
+
"name": self.name,
|
|
87
|
+
"url": self.url,
|
|
88
|
+
"published_at": self.published_at.isoformat(),
|
|
89
|
+
"assets": [a.to_dict() for a in self.assets],
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
def asset(self, filename: str) -> Asset:
|
|
93
|
+
"""Get the asset with the given filename, or raise ValueError."""
|
|
94
|
+
for asset in self.assets:
|
|
95
|
+
if asset.name == filename:
|
|
96
|
+
return asset
|
|
97
|
+
available = ", ".join(sorted(a.name for a in self.assets)) or "<no files>"
|
|
98
|
+
raise ValueError(
|
|
99
|
+
f"Release {self.tag} does not have a file named {filename}. "
|
|
100
|
+
f"Available files: {available}"
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def release_list() -> list[Release]:
|
|
105
|
+
"""List all releases of the APOC data, newest first."""
|
|
106
|
+
raw = json.loads(_get(f"{_API_ROOT}/releases"))
|
|
107
|
+
return [Release._from_api(r) for r in raw]
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def release_get(release: str = "latest") -> Release:
|
|
111
|
+
"""Get a single release by tag, or the latest release.
|
|
112
|
+
|
|
113
|
+
Parameters
|
|
114
|
+
----------
|
|
115
|
+
release :
|
|
116
|
+
A tag such as ``20240716-025636``, or ``latest`` for the most recent release.
|
|
117
|
+
"""
|
|
118
|
+
if release == "latest":
|
|
119
|
+
url = f"{_API_ROOT}/releases/latest"
|
|
120
|
+
else:
|
|
121
|
+
url = f"{_API_ROOT}/releases/tags/{release}"
|
|
122
|
+
try:
|
|
123
|
+
raw = json.loads(_get(url))
|
|
124
|
+
except HTTPError as e:
|
|
125
|
+
if e.code == 404:
|
|
126
|
+
raise ValueError(
|
|
127
|
+
f"No release found for {release!r}. "
|
|
128
|
+
f"Available releases: {_available_releases_hint()}"
|
|
129
|
+
) from e
|
|
130
|
+
raise
|
|
131
|
+
return Release._from_api(raw)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def asset_list(release: str = "latest") -> list[Asset]:
|
|
135
|
+
"""List the downloadable files in a release.
|
|
136
|
+
|
|
137
|
+
Parameters
|
|
138
|
+
----------
|
|
139
|
+
release :
|
|
140
|
+
A tag such as ``20240716-025636``, or ``latest`` for the most recent release.
|
|
141
|
+
"""
|
|
142
|
+
return list(release_get(release).assets)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def release_download(
|
|
146
|
+
release: str = "latest",
|
|
147
|
+
*,
|
|
148
|
+
destination: str | Path = "downloads/",
|
|
149
|
+
) -> list[Path]:
|
|
150
|
+
"""Download all files in a release to a folder and return the downloaded paths.
|
|
151
|
+
|
|
152
|
+
Parameters
|
|
153
|
+
----------
|
|
154
|
+
release :
|
|
155
|
+
A tag such as ``20240716-025636``, or ``latest`` for the most recent release.
|
|
156
|
+
destination :
|
|
157
|
+
The folder to save the files under.
|
|
158
|
+
"""
|
|
159
|
+
destination = Path(destination)
|
|
160
|
+
rel = release_get(release)
|
|
161
|
+
return [asset.download(destination / asset.name) for asset in rel.assets]
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def asset_download(
|
|
165
|
+
filename: str,
|
|
166
|
+
*,
|
|
167
|
+
release: str = "latest",
|
|
168
|
+
destination: str | Path | None = None,
|
|
169
|
+
) -> Path:
|
|
170
|
+
"""Download a single file from a release and return the downloaded path.
|
|
171
|
+
|
|
172
|
+
Parameters
|
|
173
|
+
----------
|
|
174
|
+
filename :
|
|
175
|
+
The name of the file to download, e.g. ``debt.csv``.
|
|
176
|
+
release :
|
|
177
|
+
A tag such as ``20240716-025636``, or ``latest`` for the most recent release.
|
|
178
|
+
destination :
|
|
179
|
+
The file path to save to.
|
|
180
|
+
Default is None, which saves to ``filename`` in the current directory.
|
|
181
|
+
"""
|
|
182
|
+
asset = release_get(release).asset(filename)
|
|
183
|
+
if destination is None:
|
|
184
|
+
destination = Path(asset.name)
|
|
185
|
+
return asset.download(destination)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _available_releases_hint() -> str:
|
|
189
|
+
try:
|
|
190
|
+
tags = [r.tag for r in release_list()]
|
|
191
|
+
except Exception:
|
|
192
|
+
return "<unable to fetch releases>"
|
|
193
|
+
return ", ".join(["latest", *tags]) or "<no releases>"
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _parse_timestamp(raw: str) -> datetime:
|
|
197
|
+
# GitHub timestamps look like "2024-07-16T02:56:36Z".
|
|
198
|
+
# datetime.fromisoformat can't parse the trailing "Z" until python 3.11.
|
|
199
|
+
return datetime.strptime(raw, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def _get(url: str) -> bytes:
|
|
203
|
+
# I'm getting hit by rate limits when using streamlit cloud, I assume because
|
|
204
|
+
# the IP address is shared. So I'm trying to use a personal access token to
|
|
205
|
+
# authenticate.
|
|
206
|
+
headers = {"Accept": "application/vnd.github.v3+json"}
|
|
207
|
+
try:
|
|
208
|
+
pat = os.environ["GITHUB_PAT"]
|
|
209
|
+
headers["Authorization"] = f"token {pat}"
|
|
210
|
+
except KeyError:
|
|
211
|
+
pass
|
|
212
|
+
with urlopen(Request(url, headers=headers)) as response:
|
|
213
|
+
return response.read()
|
|
@@ -13,10 +13,19 @@ from __future__ import annotations
|
|
|
13
13
|
import asyncio
|
|
14
14
|
import csv
|
|
15
15
|
import logging
|
|
16
|
+
import subprocess
|
|
17
|
+
import sys
|
|
16
18
|
import tempfile
|
|
17
19
|
from contextlib import asynccontextmanager
|
|
18
20
|
from pathlib import Path
|
|
19
|
-
from typing import
|
|
21
|
+
from typing import (
|
|
22
|
+
TYPE_CHECKING,
|
|
23
|
+
AsyncGenerator,
|
|
24
|
+
ClassVar,
|
|
25
|
+
Coroutine,
|
|
26
|
+
Iterable,
|
|
27
|
+
Protocol,
|
|
28
|
+
)
|
|
20
29
|
|
|
21
30
|
from playwright.async_api import BrowserContext, async_playwright, expect
|
|
22
31
|
|
|
@@ -30,9 +39,24 @@ _logger = logging.getLogger(__name__)
|
|
|
30
39
|
DEFAULT_DIRECTORY = "scraped/"
|
|
31
40
|
|
|
32
41
|
|
|
42
|
+
def _ensure_chromium_installed(executable_path: str) -> None:
|
|
43
|
+
"""Install the playwright chromium browser if it isn't already.
|
|
44
|
+
|
|
45
|
+
This saves users from needing to run `playwright install chromium`
|
|
46
|
+
manually before their first scrape.
|
|
47
|
+
"""
|
|
48
|
+
if Path(executable_path).exists():
|
|
49
|
+
return
|
|
50
|
+
_logger.info("Chromium not found. Running 'playwright install chromium'...")
|
|
51
|
+
subprocess.run(
|
|
52
|
+
[sys.executable, "-m", "playwright", "install", "chromium"], check=True
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
33
56
|
@asynccontextmanager
|
|
34
|
-
async def make_browser_async(headless: bool = True) ->
|
|
57
|
+
async def make_browser_async(headless: bool = True) -> AsyncGenerator[BrowserContext]:
|
|
35
58
|
async with async_playwright() as p:
|
|
59
|
+
_ensure_chromium_installed(p.chromium.executable_path)
|
|
36
60
|
browser = await p.chromium.launch(
|
|
37
61
|
headless=headless,
|
|
38
62
|
# This sometimes avoids race conditions?
|
|
@@ -79,7 +103,7 @@ async def _run_scrape_flow(page: Page, url: str, filters: ScrapeFilters) -> Down
|
|
|
79
103
|
|
|
80
104
|
|
|
81
105
|
class PScraper(Protocol):
|
|
82
|
-
def __call__(self, browser_context: BrowserContext) -> None:
|
|
106
|
+
async def __call__(self, browser_context: BrowserContext) -> None:
|
|
83
107
|
"""Given a browser context, scrape the data.
|
|
84
108
|
|
|
85
109
|
The destination path, the chosen filters, etc all should be known
|
|
@@ -309,7 +333,7 @@ def scrape_all(
|
|
|
309
333
|
If not provided, a temporary one will be created.
|
|
310
334
|
"""
|
|
311
335
|
directory = Path(directory)
|
|
312
|
-
classes: list[_ScraperBase] = [
|
|
336
|
+
classes: list[type[_ScraperBase]] = [
|
|
313
337
|
CampaignFormScraper,
|
|
314
338
|
IncomeScraper,
|
|
315
339
|
CandidateRegistrationScraper,
|
apoc_data-0.1.2/PKG-INFO
DELETED
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.1
|
|
2
|
-
Name: apoc-data
|
|
3
|
-
Version: 0.1.2
|
|
4
|
-
Summary: Data from the Alaska Public Offices Commission.
|
|
5
|
-
Author-Email: Nick Crews <nicholas.b.crews@gmail.com>
|
|
6
|
-
License: MIT
|
|
7
|
-
Requires-Python: >=3.9
|
|
8
|
-
Description-Content-Type: text/markdown
|
|
9
|
-
|
|
10
|
-
# APOC Data
|
|
11
|
-
|
|
12
|
-
Data from the [Alaska Public Offices Commission](https://aws.state.ak.us/ApocReports/Campaign/).
|
|
13
|
-
|
|
14
|
-
This scrapes the CSV files from the APOC website once a day and uploads them to
|
|
15
|
-
[this repo's releases](https://github.com/NickCrews/apoc-data/releases).
|
|
16
|
-
|
|
17
|
-
## Manual
|
|
18
|
-
|
|
19
|
-
Browse from [this repo's releases](https://github.com/NickCrews/apoc-data/releases).
|
|
20
|
-
|
|
21
|
-
## Python
|
|
22
|
-
|
|
23
|
-
`pip install apoc-data` and then
|
|
24
|
-
|
|
25
|
-
```python
|
|
26
|
-
from apoc_data.download import download
|
|
27
|
-
|
|
28
|
-
download(
|
|
29
|
-
release="latest",
|
|
30
|
-
filename="debt.csv",
|
|
31
|
-
destination="apoc_debt.csv",
|
|
32
|
-
)
|
|
33
|
-
```
|
|
34
|
-
|
|
35
|
-
## Shell
|
|
36
|
-
|
|
37
|
-
You can download these CSVs using the direct URLs from the releases page
|
|
38
|
-
using curl, pandas, ibis, whatever!
|
|
39
|
-
|
|
40
|
-
```bash
|
|
41
|
-
curl -L https://github.com/NickCrews/apoc-data/releases/download/20240716-025636/candidate_registration.csv > candidate_registration.csv
|
|
42
|
-
```
|
|
43
|
-
|
|
44
|
-
or we have a tiny python script that makes this a little nicer, eg get the latest
|
|
45
|
-
release, choose the download directory, etc. Read the script for more info.
|
|
46
|
-
|
|
47
|
-
```bash
|
|
48
|
-
curl -s https://raw.githubusercontent.com/NickCrews/apoc-data/main/src/apoc_data/download.py | python - --release latest
|
|
49
|
-
```
|
|
50
|
-
|
|
51
|
-
## Dev Notes
|
|
52
|
-
|
|
53
|
-
```shell
|
|
54
|
-
pdm install
|
|
55
|
-
playwright install chromium
|
|
56
|
-
```
|
|
57
|
-
|
|
58
|
-
scrape:
|
|
59
|
-
|
|
60
|
-
```shell
|
|
61
|
-
python -m apoc_data.scrape --directory downloads --no-headless
|
|
62
|
-
```
|
apoc_data-0.1.2/README.md
DELETED
|
@@ -1,53 +0,0 @@
|
|
|
1
|
-
# APOC Data
|
|
2
|
-
|
|
3
|
-
Data from the [Alaska Public Offices Commission](https://aws.state.ak.us/ApocReports/Campaign/).
|
|
4
|
-
|
|
5
|
-
This scrapes the CSV files from the APOC website once a day and uploads them to
|
|
6
|
-
[this repo's releases](https://github.com/NickCrews/apoc-data/releases).
|
|
7
|
-
|
|
8
|
-
## Manual
|
|
9
|
-
|
|
10
|
-
Browse from [this repo's releases](https://github.com/NickCrews/apoc-data/releases).
|
|
11
|
-
|
|
12
|
-
## Python
|
|
13
|
-
|
|
14
|
-
`pip install apoc-data` and then
|
|
15
|
-
|
|
16
|
-
```python
|
|
17
|
-
from apoc_data.download import download
|
|
18
|
-
|
|
19
|
-
download(
|
|
20
|
-
release="latest",
|
|
21
|
-
filename="debt.csv",
|
|
22
|
-
destination="apoc_debt.csv",
|
|
23
|
-
)
|
|
24
|
-
```
|
|
25
|
-
|
|
26
|
-
## Shell
|
|
27
|
-
|
|
28
|
-
You can download these CSVs using the direct URLs from the releases page
|
|
29
|
-
using curl, pandas, ibis, whatever!
|
|
30
|
-
|
|
31
|
-
```bash
|
|
32
|
-
curl -L https://github.com/NickCrews/apoc-data/releases/download/20240716-025636/candidate_registration.csv > candidate_registration.csv
|
|
33
|
-
```
|
|
34
|
-
|
|
35
|
-
or we have a tiny python script that makes this a little nicer, eg get the latest
|
|
36
|
-
release, choose the download directory, etc. Read the script for more info.
|
|
37
|
-
|
|
38
|
-
```bash
|
|
39
|
-
curl -s https://raw.githubusercontent.com/NickCrews/apoc-data/main/src/apoc_data/download.py | python - --release latest
|
|
40
|
-
```
|
|
41
|
-
|
|
42
|
-
## Dev Notes
|
|
43
|
-
|
|
44
|
-
```shell
|
|
45
|
-
pdm install
|
|
46
|
-
playwright install chromium
|
|
47
|
-
```
|
|
48
|
-
|
|
49
|
-
scrape:
|
|
50
|
-
|
|
51
|
-
```shell
|
|
52
|
-
python -m apoc_data.scrape --directory downloads --no-headless
|
|
53
|
-
```
|
apoc_data-0.1.2/pyproject.toml
DELETED
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
[project]
|
|
2
|
-
name = "apoc-data"
|
|
3
|
-
version = "0.1.2"
|
|
4
|
-
description = "Data from the Alaska Public Offices Commission."
|
|
5
|
-
authors = [
|
|
6
|
-
{ name = "Nick Crews", email = "nicholas.b.crews@gmail.com" },
|
|
7
|
-
]
|
|
8
|
-
dependencies = []
|
|
9
|
-
requires-python = ">=3.9"
|
|
10
|
-
readme = "README.md"
|
|
11
|
-
|
|
12
|
-
[project.license]
|
|
13
|
-
text = "MIT"
|
|
14
|
-
|
|
15
|
-
[tool.pdm]
|
|
16
|
-
distribution = true
|
|
17
|
-
|
|
18
|
-
[tool.pdm.dev-dependencies]
|
|
19
|
-
dev = [
|
|
20
|
-
"playwright",
|
|
21
|
-
"ruff",
|
|
22
|
-
]
|
|
23
|
-
|
|
24
|
-
[build-system]
|
|
25
|
-
requires = [
|
|
26
|
-
"pdm-backend",
|
|
27
|
-
]
|
|
28
|
-
build-backend = "pdm.backend"
|
|
@@ -1,111 +0,0 @@
|
|
|
1
|
-
"""Download CSV(s) of APOC data from https://github.com/NickCrews/apoc-data/releases
|
|
2
|
-
|
|
3
|
-
A no-install way to use this script is to download it from github and pipe to curl:
|
|
4
|
-
|
|
5
|
-
```shell
|
|
6
|
-
curl -s https://raw.githubusercontent.com/NickCrews/apoc-data/main/src/apoc_data/download.py | python - --release latest
|
|
7
|
-
```
|
|
8
|
-
"""
|
|
9
|
-
|
|
10
|
-
import argparse
|
|
11
|
-
import json
|
|
12
|
-
import os
|
|
13
|
-
from pathlib import Path
|
|
14
|
-
from urllib.request import Request, urlopen
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def download(
|
|
18
|
-
*,
|
|
19
|
-
release: str = "latest",
|
|
20
|
-
filename: str | None = None,
|
|
21
|
-
destination: str | Path = "downloads/",
|
|
22
|
-
) -> None:
|
|
23
|
-
"""Download CSV(s) of APOC data from https://github.com/NickCrews/apoc-data/releases.
|
|
24
|
-
|
|
25
|
-
Parameters
|
|
26
|
-
----------
|
|
27
|
-
release : str, optional
|
|
28
|
-
The name of the release to download.
|
|
29
|
-
Default is None, which means latest release
|
|
30
|
-
filename : str, optional
|
|
31
|
-
The name of the file to download.
|
|
32
|
-
Default is None, which downloads all files.
|
|
33
|
-
destination : str or Path, optional
|
|
34
|
-
Where to save the file(s).
|
|
35
|
-
If this looks like a file (the final path segment contains a `.`),
|
|
36
|
-
then we can only download a single file, and it will be saved to that location.
|
|
37
|
-
Otherwise, the file(s) will be saved underneath there.
|
|
38
|
-
"""
|
|
39
|
-
destination = Path(destination)
|
|
40
|
-
release, assets = _get_release_info(release)
|
|
41
|
-
if filename is not None:
|
|
42
|
-
if filename not in assets:
|
|
43
|
-
raise ValueError(f"Release {release} does not have a file named {filename}")
|
|
44
|
-
if not _is_file(destination):
|
|
45
|
-
destination = destination / filename
|
|
46
|
-
_download_asset(assets[filename], destination)
|
|
47
|
-
else:
|
|
48
|
-
if _is_file(destination):
|
|
49
|
-
raise ValueError("Can't download all files to a single file")
|
|
50
|
-
for name, url in assets.items():
|
|
51
|
-
_download_asset(url, destination / name)
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
def _is_file(destination: Path) -> bool:
|
|
55
|
-
return "." in destination.name
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
def _get_release_info(release: str) -> tuple[str, dict[str, str]]:
|
|
59
|
-
url = f"https://api.github.com/repos/NickCrews/apoc-data/releases/{release}"
|
|
60
|
-
info = json.loads(_get(url))
|
|
61
|
-
assets = {asset["name"]: asset["browser_download_url"] for asset in info["assets"]}
|
|
62
|
-
return info["tag_name"], assets
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
def _download_asset(url: str, destination: Path) -> None:
|
|
66
|
-
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
67
|
-
with open(destination, "wb") as file:
|
|
68
|
-
file.write(_get(url))
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
def cli():
|
|
72
|
-
parser = argparse.ArgumentParser(
|
|
73
|
-
description="Download data from the Alaska Public Offices Commission"
|
|
74
|
-
)
|
|
75
|
-
parser.add_argument(
|
|
76
|
-
"--release",
|
|
77
|
-
type=str,
|
|
78
|
-
default="latest",
|
|
79
|
-
help="The name of the release to download",
|
|
80
|
-
)
|
|
81
|
-
parser.add_argument(
|
|
82
|
-
"--filename",
|
|
83
|
-
type=str,
|
|
84
|
-
help="The name of the file to download",
|
|
85
|
-
)
|
|
86
|
-
parser.add_argument(
|
|
87
|
-
"--destination",
|
|
88
|
-
type=str,
|
|
89
|
-
default="downloads/",
|
|
90
|
-
help="Where to save the file(s)",
|
|
91
|
-
)
|
|
92
|
-
args = parser.parse_args()
|
|
93
|
-
download(release=args.release, filename=args.filename, destination=args.destination)
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
def _get(url: str) -> str:
|
|
97
|
-
# I'm getting hit by rate limits when using streamlit cloud, I assume because
|
|
98
|
-
# the IP address is shared. So I'm trying to use a personal access token to
|
|
99
|
-
# authenticate.
|
|
100
|
-
headers = {"Accept": "application/vnd.github.v3+json"}
|
|
101
|
-
try:
|
|
102
|
-
pat = os.environ["GITHUB_PAT"]
|
|
103
|
-
headers["Authorization"] = f"toasdasken {pat}"
|
|
104
|
-
except KeyError:
|
|
105
|
-
pass
|
|
106
|
-
with urlopen(Request(url, headers=headers)) as response:
|
|
107
|
-
return response.read()
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
if __name__ == "__main__":
|
|
111
|
-
cli()
|
|
@@ -1,47 +0,0 @@
|
|
|
1
|
-
import argparse
|
|
2
|
-
import logging
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
from apoc_data.scrape import scrape_all
|
|
6
|
-
from apoc_data.scrape._scraper import DEFAULT_DIRECTORY
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def main():
|
|
10
|
-
parser = argparse.ArgumentParser(
|
|
11
|
-
description="Download data from the Alaska Public Offices Commission"
|
|
12
|
-
)
|
|
13
|
-
parser.add_argument(
|
|
14
|
-
"--directory",
|
|
15
|
-
type=str,
|
|
16
|
-
default=DEFAULT_DIRECTORY,
|
|
17
|
-
help="The directory to save the data to",
|
|
18
|
-
)
|
|
19
|
-
parser.add_argument(
|
|
20
|
-
"--headless",
|
|
21
|
-
"--no-headless",
|
|
22
|
-
dest="headless",
|
|
23
|
-
default=True,
|
|
24
|
-
action=_BooleanAction,
|
|
25
|
-
help="Run the browser in headless mode",
|
|
26
|
-
)
|
|
27
|
-
args = parser.parse_args()
|
|
28
|
-
directory = Path(args.directory).absolute()
|
|
29
|
-
if directory.is_file():
|
|
30
|
-
raise ValueError("The directory can't be a file")
|
|
31
|
-
logging.basicConfig(level=logging.INFO)
|
|
32
|
-
scrape_all(directory, headless=args.headless)
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
# from https://thisdataguy.com/2017/07/03/no-options-with-argparse-and-python/
|
|
36
|
-
class _BooleanAction(argparse.Action):
|
|
37
|
-
def __init__(self, option_strings, dest, nargs=None, **kwargs):
|
|
38
|
-
super(_BooleanAction, self).__init__(option_strings, dest, nargs=0, **kwargs)
|
|
39
|
-
|
|
40
|
-
def __call__(self, parser, namespace, values, option_string=None):
|
|
41
|
-
setattr(
|
|
42
|
-
namespace, self.dest, False if option_string.startswith("--no") else True
|
|
43
|
-
)
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
if __name__ == "__main__":
|
|
47
|
-
main()
|
|
File without changes
|
|
File without changes
|