koreaapi 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- koreaapi-0.1.0/.env.example +32 -0
- koreaapi-0.1.0/.github/workflows/collect.yml +57 -0
- koreaapi-0.1.0/.github/workflows/pages.yml +68 -0
- koreaapi-0.1.0/.github/workflows/publish.yml +24 -0
- koreaapi-0.1.0/.github/workflows/test.yml +26 -0
- koreaapi-0.1.0/.gitignore +15 -0
- koreaapi-0.1.0/LICENSE +21 -0
- koreaapi-0.1.0/PKG-INFO +199 -0
- koreaapi-0.1.0/PRINCIPLES.md +54 -0
- koreaapi-0.1.0/README.md +170 -0
- koreaapi-0.1.0/ROADMAP.md +278 -0
- koreaapi-0.1.0/SCOPE.md +144 -0
- koreaapi-0.1.0/docs/API_KEYS.md +58 -0
- koreaapi-0.1.0/docs/LAUNCH.md +56 -0
- koreaapi-0.1.0/docs/MCP_INSTALL.md +74 -0
- koreaapi-0.1.0/docs/MIGRATION.md +64 -0
- koreaapi-0.1.0/docs/research/agent-blockchain-conference-2026-05-27.html +1652 -0
- koreaapi-0.1.0/llms.txt +36 -0
- koreaapi-0.1.0/pyproject.toml +46 -0
- koreaapi-0.1.0/robots.txt +10 -0
- koreaapi-0.1.0/scripts/split-koreaapi.sh +35 -0
- koreaapi-0.1.0/smithery.yaml +37 -0
- koreaapi-0.1.0/src/koreaapi/__init__.py +3 -0
- koreaapi-0.1.0/src/koreaapi/admin.py +902 -0
- koreaapi-0.1.0/src/koreaapi/models.py +76 -0
- koreaapi-0.1.0/src/koreaapi/pipeline/__init__.py +5 -0
- koreaapi-0.1.0/src/koreaapi/pipeline/ingest.py +252 -0
- koreaapi-0.1.0/src/koreaapi/pipeline/scheduler.py +15 -0
- koreaapi-0.1.0/src/koreaapi/pipeline/store.py +206 -0
- koreaapi-0.1.0/src/koreaapi/romanize.py +39 -0
- koreaapi-0.1.0/src/koreaapi/roster.py +21 -0
- koreaapi-0.1.0/src/koreaapi/server.py +57 -0
- koreaapi-0.1.0/src/koreaapi/service.py +176 -0
- koreaapi-0.1.0/src/koreaapi/skill_score.py +62 -0
- koreaapi-0.1.0/src/koreaapi/sources/__init__.py +5 -0
- koreaapi-0.1.0/src/koreaapi/sources/base.py +18 -0
- koreaapi-0.1.0/src/koreaapi/sources/circlechart.py +146 -0
- koreaapi-0.1.0/src/koreaapi/sources/mock.py +23 -0
- koreaapi-0.1.0/src/koreaapi/sources/wikidata.py +341 -0
- koreaapi-0.1.0/src/koreaapi/sources/wikipedia.py +97 -0
- koreaapi-0.1.0/src/koreaapi/sources/youtube.py +266 -0
- koreaapi-0.1.0/tests/fixtures/wikidata_bts.json +18 -0
- koreaapi-0.1.0/tests/fixtures/wikidata_bts_agency.json +26 -0
- koreaapi-0.1.0/tests/fixtures/wikidata_bts_full.json +31 -0
- koreaapi-0.1.0/tests/fixtures/wikidata_label_bighit.json +11 -0
- koreaapi-0.1.0/tests/fixtures/wikidata_labelmates.json +25 -0
- koreaapi-0.1.0/tests/fixtures/wikidata_members.json +6 -0
- koreaapi-0.1.0/tests/fixtures/wikidata_poisoned_bts.json +1 -0
- koreaapi-0.1.0/tests/fixtures/wikidata_search_bts.json +15 -0
- koreaapi-0.1.0/tests/fixtures/wikipedia_bts.json +13 -0
- koreaapi-0.1.0/tests/fixtures/youtube_channel_bts.json +24 -0
- koreaapi-0.1.0/tests/fixtures/youtube_latest_bts.json +17 -0
- koreaapi-0.1.0/tests/fixtures/youtube_search_bts.json +23 -0
- koreaapi-0.1.0/tests/test_admin_pull.py +47 -0
- koreaapi-0.1.0/tests/test_circlechart.py +111 -0
- koreaapi-0.1.0/tests/test_export.py +57 -0
- koreaapi-0.1.0/tests/test_geo.py +122 -0
- koreaapi-0.1.0/tests/test_pipeline.py +77 -0
- koreaapi-0.1.0/tests/test_romanize.py +65 -0
- koreaapi-0.1.0/tests/test_roster.py +77 -0
- koreaapi-0.1.0/tests/test_server.py +57 -0
- koreaapi-0.1.0/tests/test_service.py +126 -0
- koreaapi-0.1.0/tests/test_signal.py +73 -0
- koreaapi-0.1.0/tests/test_wikidata.py +120 -0
- koreaapi-0.1.0/tests/test_wikidata_live.py +51 -0
- koreaapi-0.1.0/tests/test_wikidata_verify.py +66 -0
- koreaapi-0.1.0/tests/test_wikipedia.py +70 -0
- koreaapi-0.1.0/tests/test_wikipedia_live.py +36 -0
- koreaapi-0.1.0/tests/test_youtube.py +136 -0
- koreaapi-0.1.0/uv.lock +1661 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# --- LLM (extraction / translation / verification) ---
|
|
2
|
+
ANTHROPIC_API_KEY=
|
|
3
|
+
|
|
4
|
+
# --- Sources (official APIs first; scraping is last-resort) ---
|
|
5
|
+
SPOTIFY_CLIENT_ID=
|
|
6
|
+
SPOTIFY_CLIENT_SECRET=
|
|
7
|
+
YOUTUBE_API_KEY=
|
|
8
|
+
|
|
9
|
+
# --- Append-only store ---
|
|
10
|
+
KOREAAPI_DB=koreaapi.db # dev SQLite path (default); used by admin / pull / export
|
|
11
|
+
DATABASE_URL= # Postgres (production scale); same insert-only contract
|
|
12
|
+
|
|
13
|
+
# --- Commerce rails (engine 1; cold-start optional; commission is a later switch) ---
|
|
14
|
+
SKIMLINKS_PUBLISHER_ID=
|
|
15
|
+
AMAZON_ASSOCIATES_TAG=
|
|
16
|
+
|
|
17
|
+
# --- Trend digest (engine 2; "Korea Rising" newsletter) ---
|
|
18
|
+
BEEHIIV_API_KEY=
|
|
19
|
+
BEEHIIV_PUBLICATION_ID=
|
|
20
|
+
|
|
21
|
+
# --- Phase 2: agent payments (x402 / USDC) — wire only when traffic qualifies ---
|
|
22
|
+
CDP_API_KEY_ID= # Coinbase Developer Platform (x402 facilitator)
|
|
23
|
+
CDP_API_KEY_SECRET=
|
|
24
|
+
KOREAAPI_PAYOUT_ADDRESS= # USDC receiving address (Base). Not "receive BNB"; no token.
|
|
25
|
+
|
|
26
|
+
# --- Accounts & keys shopping list: docs/API_KEYS.md ---
|
|
27
|
+
|
|
28
|
+
# --- Collection cadence (seconds; see src/koreaapi/pipeline/scheduler.py) ---
|
|
29
|
+
CADENCE_CHARTS_SEC=43200
|
|
30
|
+
CADENCE_EVENTS_SEC=86400
|
|
31
|
+
|
|
32
|
+
LOG_LEVEL=INFO
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# KoreaAPI daily collector — the cold-start data engine.
|
|
2
|
+
# Runs the live Wikidata pull + cross-verify + exports the growing data asset, then commits
|
|
3
|
+
# data/ back to the repo. GitHub's runners have open network access, so the live pull works
|
|
4
|
+
# here. Manual run: Actions tab -> "collect" -> Run workflow. The daily schedule is active
|
|
5
|
+
# once this file is on the default branch (main).
|
|
6
|
+
name: collect
|
|
7
|
+
|
|
8
|
+
on:
|
|
9
|
+
workflow_dispatch: {}
|
|
10
|
+
schedule:
|
|
11
|
+
- cron: "17 0 * * *" # daily ~00:17 UTC
|
|
12
|
+
|
|
13
|
+
permissions:
|
|
14
|
+
contents: write
|
|
15
|
+
|
|
16
|
+
concurrency:
|
|
17
|
+
group: koreaapi-collect
|
|
18
|
+
cancel-in-progress: false
|
|
19
|
+
|
|
20
|
+
jobs:
|
|
21
|
+
collect:
|
|
22
|
+
runs-on: ubuntu-latest
|
|
23
|
+
steps:
|
|
24
|
+
- uses: actions/checkout@v4
|
|
25
|
+
|
|
26
|
+
- name: Set up uv (Python env)
|
|
27
|
+
uses: astral-sh/setup-uv@v5
|
|
28
|
+
|
|
29
|
+
- name: Install deps
|
|
30
|
+
run: uv sync
|
|
31
|
+
|
|
32
|
+
- name: Pull live Wikidata + export the data asset
|
|
33
|
+
env:
|
|
34
|
+
PYTHONPATH: src
|
|
35
|
+
KOREAAPI_DB: ${{ runner.temp }}/koreaapi.db # fresh per run; history lives in data/
|
|
36
|
+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} # romanization + Circle Chart extract (best-effort; skipped if unset)
|
|
37
|
+
YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY }} # official-channel release snapshots (best-effort; skipped if unset)
|
|
38
|
+
run: |
|
|
39
|
+
uv run python -m koreaapi.admin pull
|
|
40
|
+
uv run python -m koreaapi.admin sweep
|
|
41
|
+
uv run python -m koreaapi.admin chart
|
|
42
|
+
uv run python -m koreaapi.admin youtube
|
|
43
|
+
uv run python -m koreaapi.admin export
|
|
44
|
+
uv run python -m koreaapi.admin digest
|
|
45
|
+
uv run python -m koreaapi.admin stats
|
|
46
|
+
|
|
47
|
+
- name: Commit accumulated data
|
|
48
|
+
run: |
|
|
49
|
+
git config user.name "github-actions[bot]"
|
|
50
|
+
git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
|
|
51
|
+
git add data
|
|
52
|
+
if git diff --cached --quiet; then
|
|
53
|
+
echo "no data change"
|
|
54
|
+
else
|
|
55
|
+
git commit -m "data: daily Wikidata snapshot [skip ci]"
|
|
56
|
+
git push
|
|
57
|
+
fi
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# Public GEO page (answer-engine-crawlable) on GitHub Pages.
|
|
2
|
+
# Builds report.html + monitor.html from LIVE data (the pull runs on GitHub's open-network
|
|
3
|
+
# runner) and deploys to GitHub Pages — a free, public, JSON-LD-bearing URL that
|
|
4
|
+
# Perplexity / ChatGPT / Google AI Overviews can crawl and cite.
|
|
5
|
+
#
|
|
6
|
+
# ONE-TIME ENABLE: repo Settings -> Pages -> Build and deployment -> Source: "GitHub Actions".
|
|
7
|
+
name: pages
|
|
8
|
+
|
|
9
|
+
on:
|
|
10
|
+
workflow_dispatch: {}
|
|
11
|
+
push:
|
|
12
|
+
branches: [main]
|
|
13
|
+
schedule:
|
|
14
|
+
- cron: "37 1 * * *" # daily, after the collect job
|
|
15
|
+
|
|
16
|
+
permissions:
|
|
17
|
+
contents: read
|
|
18
|
+
pages: write
|
|
19
|
+
id-token: write
|
|
20
|
+
|
|
21
|
+
concurrency:
|
|
22
|
+
group: pages
|
|
23
|
+
cancel-in-progress: false
|
|
24
|
+
|
|
25
|
+
jobs:
|
|
26
|
+
build-deploy:
|
|
27
|
+
runs-on: ubuntu-latest
|
|
28
|
+
environment:
|
|
29
|
+
name: github-pages
|
|
30
|
+
url: ${{ steps.deploy.outputs.page_url }}
|
|
31
|
+
steps:
|
|
32
|
+
- uses: actions/checkout@v4
|
|
33
|
+
|
|
34
|
+
- name: Set up uv (Python env)
|
|
35
|
+
uses: astral-sh/setup-uv@v5
|
|
36
|
+
|
|
37
|
+
- name: Build the public GEO page from live data
|
|
38
|
+
env:
|
|
39
|
+
PYTHONPATH: src
|
|
40
|
+
KOREAAPI_DB: ${{ runner.temp }}/koreaapi.db
|
|
41
|
+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} # romanization + Circle Chart extract (best-effort; skipped if unset)
|
|
42
|
+
YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY }} # official-channel release snapshots (best-effort; skipped if unset)
|
|
43
|
+
run: |
|
|
44
|
+
uv sync
|
|
45
|
+
uv run python -m koreaapi.admin pull
|
|
46
|
+
uv run python -m koreaapi.admin sweep
|
|
47
|
+
uv run python -m koreaapi.admin chart
|
|
48
|
+
uv run python -m koreaapi.admin youtube
|
|
49
|
+
uv run python -m koreaapi.admin report
|
|
50
|
+
uv run python -m koreaapi.admin entitypages
|
|
51
|
+
uv run python -m koreaapi.admin digest
|
|
52
|
+
uv run python -m koreaapi.admin monitor
|
|
53
|
+
uv run python -m koreaapi.admin export
|
|
54
|
+
mkdir -p _site
|
|
55
|
+
cp report.html _site/index.html
|
|
56
|
+
cp monitor.html _site/monitor.html # human data-quality cockpit at /monitor.html
|
|
57
|
+
cp llms.txt _site/llms.txt # agent-discoverable at /llms.txt (AEO/GEO)
|
|
58
|
+
cp data/korea-rising.md _site/korea-rising.md # shareable verified digest
|
|
59
|
+
cp data/latest.json _site/latest.json # open machine-readable verified data
|
|
60
|
+
cp robots.txt _site/robots.txt # explicitly welcome answer-engine crawlers (AEO)
|
|
61
|
+
cp -r site/artist _site/artist # per-entity citable answer pages (/artist/<slug>.html)
|
|
62
|
+
uv run python -m koreaapi.admin sitemap # sitemap.xml incl. every entity page (daily lastmod)
|
|
63
|
+
cp sitemap.xml _site/sitemap.xml
|
|
64
|
+
|
|
65
|
+
- uses: actions/configure-pages@v5
|
|
66
|
+
- uses: actions/upload-pages-artifact@v3 # publishes ./_site by default
|
|
67
|
+
- id: deploy
|
|
68
|
+
uses: actions/deploy-pages@v4
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# Publish koreaapi to PyPI — no local tools needed.
|
|
2
|
+
# Trigger: Actions -> "publish" -> Run workflow (or publish a GitHub Release).
|
|
3
|
+
# Requires a repo secret PYPI_API_TOKEN (pypi.org -> Account settings -> API tokens).
|
|
4
|
+
name: publish
|
|
5
|
+
|
|
6
|
+
on:
|
|
7
|
+
workflow_dispatch: {}
|
|
8
|
+
release:
|
|
9
|
+
types: [published]
|
|
10
|
+
|
|
11
|
+
permissions:
|
|
12
|
+
contents: read
|
|
13
|
+
|
|
14
|
+
jobs:
|
|
15
|
+
pypi:
|
|
16
|
+
runs-on: ubuntu-latest
|
|
17
|
+
steps:
|
|
18
|
+
- uses: actions/checkout@v4
|
|
19
|
+
- name: Set up uv
|
|
20
|
+
uses: astral-sh/setup-uv@v5
|
|
21
|
+
- name: Build sdist + wheel
|
|
22
|
+
run: uv build
|
|
23
|
+
- name: Publish to PyPI
|
|
24
|
+
run: uv publish --token ${{ secrets.PYPI_API_TOKEN }}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# koreaapi CI — run the test suite + linter on every push and PR.
|
|
2
|
+
# On GitHub's open-network runners the live smoke tests actually exercise Wikidata/Wikipedia
|
|
3
|
+
# (they auto-skip only on transient network failure), so this gates both offline correctness
|
|
4
|
+
# and the live adapters.
|
|
5
|
+
name: test
|
|
6
|
+
|
|
7
|
+
on:
|
|
8
|
+
workflow_dispatch: {}
|
|
9
|
+
push:
|
|
10
|
+
branches: [main]
|
|
11
|
+
pull_request: {}
|
|
12
|
+
|
|
13
|
+
permissions:
|
|
14
|
+
contents: read
|
|
15
|
+
|
|
16
|
+
jobs:
|
|
17
|
+
pytest:
|
|
18
|
+
runs-on: ubuntu-latest
|
|
19
|
+
steps:
|
|
20
|
+
- uses: actions/checkout@v4
|
|
21
|
+
- uses: astral-sh/setup-uv@v5
|
|
22
|
+
- run: uv sync --extra dev
|
|
23
|
+
- name: Lint
|
|
24
|
+
run: uv run ruff check src tests
|
|
25
|
+
- name: Test
|
|
26
|
+
run: uv run pytest -q
|
koreaapi-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 kwangdol-star
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
koreaapi-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: koreaapi
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: The verifiable data layer for Korean culture & commerce, callable by any AI agent (MCP).
|
|
5
|
+
Project-URL: Homepage, https://kwangdol-star.github.io/koreaapi/
|
|
6
|
+
Project-URL: Repository, https://github.com/kwangdol-star/koreaapi
|
|
7
|
+
Project-URL: Open data (JSON), https://kwangdol-star.github.io/koreaapi/latest.json
|
|
8
|
+
Author: kwangdol-star
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: aeo,ai-agents,geo,kculture,korea,kpop,mcp,model-context-protocol,verifiable-data,wikidata
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
18
|
+
Requires-Python: >=3.11
|
|
19
|
+
Requires-Dist: anthropic
|
|
20
|
+
Requires-Dist: fastmcp
|
|
21
|
+
Requires-Dist: httpx
|
|
22
|
+
Requires-Dist: pydantic>=2
|
|
23
|
+
Requires-Dist: python-dotenv
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: pytest; extra == 'dev'
|
|
26
|
+
Requires-Dist: pytest-asyncio; extra == 'dev'
|
|
27
|
+
Requires-Dist: ruff; extra == 'dev'
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
|
|
30
|
+
# KoreaAPI
|
|
31
|
+
|
|
32
|
+
**The verifiable data layer for Korean culture & commerce, callable by any AI agent.**
|
|
33
|
+
*The MCP gateway to Korea — verifiable.*
|
|
34
|
+
|
|
35
|
+
KoreaAPI exposes Korean culture, entertainment, and commerce data to AI agents via
|
|
36
|
+
Anthropic's Model Context Protocol (MCP). Every response carries machine-readable
|
|
37
|
+
**provenance** and a **Skill Score** so an agent can decide whether to trust and cite it.
|
|
38
|
+
|
|
39
|
+
> **Status:** Phase 1 (cold-start). The locked spec is in [`SCOPE.md`](./SCOPE.md); what is
|
|
40
|
+
> built / decided and why is in [`ROADMAP.md`](./ROADMAP.md).
|
|
41
|
+
> **Live, verified, public data (Schema.org JSON-LD + `/llms.txt`):**
|
|
42
|
+
> **https://kwangdol-star.github.io/koreaapi/**
|
|
43
|
+
> **Repository:** [`kwangdol-star/koreaapi`](https://github.com/kwangdol-star/koreaapi) — a
|
|
44
|
+
> standalone repo, split out from its incubation home with full git history preserved.
|
|
45
|
+
|
|
46
|
+
## What's live now (verified, on the public page + via MCP)
|
|
47
|
+
- **Cross-verification** — Wikidata + Wikipedia must agree on the canonical bilingual name
|
|
48
|
+
before a fact clears the single-source cap (high Skill Score = independent concurrence).
|
|
49
|
+
- **Identity guard** (rejects a contradictory label) + **hallucination guard** (LLM-extracted
|
|
50
|
+
data must appear verbatim in its source, else dropped — caught a fabricated chart entry live).
|
|
51
|
+
- **소속사/Agency hub** — each artist anchored to its label (Wikidata P264); the roster grows by
|
|
52
|
+
discovering cross-verified **labelmates** (SPARQL) and is queryable via `get_agency`.
|
|
53
|
+
- **YouTube** official-channel release/stats (live-state) · **LLM romanization** at ingest.
|
|
54
|
+
- **GEO/AEO** — JSON-LD (incl. `recordLabel`) + a ready-to-cite line on every record + `/llms.txt`.
|
|
55
|
+
|
|
56
|
+
## Why this exists
|
|
57
|
+
Raw Korean API wrappers are a commodity (20+ already exist on GitHub). Our moat is the
|
|
58
|
+
combination nobody else ships:
|
|
59
|
+
|
|
60
|
+
- **Aggregation** of fragmented K-culture / commerce sources
|
|
61
|
+
- **Verification** — Skill Score + provenance, exactly where LLMs confidently hallucinate
|
|
62
|
+
- **Append-only time-series** — a latecomer cannot reconstruct our history
|
|
63
|
+
- **Behavioral signal** — what agents query / buy through us becomes trend data
|
|
64
|
+
|
|
65
|
+
The customer is the **AI agent** (consumer); humans / brands / enterprises pay.
|
|
66
|
+
|
|
67
|
+
## Why now — the land-grab window
|
|
68
|
+
The compounding assets accrue to **early, high-quality** entrants: only ~13% of public MCP
|
|
69
|
+
servers are high-trust, and AI answer engines concentrate citations on content **refreshed in the
|
|
70
|
+
last 1–3 years** (Seer Interactive). A verified hub that re-verifies **daily** compounds a citation
|
|
71
|
+
lead latecomers can't backfill. We are **"picks-and-shovels"** — the data agents consume, not a
|
|
72
|
+
chat wrapper (a category the same market analyses find largely fails to monetize). *(An independent 2026 AI-agent
|
|
73
|
+
opportunity ranking places this exact model at its top — see [`ROADMAP.md`](./ROADMAP.md).)*
|
|
74
|
+
|
|
75
|
+
## Revenue flywheel (engines ① + ②)
|
|
76
|
+
K-culture current-state is the magnet. ① commerce commission + ② trend-intelligence
|
|
77
|
+
subscription reinforce each other: transactions generate the behavioral signal that
|
|
78
|
+
becomes the trend product, which improves commerce conversion. See [`SCOPE.md`](./SCOPE.md) §3.
|
|
79
|
+
|
|
80
|
+
## The heart: append-only ingestion (component A)
|
|
81
|
+
```
|
|
82
|
+
fetch → LLM-extract → cross-verify → bilingual-normalize → append (+ Skill Score)
|
|
83
|
+
```
|
|
84
|
+
**Overwrite = wrapper. Append timestamped snapshots = an asset.**
|
|
85
|
+
|
|
86
|
+
## Bilingual by design
|
|
87
|
+
Korean = canonical (provenance anchor). English = distribution layer.
|
|
88
|
+
Names carry `ko` / `en_official` / `romanized`. See [`SCOPE.md`](./SCOPE.md) §5.
|
|
89
|
+
|
|
90
|
+
## Layout
|
|
91
|
+
```
|
|
92
|
+
koreaapi/
|
|
93
|
+
├── SCOPE.md # locked Phase 1 spec
|
|
94
|
+
├── llms.txt # agent-facing description
|
|
95
|
+
├── pyproject.toml
|
|
96
|
+
└── src/koreaapi/
|
|
97
|
+
├── models.py # bilingual records + Provenance (the data contract)
|
|
98
|
+
├── skill_score.py # transparent 0–1 quality score
|
|
99
|
+
├── pipeline/ # component A: append-only ingestion (the heart)
|
|
100
|
+
│ ├── ingest.py # fetch→extract→verify→translate→append
|
|
101
|
+
│ ├── store.py # APPEND-ONLY store (the moat)
|
|
102
|
+
│ └── scheduler.py # tiered collection cadence
|
|
103
|
+
└── sources/ # source adapters (official APIs first)
|
|
104
|
+
└── base.py
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## Dev
|
|
108
|
+
```bash
|
|
109
|
+
cd koreaapi
|
|
110
|
+
uv sync # or: pip install pydantic pytest
|
|
111
|
+
|
|
112
|
+
# run the offline end-to-end pipeline test (no API keys / network needed)
|
|
113
|
+
PYTHONPATH=src python -m pytest tests -q
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
The append-only ingestion heart (store + ingest + Skill Score + bilingual normalization) is
|
|
117
|
+
implemented and **tested offline** via a `MockSource`. Real source adapters, all with pure
|
|
118
|
+
fixture-tested parse steps + best-effort live fetch (graceful when egress/keys are absent):
|
|
119
|
+
|
|
120
|
+
- **Wikidata** (#1) — bilingual labels via a curated entity→Q-id fast path (each anchor's
|
|
121
|
+
identity verified, so a contradictory label is **rejected, not ingested**) + live
|
|
122
|
+
`wbsearchentities`. Also pulls the **소속사/label** (P264) and discovers **labelmates** (SPARQL).
|
|
123
|
+
- **Wikipedia** (#2) — independent cross-check; when both agree on the bilingual name the Skill
|
|
124
|
+
Score clears the single-source cap (the verification moat).
|
|
125
|
+
- **YouTube Data API** (#3.5) — official-channel stats + latest release (live-state event data),
|
|
126
|
+
identity-guarded; deliberately *not* a name cross-verifier.
|
|
127
|
+
- **Circle Chart** (#3) — official chart, LLM-extracted **with an anti-hallucination grounding
|
|
128
|
+
guard** (entries must appear verbatim in the page HTML). The page is JS-rendered, so the raw
|
|
129
|
+
chart awaits a data endpoint; the guard ensures it ships *nothing* over anything false.
|
|
130
|
+
- **LLM romanization** (Haiku) fills `romanized` at ingest — "cheap AI as collection labor".
|
|
131
|
+
|
|
132
|
+
Spotify is **skipped** (its Web API now requires Premium, 2026); a keyless EN-mostly source
|
|
133
|
+
would only lower the cross-verified scores. See [`ROADMAP.md`](./ROADMAP.md) for the full log.
|
|
134
|
+
|
|
135
|
+
> **Egress note:** the live pull needs outbound access to `*.wikidata.org`. In the
|
|
136
|
+
> web/sandbox environment egress is allowlist-gated — if Wikidata isn't allowlisted the
|
|
137
|
+
> live test skips (HTTP 403 `host_not_allowed`) while the offline parser tests still
|
|
138
|
+
> cover correctness.
|
|
139
|
+
|
|
140
|
+
## Viewing & managing it (human console)
|
|
141
|
+
The product is agent-facing (MCP), but you (human) need a cockpit. There are
|
|
142
|
+
**two faces over one source of truth** (the append-only store): the MCP server for
|
|
143
|
+
agents, and a read-only console for you.
|
|
144
|
+
|
|
145
|
+
```bash
|
|
146
|
+
cd koreaapi
|
|
147
|
+
PYTHONPATH=src python -m koreaapi.admin seed # populate koreaapi.db (offline sample)
|
|
148
|
+
PYTHONPATH=src python -m koreaapi.admin pull # LIVE: Wikidata+Wikipedia cross-verified snapshots (+agency)
|
|
149
|
+
PYTHONPATH=src python -m koreaapi.admin sweep # LIVE: discover labelmates from each anchored agency (SPARQL)
|
|
150
|
+
PYTHONPATH=src python -m koreaapi.admin youtube # LIVE: official-channel release snapshots (needs YOUTUBE_API_KEY)
|
|
151
|
+
PYTHONPATH=src python -m koreaapi.admin chart # LIVE: Circle Chart (LLM-extract, grounding-guarded; needs key)
|
|
152
|
+
PYTHONPATH=src python -m koreaapi.admin export # write data/ asset (history + latest.json)
|
|
153
|
+
PYTHONPATH=src python -m koreaapi.admin signals # top behavioral signals (engine 2: what agents query)
|
|
154
|
+
PYTHONPATH=src python -m koreaapi.admin stats # data-quality summary
|
|
155
|
+
PYTHONPATH=src python -m koreaapi.admin dump # print recent snapshots
|
|
156
|
+
PYTHONPATH=src python -m koreaapi.admin report # -> report.html (open in a browser)
|
|
157
|
+
|
|
158
|
+
# zero-code interactive browse + query + JSON API over the same DB:
|
|
159
|
+
pip install datasette && datasette koreaapi.db
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
**Automated collection (cron).** `.github/workflows/collect.yml` runs `admin pull` +
|
|
163
|
+
`admin export` daily (and on manual dispatch) and commits the growing data asset back to
|
|
164
|
+
the repo: `koreaapi/data/snapshots.jsonl` (append-only history) + `latest.json` (current
|
|
165
|
+
state, crawlable for GEO). It runs on GitHub's runners — **open network, so the live pull
|
|
166
|
+
works there** even though the dev sandbox blocks Wikidata egress. Production scales this to
|
|
167
|
+
Postgres behind the same insert-only contract (see `pipeline/store.py`); the repo file set
|
|
168
|
+
is the zero-cost cold-start "database".
|
|
169
|
+
|
|
170
|
+
**Public GEO page.** `.github/workflows/pages.yml` builds `report.html` from live data and
|
|
171
|
+
deploys it to GitHub Pages (one-time enable: Settings → Pages → Source: GitHub Actions) — a
|
|
172
|
+
public, crawlable, JSON-LD-bearing URL so answer engines can surface and cite the verified data.
|
|
173
|
+
|
|
174
|
+
Watch the headline metric of a verifiable-data business: **avg Skill Score,
|
|
175
|
+
freshness, and source agreement** - that is literally watching the moat.
|
|
176
|
+
|
|
177
|
+
## Agent face (MCP server)
|
|
178
|
+
The product itself: an MCP server exposing 5 tools, each returning verified, bilingual,
|
|
179
|
+
provenance-bearing data (with a ready-to-cite line) from the same store the console reads.
|
|
180
|
+
|
|
181
|
+
| Tool | Returns |
|
|
182
|
+
|---|---|
|
|
183
|
+
| `get_artist_status(artist_id)` | latest status across kinds + verified facts + agency |
|
|
184
|
+
| `get_kculture_calendar(window_days)` | upcoming comebacks / releases / concerts |
|
|
185
|
+
| `get_agency(name)` | artists verified under a 소속사/label (the agency hub) |
|
|
186
|
+
| `get_korea_rising(category, limit)` | what's rising now, ranked by observed demand + Skill Score |
|
|
187
|
+
| `get_buy_options(item)` | where to buy (Phase 1: rail pending; logs buy-intent) |
|
|
188
|
+
|
|
189
|
+
```bash
|
|
190
|
+
cd koreaapi
|
|
191
|
+
pip install fastmcp # use a venv if system deps clash
|
|
192
|
+
PYTHONPATH=src python -m koreaapi.server # serves over MCP (stdio)
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
Logic lives in `service.py` (pure, offline-tested); `server.py` is the thin MCP
|
|
196
|
+
binding. Tools register cleanly (verified in an isolated venv).
|
|
197
|
+
|
|
198
|
+
**Install / connect it in your agent:** see [`docs/MCP_INSTALL.md`](./docs/MCP_INSTALL.md)
|
|
199
|
+
(run command, Claude-Desktop config, and [`smithery.yaml`](./smithery.yaml) for the Smithery registry).
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# KoreaAPI — North Star & Durability Principles
|
|
2
|
+
|
|
3
|
+
> The load-bearing doctrine. Every design and build decision is checked against this.
|
|
4
|
+
|
|
5
|
+
## North star
|
|
6
|
+
Accumulate — faster and more verifiably than anyone in the world — the **live state of
|
|
7
|
+
Korean culture & commerce**, using cheap AI as the collection labor, and monetize the
|
|
8
|
+
**transactions and trust** that flow through it.
|
|
9
|
+
|
|
10
|
+
**Hard requirement: the model must keep making money as AI models get *better*, not worse.**
|
|
11
|
+
|
|
12
|
+
## Why this gets STRONGER as AI advances
|
|
13
|
+
The bottleneck of the AI era is shifting from *intelligence* (becoming abundant and
|
|
14
|
+
cheap) to *trustworthy real-world input + the right to act* (scarce). We own the scarce
|
|
15
|
+
complement for Korea. Better models feed us; they do not erode us.
|
|
16
|
+
|
|
17
|
+
| As AI models advance... | Effect on us |
|
|
18
|
+
|---|---|
|
|
19
|
+
| Training cutoff can't hold live state | Agents need fresh feeds → **demand ↑** |
|
|
20
|
+
| Smarter models hallucinate more *plausibly* | Verifiable ground truth + provenance worth **more**, not less |
|
|
21
|
+
| Our append-only history was never in any training set | **Un-reconstructable**; compounds daily |
|
|
22
|
+
| Behavioral signal is generated by usage | Exclusive, never trainable; grows with adoption |
|
|
23
|
+
| Agents recommend & transact more confidently | More volume through our **commission rail** |
|
|
24
|
+
| Extraction models get cheaper/better | Our **collection cost ↓** (the progress that helps rivals helps us first) |
|
|
25
|
+
|
|
26
|
+
Demand ↑ + our cost ↓ + history compounding = the flywheel **accelerates** with model progress.
|
|
27
|
+
|
|
28
|
+
## What we DO and DON'T build
|
|
29
|
+
**DO (model-advancement-proof):**
|
|
30
|
+
- Live / future state (calendars, charts, prices) — structurally outside any model
|
|
31
|
+
- Verification: Skill Score + provenance on every record
|
|
32
|
+
- Append-only history (never overwrite) — the un-reconstructable asset
|
|
33
|
+
- Proprietary behavioral signal (what agents query / buy)
|
|
34
|
+
- Transaction-attached monetization (commission) + data subscription
|
|
35
|
+
|
|
36
|
+
**DON'T (eroded by better models):**
|
|
37
|
+
- Static evergreen knowledge the model already has (generic encyclopedia)
|
|
38
|
+
- Commodity raw data a browsing agent fetches itself (plain public-API wrapping)
|
|
39
|
+
- Pure reformatting / translation as the *only* value (verification, not translation, is the value)
|
|
40
|
+
|
|
41
|
+
## Design invariants (check every change against these)
|
|
42
|
+
1. **Append, never overwrite.** History is the moat.
|
|
43
|
+
2. **Every record carries provenance + Skill Score.** No unverifiable data ships.
|
|
44
|
+
3. **Korean canonical, English distribution; official names over translation.**
|
|
45
|
+
4. **Every new tool must add live-state, verification, or history value** — never just restate model-known facts.
|
|
46
|
+
5. **Prefer transaction-attached revenue** (commission) over pure read fees.
|
|
47
|
+
6. **Use cheap AI as collection labor;** collection cost should fall as models improve.
|
|
48
|
+
7. **One source of truth, two faces** (agent MCP + human console) — never a second data path.
|
|
49
|
+
|
|
50
|
+
## The main long-term risk (and the answer)
|
|
51
|
+
Disintermediation by a model/platform building the Korean layer themselves. Answer:
|
|
52
|
+
they build *horizontal intelligence*, not *vertical Korean ground truth*. We win by
|
|
53
|
+
owning the **cited default + the transaction rail + the behavioral flywheel** inside the
|
|
54
|
+
12–18 month window — before anyone bothers.
|