koreaapi 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. koreaapi-0.1.0/.env.example +32 -0
  2. koreaapi-0.1.0/.github/workflows/collect.yml +57 -0
  3. koreaapi-0.1.0/.github/workflows/pages.yml +68 -0
  4. koreaapi-0.1.0/.github/workflows/publish.yml +24 -0
  5. koreaapi-0.1.0/.github/workflows/test.yml +26 -0
  6. koreaapi-0.1.0/.gitignore +15 -0
  7. koreaapi-0.1.0/LICENSE +21 -0
  8. koreaapi-0.1.0/PKG-INFO +199 -0
  9. koreaapi-0.1.0/PRINCIPLES.md +54 -0
  10. koreaapi-0.1.0/README.md +170 -0
  11. koreaapi-0.1.0/ROADMAP.md +278 -0
  12. koreaapi-0.1.0/SCOPE.md +144 -0
  13. koreaapi-0.1.0/docs/API_KEYS.md +58 -0
  14. koreaapi-0.1.0/docs/LAUNCH.md +56 -0
  15. koreaapi-0.1.0/docs/MCP_INSTALL.md +74 -0
  16. koreaapi-0.1.0/docs/MIGRATION.md +64 -0
  17. koreaapi-0.1.0/docs/research/agent-blockchain-conference-2026-05-27.html +1652 -0
  18. koreaapi-0.1.0/llms.txt +36 -0
  19. koreaapi-0.1.0/pyproject.toml +46 -0
  20. koreaapi-0.1.0/robots.txt +10 -0
  21. koreaapi-0.1.0/scripts/split-koreaapi.sh +35 -0
  22. koreaapi-0.1.0/smithery.yaml +37 -0
  23. koreaapi-0.1.0/src/koreaapi/__init__.py +3 -0
  24. koreaapi-0.1.0/src/koreaapi/admin.py +902 -0
  25. koreaapi-0.1.0/src/koreaapi/models.py +76 -0
  26. koreaapi-0.1.0/src/koreaapi/pipeline/__init__.py +5 -0
  27. koreaapi-0.1.0/src/koreaapi/pipeline/ingest.py +252 -0
  28. koreaapi-0.1.0/src/koreaapi/pipeline/scheduler.py +15 -0
  29. koreaapi-0.1.0/src/koreaapi/pipeline/store.py +206 -0
  30. koreaapi-0.1.0/src/koreaapi/romanize.py +39 -0
  31. koreaapi-0.1.0/src/koreaapi/roster.py +21 -0
  32. koreaapi-0.1.0/src/koreaapi/server.py +57 -0
  33. koreaapi-0.1.0/src/koreaapi/service.py +176 -0
  34. koreaapi-0.1.0/src/koreaapi/skill_score.py +62 -0
  35. koreaapi-0.1.0/src/koreaapi/sources/__init__.py +5 -0
  36. koreaapi-0.1.0/src/koreaapi/sources/base.py +18 -0
  37. koreaapi-0.1.0/src/koreaapi/sources/circlechart.py +146 -0
  38. koreaapi-0.1.0/src/koreaapi/sources/mock.py +23 -0
  39. koreaapi-0.1.0/src/koreaapi/sources/wikidata.py +341 -0
  40. koreaapi-0.1.0/src/koreaapi/sources/wikipedia.py +97 -0
  41. koreaapi-0.1.0/src/koreaapi/sources/youtube.py +266 -0
  42. koreaapi-0.1.0/tests/fixtures/wikidata_bts.json +18 -0
  43. koreaapi-0.1.0/tests/fixtures/wikidata_bts_agency.json +26 -0
  44. koreaapi-0.1.0/tests/fixtures/wikidata_bts_full.json +31 -0
  45. koreaapi-0.1.0/tests/fixtures/wikidata_label_bighit.json +11 -0
  46. koreaapi-0.1.0/tests/fixtures/wikidata_labelmates.json +25 -0
  47. koreaapi-0.1.0/tests/fixtures/wikidata_members.json +6 -0
  48. koreaapi-0.1.0/tests/fixtures/wikidata_poisoned_bts.json +1 -0
  49. koreaapi-0.1.0/tests/fixtures/wikidata_search_bts.json +15 -0
  50. koreaapi-0.1.0/tests/fixtures/wikipedia_bts.json +13 -0
  51. koreaapi-0.1.0/tests/fixtures/youtube_channel_bts.json +24 -0
  52. koreaapi-0.1.0/tests/fixtures/youtube_latest_bts.json +17 -0
  53. koreaapi-0.1.0/tests/fixtures/youtube_search_bts.json +23 -0
  54. koreaapi-0.1.0/tests/test_admin_pull.py +47 -0
  55. koreaapi-0.1.0/tests/test_circlechart.py +111 -0
  56. koreaapi-0.1.0/tests/test_export.py +57 -0
  57. koreaapi-0.1.0/tests/test_geo.py +122 -0
  58. koreaapi-0.1.0/tests/test_pipeline.py +77 -0
  59. koreaapi-0.1.0/tests/test_romanize.py +65 -0
  60. koreaapi-0.1.0/tests/test_roster.py +77 -0
  61. koreaapi-0.1.0/tests/test_server.py +57 -0
  62. koreaapi-0.1.0/tests/test_service.py +126 -0
  63. koreaapi-0.1.0/tests/test_signal.py +73 -0
  64. koreaapi-0.1.0/tests/test_wikidata.py +120 -0
  65. koreaapi-0.1.0/tests/test_wikidata_live.py +51 -0
  66. koreaapi-0.1.0/tests/test_wikidata_verify.py +66 -0
  67. koreaapi-0.1.0/tests/test_wikipedia.py +70 -0
  68. koreaapi-0.1.0/tests/test_wikipedia_live.py +36 -0
  69. koreaapi-0.1.0/tests/test_youtube.py +136 -0
  70. koreaapi-0.1.0/uv.lock +1661 -0
@@ -0,0 +1,32 @@
1
+ # --- LLM (extraction / translation / verification) ---
2
+ ANTHROPIC_API_KEY=
3
+
4
+ # --- Sources (official APIs first; scraping is last-resort) ---
5
+ SPOTIFY_CLIENT_ID=
6
+ SPOTIFY_CLIENT_SECRET=
7
+ YOUTUBE_API_KEY=
8
+
9
+ # --- Append-only store ---
10
+ KOREAAPI_DB=koreaapi.db # dev SQLite path (default); used by admin / pull / export
11
+ DATABASE_URL= # Postgres (production scale); same insert-only contract
12
+
13
+ # --- Commerce rails (engine 1; cold-start optional; commission is a later switch) ---
14
+ SKIMLINKS_PUBLISHER_ID=
15
+ AMAZON_ASSOCIATES_TAG=
16
+
17
+ # --- Trend digest (engine 2; "Korea Rising" newsletter) ---
18
+ BEEHIIV_API_KEY=
19
+ BEEHIIV_PUBLICATION_ID=
20
+
21
+ # --- Phase 2: agent payments (x402 / USDC) — wire only when traffic qualifies ---
22
+ CDP_API_KEY_ID= # Coinbase Developer Platform (x402 facilitator)
23
+ CDP_API_KEY_SECRET=
24
+ KOREAAPI_PAYOUT_ADDRESS= # USDC receiving address (Base). Not "receive BNB"; no token.
25
+
26
+ # --- Accounts & keys shopping list: docs/API_KEYS.md ---
27
+
28
+ # --- Collection cadence (seconds; see src/koreaapi/pipeline/scheduler.py) ---
29
+ CADENCE_CHARTS_SEC=43200
30
+ CADENCE_EVENTS_SEC=86400
31
+
32
+ LOG_LEVEL=INFO
@@ -0,0 +1,57 @@
1
+ # KoreaAPI daily collector — the cold-start data engine.
2
+ # Runs the live Wikidata pull + cross-verify + exports the growing data asset, then commits
3
+ # data/ back to the repo. GitHub's runners have open network access, so the live pull works
4
+ # here. Manual run: Actions tab -> "collect" -> Run workflow. The daily schedule is active
5
+ # once this file is on the default branch (main).
6
+ name: collect
7
+
8
+ on:
9
+ workflow_dispatch: {}
10
+ schedule:
11
+ - cron: "17 0 * * *" # daily ~00:17 UTC
12
+
13
+ permissions:
14
+ contents: write
15
+
16
+ concurrency:
17
+ group: koreaapi-collect
18
+ cancel-in-progress: false
19
+
20
+ jobs:
21
+ collect:
22
+ runs-on: ubuntu-latest
23
+ steps:
24
+ - uses: actions/checkout@v4
25
+
26
+ - name: Set up uv (Python env)
27
+ uses: astral-sh/setup-uv@v5
28
+
29
+ - name: Install deps
30
+ run: uv sync
31
+
32
+ - name: Pull live Wikidata + export the data asset
33
+ env:
34
+ PYTHONPATH: src
35
+ KOREAAPI_DB: ${{ runner.temp }}/koreaapi.db # fresh per run; history lives in data/
36
+ ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} # romanization + Circle Chart extract (best-effort; skipped if unset)
37
+ YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY }} # official-channel release snapshots (best-effort; skipped if unset)
38
+ run: |
39
+ uv run python -m koreaapi.admin pull
40
+ uv run python -m koreaapi.admin sweep
41
+ uv run python -m koreaapi.admin chart
42
+ uv run python -m koreaapi.admin youtube
43
+ uv run python -m koreaapi.admin export
44
+ uv run python -m koreaapi.admin digest
45
+ uv run python -m koreaapi.admin stats
46
+
47
+ - name: Commit accumulated data
48
+ run: |
49
+ git config user.name "github-actions[bot]"
50
+ git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
51
+ git add data
52
+ if git diff --cached --quiet; then
53
+ echo "no data change"
54
+ else
55
+ git commit -m "data: daily Wikidata snapshot [skip ci]"
56
+ git push
57
+ fi
@@ -0,0 +1,68 @@
1
+ # Public GEO page (answer-engine-crawlable) on GitHub Pages.
2
+ # Builds report.html + monitor.html from LIVE data (the pull runs on GitHub's open-network
3
+ # runner) and deploys to GitHub Pages — a free, public, JSON-LD-bearing URL that
4
+ # Perplexity / ChatGPT / Google AI Overviews can crawl and cite.
5
+ #
6
+ # ONE-TIME ENABLE: repo Settings -> Pages -> Build and deployment -> Source: "GitHub Actions".
7
+ name: pages
8
+
9
+ on:
10
+ workflow_dispatch: {}
11
+ push:
12
+ branches: [main]
13
+ schedule:
14
+ - cron: "37 1 * * *" # daily, after the collect job
15
+
16
+ permissions:
17
+ contents: read
18
+ pages: write
19
+ id-token: write
20
+
21
+ concurrency:
22
+ group: pages
23
+ cancel-in-progress: false
24
+
25
+ jobs:
26
+ build-deploy:
27
+ runs-on: ubuntu-latest
28
+ environment:
29
+ name: github-pages
30
+ url: ${{ steps.deploy.outputs.page_url }}
31
+ steps:
32
+ - uses: actions/checkout@v4
33
+
34
+ - name: Set up uv (Python env)
35
+ uses: astral-sh/setup-uv@v5
36
+
37
+ - name: Build the public GEO page from live data
38
+ env:
39
+ PYTHONPATH: src
40
+ KOREAAPI_DB: ${{ runner.temp }}/koreaapi.db
41
+ ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} # romanization + Circle Chart extract (best-effort; skipped if unset)
42
+ YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY }} # official-channel release snapshots (best-effort; skipped if unset)
43
+ run: |
44
+ uv sync
45
+ uv run python -m koreaapi.admin pull
46
+ uv run python -m koreaapi.admin sweep
47
+ uv run python -m koreaapi.admin chart
48
+ uv run python -m koreaapi.admin youtube
49
+ uv run python -m koreaapi.admin report
50
+ uv run python -m koreaapi.admin entitypages
51
+ uv run python -m koreaapi.admin digest
52
+ uv run python -m koreaapi.admin monitor
53
+ uv run python -m koreaapi.admin export
54
+ mkdir -p _site
55
+ cp report.html _site/index.html
56
+ cp monitor.html _site/monitor.html # human data-quality cockpit at /monitor.html
57
+ cp llms.txt _site/llms.txt # agent-discoverable at /llms.txt (AEO/GEO)
58
+ cp data/korea-rising.md _site/korea-rising.md # shareable verified digest
59
+ cp data/latest.json _site/latest.json # open machine-readable verified data
60
+ cp robots.txt _site/robots.txt # explicitly welcome answer-engine crawlers (AEO)
61
+ cp -r site/artist _site/artist # per-entity citable answer pages (/artist/<slug>.html)
62
+ uv run python -m koreaapi.admin sitemap # sitemap.xml incl. every entity page (daily lastmod)
63
+ cp sitemap.xml _site/sitemap.xml
64
+
65
+ - uses: actions/configure-pages@v5
66
+ - uses: actions/upload-pages-artifact@v3 # publishes ./_site by default
67
+ - id: deploy
68
+ uses: actions/deploy-pages@v4
@@ -0,0 +1,24 @@
1
+ # Publish koreaapi to PyPI — no local tools needed.
2
+ # Trigger: Actions -> "publish" -> Run workflow (or publish a GitHub Release).
3
+ # Requires a repo secret PYPI_API_TOKEN (pypi.org -> Account settings -> API tokens).
4
+ name: publish
5
+
6
+ on:
7
+ workflow_dispatch: {}
8
+ release:
9
+ types: [published]
10
+
11
+ permissions:
12
+ contents: read
13
+
14
+ jobs:
15
+ pypi:
16
+ runs-on: ubuntu-latest
17
+ steps:
18
+ - uses: actions/checkout@v4
19
+ - name: Set up uv
20
+ uses: astral-sh/setup-uv@v5
21
+ - name: Build sdist + wheel
22
+ run: uv build
23
+ - name: Publish to PyPI
24
+ run: uv publish --token ${{ secrets.PYPI_API_TOKEN }}
@@ -0,0 +1,26 @@
1
+ # koreaapi CI — run the test suite + linter on every push and PR.
2
+ # On GitHub's open-network runners the live smoke tests actually exercise Wikidata/Wikipedia
3
+ # (they auto-skip only on transient network failure), so this gates both offline correctness
4
+ # and the live adapters.
5
+ name: test
6
+
7
+ on:
8
+ workflow_dispatch: {}
9
+ push:
10
+ branches: [main]
11
+ pull_request: {}
12
+
13
+ permissions:
14
+ contents: read
15
+
16
+ jobs:
17
+ pytest:
18
+ runs-on: ubuntu-latest
19
+ steps:
20
+ - uses: actions/checkout@v4
21
+ - uses: astral-sh/setup-uv@v5
22
+ - run: uv sync --extra dev
23
+ - name: Lint
24
+ run: uv run ruff check src tests
25
+ - name: Test
26
+ run: uv run pytest -q
@@ -0,0 +1,15 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ .venv/
4
+ .env
5
+ .env.local
6
+ dist/
7
+ build/
8
+ *.egg-info/
9
+ .pytest_cache/
10
+ .ruff_cache/
11
+ *.db
12
+ report.html
13
+ monitor.html
14
+ sitemap.xml
15
+ site/
koreaapi-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 kwangdol-star
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,199 @@
1
+ Metadata-Version: 2.4
2
+ Name: koreaapi
3
+ Version: 0.1.0
4
+ Summary: The verifiable data layer for Korean culture & commerce, callable by any AI agent (MCP).
5
+ Project-URL: Homepage, https://kwangdol-star.github.io/koreaapi/
6
+ Project-URL: Repository, https://github.com/kwangdol-star/koreaapi
7
+ Project-URL: Open data (JSON), https://kwangdol-star.github.io/koreaapi/latest.json
8
+ Author: kwangdol-star
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: aeo,ai-agents,geo,kculture,korea,kpop,mcp,model-context-protocol,verifiable-data,wikidata
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
+ Requires-Python: >=3.11
19
+ Requires-Dist: anthropic
20
+ Requires-Dist: fastmcp
21
+ Requires-Dist: httpx
22
+ Requires-Dist: pydantic>=2
23
+ Requires-Dist: python-dotenv
24
+ Provides-Extra: dev
25
+ Requires-Dist: pytest; extra == 'dev'
26
+ Requires-Dist: pytest-asyncio; extra == 'dev'
27
+ Requires-Dist: ruff; extra == 'dev'
28
+ Description-Content-Type: text/markdown
29
+
30
+ # KoreaAPI
31
+
32
+ **The verifiable data layer for Korean culture & commerce, callable by any AI agent.**
33
+ *The MCP gateway to Korea — verifiable.*
34
+
35
+ KoreaAPI exposes Korean culture, entertainment, and commerce data to AI agents via
36
+ Anthropic's Model Context Protocol (MCP). Every response carries machine-readable
37
+ **provenance** and a **Skill Score** so an agent can decide whether to trust and cite it.
38
+
39
+ > **Status:** Phase 1 (cold-start). The locked spec is in [`SCOPE.md`](./SCOPE.md); what is
40
+ > built / decided and why is in [`ROADMAP.md`](./ROADMAP.md).
41
+ > **Live, verified, public data (Schema.org JSON-LD + `/llms.txt`):**
42
+ > **https://kwangdol-star.github.io/koreaapi/**
43
+ > **Repository:** [`kwangdol-star/koreaapi`](https://github.com/kwangdol-star/koreaapi) — a
44
+ > standalone repo, split out from its incubation home with full git history preserved.
45
+
46
+ ## What's live now (verified, on the public page + via MCP)
47
+ - **Cross-verification** — Wikidata + Wikipedia must agree on the canonical bilingual name
48
+ before a fact clears the single-source cap (high Skill Score = independent concurrence).
49
+ - **Identity guard** (rejects a contradictory label) + **hallucination guard** (LLM-extracted
50
+ data must appear verbatim in its source, else dropped — caught a fabricated chart entry live).
51
+ - **소속사/Agency hub** — each artist anchored to its label (Wikidata P264); the roster grows by
52
+ discovering cross-verified **labelmates** (SPARQL) and is queryable via `get_agency`.
53
+ - **YouTube** official-channel release/stats (live-state) · **LLM romanization** at ingest.
54
+ - **GEO/AEO** — JSON-LD (incl. `recordLabel`) + a ready-to-cite line on every record + `/llms.txt`.
55
+
56
+ ## Why this exists
57
+ Raw Korean API wrappers are a commodity (20+ already exist on GitHub). Our moat is the
58
+ combination nobody else ships:
59
+
60
+ - **Aggregation** of fragmented K-culture / commerce sources
61
+ - **Verification** — Skill Score + provenance, exactly where LLMs confidently hallucinate
62
+ - **Append-only time-series** — a latecomer cannot reconstruct our history
63
+ - **Behavioral signal** — what agents query / buy through us becomes trend data
64
+
65
+ The customer is the **AI agent** (consumer); humans / brands / enterprises pay.
66
+
67
+ ## Why now — the land-grab window
68
+ The compounding assets accrue to **early, high-quality** entrants: only ~13% of public MCP
69
+ servers are high-trust, and AI answer engines concentrate citations on content **refreshed in the
70
+ last 1–3 years** (Seer Interactive). A verified hub that re-verifies **daily** compounds a citation
71
+ lead latecomers can't backfill. We are **"picks-and-shovels"** — the data agents consume, not a
72
+ chat wrapper (a category the same market analyses find largely fails to monetize). *(An independent 2026 AI-agent
73
+ opportunity ranking places this exact model at its top — see [`ROADMAP.md`](./ROADMAP.md).)*
74
+
75
+ ## Revenue flywheel (engines ① + ②)
76
+ K-culture current-state is the magnet. ① commerce commission + ② trend-intelligence
77
+ subscription reinforce each other: transactions generate the behavioral signal that
78
+ becomes the trend product, which improves commerce conversion. See [`SCOPE.md`](./SCOPE.md) §3.
79
+
80
+ ## The heart: append-only ingestion (component A)
81
+ ```
82
+ fetch → LLM-extract → cross-verify → bilingual-normalize → append (+ Skill Score)
83
+ ```
84
+ **Overwrite = wrapper. Append timestamped snapshots = an asset.**
85
+
86
+ ## Bilingual by design
87
+ Korean = canonical (provenance anchor). English = distribution layer.
88
+ Names carry `ko` / `en_official` / `romanized`. See [`SCOPE.md`](./SCOPE.md) §5.
89
+
90
+ ## Layout
91
+ ```
92
+ koreaapi/
93
+ ├── SCOPE.md # locked Phase 1 spec
94
+ ├── llms.txt # agent-facing description
95
+ ├── pyproject.toml
96
+ └── src/koreaapi/
97
+ ├── models.py # bilingual records + Provenance (the data contract)
98
+ ├── skill_score.py # transparent 0–1 quality score
99
+ ├── pipeline/ # component A: append-only ingestion (the heart)
100
+ │ ├── ingest.py # fetch→extract→verify→translate→append
101
+ │ ├── store.py # APPEND-ONLY store (the moat)
102
+ │ └── scheduler.py # tiered collection cadence
103
+ └── sources/ # source adapters (official APIs first)
104
+ └── base.py
105
+ ```
106
+
107
+ ## Dev
108
+ ```bash
109
+ cd koreaapi
110
+ uv sync # or: pip install pydantic pytest
111
+
112
+ # run the offline end-to-end pipeline test (no API keys / network needed)
113
+ PYTHONPATH=src python -m pytest tests -q
114
+ ```
115
+
116
+ The append-only ingestion heart (store + ingest + Skill Score + bilingual normalization) is
117
+ implemented and **tested offline** via a `MockSource`. Real source adapters, all with pure
118
+ fixture-tested parse steps + best-effort live fetch (graceful when egress/keys are absent):
119
+
120
+ - **Wikidata** (#1) — bilingual labels via a curated entity→Q-id fast path (each anchor's
121
+ identity verified, so a contradictory label is **rejected, not ingested**) + live
122
+ `wbsearchentities`. Also pulls the **소속사/label** (P264) and discovers **labelmates** (SPARQL).
123
+ - **Wikipedia** (#2) — independent cross-check; when both agree on the bilingual name the Skill
124
+ Score clears the single-source cap (the verification moat).
125
+ - **YouTube Data API** (#3.5) — official-channel stats + latest release (live-state event data),
126
+ identity-guarded; deliberately *not* a name cross-verifier.
127
+ - **Circle Chart** (#3) — official chart, LLM-extracted **with an anti-hallucination grounding
128
+ guard** (entries must appear verbatim in the page HTML). The page is JS-rendered, so the raw
129
+ chart awaits a data endpoint; the guard ensures it ships *nothing* over anything false.
130
+ - **LLM romanization** (Haiku) fills `romanized` at ingest — "cheap AI as collection labor".
131
+
132
+ Spotify is **skipped** (its Web API now requires Premium, 2026); a keyless EN-mostly source
133
+ would only lower the cross-verified scores. See [`ROADMAP.md`](./ROADMAP.md) for the full log.
134
+
135
+ > **Egress note:** the live pull needs outbound access to `*.wikidata.org`. In the
136
+ > web/sandbox environment egress is allowlist-gated — if Wikidata isn't allowlisted the
137
+ > live test skips (HTTP 403 `host_not_allowed`) while the offline parser tests still
138
+ > cover correctness.
139
+
140
+ ## Viewing & managing it (human console)
141
+ The product is agent-facing (MCP), but you (human) need a cockpit. There are
142
+ **two faces over one source of truth** (the append-only store): the MCP server for
143
+ agents, and a read-only console for you.
144
+
145
+ ```bash
146
+ cd koreaapi
147
+ PYTHONPATH=src python -m koreaapi.admin seed # populate koreaapi.db (offline sample)
148
+ PYTHONPATH=src python -m koreaapi.admin pull # LIVE: Wikidata+Wikipedia cross-verified snapshots (+agency)
149
+ PYTHONPATH=src python -m koreaapi.admin sweep # LIVE: discover labelmates from each anchored agency (SPARQL)
150
+ PYTHONPATH=src python -m koreaapi.admin youtube # LIVE: official-channel release snapshots (needs YOUTUBE_API_KEY)
151
+ PYTHONPATH=src python -m koreaapi.admin chart # LIVE: Circle Chart (LLM-extract, grounding-guarded; needs key)
152
+ PYTHONPATH=src python -m koreaapi.admin export # write data/ asset (history + latest.json)
153
+ PYTHONPATH=src python -m koreaapi.admin signals # top behavioral signals (engine 2: what agents query)
154
+ PYTHONPATH=src python -m koreaapi.admin stats # data-quality summary
155
+ PYTHONPATH=src python -m koreaapi.admin dump # print recent snapshots
156
+ PYTHONPATH=src python -m koreaapi.admin report # -> report.html (open in a browser)
157
+
158
+ # zero-code interactive browse + query + JSON API over the same DB:
159
+ pip install datasette && datasette koreaapi.db
160
+ ```
161
+
162
+ **Automated collection (cron).** `.github/workflows/collect.yml` runs `admin pull` +
163
+ `admin export` daily (and on manual dispatch) and commits the growing data asset back to
164
+ the repo: `koreaapi/data/snapshots.jsonl` (append-only history) + `latest.json` (current
165
+ state, crawlable for GEO). It runs on GitHub's runners — **open network, so the live pull
166
+ works there** even though the dev sandbox blocks Wikidata egress. Production scales this to
167
+ Postgres behind the same insert-only contract (see `pipeline/store.py`); the repo file set
168
+ is the zero-cost cold-start "database".
169
+
170
+ **Public GEO page.** `.github/workflows/pages.yml` builds `report.html` from live data and
171
+ deploys it to GitHub Pages (one-time enable: Settings → Pages → Source: GitHub Actions) — a
172
+ public, crawlable, JSON-LD-bearing URL so answer engines can surface and cite the verified data.
173
+
174
+ Watch the headline metric of a verifiable-data business: **avg Skill Score,
175
+ freshness, and source agreement** - that is literally watching the moat.
176
+
177
+ ## Agent face (MCP server)
178
+ The product itself: an MCP server exposing 5 tools, each returning verified, bilingual,
179
+ provenance-bearing data (with a ready-to-cite line) from the same store the console reads.
180
+
181
+ | Tool | Returns |
182
+ |---|---|
183
+ | `get_artist_status(artist_id)` | latest status across kinds + verified facts + agency |
184
+ | `get_kculture_calendar(window_days)` | upcoming comebacks / releases / concerts |
185
+ | `get_agency(name)` | artists verified under a 소속사/label (the agency hub) |
186
+ | `get_korea_rising(category, limit)` | what's rising now, ranked by observed demand + Skill Score |
187
+ | `get_buy_options(item)` | where to buy (Phase 1: rail pending; logs buy-intent) |
188
+
189
+ ```bash
190
+ cd koreaapi
191
+ pip install fastmcp # use a venv if system deps clash
192
+ PYTHONPATH=src python -m koreaapi.server # serves over MCP (stdio)
193
+ ```
194
+
195
+ Logic lives in `service.py` (pure, offline-tested); `server.py` is the thin MCP
196
+ binding. Tools register cleanly (verified in an isolated venv).
197
+
198
+ **Install / connect it in your agent:** see [`docs/MCP_INSTALL.md`](./docs/MCP_INSTALL.md)
199
+ (run command, Claude-Desktop config, and [`smithery.yaml`](./smithery.yaml) for the Smithery registry).
@@ -0,0 +1,54 @@
1
+ # KoreaAPI — North Star & Durability Principles
2
+
3
+ > The load-bearing doctrine. Every design and build decision is checked against this.
4
+
5
+ ## North star
6
+ Accumulate — faster and more verifiably than anyone in the world — the **live state of
7
+ Korean culture & commerce**, using cheap AI as the collection labor, and monetize the
8
+ **transactions and trust** that flow through it.
9
+
10
+ **Hard requirement: the model must keep making money as AI models get *better*, not worse.**
11
+
12
+ ## Why this gets STRONGER as AI advances
13
+ The bottleneck of the AI era is shifting from *intelligence* (becoming abundant and
14
+ cheap) to *trustworthy real-world input + the right to act* (scarce). We own the scarce
15
+ complement for Korea. Better models feed us; they do not erode us.
16
+
17
+ | As AI models advance... | Effect on us |
18
+ |---|---|
19
+ | Training cutoff can't hold live state | Agents need fresh feeds → **demand ↑** |
20
+ | Smarter models hallucinate more *plausibly* | Verifiable ground truth + provenance worth **more**, not less |
21
+ | Our append-only history was never in any training set | **Un-reconstructable**; compounds daily |
22
+ | Behavioral signal is generated by usage | Exclusive, never trainable; grows with adoption |
23
+ | Agents recommend & transact more confidently | More volume through our **commission rail** |
24
+ | Extraction models get cheaper/better | Our **collection cost ↓** (the progress that helps rivals helps us first) |
25
+
26
+ Demand ↑ + our cost ↓ + history compounding = the flywheel **accelerates** with model progress.
27
+
28
+ ## What we DO and DON'T build
29
+ **DO (model-advancement-proof):**
30
+ - Live / future state (calendars, charts, prices) — structurally outside any model
31
+ - Verification: Skill Score + provenance on every record
32
+ - Append-only history (never overwrite) — the un-reconstructable asset
33
+ - Proprietary behavioral signal (what agents query / buy)
34
+ - Transaction-attached monetization (commission) + data subscription
35
+
36
+ **DON'T (eroded by better models):**
37
+ - Static evergreen knowledge the model already has (generic encyclopedia)
38
+ - Commodity raw data a browsing agent fetches itself (plain public-API wrapping)
39
+ - Pure reformatting / translation as the *only* value (verification, not translation, is the value)
40
+
41
+ ## Design invariants (check every change against these)
42
+ 1. **Append, never overwrite.** History is the moat.
43
+ 2. **Every record carries provenance + Skill Score.** No unverifiable data ships.
44
+ 3. **Korean canonical, English distribution; official names over translation.**
45
+ 4. **Every new tool must add live-state, verification, or history value** — never just restate model-known facts.
46
+ 5. **Prefer transaction-attached revenue** (commission) over pure read fees.
47
+ 6. **Use cheap AI as collection labor;** collection cost should fall as models improve.
48
+ 7. **One source of truth, two faces** (agent MCP + human console) — never a second data path.
49
+
50
+ ## The main long-term risk (and the answer)
51
+ Disintermediation by a model/platform building the Korean layer themselves. Answer:
52
+ they build *horizontal intelligence*, not *vertical Korean ground truth*. We win by
53
+ owning the **cited default + the transaction rail + the behavioral flywheel** inside the
54
+ 12–18 month window — before anyone bothers.