@pencil-agent/nano-pencil 2.0.0-beta.8 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (241) hide show
  1. package/README.md +267 -267
  2. package/dist/build-meta.json +3 -3
  3. package/dist/core/export-html/AGENT.md +11 -11
  4. package/dist/core/export-html/template.css +971 -971
  5. package/dist/core/export-html/template.html +54 -54
  6. package/dist/core/extensions-host/index.d.ts +1 -1
  7. package/dist/core/extensions-host/loader.js +1 -1
  8. package/dist/core/extensions-host/runner.d.ts +1 -0
  9. package/dist/core/extensions-host/runner.js +2 -2
  10. package/dist/core/extensions-host/types.d.ts +17 -22
  11. package/dist/core/lib/ai/src/types.d.ts +12 -2
  12. package/dist/core/persona/persona-manager.js +5 -2
  13. package/dist/core/runtime/agent-session.js +3 -3
  14. package/dist/core/runtime/extension-core-bindings.d.ts +1 -0
  15. package/dist/core/runtime/extension-core-bindings.js +2 -2
  16. package/dist/extensions/builtin/AGENT.md +115 -115
  17. package/dist/extensions/builtin/browser/AGENT.md +17 -17
  18. package/dist/extensions/builtin/browser/agent-workspace/agent_helpers.py +12 -12
  19. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/amazon/product-search.md +198 -198
  20. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/archive-org/scraping.md +341 -341
  21. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/arxiv/scraping.md +311 -311
  22. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/arxiv-bulk/scraping.md +333 -333
  23. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/atlas/overview.md +70 -70
  24. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/booking-com/scraping.md +578 -578
  25. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/capterra/scraping.md +440 -440
  26. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/centilebrain/generate-estimates.md +110 -110
  27. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/coingecko/scraping.md +325 -325
  28. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/coinmarketcap/scraping.md +463 -463
  29. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/coursera/scraping.md +360 -360
  30. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/craigslist/scraping.md +390 -390
  31. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/crossref/scraping.md +568 -568
  32. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/dev-to/scraping.md +323 -323
  33. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/duckduckgo/scraping.md +349 -349
  34. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/ebay/scraping.md +435 -435
  35. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/etsy/scraping.md +506 -506
  36. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/eventbrite/scraping.md +363 -363
  37. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/expedia/automation.md +168 -168
  38. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/facebook/groups.md +236 -236
  39. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/facebook/pages.md +295 -295
  40. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/framer/editor.md +108 -108
  41. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/fred/scraping.md +493 -493
  42. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/g2/scraping.md +580 -580
  43. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/genius/scraping.md +511 -511
  44. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/github/repo-actions.md +65 -65
  45. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/github/scraping.md +184 -184
  46. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/glassdoor/scraping.md +543 -543
  47. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/gmail/compose.md +122 -122
  48. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/goodreads/scraping.md +461 -461
  49. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/gutenberg/scraping.md +383 -383
  50. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/hackernews/scraping.md +243 -243
  51. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/howlongtobeat/scraping.md +473 -473
  52. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/imdb/scraping.md +271 -271
  53. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/itch-io/scraping.md +436 -436
  54. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/job-boards/indeed-glassdoor.md +1021 -1021
  55. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/letterboxd/scraping.md +349 -349
  56. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/linkedin/invitation-manager.md +109 -109
  57. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/loom/folder-enumeration.md +170 -170
  58. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/macrotrends/scraping.md +537 -537
  59. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/medium/article-hydration.md +120 -120
  60. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/medium/scraping.md +414 -414
  61. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/metacritic/scraping.md +477 -477
  62. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/musicbrainz/scraping.md +478 -478
  63. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/nasa/scraping.md +339 -339
  64. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/news-aggregation/multi-source.md +205 -205
  65. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/open-library/scraping.md +472 -472
  66. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/openalex/scraping.md +470 -470
  67. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/openstreetmap/scraping.md +490 -490
  68. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/package-registries/npm-pypi.md +478 -478
  69. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/polymarket/scraping.md +234 -234
  70. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/producthunt/scraping.md +307 -307
  71. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/pubmed/scraping.md +421 -421
  72. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/quora/scraping.md +364 -364
  73. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/rawg/scraping.md +352 -352
  74. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/reddit/scraping.md +124 -124
  75. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/rest-countries/scraping.md +233 -233
  76. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/sec-edgar/scraping.md +361 -361
  77. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/shopify-admin/README.md +36 -36
  78. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/shopify-admin/embedded-apps.md +72 -72
  79. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/shopify-admin/knowledge-base.md +109 -109
  80. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/shopify-admin/polaris-inputs.md +137 -137
  81. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/soundcloud/scraping.md +362 -362
  82. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/spotify/scraping.md +339 -339
  83. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/stackoverflow/scraping.md +435 -435
  84. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/steam/scraping.md +575 -575
  85. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/substack/scraping.md +338 -338
  86. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/thetechgeeks/pricing.md +52 -52
  87. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/tiktok/upload.md +107 -107
  88. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/tradingview/scraping.md +309 -309
  89. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/trello/boards-and-lists.md +88 -88
  90. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/trustpilot/scraping.md +375 -375
  91. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/walmart/scraping.md +444 -444
  92. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/wayback-machine/scraping.md +306 -306
  93. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/weather/scraping.md +398 -398
  94. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/wellfound/scraping.md +596 -596
  95. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/world-bank/scraping.md +356 -356
  96. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/xiaohongshu/scraping.md +84 -84
  97. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/youtube/scraping.md +418 -418
  98. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/zillow/scraping.md +433 -433
  99. package/dist/extensions/builtin/browser/browser.md +73 -73
  100. package/dist/extensions/builtin/browser/install.md +142 -142
  101. package/dist/extensions/builtin/browser/interaction-skills/connection.md +48 -48
  102. package/dist/extensions/builtin/browser/interaction-skills/cookies.md +3 -3
  103. package/dist/extensions/builtin/browser/interaction-skills/cross-origin-iframes.md +3 -3
  104. package/dist/extensions/builtin/browser/interaction-skills/dialogs.md +64 -64
  105. package/dist/extensions/builtin/browser/interaction-skills/downloads.md +3 -3
  106. package/dist/extensions/builtin/browser/interaction-skills/drag-and-drop.md +3 -3
  107. package/dist/extensions/builtin/browser/interaction-skills/dropdowns.md +3 -3
  108. package/dist/extensions/builtin/browser/interaction-skills/iframes.md +3 -3
  109. package/dist/extensions/builtin/browser/interaction-skills/network-requests.md +3 -3
  110. package/dist/extensions/builtin/browser/interaction-skills/print-as-pdf.md +3 -3
  111. package/dist/extensions/builtin/browser/interaction-skills/profile-sync.md +90 -90
  112. package/dist/extensions/builtin/browser/interaction-skills/screenshots.md +17 -17
  113. package/dist/extensions/builtin/browser/interaction-skills/scrolling.md +3 -3
  114. package/dist/extensions/builtin/browser/interaction-skills/shadow-dom.md +3 -3
  115. package/dist/extensions/builtin/browser/interaction-skills/tabs.md +69 -69
  116. package/dist/extensions/builtin/browser/interaction-skills/uploads.md +1 -1
  117. package/dist/extensions/builtin/browser/interaction-skills/viewport.md +3 -3
  118. package/dist/extensions/builtin/browser/src/browser_harness/AGENT.md +15 -15
  119. package/dist/extensions/builtin/browser/src/browser_harness/__init__.py +8 -8
  120. package/dist/extensions/builtin/browser/src/browser_harness/_ipc.py +90 -90
  121. package/dist/extensions/builtin/browser/src/browser_harness/admin.py +722 -722
  122. package/dist/extensions/builtin/browser/src/browser_harness/daemon.py +328 -328
  123. package/dist/extensions/builtin/browser/src/browser_harness/helpers.py +396 -396
  124. package/dist/extensions/builtin/browser/src/browser_harness/run.py +103 -103
  125. package/dist/extensions/builtin/discipline/skills/brainstorming/SKILL.md +33 -33
  126. package/dist/extensions/builtin/discipline/skills/executing-plans/SKILL.md +25 -25
  127. package/dist/extensions/builtin/discipline/skills/finishing-development-branch/SKILL.md +25 -25
  128. package/dist/extensions/builtin/discipline/skills/receiving-code-review/SKILL.md +22 -22
  129. package/dist/extensions/builtin/discipline/skills/requesting-code-review/SKILL.md +31 -31
  130. package/dist/extensions/builtin/discipline/skills/systematic-debugging/SKILL.md +28 -28
  131. package/dist/extensions/builtin/discipline/skills/test-driven-development/SKILL.md +32 -32
  132. package/dist/extensions/builtin/discipline/skills/using-git-worktrees/SKILL.md +25 -25
  133. package/dist/extensions/builtin/discipline/skills/verification-before-completion/SKILL.md +27 -27
  134. package/dist/extensions/builtin/discipline/skills/writing-plans/SKILL.md +26 -26
  135. package/dist/extensions/builtin/goal/README.md +67 -67
  136. package/dist/extensions/builtin/goal/goal-controller.d.ts +39 -10
  137. package/dist/extensions/builtin/goal/goal-controller.js +1 -1
  138. package/dist/extensions/builtin/goal/goal-format.js +1 -1
  139. package/dist/extensions/builtin/goal/goal-prompts.d.ts +2 -0
  140. package/dist/extensions/builtin/goal/goal-prompts.js +5 -4
  141. package/dist/extensions/builtin/goal/goal-store.js +1 -1
  142. package/dist/extensions/builtin/goal/index.d.ts +1 -1
  143. package/dist/extensions/builtin/goal/index.js +10 -7
  144. package/dist/extensions/builtin/grub/README.md +112 -112
  145. package/dist/extensions/builtin/link-world/agent-workspace/README.md +16 -16
  146. package/dist/extensions/builtin/link-world/index.js +6 -6
  147. package/dist/extensions/builtin/link-world/internet-search/internet-search.md +65 -65
  148. package/dist/extensions/builtin/link-world/link-world-agent.md +82 -82
  149. package/dist/extensions/builtin/link-world/linkworld.md +313 -313
  150. package/dist/extensions/builtin/link-world/{network-routing.md → network-routing/network-routing.md} +67 -67
  151. package/dist/extensions/builtin/loop/README.md +92 -92
  152. package/dist/extensions/builtin/mcp/figma-design.md +68 -68
  153. package/dist/extensions/builtin/mcp/mcp-management.md +85 -85
  154. package/dist/extensions/builtin/plan/index.js +1 -1
  155. package/dist/extensions/builtin/recap/AGENT.md +15 -15
  156. package/dist/extensions/builtin/sal/README.md +72 -72
  157. package/dist/extensions/builtin/security-audit/README.md +289 -289
  158. package/dist/extensions/builtin/task/task-store.d.ts +4 -0
  159. package/dist/extensions/builtin/task/task-store.js +1 -1
  160. package/dist/extensions/builtin/team/AGENT.md +112 -112
  161. package/dist/extensions/builtin/team/TESTING.md +299 -299
  162. package/dist/extensions/builtin/token-save/README.md +56 -56
  163. package/dist/extensions/optional/AGENT.md +10 -10
  164. package/dist/index.d.ts +5 -30
  165. package/dist/index.js +1 -1
  166. package/dist/models.d.ts +7 -0
  167. package/dist/models.js +1 -0
  168. package/dist/modes/interactive/components/footer.js +1 -1
  169. package/dist/modes/interactive/components/task-status-panel.d.ts +36 -0
  170. package/dist/modes/interactive/components/task-status-panel.js +1 -0
  171. package/dist/modes/interactive/controllers/stream-render-controller.d.ts +7 -0
  172. package/dist/modes/interactive/controllers/stream-render-controller.js +2 -2
  173. package/dist/modes/interactive/interactive-mode.js +40 -40
  174. package/dist/modes/interactive/state/interactive-state.d.ts +2 -0
  175. package/dist/modes/interactive/state/interactive-state.js +1 -1
  176. package/dist/modes/interactive/theme/dark.json +85 -85
  177. package/dist/modes/interactive/theme/light.json +84 -84
  178. package/dist/modes/interactive/theme/theme-schema.json +335 -335
  179. package/dist/modes/interactive/theme/warm.json +81 -81
  180. package/dist/node_modules/@pencil-agent/ai/dist/cli.js +0 -0
  181. package/dist/node_modules/@pencil-agent/ai/dist/models.generated.js +1 -1
  182. package/dist/node_modules/@pencil-agent/ai/dist/providers/anthropic.js +2 -2
  183. package/dist/node_modules/@pencil-agent/ai/dist/providers/openai-completions.js +5 -5
  184. package/dist/node_modules/@pencil-agent/ai/dist/providers/openai-responses.js +1 -1
  185. package/dist/node_modules/@pencil-agent/ai/dist/stream.js +1 -1
  186. package/dist/packages/protocol/src/commands.d.ts +33 -0
  187. package/dist/packages/protocol/src/flags.d.ts +20 -0
  188. package/dist/packages/protocol/src/hooks.d.ts +17 -0
  189. package/dist/packages/protocol/src/hooks.js +0 -0
  190. package/dist/packages/{extension-sdk → protocol}/src/index.d.ts +7 -4
  191. package/dist/packages/protocol/src/index.js +1 -0
  192. package/dist/packages/{extension-sdk → protocol}/src/lifecycle.d.ts +15 -27
  193. package/dist/packages/protocol/src/lifecycle.js +0 -0
  194. package/dist/packages/{extension-sdk → protocol}/src/tools.d.ts +1 -1
  195. package/dist/packages/protocol/src/tools.js +0 -0
  196. package/dist/public-config.d.ts +12 -0
  197. package/dist/public-config.js +1 -0
  198. package/dist/runtime.d.ts +9 -0
  199. package/dist/runtime.js +1 -0
  200. package/dist/session-compaction.d.ts +7 -0
  201. package/dist/session-compaction.js +1 -0
  202. package/dist/session.d.ts +7 -0
  203. package/dist/session.js +1 -0
  204. package/dist/skills.d.ts +7 -0
  205. package/dist/skills.js +1 -0
  206. package/dist/tools.d.ts +7 -0
  207. package/dist/tools.js +1 -0
  208. package/docs/ACP/345/215/217/350/256/256/351/233/206/346/210/220/345/274/200/345/217/221/346/226/207/346/241/243.md +851 -0
  209. package/docs/SDK-TESTING.md +364 -0
  210. package/docs/codex-goal-command-impl.md +1055 -1055
  211. package/docs/codex-goal-vs-grub.md +500 -500
  212. package/docs/custom-provider.md +27 -27
  213. package/docs/extensions.md +27 -27
  214. package/docs/keybindings.md +27 -27
  215. package/docs/loop /351/207/215/346/236/204/345/256/214/346/210/220/346/200/273/347/273/223.md" +250 -250
  216. package/docs/loop /351/207/215/346/236/204/345/256/214/346/210/220/346/212/245/345/221/212.md" +122 -122
  217. package/docs/loop /351/207/215/346/236/204/346/226/271/346/241/210.md" +1222 -1222
  218. package/docs/loop /351/207/215/346/236/204/346/226/271/346/241/210/345/256/236/347/216/260/346/212/245/345/221/212.md" +158 -158
  219. package/docs/loop /351/207/215/346/236/204/346/226/271/346/241/210/345/257/271/346/257/224/345/210/206/346/236/220.md" +128 -128
  220. package/docs/loop /351/207/215/346/236/204/350/256/241/345/210/222.md" +320 -320
  221. package/docs/loop-usage-examples.md +214 -214
  222. package/docs/mem-core/346/212/200/346/234/257/346/226/207/346/241/243.md +593 -0
  223. package/docs/models.md +27 -27
  224. package/docs/packages.md +27 -27
  225. package/docs/pi-design-philosophy.md +457 -457
  226. package/docs/planmode.md +1987 -1987
  227. package/docs/prompt-templates.md +27 -27
  228. package/docs/providers.md +27 -27
  229. package/docs/sdk.md +27 -27
  230. package/docs/skills.md +27 -27
  231. package/docs/startup-performance-optimization.md +301 -0
  232. package/docs/themes.md +27 -27
  233. package/docs/tui.md +27 -27
  234. package/docs//350/256/244/347/237/245/345/234/260/345/233/276.md +47 -0
  235. package/package.json +190 -162
  236. package/dist/packages/extension-sdk/src/index.js +0 -1
  237. package/docs/cc-agent-design.md +0 -1297
  238. package/docs/cc-tui-design.md +0 -1333
  239. package/docs//345/257/271/346/240/207Claude-Code.md +0 -1775
  240. /package/dist/packages/{extension-sdk/src/lifecycle.js → protocol/src/commands.js} +0 -0
  241. /package/dist/packages/{extension-sdk/src/tools.js → protocol/src/flags.js} +0 -0
@@ -1,511 +1,511 @@
1
- # Genius — Data Extraction
2
-
3
- Field-tested against genius.com on 2026-04-18.
4
- No authentication required for any approach documented here.
5
-
6
- ---
7
-
8
- ## Anti-Bot: http_get Fails, Custom UA Required
9
-
10
- `http_get` uses `User-Agent: Mozilla/5.0` (bare string). Genius returns HTTP 403 for that UA on both HTML pages and internal API endpoints. Adding any OS token (e.g. `(Macintosh; Intel Mac OS X 10_15_7)`) immediately lifts the block — no cookies, no session, no JavaScript required.
11
-
12
- ```python
13
- from helpers import http_get
14
-
15
- def genius_get(url, extra_headers=None):
16
- """Drop-in replacement for http_get on genius.com endpoints."""
17
- headers = {
18
- "User-Agent": (
19
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
20
- "AppleWebKit/537.36 (KHTML, like Gecko) "
21
- "Chrome/120.0.0.0 Safari/537.36"
22
- ),
23
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
24
- "Accept-Language": "en-US,en;q=0.5",
25
- "Accept-Encoding": "gzip",
26
- }
27
- if extra_headers:
28
- headers.update(extra_headers)
29
- return http_get(url, headers=headers)
30
- ```
31
-
32
- Use `genius_get` everywhere in this document instead of bare `http_get`.
33
-
34
- ---
35
-
36
- ## Approach 1 (Fastest): Internal JSON API — No Auth, No Browser
37
-
38
- Genius's own website calls `genius.com/api/*` (not `api.genius.com`) from
39
- its server-side rendering layer. These endpoints are public and require only
40
- a browser-like User-Agent. They return rich structured JSON in ~0.13s.
41
-
42
- ### Song metadata
43
-
44
- ```python
45
- import json
46
- from helpers import http_get
47
-
48
- def genius_get(url, extra_headers=None):
49
- headers = {
50
- "User-Agent": (
51
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
52
- "AppleWebKit/537.36 (KHTML, like Gecko) "
53
- "Chrome/120.0.0.0 Safari/537.36"
54
- ),
55
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
56
- "Accept-Language": "en-US,en;q=0.5",
57
- "Accept-Encoding": "gzip",
58
- }
59
- if extra_headers:
60
- headers.update(extra_headers)
61
- return http_get(url, headers=headers)
62
-
63
- def genius_song(song_id):
64
- """Fetch full song metadata by Genius song ID."""
65
- data = json.loads(genius_get(f"https://genius.com/api/songs/{song_id}"))
66
- return data["response"]["song"]
67
-
68
- song = genius_song(1063)
69
- # All fields available in one call (no auth):
70
- # song["title"] → "Bohemian Rhapsody"
71
- # song["full_title"] → "Bohemian Rhapsody by Queen"
72
- # song["artist_names"] → "Queen"
73
- # song["primary_artist"]["name"] → "Queen"
74
- # song["primary_artist"]["id"] → 563
75
- # song["primary_artist"]["url"] → "https://genius.com/artists/Queen"
76
- # song["release_date"] → "1975-10-31"
77
- # song["release_date_for_display"] → "October 31, 1975"
78
- # song["release_date_components"] → {"year": 1975, "month": 10, "day": 31}
79
- # song["stats"]["pageviews"] → 11067562
80
- # song["stats"]["contributors"] → 516
81
- # song["stats"]["accepted_annotations"] → 20
82
- # song["pyongs_count"] → 703
83
- # song["annotation_count"] → 33
84
- # song["comment_count"] → 253
85
- # song["album"]["name"] → "Studio Collection" (varies by region)
86
- # song["albums"][0]["name"] → "A Night at the Opera" (first = original)
87
- # song["url"] → "https://genius.com/Queen-bohemian-rhapsody-lyrics"
88
- # song["path"] → "/Queen-bohemian-rhapsody-lyrics"
89
- # song["song_art_image_url"] → "https://images.genius.com/718de9d..."
90
- # song["explicit"] → False
91
- # song["language"] → "en"
92
- # song["lyrics_state"] → "complete"
93
- # song["lyrics_verified"] → False
94
- # song["spotify_uuid"] → "7tFiyTwD0nx5a1eklYtX2J"
95
- # song["youtube_url"] → "https://www.youtube.com/watch?v=fJ9rUzIMcZQ"
96
- # song["writer_artists"] → [{"name": "Freddie Mercury", ...}]
97
- # song["producer_artists"] → [{"name": "Roy Thomas Baker"}, {"name": "Queen"}]
98
- # song["featured_artists"] → []
99
-
100
- # Primary album (first in list = original release):
101
- primary_album = song["albums"][0]["name"] # "A Night at the Opera"
102
- ```
103
-
104
- ### Search
105
-
106
- ```python
107
- def genius_search(query, per_page=5):
108
- """Search Genius. Returns sections: top_hit, song, lyric, artist, album, video, article, user."""
109
- url = f"https://genius.com/api/search/multi?per_page={per_page}&q={urllib.parse.quote(query)}"
110
- data = json.loads(genius_get(url))
111
- return data["response"]["sections"]
112
-
113
- import urllib.parse
114
- sections = genius_search("Bohemian Rhapsody Queen", per_page=5)
115
- # sections is a list of dicts with keys: "type", "hits"
116
- # Each hit has: "type", "result"
117
- # For type="song", result has: id, full_title, url, primary_artist, stats, ...
118
-
119
- for section in sections:
120
- if section["type"] == "song":
121
- for hit in section["hits"]:
122
- r = hit["result"]
123
- print(r["full_title"], r["url"], r["id"])
124
- # Bohemian Rhapsody by Queen https://genius.com/Queen-bohemian-rhapsody-lyrics 1063
125
- break
126
-
127
- # Simpler search (song section only):
128
- def genius_search_songs(query, per_page=5):
129
- sections = genius_search(query, per_page)
130
- for s in sections:
131
- if s["type"] == "song":
132
- return [h["result"] for h in s["hits"]]
133
- return []
134
- ```
135
-
136
- ### Artist songs (paginated)
137
-
138
- ```python
139
- def genius_artist_songs(artist_id, per_page=20, sort="popularity"):
140
- """Fetch paginated list of songs for an artist. sort: 'popularity' or 'title'."""
141
- page = 1
142
- while True:
143
- url = (f"https://genius.com/api/artists/{artist_id}/songs"
144
- f"?per_page={per_page}&page={page}&sort={sort}")
145
- data = json.loads(genius_get(url))["response"]
146
- songs = data["songs"]
147
- if not songs:
148
- break
149
- yield from songs
150
- if data["next_page"] is None:
151
- break
152
- page = data["next_page"]
153
-
154
- # Example: get top 5 Queen songs by popularity
155
- for song in list(genius_artist_songs(563, per_page=5))[:5]:
156
- print(f"{song['full_title']} — {song['stats']['pageviews']:,} views")
157
- # Bohemian Rhapsody by Queen — 11,067,663 views
158
- # Don't Stop Me Now by Queen — 2,453,240 views
159
- # Under Pressure by Queen & David Bowie — 1,972,606 views
160
- # Somebody to Love by Queen — 1,241,740 views
161
- # Killer Queen by Queen — 1,146,813 views
162
- ```
163
-
164
- ---
165
-
166
- ## Approach 2: Lyrics from HTML — Regex on data-lyrics-container
167
-
168
- The lyrics live in `<div data-lyrics-container="true">` elements on the song's
169
- lyrics page. There are usually 3–5 such divs (the song is split across sections).
170
- Each div can contain nested child divs for annotation highlights — including a
171
- `data-exclude-from-selection="true"` header div that must be stripped first.
172
-
173
- ```python
174
- import re, json
175
- from helpers import http_get
176
-
177
- def genius_get(url, extra_headers=None):
178
- headers = {
179
- "User-Agent": (
180
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
181
- "AppleWebKit/537.36 (KHTML, like Gecko) "
182
- "Chrome/120.0.0.0 Safari/537.36"
183
- ),
184
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
185
- "Accept-Language": "en-US,en;q=0.5",
186
- "Accept-Encoding": "gzip",
187
- }
188
- if extra_headers:
189
- headers.update(extra_headers)
190
- return http_get(url, headers=headers)
191
-
192
- def _remove_excluded_divs(html):
193
- """Strip all <div data-exclude-from-selection="true"> subtrees (contributor headers)."""
194
- while True:
195
- idx = html.find('data-exclude-from-selection="true"')
196
- if idx == -1:
197
- break
198
- tag_start = html.rfind("<div", 0, idx)
199
- depth, pos = 0, tag_start
200
- while pos < len(html):
201
- if html[pos:pos+4] == "<div":
202
- depth += 1; pos += 4
203
- elif html[pos:pos+6] == "</div>":
204
- depth -= 1; pos += 6
205
- if depth == 0:
206
- html = html[:tag_start] + html[pos:]
207
- break
208
- else:
209
- pos += 1
210
- else:
211
- break
212
- return html
213
-
214
- def _extract_div_content(html, marker):
215
- """Extract all <div> subtrees that contain the given attribute marker."""
216
- parts = []
217
- start = 0
218
- while True:
219
- idx = html.find(marker, start)
220
- if idx == -1:
221
- break
222
- tag_start = html.rfind("<div", 0, idx)
223
- depth, pos = 0, tag_start
224
- while pos < len(html):
225
- if html[pos:pos+4] == "<div":
226
- depth += 1; pos += 4
227
- elif html[pos:pos+6] == "</div>":
228
- depth -= 1; pos += 6
229
- if depth == 0:
230
- parts.append(html[tag_start:pos])
231
- break
232
- else:
233
- pos += 1
234
- start = idx + 1
235
- return parts
236
-
237
- def _html_to_text(html_str):
238
- """Convert lyrics HTML to plain text, preserving line breaks."""
239
- text = re.sub(r"<br\s*/?>", "\n", html_str)
240
- text = re.sub(r"<[^>]+>", "", text)
241
- text = (text
242
- .replace("&amp;", "&").replace("&lt;", "<").replace("&gt;", ">")
243
- .replace("&#39;", "'").replace("&quot;", '"').replace("&#x27;", "'")
244
- .replace("&#x2F;", "/").replace("&nbsp;", " "))
245
- # Collapse multiple blank lines to one
246
- lines = [l.strip() for l in text.split("\n")]
247
- result, prev_blank = [], False
248
- for line in lines:
249
- if not line:
250
- if not prev_blank:
251
- result.append("")
252
- prev_blank = True
253
- else:
254
- result.append(line)
255
- prev_blank = False
256
- return "\n".join(result).strip()
257
-
258
- def genius_lyrics(url):
259
- """
260
- Scrape lyrics from a Genius song URL.
261
-
262
- url: the canonical lyrics URL, e.g. 'https://genius.com/Queen-bohemian-rhapsody-lyrics'
263
- Returns: plain-text lyrics string with section headers like [Verse 1], [Chorus].
264
- """
265
- html = genius_get(url)
266
- cleaned = _remove_excluded_divs(html)
267
- containers = _extract_div_content(cleaned, 'data-lyrics-container="true"')
268
- parts = []
269
- for c in containers:
270
- text = _html_to_text(c).strip()
271
- if text:
272
- parts.append(text)
273
- return "\n\n".join(parts)
274
-
275
- lyrics = genius_lyrics("https://genius.com/Queen-bohemian-rhapsody-lyrics")
276
- # Returns 2076 chars, 62 lines, structured as:
277
- # [Intro]
278
- # Is this the real life? Is this just fantasy?
279
- # Caught in a landslide, no escape from reality
280
- # ...
281
- # [Verse 1]
282
- # Mama, just killed a man
283
- # ...
284
- # [Outro]
285
- # Nothing really matters to me
286
- # Any way the wind blows
287
- ```
288
-
289
- **Performance:** Lyrics page is ~1.2 MB. One `genius_get` call takes ~0.18s.
290
- No rate limiting observed across 10 rapid sequential requests.
291
-
292
- ---
293
-
294
- ## Approach 3: Combined Workflow — Metadata + Lyrics
295
-
296
- The fastest complete extraction pattern: one API call for all metadata,
297
- one HTML call for lyrics. Song ID can be derived several ways.
298
-
299
- ```python
300
- import json, re, urllib.parse
301
- from helpers import http_get
302
-
303
- def genius_get(url, extra_headers=None):
304
- headers = {
305
- "User-Agent": (
306
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
307
- "AppleWebKit/537.36 (KHTML, like Gecko) "
308
- "Chrome/120.0.0.0 Safari/537.36"
309
- ),
310
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
311
- "Accept-Language": "en-US,en;q=0.5",
312
- "Accept-Encoding": "gzip",
313
- }
314
- if extra_headers:
315
- headers.update(extra_headers)
316
- return http_get(url, headers=headers)
317
-
318
- def genius_song_id_from_url(lyrics_url):
319
- """
320
- Extract Genius song ID from a lyrics page URL without fetching it.
321
- Returns None if not determinable — fall back to fetching the page.
322
- """
323
- # Not possible from the slug alone; must fetch the page or use search.
324
- # From the HTML: <meta content="genius://songs/{id}" name="twitter:app:url:iphone">
325
- html = genius_get(lyrics_url)
326
- m = re.search(r'content="genius://songs/(\d+)"', html)
327
- return int(m.group(1)) if m else None
328
-
329
- def genius_full(query):
330
- """
331
- Search for a song, return metadata + lyrics in two HTTP calls.
332
- """
333
- # Call 1: search for song
334
- sections = json.loads(
335
- genius_get(f"https://genius.com/api/search/multi?per_page=3&q={urllib.parse.quote(query)}")
336
- )["response"]["sections"]
337
- song_result = None
338
- for s in sections:
339
- if s["type"] == "song" and s["hits"]:
340
- song_result = s["hits"][0]["result"]
341
- break
342
- if not song_result:
343
- return None
344
-
345
- song_id = song_result["id"]
346
- lyrics_url = song_result["url"]
347
-
348
- # Call 2: full metadata from internal API
349
- meta = json.loads(genius_get(f"https://genius.com/api/songs/{song_id}"))["response"]["song"]
350
-
351
- # Call 3: lyrics from HTML
352
- lyrics = genius_lyrics(lyrics_url) # uses the function from Approach 2
353
-
354
- return {
355
- "id": meta["id"],
356
- "title": meta["title"],
357
- "artist": meta["primary_artist"]["name"],
358
- "artist_id": meta["primary_artist"]["id"],
359
- "album": meta["albums"][0]["name"] if meta.get("albums") else None,
360
- "release_date": meta["release_date"], # "1975-10-31"
361
- "pageviews": meta["stats"]["pageviews"], # 11067562
362
- "contributors": meta["stats"]["contributors"], # 516
363
- "writers": [a["name"] for a in meta["writer_artists"]],
364
- "producers": [a["name"] for a in meta["producer_artists"]],
365
- "spotify_uuid": meta["spotify_uuid"],
366
- "youtube_url": meta["youtube_url"],
367
- "song_art_url": meta["song_art_image_url"],
368
- "lyrics_url": meta["url"],
369
- "lyrics": lyrics,
370
- }
371
-
372
- result = genius_full("Queen Bohemian Rhapsody")
373
- # {
374
- # "id": 1063,
375
- # "title": "Bohemian Rhapsody",
376
- # "artist": "Queen",
377
- # "artist_id": 563,
378
- # "album": "A Night at the Opera",
379
- # "release_date": "1975-10-31",
380
- # "pageviews": 11067562,
381
- # "contributors": 516,
382
- # "writers": ["Freddie Mercury"],
383
- # "producers": ["Roy Thomas Baker", "Queen"],
384
- # "spotify_uuid": "7tFiyTwD0nx5a1eklYtX2J",
385
- # "youtube_url": "https://www.youtube.com/watch?v=fJ9rUzIMcZQ",
386
- # "song_art_url": "https://images.genius.com/718de9d1fbcaae9f3c9b1bf483bfa8f1.1000x1000x1.png",
387
- # "lyrics_url": "https://genius.com/Queen-bohemian-rhapsody-lyrics",
388
- # "lyrics": "[Intro]\nIs this the real life?..."
389
- # }
390
- ```
391
-
392
- ---
393
-
394
- ## URL and ID Patterns
395
-
396
- | Resource | URL pattern | Notes |
397
- |-------------|-----------------------------------------------|----------------------------------|
398
- | Song page | `genius.com/{Artist}-{song-slug}-lyrics` | Slug is lowercased, hyphenated |
399
- | Artist page | `genius.com/artists/{Artist}` | Title-cased artist name |
400
- | Album page | `genius.com/albums/{Artist}/{album-slug}` | |
401
- | Song API | `genius.com/api/songs/{id}` | Internal; no auth required |
402
- | Artist API | `genius.com/api/artists/{id}` | Internal; no auth required |
403
- | Artist songs| `genius.com/api/artists/{id}/songs?...` | per_page, page, sort params |
404
- | Search API | `genius.com/api/search/multi?per_page=N&q=...`| Internal; multi-section results |
405
-
406
- **Extracting song ID from a known lyrics URL:**
407
-
408
- ```python
409
- # The slug alone cannot be decoded to an ID. Must fetch HTML or search.
410
- # From lyrics page HTML (fastest — one line):
411
- song_id = re.search(r'content="genius://songs/(\d+)"', html).group(1)
412
-
413
- # Or from __PRELOADED_STATE__ (same page, equally reliable):
414
- song_id = re.search(r'\\"song\\":\s*(\d+)', html).group(1)
415
-
416
- # Or from search API (no HTML required):
417
- sections = json.loads(genius_get(f"https://genius.com/api/search/multi?per_page=1&q={query}"))
418
- # then walk sections for type="song"
419
- ```
420
-
421
- ---
422
-
423
- ## What Requires a Browser
424
-
425
- The following are **not available** via `genius_get` / HTTP:
426
-
427
- - **Search results page** (`/search?q=...`): renders client-side only. The
428
- returned HTML contains no song results matching the query. Use the internal
429
- search API (`/api/search/multi`) instead — it works without a browser.
430
-
431
- - **Public API** (`api.genius.com`): returns HTTP 401 without a Bearer token
432
- even with a browser-like User-Agent. Must register at genius.com/developers
433
- to obtain a client access token. The internal site API (`genius.com/api/*`)
434
- is the no-auth alternative and returns equivalent data.
435
-
436
- - **Annotations content**: annotation HTML is embedded in `__PRELOADED_STATE__`
437
- but the JSON is multi-escaped (six levels of backslash nesting) and cannot
438
- be reliably parsed with plain string operations. Annotation IDs are
439
- available but their body text is not easily extractable.
440
-
441
- - **Login-gated features**: user library, personalization, editor tools.
442
-
443
- ---
444
-
445
- ## Public API (api.genius.com) — Requires Bearer Token
446
-
447
- If you have a token (free registration at genius.com/developers):
448
-
449
- ```python
450
- def genius_api(path, token):
451
- """Call the official public API. path example: '/songs/1063'"""
452
- import json
453
- from helpers import http_get
454
- url = f"https://api.genius.com{path}"
455
- return json.loads(http_get(url, headers={"Authorization": f"Bearer {token}"}))
456
-
457
- # Returns same structure as the internal /api/* endpoints.
458
- # Endpoints: /songs/{id}, /artists/{id}, /artists/{id}/songs, /search?q=...
459
- # Without a token: HTTP 401 with body:
460
- # {"meta": {"status": 401, "message": "This call requires an access_token..."}}
461
- ```
462
-
463
- ---
464
-
465
- ## Gotchas
466
-
467
- - **`http_get` returns 403**: The default `User-Agent: Mozilla/5.0` (bare) is
468
- blocked. Add any OS string — `(Macintosh; Intel Mac OS X 10_15_7)` is
469
- sufficient. Use the `genius_get` wrapper from this document.
470
-
471
- - **`data-lyrics-container` split across 3–5 divs**: Don't look for a single
472
- lyrics block. Use `_extract_div_content` on all occurrences, then join.
473
- Empty containers (`<div ...></div>`, 87 bytes) appear between sections —
474
- the `if text:` guard skips them cleanly.
475
-
476
- - **`data-exclude-from-selection` header in first container**: The first
477
- lyrics container includes a contributor credit header div. It must be
478
- stripped before text extraction or the output will begin with
479
- `"516 ContributorsTranslations..."` instead of `"[Intro]"`.
480
-
481
- - **`album` field vs `albums[0]`**: `song["album"]` is the "primary" album
482
- used by Genius's album link (often a compilation or reissue). `song["albums"][0]`
483
- is the first album in the full list and is typically the original release.
484
- Verified: for Bohemian Rhapsody, `album.name` = "Studio Collection" but
485
- `albums[0].name` = "A Night at the Opera".
486
-
487
- - **`__PRELOADED_STATE__` is not parseable**: The state is embedded as
488
- `JSON.parse('...')` where the inner JSON is escaped six levels deep
489
- (`\\\\\"` for a literal quote inside HTML content). Standard string
490
- replacement fails due to `\\'` and `\$` sequences. Don't try to parse it —
491
- use the `/api/songs/{id}` endpoint instead.
492
-
493
- - **No `__NEXT_DATA__`**: Genius does not use Next.js. There is no
494
- `<script id="__NEXT_DATA__">` on any page.
495
-
496
- - **No JSON-LD**: Genius does not emit `<script type="application/ld+json">`.
497
- Open Graph tags are present but minimal (only `og:title`, `og:image`,
498
- `og:description`, `og:url`, `og:type`). Use the API for structured data.
499
-
500
- - **Search page is client-side only**: `GET /search?q=...` returns an HTML
501
- shell with ~5 unrelated song links (trending, not query-matched). The actual
502
- search results are fetched client-side by JavaScript. Use `/api/search/multi`
503
- instead — it works without a browser and returns properly filtered results.
504
-
505
- - **Rate limiting**: No rate limiting observed across 10 rapid sequential
506
- requests to `/api/songs/{id}` (avg 0.13s/request). Song lyrics pages
507
- average 0.18s. No Retry-After headers observed.
508
-
509
- - **Cloudflare**: Present (confirmed by `<meta itemprop="cf-country">` and
510
- `cf-cache-status` tags), but in pass-through mode — no JS challenge, no
511
- CAPTCHA. A browser-like User-Agent is all that's needed.
1
+ # Genius — Data Extraction
2
+
3
+ Field-tested against genius.com on 2026-04-18.
4
+ No authentication required for any approach documented here.
5
+
6
+ ---
7
+
8
+ ## Anti-Bot: http_get Fails, Custom UA Required
9
+
10
+ `http_get` uses `User-Agent: Mozilla/5.0` (bare string). Genius returns HTTP 403 for that UA on both HTML pages and internal API endpoints. Adding any OS token (e.g. `(Macintosh; Intel Mac OS X 10_15_7)`) immediately lifts the block — no cookies, no session, no JavaScript required.
11
+
12
+ ```python
13
+ from helpers import http_get
14
+
15
+ def genius_get(url, extra_headers=None):
16
+ """Drop-in replacement for http_get on genius.com endpoints."""
17
+ headers = {
18
+ "User-Agent": (
19
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
20
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
21
+ "Chrome/120.0.0.0 Safari/537.36"
22
+ ),
23
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
24
+ "Accept-Language": "en-US,en;q=0.5",
25
+ "Accept-Encoding": "gzip",
26
+ }
27
+ if extra_headers:
28
+ headers.update(extra_headers)
29
+ return http_get(url, headers=headers)
30
+ ```
31
+
32
+ Use `genius_get` everywhere in this document instead of bare `http_get`.
33
+
34
+ ---
35
+
36
+ ## Approach 1 (Fastest): Internal JSON API — No Auth, No Browser
37
+
38
+ Genius's own website calls `genius.com/api/*` (not `api.genius.com`) from
39
+ its server-side rendering layer. These endpoints are public and require only
40
+ a browser-like User-Agent. They return rich structured JSON in ~0.13s.
41
+
42
+ ### Song metadata
43
+
44
+ ```python
45
+ import json
46
+ from helpers import http_get
47
+
48
+ def genius_get(url, extra_headers=None):
49
+ headers = {
50
+ "User-Agent": (
51
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
52
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
53
+ "Chrome/120.0.0.0 Safari/537.36"
54
+ ),
55
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
56
+ "Accept-Language": "en-US,en;q=0.5",
57
+ "Accept-Encoding": "gzip",
58
+ }
59
+ if extra_headers:
60
+ headers.update(extra_headers)
61
+ return http_get(url, headers=headers)
62
+
63
+ def genius_song(song_id):
64
+ """Fetch full song metadata by Genius song ID."""
65
+ data = json.loads(genius_get(f"https://genius.com/api/songs/{song_id}"))
66
+ return data["response"]["song"]
67
+
68
+ song = genius_song(1063)
69
+ # All fields available in one call (no auth):
70
+ # song["title"] → "Bohemian Rhapsody"
71
+ # song["full_title"] → "Bohemian Rhapsody by Queen"
72
+ # song["artist_names"] → "Queen"
73
+ # song["primary_artist"]["name"] → "Queen"
74
+ # song["primary_artist"]["id"] → 563
75
+ # song["primary_artist"]["url"] → "https://genius.com/artists/Queen"
76
+ # song["release_date"] → "1975-10-31"
77
+ # song["release_date_for_display"] → "October 31, 1975"
78
+ # song["release_date_components"] → {"year": 1975, "month": 10, "day": 31}
79
+ # song["stats"]["pageviews"] → 11067562
80
+ # song["stats"]["contributors"] → 516
81
+ # song["stats"]["accepted_annotations"] → 20
82
+ # song["pyongs_count"] → 703
83
+ # song["annotation_count"] → 33
84
+ # song["comment_count"] → 253
85
+ # song["album"]["name"] → "Studio Collection" (varies by region)
86
+ # song["albums"][0]["name"] → "A Night at the Opera" (first = original)
87
+ # song["url"] → "https://genius.com/Queen-bohemian-rhapsody-lyrics"
88
+ # song["path"] → "/Queen-bohemian-rhapsody-lyrics"
89
+ # song["song_art_image_url"] → "https://images.genius.com/718de9d..."
90
+ # song["explicit"] → False
91
+ # song["language"] → "en"
92
+ # song["lyrics_state"] → "complete"
93
+ # song["lyrics_verified"] → False
94
+ # song["spotify_uuid"] → "7tFiyTwD0nx5a1eklYtX2J"
95
+ # song["youtube_url"] → "https://www.youtube.com/watch?v=fJ9rUzIMcZQ"
96
+ # song["writer_artists"] → [{"name": "Freddie Mercury", ...}]
97
+ # song["producer_artists"] → [{"name": "Roy Thomas Baker"}, {"name": "Queen"}]
98
+ # song["featured_artists"] → []
99
+
100
+ # Primary album (first in list = original release):
101
+ primary_album = song["albums"][0]["name"] # "A Night at the Opera"
102
+ ```
103
+
104
+ ### Search
105
+
106
+ ```python
107
+ def genius_search(query, per_page=5):
108
+ """Search Genius. Returns sections: top_hit, song, lyric, artist, album, video, article, user."""
109
+ url = f"https://genius.com/api/search/multi?per_page={per_page}&q={urllib.parse.quote(query)}"
110
+ data = json.loads(genius_get(url))
111
+ return data["response"]["sections"]
112
+
113
+ import urllib.parse
114
+ sections = genius_search("Bohemian Rhapsody Queen", per_page=5)
115
+ # sections is a list of dicts with keys: "type", "hits"
116
+ # Each hit has: "type", "result"
117
+ # For type="song", result has: id, full_title, url, primary_artist, stats, ...
118
+
119
+ for section in sections:
120
+ if section["type"] == "song":
121
+ for hit in section["hits"]:
122
+ r = hit["result"]
123
+ print(r["full_title"], r["url"], r["id"])
124
+ # Bohemian Rhapsody by Queen https://genius.com/Queen-bohemian-rhapsody-lyrics 1063
125
+ break
126
+
127
+ # Simpler search (song section only):
128
+ def genius_search_songs(query, per_page=5):
129
+ sections = genius_search(query, per_page)
130
+ for s in sections:
131
+ if s["type"] == "song":
132
+ return [h["result"] for h in s["hits"]]
133
+ return []
134
+ ```
135
+
136
+ ### Artist songs (paginated)
137
+
138
+ ```python
139
+ def genius_artist_songs(artist_id, per_page=20, sort="popularity"):
140
+ """Fetch paginated list of songs for an artist. sort: 'popularity' or 'title'."""
141
+ page = 1
142
+ while True:
143
+ url = (f"https://genius.com/api/artists/{artist_id}/songs"
144
+ f"?per_page={per_page}&page={page}&sort={sort}")
145
+ data = json.loads(genius_get(url))["response"]
146
+ songs = data["songs"]
147
+ if not songs:
148
+ break
149
+ yield from songs
150
+ if data["next_page"] is None:
151
+ break
152
+ page = data["next_page"]
153
+
154
+ # Example: get top 5 Queen songs by popularity
155
+ for song in list(genius_artist_songs(563, per_page=5))[:5]:
156
+ print(f"{song['full_title']} — {song['stats']['pageviews']:,} views")
157
+ # Bohemian Rhapsody by Queen — 11,067,663 views
158
+ # Don't Stop Me Now by Queen — 2,453,240 views
159
+ # Under Pressure by Queen & David Bowie — 1,972,606 views
160
+ # Somebody to Love by Queen — 1,241,740 views
161
+ # Killer Queen by Queen — 1,146,813 views
162
+ ```
163
+
164
+ ---
165
+
166
+ ## Approach 2: Lyrics from HTML — Regex on data-lyrics-container
167
+
168
+ The lyrics live in `<div data-lyrics-container="true">` elements on the song's
169
+ lyrics page. There are usually 3–5 such divs (the song is split across sections).
170
+ Each div can contain nested child divs for annotation highlights — including a
171
+ `data-exclude-from-selection="true"` header div that must be stripped first.
172
+
173
+ ```python
174
+ import re, json
175
+ from helpers import http_get
176
+
177
+ def genius_get(url, extra_headers=None):
178
+ headers = {
179
+ "User-Agent": (
180
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
181
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
182
+ "Chrome/120.0.0.0 Safari/537.36"
183
+ ),
184
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
185
+ "Accept-Language": "en-US,en;q=0.5",
186
+ "Accept-Encoding": "gzip",
187
+ }
188
+ if extra_headers:
189
+ headers.update(extra_headers)
190
+ return http_get(url, headers=headers)
191
+
192
+ def _remove_excluded_divs(html):
193
+ """Strip all <div data-exclude-from-selection="true"> subtrees (contributor headers)."""
194
+ while True:
195
+ idx = html.find('data-exclude-from-selection="true"')
196
+ if idx == -1:
197
+ break
198
+ tag_start = html.rfind("<div", 0, idx)
199
+ depth, pos = 0, tag_start
200
+ while pos < len(html):
201
+ if html[pos:pos+4] == "<div":
202
+ depth += 1; pos += 4
203
+ elif html[pos:pos+6] == "</div>":
204
+ depth -= 1; pos += 6
205
+ if depth == 0:
206
+ html = html[:tag_start] + html[pos:]
207
+ break
208
+ else:
209
+ pos += 1
210
+ else:
211
+ break
212
+ return html
213
+
214
+ def _extract_div_content(html, marker):
215
+ """Extract all <div> subtrees that contain the given attribute marker."""
216
+ parts = []
217
+ start = 0
218
+ while True:
219
+ idx = html.find(marker, start)
220
+ if idx == -1:
221
+ break
222
+ tag_start = html.rfind("<div", 0, idx)
223
+ depth, pos = 0, tag_start
224
+ while pos < len(html):
225
+ if html[pos:pos+4] == "<div":
226
+ depth += 1; pos += 4
227
+ elif html[pos:pos+6] == "</div>":
228
+ depth -= 1; pos += 6
229
+ if depth == 0:
230
+ parts.append(html[tag_start:pos])
231
+ break
232
+ else:
233
+ pos += 1
234
+ start = idx + 1
235
+ return parts
236
+
237
+ def _html_to_text(html_str):
238
+ """Convert lyrics HTML to plain text, preserving line breaks."""
239
+ text = re.sub(r"<br\s*/?>", "\n", html_str)
240
+ text = re.sub(r"<[^>]+>", "", text)
241
+ text = (text
242
+ .replace("&amp;", "&").replace("&lt;", "<").replace("&gt;", ">")
243
+ .replace("&#39;", "'").replace("&quot;", '"').replace("&#x27;", "'")
244
+ .replace("&#x2F;", "/").replace("&nbsp;", " "))
245
+ # Collapse multiple blank lines to one
246
+ lines = [l.strip() for l in text.split("\n")]
247
+ result, prev_blank = [], False
248
+ for line in lines:
249
+ if not line:
250
+ if not prev_blank:
251
+ result.append("")
252
+ prev_blank = True
253
+ else:
254
+ result.append(line)
255
+ prev_blank = False
256
+ return "\n".join(result).strip()
257
+
258
+ def genius_lyrics(url):
259
+ """
260
+ Scrape lyrics from a Genius song URL.
261
+
262
+ url: the canonical lyrics URL, e.g. 'https://genius.com/Queen-bohemian-rhapsody-lyrics'
263
+ Returns: plain-text lyrics string with section headers like [Verse 1], [Chorus].
264
+ """
265
+ html = genius_get(url)
266
+ cleaned = _remove_excluded_divs(html)
267
+ containers = _extract_div_content(cleaned, 'data-lyrics-container="true"')
268
+ parts = []
269
+ for c in containers:
270
+ text = _html_to_text(c).strip()
271
+ if text:
272
+ parts.append(text)
273
+ return "\n\n".join(parts)
274
+
275
+ lyrics = genius_lyrics("https://genius.com/Queen-bohemian-rhapsody-lyrics")
276
+ # Returns 2076 chars, 62 lines, structured as:
277
+ # [Intro]
278
+ # Is this the real life? Is this just fantasy?
279
+ # Caught in a landslide, no escape from reality
280
+ # ...
281
+ # [Verse 1]
282
+ # Mama, just killed a man
283
+ # ...
284
+ # [Outro]
285
+ # Nothing really matters to me
286
+ # Any way the wind blows
287
+ ```
288
+
289
+ **Performance:** Lyrics page is ~1.2 MB. One `genius_get` call takes ~0.18s.
290
+ No rate limiting observed across 10 rapid sequential requests.
291
+
292
+ ---
293
+
294
+ ## Approach 3: Combined Workflow — Metadata + Lyrics
295
+
296
+ The fastest complete extraction pattern: one API call for all metadata,
297
+ one HTML call for lyrics. Song ID can be derived several ways.
298
+
299
+ ```python
300
+ import json, re, urllib.parse
301
+ from helpers import http_get
302
+
303
+ def genius_get(url, extra_headers=None):
304
+ headers = {
305
+ "User-Agent": (
306
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
307
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
308
+ "Chrome/120.0.0.0 Safari/537.36"
309
+ ),
310
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
311
+ "Accept-Language": "en-US,en;q=0.5",
312
+ "Accept-Encoding": "gzip",
313
+ }
314
+ if extra_headers:
315
+ headers.update(extra_headers)
316
+ return http_get(url, headers=headers)
317
+
318
+ def genius_song_id_from_url(lyrics_url):
319
+ """
320
+ Extract Genius song ID from a lyrics page URL without fetching it.
321
+ Returns None if not determinable — fall back to fetching the page.
322
+ """
323
+ # Not possible from the slug alone; must fetch the page or use search.
324
+ # From the HTML: <meta content="genius://songs/{id}" name="twitter:app:url:iphone">
325
+ html = genius_get(lyrics_url)
326
+ m = re.search(r'content="genius://songs/(\d+)"', html)
327
+ return int(m.group(1)) if m else None
328
+
329
+ def genius_full(query):
330
+ """
331
+ Search for a song, return metadata + lyrics in two HTTP calls.
332
+ """
333
+ # Call 1: search for song
334
+ sections = json.loads(
335
+ genius_get(f"https://genius.com/api/search/multi?per_page=3&q={urllib.parse.quote(query)}")
336
+ )["response"]["sections"]
337
+ song_result = None
338
+ for s in sections:
339
+ if s["type"] == "song" and s["hits"]:
340
+ song_result = s["hits"][0]["result"]
341
+ break
342
+ if not song_result:
343
+ return None
344
+
345
+ song_id = song_result["id"]
346
+ lyrics_url = song_result["url"]
347
+
348
+ # Call 2: full metadata from internal API
349
+ meta = json.loads(genius_get(f"https://genius.com/api/songs/{song_id}"))["response"]["song"]
350
+
351
+ # Call 3: lyrics from HTML
352
+ lyrics = genius_lyrics(lyrics_url) # uses the function from Approach 2
353
+
354
+ return {
355
+ "id": meta["id"],
356
+ "title": meta["title"],
357
+ "artist": meta["primary_artist"]["name"],
358
+ "artist_id": meta["primary_artist"]["id"],
359
+ "album": meta["albums"][0]["name"] if meta.get("albums") else None,
360
+ "release_date": meta["release_date"], # "1975-10-31"
361
+ "pageviews": meta["stats"]["pageviews"], # 11067562
362
+ "contributors": meta["stats"]["contributors"], # 516
363
+ "writers": [a["name"] for a in meta["writer_artists"]],
364
+ "producers": [a["name"] for a in meta["producer_artists"]],
365
+ "spotify_uuid": meta["spotify_uuid"],
366
+ "youtube_url": meta["youtube_url"],
367
+ "song_art_url": meta["song_art_image_url"],
368
+ "lyrics_url": meta["url"],
369
+ "lyrics": lyrics,
370
+ }
371
+
372
+ result = genius_full("Queen Bohemian Rhapsody")
373
+ # {
374
+ # "id": 1063,
375
+ # "title": "Bohemian Rhapsody",
376
+ # "artist": "Queen",
377
+ # "artist_id": 563,
378
+ # "album": "A Night at the Opera",
379
+ # "release_date": "1975-10-31",
380
+ # "pageviews": 11067562,
381
+ # "contributors": 516,
382
+ # "writers": ["Freddie Mercury"],
383
+ # "producers": ["Roy Thomas Baker", "Queen"],
384
+ # "spotify_uuid": "7tFiyTwD0nx5a1eklYtX2J",
385
+ # "youtube_url": "https://www.youtube.com/watch?v=fJ9rUzIMcZQ",
386
+ # "song_art_url": "https://images.genius.com/718de9d1fbcaae9f3c9b1bf483bfa8f1.1000x1000x1.png",
387
+ # "lyrics_url": "https://genius.com/Queen-bohemian-rhapsody-lyrics",
388
+ # "lyrics": "[Intro]\nIs this the real life?..."
389
+ # }
390
+ ```
391
+
392
+ ---
393
+
394
+ ## URL and ID Patterns
395
+
396
+ | Resource | URL pattern | Notes |
397
+ |-------------|-----------------------------------------------|----------------------------------|
398
+ | Song page | `genius.com/{Artist}-{song-slug}-lyrics` | Slug is lowercased, hyphenated |
399
+ | Artist page | `genius.com/artists/{Artist}` | Title-cased artist name |
400
+ | Album page | `genius.com/albums/{Artist}/{album-slug}` | |
401
+ | Song API | `genius.com/api/songs/{id}` | Internal; no auth required |
402
+ | Artist API | `genius.com/api/artists/{id}` | Internal; no auth required |
403
+ | Artist songs| `genius.com/api/artists/{id}/songs?...` | per_page, page, sort params |
404
+ | Search API | `genius.com/api/search/multi?per_page=N&q=...`| Internal; multi-section results |
405
+
406
+ **Extracting song ID from a known lyrics URL:**
407
+
408
+ ```python
409
+ # The slug alone cannot be decoded to an ID. Must fetch HTML or search.
410
+ # From lyrics page HTML (fastest — one line):
411
+ song_id = re.search(r'content="genius://songs/(\d+)"', html).group(1)
412
+
413
+ # Or from __PRELOADED_STATE__ (same page, equally reliable):
414
+ song_id = re.search(r'\\"song\\":\s*(\d+)', html).group(1)
415
+
416
+ # Or from search API (no HTML required):
417
+ sections = json.loads(genius_get(f"https://genius.com/api/search/multi?per_page=1&q={query}"))
418
+ # then walk sections for type="song"
419
+ ```
420
+
421
+ ---
422
+
423
+ ## What Requires a Browser
424
+
425
+ The following are **not available** via `genius_get` / HTTP:
426
+
427
+ - **Search results page** (`/search?q=...`): renders client-side only. The
428
+ returned HTML contains no song results matching the query. Use the internal
429
+ search API (`/api/search/multi`) instead — it works without a browser.
430
+
431
+ - **Public API** (`api.genius.com`): returns HTTP 401 without a Bearer token
432
+ even with a browser-like User-Agent. Must register at genius.com/developers
433
+ to obtain a client access token. The internal site API (`genius.com/api/*`)
434
+ is the no-auth alternative and returns equivalent data.
435
+
436
+ - **Annotations content**: annotation HTML is embedded in `__PRELOADED_STATE__`
437
+ but the JSON is multi-escaped (six levels of backslash nesting) and cannot
438
+ be reliably parsed with plain string operations. Annotation IDs are
439
+ available but their body text is not easily extractable.
440
+
441
+ - **Login-gated features**: user library, personalization, editor tools.
442
+
443
+ ---
444
+
445
+ ## Public API (api.genius.com) — Requires Bearer Token
446
+
447
+ If you have a token (free registration at genius.com/developers):
448
+
449
+ ```python
450
+ def genius_api(path, token):
451
+ """Call the official public API. path example: '/songs/1063'"""
452
+ import json
453
+ from helpers import http_get
454
+ url = f"https://api.genius.com{path}"
455
+ return json.loads(http_get(url, headers={"Authorization": f"Bearer {token}"}))
456
+
457
+ # Returns same structure as the internal /api/* endpoints.
458
+ # Endpoints: /songs/{id}, /artists/{id}, /artists/{id}/songs, /search?q=...
459
+ # Without a token: HTTP 401 with body:
460
+ # {"meta": {"status": 401, "message": "This call requires an access_token..."}}
461
+ ```
462
+
463
+ ---
464
+
465
+ ## Gotchas
466
+
467
+ - **`http_get` returns 403**: The default `User-Agent: Mozilla/5.0` (bare) is
468
+ blocked. Add any OS string — `(Macintosh; Intel Mac OS X 10_15_7)` is
469
+ sufficient. Use the `genius_get` wrapper from this document.
470
+
471
+ - **`data-lyrics-container` split across 3–5 divs**: Don't look for a single
472
+ lyrics block. Use `_extract_div_content` on all occurrences, then join.
473
+ Empty containers (`<div ...></div>`, 87 bytes) appear between sections —
474
+ the `if text:` guard skips them cleanly.
475
+
476
+ - **`data-exclude-from-selection` header in first container**: The first
477
+ lyrics container includes a contributor credit header div. It must be
478
+ stripped before text extraction or the output will begin with
479
+ `"516 ContributorsTranslations..."` instead of `"[Intro]"`.
480
+
481
+ - **`album` field vs `albums[0]`**: `song["album"]` is the "primary" album
482
+ used by Genius's album link (often a compilation or reissue). `song["albums"][0]`
483
+ is the first album in the full list and is typically the original release.
484
+ Verified: for Bohemian Rhapsody, `album.name` = "Studio Collection" but
485
+ `albums[0].name` = "A Night at the Opera".
486
+
487
+ - **`__PRELOADED_STATE__` is not parseable**: The state is embedded as
488
+ `JSON.parse('...')` where the inner JSON is escaped six levels deep
489
+ (`\\\\\"` for a literal quote inside HTML content). Standard string
490
+ replacement fails due to `\\'` and `\$` sequences. Don't try to parse it —
491
+ use the `/api/songs/{id}` endpoint instead.
492
+
493
+ - **No `__NEXT_DATA__`**: Genius does not use Next.js. There is no
494
+ `<script id="__NEXT_DATA__">` on any page.
495
+
496
+ - **No JSON-LD**: Genius does not emit `<script type="application/ld+json">`.
497
+ Open Graph tags are present but minimal (only `og:title`, `og:image`,
498
+ `og:description`, `og:url`, `og:type`). Use the API for structured data.
499
+
500
+ - **Search page is client-side only**: `GET /search?q=...` returns an HTML
501
+ shell with ~5 unrelated song links (trending, not query-matched). The actual
502
+ search results are fetched client-side by JavaScript. Use `/api/search/multi`
503
+ instead — it works without a browser and returns properly filtered results.
504
+
505
+ - **Rate limiting**: No rate limiting observed across 10 rapid sequential
506
+ requests to `/api/songs/{id}` (avg 0.13s/request). Song lyrics pages
507
+ average 0.18s. No Retry-After headers observed.
508
+
509
+ - **Cloudflare**: Present (confirmed by `<meta itemprop="cf-country">` and
510
+ `cf-cache-status` tags), but in pass-through mode — no JS challenge, no
511
+ CAPTCHA. A browser-like User-Agent is all that's needed.