@synsci/cli-darwin-x64 1.1.49

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (373) hide show
  1. package/bin/skills/accelerate/SKILL.md +332 -0
  2. package/bin/skills/accelerate/references/custom-plugins.md +453 -0
  3. package/bin/skills/accelerate/references/megatron-integration.md +489 -0
  4. package/bin/skills/accelerate/references/performance.md +525 -0
  5. package/bin/skills/audiocraft/SKILL.md +564 -0
  6. package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
  7. package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
  8. package/bin/skills/autogpt/SKILL.md +403 -0
  9. package/bin/skills/autogpt/references/advanced-usage.md +535 -0
  10. package/bin/skills/autogpt/references/troubleshooting.md +420 -0
  11. package/bin/skills/awq/SKILL.md +310 -0
  12. package/bin/skills/awq/references/advanced-usage.md +324 -0
  13. package/bin/skills/awq/references/troubleshooting.md +344 -0
  14. package/bin/skills/axolotl/SKILL.md +158 -0
  15. package/bin/skills/axolotl/references/api.md +5548 -0
  16. package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
  17. package/bin/skills/axolotl/references/index.md +15 -0
  18. package/bin/skills/axolotl/references/other.md +3563 -0
  19. package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
  20. package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
  21. package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
  22. package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
  23. package/bin/skills/bitsandbytes/SKILL.md +411 -0
  24. package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
  25. package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
  26. package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
  27. package/bin/skills/blip-2/SKILL.md +564 -0
  28. package/bin/skills/blip-2/references/advanced-usage.md +680 -0
  29. package/bin/skills/blip-2/references/troubleshooting.md +526 -0
  30. package/bin/skills/chroma/SKILL.md +406 -0
  31. package/bin/skills/chroma/references/integration.md +38 -0
  32. package/bin/skills/clip/SKILL.md +253 -0
  33. package/bin/skills/clip/references/applications.md +207 -0
  34. package/bin/skills/constitutional-ai/SKILL.md +290 -0
  35. package/bin/skills/crewai/SKILL.md +498 -0
  36. package/bin/skills/crewai/references/flows.md +438 -0
  37. package/bin/skills/crewai/references/tools.md +429 -0
  38. package/bin/skills/crewai/references/troubleshooting.md +480 -0
  39. package/bin/skills/deepspeed/SKILL.md +141 -0
  40. package/bin/skills/deepspeed/references/08.md +17 -0
  41. package/bin/skills/deepspeed/references/09.md +173 -0
  42. package/bin/skills/deepspeed/references/2020.md +378 -0
  43. package/bin/skills/deepspeed/references/2023.md +279 -0
  44. package/bin/skills/deepspeed/references/assets.md +179 -0
  45. package/bin/skills/deepspeed/references/index.md +35 -0
  46. package/bin/skills/deepspeed/references/mii.md +118 -0
  47. package/bin/skills/deepspeed/references/other.md +1191 -0
  48. package/bin/skills/deepspeed/references/tutorials.md +6554 -0
  49. package/bin/skills/dspy/SKILL.md +590 -0
  50. package/bin/skills/dspy/references/examples.md +663 -0
  51. package/bin/skills/dspy/references/modules.md +475 -0
  52. package/bin/skills/dspy/references/optimizers.md +566 -0
  53. package/bin/skills/faiss/SKILL.md +221 -0
  54. package/bin/skills/faiss/references/index_types.md +280 -0
  55. package/bin/skills/flash-attention/SKILL.md +367 -0
  56. package/bin/skills/flash-attention/references/benchmarks.md +215 -0
  57. package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
  58. package/bin/skills/gguf/SKILL.md +427 -0
  59. package/bin/skills/gguf/references/advanced-usage.md +504 -0
  60. package/bin/skills/gguf/references/troubleshooting.md +442 -0
  61. package/bin/skills/gptq/SKILL.md +450 -0
  62. package/bin/skills/gptq/references/calibration.md +337 -0
  63. package/bin/skills/gptq/references/integration.md +129 -0
  64. package/bin/skills/gptq/references/troubleshooting.md +95 -0
  65. package/bin/skills/grpo-rl-training/README.md +97 -0
  66. package/bin/skills/grpo-rl-training/SKILL.md +572 -0
  67. package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
  68. package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
  69. package/bin/skills/guidance/SKILL.md +572 -0
  70. package/bin/skills/guidance/references/backends.md +554 -0
  71. package/bin/skills/guidance/references/constraints.md +674 -0
  72. package/bin/skills/guidance/references/examples.md +767 -0
  73. package/bin/skills/hqq/SKILL.md +445 -0
  74. package/bin/skills/hqq/references/advanced-usage.md +528 -0
  75. package/bin/skills/hqq/references/troubleshooting.md +503 -0
  76. package/bin/skills/hugging-face-cli/SKILL.md +191 -0
  77. package/bin/skills/hugging-face-cli/references/commands.md +954 -0
  78. package/bin/skills/hugging-face-cli/references/examples.md +374 -0
  79. package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
  80. package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
  81. package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
  82. package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
  83. package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
  84. package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
  85. package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
  86. package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
  87. package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
  88. package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
  89. package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
  90. package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
  91. package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
  92. package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
  93. package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
  94. package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
  95. package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
  96. package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
  97. package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
  98. package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
  99. package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
  100. package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
  101. package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
  102. package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
  103. package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
  104. package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
  105. package/bin/skills/hugging-face-jobs/index.html +216 -0
  106. package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
  107. package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
  108. package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
  109. package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
  110. package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
  111. package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
  112. package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
  113. package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
  114. package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
  115. package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
  116. package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
  117. package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
  118. package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
  119. package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
  120. package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
  121. package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
  122. package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
  123. package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
  124. package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
  125. package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
  126. package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
  127. package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
  128. package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
  129. package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
  130. package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
  131. package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
  132. package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
  133. package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
  134. package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
  135. package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
  136. package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
  137. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
  138. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
  139. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
  140. package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
  141. package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
  142. package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
  143. package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
  144. package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
  145. package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
  146. package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
  147. package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
  148. package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
  149. package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
  150. package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
  151. package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
  152. package/bin/skills/instructor/SKILL.md +740 -0
  153. package/bin/skills/instructor/references/examples.md +107 -0
  154. package/bin/skills/instructor/references/providers.md +70 -0
  155. package/bin/skills/instructor/references/validation.md +606 -0
  156. package/bin/skills/knowledge-distillation/SKILL.md +458 -0
  157. package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
  158. package/bin/skills/lambda-labs/SKILL.md +545 -0
  159. package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
  160. package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
  161. package/bin/skills/langchain/SKILL.md +480 -0
  162. package/bin/skills/langchain/references/agents.md +499 -0
  163. package/bin/skills/langchain/references/integration.md +562 -0
  164. package/bin/skills/langchain/references/rag.md +600 -0
  165. package/bin/skills/langsmith/SKILL.md +422 -0
  166. package/bin/skills/langsmith/references/advanced-usage.md +548 -0
  167. package/bin/skills/langsmith/references/troubleshooting.md +537 -0
  168. package/bin/skills/litgpt/SKILL.md +469 -0
  169. package/bin/skills/litgpt/references/custom-models.md +568 -0
  170. package/bin/skills/litgpt/references/distributed-training.md +451 -0
  171. package/bin/skills/litgpt/references/supported-models.md +336 -0
  172. package/bin/skills/litgpt/references/training-recipes.md +619 -0
  173. package/bin/skills/llama-cpp/SKILL.md +258 -0
  174. package/bin/skills/llama-cpp/references/optimization.md +89 -0
  175. package/bin/skills/llama-cpp/references/quantization.md +213 -0
  176. package/bin/skills/llama-cpp/references/server.md +125 -0
  177. package/bin/skills/llama-factory/SKILL.md +80 -0
  178. package/bin/skills/llama-factory/references/_images.md +23 -0
  179. package/bin/skills/llama-factory/references/advanced.md +1055 -0
  180. package/bin/skills/llama-factory/references/getting_started.md +349 -0
  181. package/bin/skills/llama-factory/references/index.md +19 -0
  182. package/bin/skills/llama-factory/references/other.md +31 -0
  183. package/bin/skills/llamaguard/SKILL.md +337 -0
  184. package/bin/skills/llamaindex/SKILL.md +569 -0
  185. package/bin/skills/llamaindex/references/agents.md +83 -0
  186. package/bin/skills/llamaindex/references/data_connectors.md +108 -0
  187. package/bin/skills/llamaindex/references/query_engines.md +406 -0
  188. package/bin/skills/llava/SKILL.md +304 -0
  189. package/bin/skills/llava/references/training.md +197 -0
  190. package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
  191. package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
  192. package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
  193. package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
  194. package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
  195. package/bin/skills/long-context/SKILL.md +536 -0
  196. package/bin/skills/long-context/references/extension_methods.md +468 -0
  197. package/bin/skills/long-context/references/fine_tuning.md +611 -0
  198. package/bin/skills/long-context/references/rope.md +402 -0
  199. package/bin/skills/mamba/SKILL.md +260 -0
  200. package/bin/skills/mamba/references/architecture-details.md +206 -0
  201. package/bin/skills/mamba/references/benchmarks.md +255 -0
  202. package/bin/skills/mamba/references/training-guide.md +388 -0
  203. package/bin/skills/megatron-core/SKILL.md +366 -0
  204. package/bin/skills/megatron-core/references/benchmarks.md +249 -0
  205. package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
  206. package/bin/skills/megatron-core/references/production-examples.md +473 -0
  207. package/bin/skills/megatron-core/references/training-recipes.md +547 -0
  208. package/bin/skills/miles/SKILL.md +315 -0
  209. package/bin/skills/miles/references/api-reference.md +141 -0
  210. package/bin/skills/miles/references/troubleshooting.md +352 -0
  211. package/bin/skills/mlflow/SKILL.md +704 -0
  212. package/bin/skills/mlflow/references/deployment.md +744 -0
  213. package/bin/skills/mlflow/references/model-registry.md +770 -0
  214. package/bin/skills/mlflow/references/tracking.md +680 -0
  215. package/bin/skills/modal/SKILL.md +341 -0
  216. package/bin/skills/modal/references/advanced-usage.md +503 -0
  217. package/bin/skills/modal/references/troubleshooting.md +494 -0
  218. package/bin/skills/model-merging/SKILL.md +539 -0
  219. package/bin/skills/model-merging/references/evaluation.md +462 -0
  220. package/bin/skills/model-merging/references/examples.md +428 -0
  221. package/bin/skills/model-merging/references/methods.md +352 -0
  222. package/bin/skills/model-pruning/SKILL.md +495 -0
  223. package/bin/skills/model-pruning/references/wanda.md +347 -0
  224. package/bin/skills/moe-training/SKILL.md +526 -0
  225. package/bin/skills/moe-training/references/architectures.md +432 -0
  226. package/bin/skills/moe-training/references/inference.md +348 -0
  227. package/bin/skills/moe-training/references/training.md +425 -0
  228. package/bin/skills/nanogpt/SKILL.md +290 -0
  229. package/bin/skills/nanogpt/references/architecture.md +382 -0
  230. package/bin/skills/nanogpt/references/data.md +476 -0
  231. package/bin/skills/nanogpt/references/training.md +564 -0
  232. package/bin/skills/nemo-curator/SKILL.md +383 -0
  233. package/bin/skills/nemo-curator/references/deduplication.md +87 -0
  234. package/bin/skills/nemo-curator/references/filtering.md +102 -0
  235. package/bin/skills/nemo-evaluator/SKILL.md +494 -0
  236. package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
  237. package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
  238. package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
  239. package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
  240. package/bin/skills/nemo-guardrails/SKILL.md +297 -0
  241. package/bin/skills/nnsight/SKILL.md +436 -0
  242. package/bin/skills/nnsight/references/README.md +78 -0
  243. package/bin/skills/nnsight/references/api.md +344 -0
  244. package/bin/skills/nnsight/references/tutorials.md +300 -0
  245. package/bin/skills/openrlhf/SKILL.md +249 -0
  246. package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
  247. package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
  248. package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
  249. package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
  250. package/bin/skills/outlines/SKILL.md +652 -0
  251. package/bin/skills/outlines/references/backends.md +615 -0
  252. package/bin/skills/outlines/references/examples.md +773 -0
  253. package/bin/skills/outlines/references/json_generation.md +652 -0
  254. package/bin/skills/peft/SKILL.md +431 -0
  255. package/bin/skills/peft/references/advanced-usage.md +514 -0
  256. package/bin/skills/peft/references/troubleshooting.md +480 -0
  257. package/bin/skills/phoenix/SKILL.md +475 -0
  258. package/bin/skills/phoenix/references/advanced-usage.md +619 -0
  259. package/bin/skills/phoenix/references/troubleshooting.md +538 -0
  260. package/bin/skills/pinecone/SKILL.md +358 -0
  261. package/bin/skills/pinecone/references/deployment.md +181 -0
  262. package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
  263. package/bin/skills/pytorch-fsdp/references/index.md +7 -0
  264. package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
  265. package/bin/skills/pytorch-lightning/SKILL.md +346 -0
  266. package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
  267. package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
  268. package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
  269. package/bin/skills/pyvene/SKILL.md +473 -0
  270. package/bin/skills/pyvene/references/README.md +73 -0
  271. package/bin/skills/pyvene/references/api.md +383 -0
  272. package/bin/skills/pyvene/references/tutorials.md +376 -0
  273. package/bin/skills/qdrant/SKILL.md +493 -0
  274. package/bin/skills/qdrant/references/advanced-usage.md +648 -0
  275. package/bin/skills/qdrant/references/troubleshooting.md +631 -0
  276. package/bin/skills/ray-data/SKILL.md +326 -0
  277. package/bin/skills/ray-data/references/integration.md +82 -0
  278. package/bin/skills/ray-data/references/transformations.md +83 -0
  279. package/bin/skills/ray-train/SKILL.md +406 -0
  280. package/bin/skills/ray-train/references/multi-node.md +628 -0
  281. package/bin/skills/rwkv/SKILL.md +260 -0
  282. package/bin/skills/rwkv/references/architecture-details.md +344 -0
  283. package/bin/skills/rwkv/references/rwkv7.md +386 -0
  284. package/bin/skills/rwkv/references/state-management.md +369 -0
  285. package/bin/skills/saelens/SKILL.md +386 -0
  286. package/bin/skills/saelens/references/README.md +70 -0
  287. package/bin/skills/saelens/references/api.md +333 -0
  288. package/bin/skills/saelens/references/tutorials.md +318 -0
  289. package/bin/skills/segment-anything/SKILL.md +500 -0
  290. package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
  291. package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
  292. package/bin/skills/sentence-transformers/SKILL.md +255 -0
  293. package/bin/skills/sentence-transformers/references/models.md +123 -0
  294. package/bin/skills/sentencepiece/SKILL.md +235 -0
  295. package/bin/skills/sentencepiece/references/algorithms.md +200 -0
  296. package/bin/skills/sentencepiece/references/training.md +304 -0
  297. package/bin/skills/sglang/SKILL.md +442 -0
  298. package/bin/skills/sglang/references/deployment.md +490 -0
  299. package/bin/skills/sglang/references/radix-attention.md +413 -0
  300. package/bin/skills/sglang/references/structured-generation.md +541 -0
  301. package/bin/skills/simpo/SKILL.md +219 -0
  302. package/bin/skills/simpo/references/datasets.md +478 -0
  303. package/bin/skills/simpo/references/hyperparameters.md +452 -0
  304. package/bin/skills/simpo/references/loss-functions.md +350 -0
  305. package/bin/skills/skypilot/SKILL.md +509 -0
  306. package/bin/skills/skypilot/references/advanced-usage.md +491 -0
  307. package/bin/skills/skypilot/references/troubleshooting.md +570 -0
  308. package/bin/skills/slime/SKILL.md +464 -0
  309. package/bin/skills/slime/references/api-reference.md +392 -0
  310. package/bin/skills/slime/references/troubleshooting.md +386 -0
  311. package/bin/skills/speculative-decoding/SKILL.md +467 -0
  312. package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
  313. package/bin/skills/speculative-decoding/references/medusa.md +350 -0
  314. package/bin/skills/stable-diffusion/SKILL.md +519 -0
  315. package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
  316. package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
  317. package/bin/skills/tensorboard/SKILL.md +629 -0
  318. package/bin/skills/tensorboard/references/integrations.md +638 -0
  319. package/bin/skills/tensorboard/references/profiling.md +545 -0
  320. package/bin/skills/tensorboard/references/visualization.md +620 -0
  321. package/bin/skills/tensorrt-llm/SKILL.md +187 -0
  322. package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
  323. package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
  324. package/bin/skills/tensorrt-llm/references/serving.md +470 -0
  325. package/bin/skills/tinker/SKILL.md +362 -0
  326. package/bin/skills/tinker/references/api-reference.md +168 -0
  327. package/bin/skills/tinker/references/getting-started.md +157 -0
  328. package/bin/skills/tinker/references/loss-functions.md +163 -0
  329. package/bin/skills/tinker/references/models-and-lora.md +139 -0
  330. package/bin/skills/tinker/references/recipes.md +280 -0
  331. package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
  332. package/bin/skills/tinker/references/rendering.md +243 -0
  333. package/bin/skills/tinker/references/supervised-learning.md +232 -0
  334. package/bin/skills/tinker-training-cost/SKILL.md +187 -0
  335. package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
  336. package/bin/skills/torchforge/SKILL.md +433 -0
  337. package/bin/skills/torchforge/references/api-reference.md +327 -0
  338. package/bin/skills/torchforge/references/troubleshooting.md +409 -0
  339. package/bin/skills/torchtitan/SKILL.md +358 -0
  340. package/bin/skills/torchtitan/references/checkpoint.md +181 -0
  341. package/bin/skills/torchtitan/references/custom-models.md +258 -0
  342. package/bin/skills/torchtitan/references/float8.md +133 -0
  343. package/bin/skills/torchtitan/references/fsdp.md +126 -0
  344. package/bin/skills/transformer-lens/SKILL.md +346 -0
  345. package/bin/skills/transformer-lens/references/README.md +54 -0
  346. package/bin/skills/transformer-lens/references/api.md +362 -0
  347. package/bin/skills/transformer-lens/references/tutorials.md +339 -0
  348. package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
  349. package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
  350. package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
  351. package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
  352. package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
  353. package/bin/skills/unsloth/SKILL.md +80 -0
  354. package/bin/skills/unsloth/references/index.md +7 -0
  355. package/bin/skills/unsloth/references/llms-full.md +16799 -0
  356. package/bin/skills/unsloth/references/llms-txt.md +12044 -0
  357. package/bin/skills/unsloth/references/llms.md +82 -0
  358. package/bin/skills/verl/SKILL.md +391 -0
  359. package/bin/skills/verl/references/api-reference.md +301 -0
  360. package/bin/skills/verl/references/troubleshooting.md +391 -0
  361. package/bin/skills/vllm/SKILL.md +364 -0
  362. package/bin/skills/vllm/references/optimization.md +226 -0
  363. package/bin/skills/vllm/references/quantization.md +284 -0
  364. package/bin/skills/vllm/references/server-deployment.md +255 -0
  365. package/bin/skills/vllm/references/troubleshooting.md +447 -0
  366. package/bin/skills/weights-and-biases/SKILL.md +590 -0
  367. package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
  368. package/bin/skills/weights-and-biases/references/integrations.md +700 -0
  369. package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
  370. package/bin/skills/whisper/SKILL.md +317 -0
  371. package/bin/skills/whisper/references/languages.md +189 -0
  372. package/bin/synsc +0 -0
  373. package/package.json +10 -0
@@ -0,0 +1,362 @@
1
+ # TransformerLens API Reference
2
+
3
+ ## HookedTransformer
4
+
5
+ The core class for mechanistic interpretability, wrapping transformer models with hooks on every activation.
6
+
7
+ ### Loading Models
8
+
9
+ ```python
10
+ from transformer_lens import HookedTransformer
11
+
12
+ # Basic loading
13
+ model = HookedTransformer.from_pretrained("gpt2-small")
14
+
15
+ # With specific device/dtype
16
+ model = HookedTransformer.from_pretrained(
17
+ "gpt2-medium",
18
+ device="cuda",
19
+ dtype=torch.float16
20
+ )
21
+
22
+ # Gated models (LLaMA, Mistral)
23
+ import os
24
+ os.environ["HF_TOKEN"] = "your_token"
25
+ model = HookedTransformer.from_pretrained("meta-llama/Llama-2-7b-hf")
26
+ ```
27
+
28
+ ### from_pretrained() Parameters
29
+
30
+ | Parameter | Type | Default | Description |
31
+ |-----------|------|---------|-------------|
32
+ | `model_name` | str | required | Model name from OFFICIAL_MODEL_NAMES |
33
+ | `fold_ln` | bool | True | Fold LayerNorm weights into subsequent layers |
34
+ | `center_writing_weights` | bool | True | Center residual stream writer means |
35
+ | `center_unembed` | bool | True | Center unembedding weights |
36
+ | `dtype` | torch.dtype | None | Model precision |
37
+ | `device` | str | None | Target device |
38
+ | `n_devices` | int | 1 | Number of devices for model parallelism |
39
+
40
+ ### Weight Matrices
41
+
42
+ | Property | Shape | Description |
43
+ |----------|-------|-------------|
44
+ | `W_E` | [d_vocab, d_model] | Token embedding matrix |
45
+ | `W_U` | [d_model, d_vocab] | Unembedding matrix |
46
+ | `W_pos` | [n_ctx, d_model] | Positional embedding |
47
+ | `W_Q` | [n_layers, n_heads, d_model, d_head] | Query weights |
48
+ | `W_K` | [n_layers, n_heads, d_model, d_head] | Key weights |
49
+ | `W_V` | [n_layers, n_heads, d_model, d_head] | Value weights |
50
+ | `W_O` | [n_layers, n_heads, d_head, d_model] | Output weights |
51
+ | `W_in` | [n_layers, d_model, d_mlp] | MLP input weights |
52
+ | `W_out` | [n_layers, d_mlp, d_model] | MLP output weights |
53
+
54
+ ### Core Methods
55
+
56
+ #### forward()
57
+
58
+ ```python
59
+ logits = model(tokens)
60
+ logits = model(tokens, return_type="logits")
61
+ loss = model(tokens, return_type="loss")
62
+ logits, loss = model(tokens, return_type="both")
63
+ ```
64
+
65
+ Parameters:
66
+ - `input`: Token tensor or string
67
+ - `return_type`: "logits", "loss", "both", or None
68
+ - `prepend_bos`: Whether to prepend BOS token
69
+ - `start_at_layer`: Start execution from specific layer
70
+ - `stop_at_layer`: Stop execution at specific layer
71
+
72
+ #### run_with_cache()
73
+
74
+ ```python
75
+ logits, cache = model.run_with_cache(tokens)
76
+
77
+ # Selective caching (saves memory)
78
+ logits, cache = model.run_with_cache(
79
+ tokens,
80
+ names_filter=lambda name: "resid_post" in name
81
+ )
82
+
83
+ # Cache on CPU
84
+ logits, cache = model.run_with_cache(tokens, device="cpu")
85
+ ```
86
+
87
+ #### run_with_hooks()
88
+
89
+ ```python
90
+ def my_hook(activation, hook):
91
+ # Modify activation
92
+ activation[:, :, 0] = 0
93
+ return activation
94
+
95
+ logits = model.run_with_hooks(
96
+ tokens,
97
+ fwd_hooks=[("blocks.5.hook_resid_post", my_hook)]
98
+ )
99
+ ```
100
+
101
+ #### generate()
102
+
103
+ ```python
104
+ output = model.generate(
105
+ tokens,
106
+ max_new_tokens=50,
107
+ temperature=0.7,
108
+ top_k=40,
109
+ top_p=0.9,
110
+ freq_penalty=1.0,
111
+ use_past_kv_cache=True
112
+ )
113
+ ```
114
+
115
+ ### Tokenization Methods
116
+
117
+ ```python
118
+ # String to tokens
119
+ tokens = model.to_tokens("Hello world") # [1, seq_len]
120
+ tokens = model.to_tokens("Hello", prepend_bos=False)
121
+
122
+ # Tokens to string
123
+ text = model.to_string(tokens)
124
+
125
+ # Get string tokens (for debugging)
126
+ str_tokens = model.to_str_tokens("Hello world")
127
+ # ['<|endoftext|>', 'Hello', ' world']
128
+
129
+ # Single token validation
130
+ token_id = model.to_single_token(" Paris") # Returns int or raises error
131
+ ```
132
+
133
+ ### Hook Management
134
+
135
+ ```python
136
+ # Clear all hooks
137
+ model.reset_hooks()
138
+
139
+ # Add permanent hook
140
+ model.add_hook("blocks.0.hook_resid_post", my_hook)
141
+
142
+ # Remove specific hook
143
+ model.remove_hook("blocks.0.hook_resid_post")
144
+ ```
145
+
146
+ ---
147
+
148
+ ## ActivationCache
149
+
150
+ Stores and provides access to all activations from a forward pass.
151
+
152
+ ### Accessing Activations
153
+
154
+ ```python
155
+ logits, cache = model.run_with_cache(tokens)
156
+
157
+ # By name and layer
158
+ residual = cache["resid_post", 5]
159
+ attention = cache["pattern", 3]
160
+ mlp_out = cache["mlp_out", 7]
161
+
162
+ # Full name string
163
+ residual = cache["blocks.5.hook_resid_post"]
164
+ ```
165
+
166
+ ### Cache Keys
167
+
168
+ | Key Pattern | Shape | Description |
169
+ |-------------|-------|-------------|
170
+ | `hook_embed` | [batch, pos, d_model] | Token embeddings |
171
+ | `hook_pos_embed` | [batch, pos, d_model] | Positional embeddings |
172
+ | `resid_pre, layer` | [batch, pos, d_model] | Residual before attention |
173
+ | `resid_mid, layer` | [batch, pos, d_model] | Residual after attention |
174
+ | `resid_post, layer` | [batch, pos, d_model] | Residual after MLP |
175
+ | `attn_out, layer` | [batch, pos, d_model] | Attention output |
176
+ | `mlp_out, layer` | [batch, pos, d_model] | MLP output |
177
+ | `pattern, layer` | [batch, head, q_pos, k_pos] | Attention pattern (post-softmax) |
178
+ | `attn_scores, layer` | [batch, head, q_pos, k_pos] | Attention scores (pre-softmax) |
179
+ | `q, layer` | [batch, pos, head, d_head] | Query vectors |
180
+ | `k, layer` | [batch, pos, head, d_head] | Key vectors |
181
+ | `v, layer` | [batch, pos, head, d_head] | Value vectors |
182
+ | `z, layer` | [batch, pos, head, d_head] | Attention output per head |
183
+
184
+ ### Analysis Methods
185
+
186
+ #### decompose_resid()
187
+
188
+ Decomposes residual stream into component contributions:
189
+
190
+ ```python
191
+ components, labels = cache.decompose_resid(
192
+ layer=5,
193
+ return_labels=True,
194
+ mode="attn" # or "mlp" or "full"
195
+ )
196
+ ```
197
+
198
+ #### accumulated_resid()
199
+
200
+ Get accumulated residual at each layer (for Logit Lens):
201
+
202
+ ```python
203
+ accumulated = cache.accumulated_resid(
204
+ layer=None, # All layers
205
+ incl_mid=False,
206
+ apply_ln=True # Apply final LayerNorm
207
+ )
208
+ ```
209
+
210
+ #### logit_attrs()
211
+
212
+ Calculate logit attribution for components:
213
+
214
+ ```python
215
+ attrs = cache.logit_attrs(
216
+ residual_stack,
217
+ tokens=target_tokens,
218
+ incorrect_tokens=incorrect_tokens
219
+ )
220
+ ```
221
+
222
+ #### stack_head_results()
223
+
224
+ Stack attention head outputs:
225
+
226
+ ```python
227
+ head_results = cache.stack_head_results(
228
+ layer=-1, # All layers
229
+ pos_slice=None # All positions
230
+ )
231
+ # Shape: [n_layers, n_heads, batch, pos, d_model]
232
+ ```
233
+
234
+ ### Utility Methods
235
+
236
+ ```python
237
+ # Move cache to device
238
+ cache = cache.to("cpu")
239
+
240
+ # Remove batch dimension (for batch_size=1)
241
+ cache = cache.remove_batch_dim()
242
+
243
+ # Get all keys
244
+ keys = cache.keys()
245
+
246
+ # Iterate
247
+ for name, activation in cache.items():
248
+ print(name, activation.shape)
249
+ ```
250
+
251
+ ---
252
+
253
+ ## HookPoint
254
+
255
+ The fundamental hook mechanism wrapping every activation.
256
+
257
+ ### Hook Function Signature
258
+
259
+ ```python
260
+ def hook_fn(activation: torch.Tensor, hook: HookPoint) -> torch.Tensor:
261
+ """
262
+ Args:
263
+ activation: Current activation value
264
+ hook: The HookPoint object (has .name attribute)
265
+
266
+ Returns:
267
+ Modified activation (or None to keep original)
268
+ """
269
+ # Modify activation
270
+ return activation
271
+ ```
272
+
273
+ ### Common Hook Patterns
274
+
275
+ ```python
276
+ # Zero ablation
277
+ def zero_hook(act, hook):
278
+ act[:, :, :] = 0
279
+ return act
280
+
281
+ # Mean ablation
282
+ def mean_hook(act, hook):
283
+ act[:, :, :] = act.mean(dim=0, keepdim=True)
284
+ return act
285
+
286
+ # Patch from cache
287
+ def patch_hook(act, hook):
288
+ act[:, 5, :] = clean_cache[hook.name][:, 5, :]
289
+ return act
290
+
291
+ # Add steering vector
292
+ def steer_hook(act, hook):
293
+ act += 0.5 * steering_vector
294
+ return act
295
+ ```
296
+
297
+ ---
298
+
299
+ ## Utility Functions
300
+
301
+ ### patching module
302
+
303
+ ```python
304
+ from transformer_lens import patching
305
+
306
+ # Generic activation patching
307
+ results = patching.generic_activation_patch(
308
+ model=model,
309
+ corrupted_tokens=corrupted,
310
+ clean_cache=clean_cache,
311
+ patching_metric=metric_fn,
312
+ patch_setter=patch_fn,
313
+ activation_name="resid_post",
314
+ index_axis_names=("layer", "pos")
315
+ )
316
+ ```
317
+
318
+ ### FactoredMatrix
319
+
320
+ Efficient operations on factored weight matrices:
321
+
322
+ ```python
323
+ from transformer_lens import FactoredMatrix
324
+
325
+ # QK circuit
326
+ QK = FactoredMatrix(model.W_Q[layer], model.W_K[layer].T)
327
+
328
+ # OV circuit
329
+ OV = FactoredMatrix(model.W_V[layer], model.W_O[layer])
330
+
331
+ # Get full matrix
332
+ full = QK.AB
333
+
334
+ # SVD decomposition
335
+ U, S, V = QK.svd()
336
+ ```
337
+
338
+ ---
339
+
340
+ ## Configuration
341
+
342
+ ### HookedTransformerConfig
343
+
344
+ Key configuration attributes:
345
+
346
+ | Attribute | Description |
347
+ |-----------|-------------|
348
+ | `n_layers` | Number of transformer layers |
349
+ | `n_heads` | Number of attention heads |
350
+ | `d_model` | Model dimension |
351
+ | `d_head` | Head dimension |
352
+ | `d_mlp` | MLP hidden dimension |
353
+ | `d_vocab` | Vocabulary size |
354
+ | `n_ctx` | Maximum context length |
355
+ | `act_fn` | Activation function name |
356
+ | `normalization_type` | "LN" or "LNPre" |
357
+
358
+ Access via:
359
+ ```python
360
+ model.cfg.n_layers
361
+ model.cfg.d_model
362
+ ```
@@ -0,0 +1,339 @@
1
+ # TransformerLens Tutorials
2
+
3
+ ## Tutorial 1: Basic Activation Analysis
4
+
5
+ ### Goal
6
+ Understand how to load models, cache activations, and inspect model internals.
7
+
8
+ ### Step-by-Step
9
+
10
+ ```python
11
+ from transformer_lens import HookedTransformer
12
+ import torch
13
+
14
+ # 1. Load model
15
+ model = HookedTransformer.from_pretrained("gpt2-small")
16
+ print(f"Model has {model.cfg.n_layers} layers, {model.cfg.n_heads} heads")
17
+
18
+ # 2. Tokenize input
19
+ prompt = "The capital of France is"
20
+ tokens = model.to_tokens(prompt)
21
+ print(f"Tokens shape: {tokens.shape}")
22
+ print(f"String tokens: {model.to_str_tokens(prompt)}")
23
+
24
+ # 3. Run with cache
25
+ logits, cache = model.run_with_cache(tokens)
26
+ print(f"Logits shape: {logits.shape}")
27
+ print(f"Cache keys: {len(cache.keys())}")
28
+
29
+ # 4. Inspect activations
30
+ for layer in range(model.cfg.n_layers):
31
+ resid = cache["resid_post", layer]
32
+ print(f"Layer {layer} residual norm: {resid.norm().item():.2f}")
33
+
34
+ # 5. Look at attention patterns
35
+ attn = cache["pattern", 0] # Layer 0
36
+ print(f"Attention shape: {attn.shape}") # [batch, heads, q_pos, k_pos]
37
+
38
+ # 6. Get top predictions
39
+ probs = torch.softmax(logits[0, -1], dim=-1)
40
+ top_tokens = probs.topk(5)
41
+ for token_id, prob in zip(top_tokens.indices, top_tokens.values):
42
+ print(f"{model.to_string(token_id.unsqueeze(0))}: {prob.item():.3f}")
43
+ ```
44
+
45
+ ---
46
+
47
+ ## Tutorial 2: Activation Patching
48
+
49
+ ### Goal
50
+ Identify which activations causally affect model output.
51
+
52
+ ### Concept
53
+ 1. Run model on "clean" input, cache activations
54
+ 2. Run model on "corrupted" input
55
+ 3. Patch clean activations into corrupted run
56
+ 4. Measure effect on output
57
+
58
+ ### Step-by-Step
59
+
60
+ ```python
61
+ from transformer_lens import HookedTransformer
62
+ import torch
63
+
64
+ model = HookedTransformer.from_pretrained("gpt2-small")
65
+
66
+ # Define clean and corrupted prompts
67
+ clean_prompt = "The Eiffel Tower is in the city of"
68
+ corrupted_prompt = "The Colosseum is in the city of"
69
+
70
+ clean_tokens = model.to_tokens(clean_prompt)
71
+ corrupted_tokens = model.to_tokens(corrupted_prompt)
72
+
73
+ # Get clean activations
74
+ _, clean_cache = model.run_with_cache(clean_tokens)
75
+
76
+ # Define metric
77
+ paris_token = model.to_single_token(" Paris")
78
+ rome_token = model.to_single_token(" Rome")
79
+
80
+ def logit_diff(logits):
81
+ """Positive = model prefers Paris over Rome"""
82
+ return (logits[0, -1, paris_token] - logits[0, -1, rome_token]).item()
83
+
84
+ # Baseline measurements
85
+ clean_logits = model(clean_tokens)
86
+ corrupted_logits = model(corrupted_tokens)
87
+ print(f"Clean logit diff: {logit_diff(clean_logits):.3f}")
88
+ print(f"Corrupted logit diff: {logit_diff(corrupted_logits):.3f}")
89
+
90
+ # Patch each layer
91
+ results = []
92
+ for layer in range(model.cfg.n_layers):
93
+ def patch_hook(activation, hook, layer=layer):
94
+ activation[:] = clean_cache["resid_post", layer]
95
+ return activation
96
+
97
+ patched_logits = model.run_with_hooks(
98
+ corrupted_tokens,
99
+ fwd_hooks=[(f"blocks.{layer}.hook_resid_post", patch_hook)]
100
+ )
101
+ results.append(logit_diff(patched_logits))
102
+ print(f"Layer {layer}: {results[-1]:.3f}")
103
+
104
+ # Find most important layer
105
+ best_layer = max(range(len(results)), key=lambda i: results[i])
106
+ print(f"\nMost important layer: {best_layer}")
107
+ ```
108
+
109
+ ### Position-Specific Patching
110
+
111
+ ```python
112
+ import torch
113
+
114
+ seq_len = clean_tokens.shape[1]
115
+ results = torch.zeros(model.cfg.n_layers, seq_len)
116
+
117
+ for layer in range(model.cfg.n_layers):
118
+ for pos in range(seq_len):
119
+ def patch_hook(activation, hook, layer=layer, pos=pos):
120
+ activation[:, pos, :] = clean_cache["resid_post", layer][:, pos, :]
121
+ return activation
122
+
123
+ patched_logits = model.run_with_hooks(
124
+ corrupted_tokens,
125
+ fwd_hooks=[(f"blocks.{layer}.hook_resid_post", patch_hook)]
126
+ )
127
+ results[layer, pos] = logit_diff(patched_logits)
128
+
129
+ # Visualize as heatmap
130
+ import matplotlib.pyplot as plt
131
+ plt.figure(figsize=(12, 8))
132
+ plt.imshow(results.numpy(), aspect='auto', cmap='RdBu')
133
+ plt.xlabel('Position')
134
+ plt.ylabel('Layer')
135
+ plt.colorbar(label='Logit Difference')
136
+ plt.title('Activation Patching Results')
137
+ ```
138
+
139
+ ---
140
+
141
+ ## Tutorial 3: Direct Logit Attribution
142
+
143
+ ### Goal
144
+ Identify which components (heads, neurons) contribute to specific predictions.
145
+
146
+ ### Step-by-Step
147
+
148
+ ```python
149
+ from transformer_lens import HookedTransformer
150
+ import torch
151
+
152
+ model = HookedTransformer.from_pretrained("gpt2-small")
153
+
154
+ prompt = "The capital of France is"
155
+ tokens = model.to_tokens(prompt)
156
+ logits, cache = model.run_with_cache(tokens)
157
+
158
+ # Target token
159
+ target_token = model.to_single_token(" Paris")
160
+
161
+ # Get unembedding direction for target
162
+ target_direction = model.W_U[:, target_token] # [d_model]
163
+
164
+ # Attribution per attention head
165
+ head_contributions = torch.zeros(model.cfg.n_layers, model.cfg.n_heads)
166
+
167
+ for layer in range(model.cfg.n_layers):
168
+ # Get per-head output at final position
169
+ z = cache["z", layer][0, -1] # [n_heads, d_head]
170
+
171
+ for head in range(model.cfg.n_heads):
172
+ # Project through W_O to get contribution to residual
173
+ head_out = z[head] @ model.W_O[layer, head] # [d_model]
174
+
175
+ # Dot with target direction
176
+ contribution = (head_out @ target_direction).item()
177
+ head_contributions[layer, head] = contribution
178
+
179
+ # Find top contributing heads
180
+ flat_idx = head_contributions.flatten().topk(10)
181
+ print("Top 10 heads for predicting 'Paris':")
182
+ for idx, val in zip(flat_idx.indices, flat_idx.values):
183
+ layer = idx.item() // model.cfg.n_heads
184
+ head = idx.item() % model.cfg.n_heads
185
+ print(f" L{layer}H{head}: {val.item():.3f}")
186
+ ```
187
+
188
+ ---
189
+
190
+ ## Tutorial 4: Induction Head Detection
191
+
192
+ ### Goal
193
+ Find attention heads that implement the [A][B]...[A] → [B] pattern.
194
+
195
+ ### Step-by-Step
196
+
197
+ ```python
198
+ from transformer_lens import HookedTransformer
199
+ import torch
200
+
201
+ model = HookedTransformer.from_pretrained("gpt2-small")
202
+
203
+ # Create repeated sequence pattern
204
+ # Pattern: [A][B][C][A] - model should attend from last A to B
205
+ seq = torch.randint(1000, 5000, (1, 20))
206
+ # Repeat first half
207
+ seq[0, 10:] = seq[0, :10]
208
+
209
+ _, cache = model.run_with_cache(seq)
210
+
211
+ # For induction heads: position i should attend to position (i - seq_len/2 + 1)
212
+ # At position 10 (second A), should attend to position 1 (first B)
213
+
214
+ induction_scores = torch.zeros(model.cfg.n_layers, model.cfg.n_heads)
215
+
216
+ for layer in range(model.cfg.n_layers):
217
+ pattern = cache["pattern", layer][0] # [heads, q_pos, k_pos]
218
+
219
+ # Check attention from repeated positions to position after first occurrence
220
+ for offset in range(1, 10):
221
+ q_pos = 10 + offset # Position in second half
222
+ k_pos = offset # Should attend to corresponding position in first half
223
+
224
+ # Average attention to the "correct" position
225
+ induction_scores[layer] += pattern[:, q_pos, k_pos]
226
+
227
+ induction_scores[layer] /= 9 # Average over offsets
228
+
229
+ # Find top induction heads
230
+ print("Top induction heads:")
231
+ for layer in range(model.cfg.n_layers):
232
+ for head in range(model.cfg.n_heads):
233
+ score = induction_scores[layer, head].item()
234
+ if score > 0.3:
235
+ print(f" L{layer}H{head}: {score:.3f}")
236
+ ```
237
+
238
+ ---
239
+
240
+ ## Tutorial 5: Logit Lens
241
+
242
+ ### Goal
243
+ See what the model "believes" at each layer before final unembedding.
244
+
245
+ ### Step-by-Step
246
+
247
+ ```python
248
+ from transformer_lens import HookedTransformer
249
+ import torch
250
+
251
+ model = HookedTransformer.from_pretrained("gpt2-small")
252
+
253
+ prompt = "The quick brown fox jumps over the lazy"
254
+ tokens = model.to_tokens(prompt)
255
+ logits, cache = model.run_with_cache(tokens)
256
+
257
+ # Get accumulated residual at each layer
258
+ # Apply LayerNorm to match what unembedding sees
259
+ accumulated = cache.accumulated_resid(layer=None, incl_mid=False, apply_ln=True)
260
+ # Shape: [n_layers + 1, batch, pos, d_model]
261
+
262
+ # Project to vocabulary
263
+ layer_logits = accumulated @ model.W_U # [n_layers + 1, batch, pos, d_vocab]
264
+
265
+ # Look at predictions for final position
266
+ print("Layer-by-layer predictions for final token:")
267
+ for layer in range(model.cfg.n_layers + 1):
268
+ probs = torch.softmax(layer_logits[layer, 0, -1], dim=-1)
269
+ top_token = probs.argmax()
270
+ top_prob = probs[top_token].item()
271
+ print(f"Layer {layer}: {model.to_string(top_token.unsqueeze(0))!r} ({top_prob:.3f})")
272
+ ```
273
+
274
+ ---
275
+
276
+ ## Tutorial 6: Steering with Activation Addition
277
+
278
+ ### Goal
279
+ Add a steering vector to change model behavior.
280
+
281
+ ### Step-by-Step
282
+
283
+ ```python
284
+ from transformer_lens import HookedTransformer
285
+ import torch
286
+
287
+ model = HookedTransformer.from_pretrained("gpt2-small")
288
+
289
+ # Get activations for contrasting prompts
290
+ positive_prompt = "I love this! It's absolutely wonderful and"
291
+ negative_prompt = "I hate this! It's absolutely terrible and"
292
+
293
+ _, pos_cache = model.run_with_cache(model.to_tokens(positive_prompt))
294
+ _, neg_cache = model.run_with_cache(model.to_tokens(negative_prompt))
295
+
296
+ # Compute steering vector (positive - negative direction)
297
+ layer = 6
298
+ steering_vector = (
299
+ pos_cache["resid_post", layer].mean(dim=1) -
300
+ neg_cache["resid_post", layer].mean(dim=1)
301
+ )
302
+
303
+ # Generate with steering
304
+ test_prompt = "The movie was"
305
+ test_tokens = model.to_tokens(test_prompt)
306
+
307
+ def steer_hook(activation, hook):
308
+ activation += 2.0 * steering_vector
309
+ return activation
310
+
311
+ # Without steering
312
+ normal_output = model.generate(test_tokens, max_new_tokens=20)
313
+ print(f"Normal: {model.to_string(normal_output[0])}")
314
+
315
+ # With positive steering
316
+ steered_output = model.generate(
317
+ test_tokens,
318
+ max_new_tokens=20,
319
+ fwd_hooks=[(f"blocks.{layer}.hook_resid_post", steer_hook)]
320
+ )
321
+ print(f"Steered: {model.to_string(steered_output[0])}")
322
+ ```
323
+
324
+ ---
325
+
326
+ ## External Resources
327
+
328
+ ### Official Tutorials
329
+ - [Main Demo](https://transformerlensorg.github.io/TransformerLens/generated/demos/Main_Demo.html)
330
+ - [Exploratory Analysis](https://transformerlensorg.github.io/TransformerLens/generated/demos/Exploratory_Analysis_Demo.html)
331
+ - [Activation Patching Demo](https://colab.research.google.com/github/TransformerLensOrg/TransformerLens/blob/main/demos/Activation_Patching_in_TL_Demo.ipynb)
332
+
333
+ ### ARENA Course
334
+ Comprehensive 200+ hour curriculum: https://arena-foundation.github.io/ARENA/
335
+
336
+ ### Neel Nanda's Resources
337
+ - [Getting Started in Mech Interp](https://www.neelnanda.io/mechanistic-interpretability/getting-started)
338
+ - [Mech Interp Glossary](https://www.neelnanda.io/mechanistic-interpretability/glossary)
339
+ - [YouTube Channel](https://www.youtube.com/@neelnanda)