@wentorai/research-plugins 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (252) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +204 -0
  3. package/curated/analysis/README.md +64 -0
  4. package/curated/domains/README.md +104 -0
  5. package/curated/literature/README.md +53 -0
  6. package/curated/research/README.md +62 -0
  7. package/curated/tools/README.md +87 -0
  8. package/curated/writing/README.md +61 -0
  9. package/index.ts +39 -0
  10. package/mcp-configs/academic-db/ChatSpatial.json +17 -0
  11. package/mcp-configs/academic-db/academia-mcp.json +17 -0
  12. package/mcp-configs/academic-db/academic-paper-explorer.json +17 -0
  13. package/mcp-configs/academic-db/academic-search-mcp-server.json +17 -0
  14. package/mcp-configs/academic-db/agentinterviews-mcp.json +17 -0
  15. package/mcp-configs/academic-db/all-in-mcp.json +17 -0
  16. package/mcp-configs/academic-db/apple-health-mcp.json +17 -0
  17. package/mcp-configs/academic-db/arxiv-latex-mcp.json +17 -0
  18. package/mcp-configs/academic-db/arxiv-mcp-server.json +17 -0
  19. package/mcp-configs/academic-db/bgpt-mcp.json +17 -0
  20. package/mcp-configs/academic-db/biomcp.json +17 -0
  21. package/mcp-configs/academic-db/biothings-mcp.json +17 -0
  22. package/mcp-configs/academic-db/catalysishub-mcp-server.json +17 -0
  23. package/mcp-configs/academic-db/clinicaltrialsgov-mcp-server.json +17 -0
  24. package/mcp-configs/academic-db/deep-research-mcp.json +17 -0
  25. package/mcp-configs/academic-db/dicom-mcp.json +17 -0
  26. package/mcp-configs/academic-db/enrichr-mcp-server.json +17 -0
  27. package/mcp-configs/academic-db/fec-mcp-server.json +17 -0
  28. package/mcp-configs/academic-db/fhir-mcp-server-themomentum.json +17 -0
  29. package/mcp-configs/academic-db/fhir-mcp.json +19 -0
  30. package/mcp-configs/academic-db/gget-mcp.json +17 -0
  31. package/mcp-configs/academic-db/google-researcher-mcp.json +17 -0
  32. package/mcp-configs/academic-db/idea-reality-mcp.json +17 -0
  33. package/mcp-configs/academic-db/legiscan-mcp.json +19 -0
  34. package/mcp-configs/academic-db/lex.json +17 -0
  35. package/mcp-configs/ai-platform/Adaptive-Graph-of-Thoughts-MCP-server.json +17 -0
  36. package/mcp-configs/ai-platform/ai-counsel.json +17 -0
  37. package/mcp-configs/ai-platform/atlas-mcp-server.json +17 -0
  38. package/mcp-configs/ai-platform/counsel-mcp.json +17 -0
  39. package/mcp-configs/ai-platform/cross-llm-mcp.json +17 -0
  40. package/mcp-configs/ai-platform/gptr-mcp.json +17 -0
  41. package/mcp-configs/browser/decipher-research-agent.json +17 -0
  42. package/mcp-configs/browser/deep-research.json +17 -0
  43. package/mcp-configs/browser/everything-claude-code.json +17 -0
  44. package/mcp-configs/browser/gpt-researcher.json +17 -0
  45. package/mcp-configs/browser/heurist-agent-framework.json +17 -0
  46. package/mcp-configs/data-platform/4everland-hosting-mcp.json +17 -0
  47. package/mcp-configs/data-platform/context-keeper.json +17 -0
  48. package/mcp-configs/data-platform/context7.json +19 -0
  49. package/mcp-configs/data-platform/contextstream-mcp.json +17 -0
  50. package/mcp-configs/data-platform/email-mcp.json +17 -0
  51. package/mcp-configs/note-knowledge/ApeRAG.json +17 -0
  52. package/mcp-configs/note-knowledge/In-Memoria.json +17 -0
  53. package/mcp-configs/note-knowledge/agent-memory.json +17 -0
  54. package/mcp-configs/note-knowledge/aimemo.json +17 -0
  55. package/mcp-configs/note-knowledge/biel-mcp.json +19 -0
  56. package/mcp-configs/note-knowledge/cognee.json +17 -0
  57. package/mcp-configs/note-knowledge/context-awesome.json +17 -0
  58. package/mcp-configs/note-knowledge/context-mcp.json +17 -0
  59. package/mcp-configs/note-knowledge/conversation-handoff-mcp.json +17 -0
  60. package/mcp-configs/note-knowledge/cortex.json +17 -0
  61. package/mcp-configs/note-knowledge/devrag.json +17 -0
  62. package/mcp-configs/note-knowledge/easy-obsidian-mcp.json +17 -0
  63. package/mcp-configs/note-knowledge/engram.json +17 -0
  64. package/mcp-configs/note-knowledge/gnosis-mcp.json +17 -0
  65. package/mcp-configs/note-knowledge/graphlit-mcp-server.json +19 -0
  66. package/mcp-configs/reference-mgr/arxiv-cli.json +17 -0
  67. package/mcp-configs/reference-mgr/arxiv-search-mcp.json +17 -0
  68. package/mcp-configs/reference-mgr/chiken.json +17 -0
  69. package/mcp-configs/reference-mgr/claude-scholar.json +17 -0
  70. package/mcp-configs/reference-mgr/devonthink-mcp.json +17 -0
  71. package/mcp-configs/registry.json +447 -0
  72. package/openclaw.plugin.json +21 -0
  73. package/package.json +61 -0
  74. package/skills/analysis/dataviz/color-accessibility-guide/SKILL.md +230 -0
  75. package/skills/analysis/dataviz/geospatial-viz-guide/SKILL.md +218 -0
  76. package/skills/analysis/dataviz/interactive-viz-guide/SKILL.md +287 -0
  77. package/skills/analysis/dataviz/network-visualization-guide/SKILL.md +195 -0
  78. package/skills/analysis/dataviz/publication-figures-guide/SKILL.md +238 -0
  79. package/skills/analysis/dataviz/python-dataviz-guide/SKILL.md +195 -0
  80. package/skills/analysis/econometrics/causal-inference-guide/SKILL.md +197 -0
  81. package/skills/analysis/econometrics/iv-regression-guide/SKILL.md +198 -0
  82. package/skills/analysis/econometrics/panel-data-guide/SKILL.md +274 -0
  83. package/skills/analysis/econometrics/robustness-checks/SKILL.md +250 -0
  84. package/skills/analysis/econometrics/stata-regression/SKILL.md +117 -0
  85. package/skills/analysis/econometrics/time-series-guide/SKILL.md +235 -0
  86. package/skills/analysis/statistics/bayesian-statistics-guide/SKILL.md +221 -0
  87. package/skills/analysis/statistics/hypothesis-testing-guide/SKILL.md +210 -0
  88. package/skills/analysis/statistics/meta-analysis-guide/SKILL.md +206 -0
  89. package/skills/analysis/statistics/nonparametric-tests-guide/SKILL.md +221 -0
  90. package/skills/analysis/statistics/power-analysis-guide/SKILL.md +240 -0
  91. package/skills/analysis/statistics/sem-guide/SKILL.md +231 -0
  92. package/skills/analysis/statistics/survival-analysis-guide/SKILL.md +195 -0
  93. package/skills/analysis/wrangling/missing-data-handling/SKILL.md +224 -0
  94. package/skills/analysis/wrangling/pandas-data-wrangling/SKILL.md +242 -0
  95. package/skills/analysis/wrangling/questionnaire-design-guide/SKILL.md +234 -0
  96. package/skills/analysis/wrangling/text-mining-guide/SKILL.md +225 -0
  97. package/skills/domains/ai-ml/computer-vision-guide/SKILL.md +213 -0
  98. package/skills/domains/ai-ml/deep-learning-papers-guide/SKILL.md +200 -0
  99. package/skills/domains/ai-ml/llm-evaluation-guide/SKILL.md +194 -0
  100. package/skills/domains/ai-ml/prompt-engineering-research/SKILL.md +233 -0
  101. package/skills/domains/ai-ml/reinforcement-learning-guide/SKILL.md +254 -0
  102. package/skills/domains/ai-ml/transformer-architecture-guide/SKILL.md +233 -0
  103. package/skills/domains/biomedical/clinical-research-guide/SKILL.md +232 -0
  104. package/skills/domains/biomedical/clinicaltrials-api/SKILL.md +177 -0
  105. package/skills/domains/biomedical/epidemiology-guide/SKILL.md +200 -0
  106. package/skills/domains/biomedical/genomics-analysis-guide/SKILL.md +270 -0
  107. package/skills/domains/business/market-analysis-guide/SKILL.md +112 -0
  108. package/skills/domains/business/strategic-management-guide/SKILL.md +154 -0
  109. package/skills/domains/chemistry/computational-chemistry-guide/SKILL.md +266 -0
  110. package/skills/domains/chemistry/retrosynthesis-guide/SKILL.md +215 -0
  111. package/skills/domains/cs/algorithms-complexity-guide/SKILL.md +194 -0
  112. package/skills/domains/cs/dblp-api/SKILL.md +129 -0
  113. package/skills/domains/cs/software-engineering-research/SKILL.md +218 -0
  114. package/skills/domains/ecology/biodiversity-data-guide/SKILL.md +296 -0
  115. package/skills/domains/ecology/conservation-biology-guide/SKILL.md +198 -0
  116. package/skills/domains/ecology/gbif-api/SKILL.md +158 -0
  117. package/skills/domains/ecology/inaturalist-api/SKILL.md +173 -0
  118. package/skills/domains/economics/behavioral-economics-guide/SKILL.md +239 -0
  119. package/skills/domains/economics/development-economics-guide/SKILL.md +181 -0
  120. package/skills/domains/economics/fred-api/SKILL.md +189 -0
  121. package/skills/domains/education/curriculum-design-guide/SKILL.md +144 -0
  122. package/skills/domains/education/learning-science-guide/SKILL.md +150 -0
  123. package/skills/domains/finance/financial-data-analysis/SKILL.md +152 -0
  124. package/skills/domains/finance/quantitative-finance-guide/SKILL.md +151 -0
  125. package/skills/domains/geoscience/climate-science-guide/SKILL.md +158 -0
  126. package/skills/domains/geoscience/gis-remote-sensing-guide/SKILL.md +129 -0
  127. package/skills/domains/humanities/digital-humanities-guide/SKILL.md +181 -0
  128. package/skills/domains/humanities/philosophy-research-guide/SKILL.md +148 -0
  129. package/skills/domains/law/courtlistener-api/SKILL.md +213 -0
  130. package/skills/domains/law/legal-research-guide/SKILL.md +250 -0
  131. package/skills/domains/math/linear-algebra-applications/SKILL.md +227 -0
  132. package/skills/domains/math/numerical-methods-guide/SKILL.md +236 -0
  133. package/skills/domains/math/oeis-api/SKILL.md +158 -0
  134. package/skills/domains/pharma/clinical-pharmacology-guide/SKILL.md +165 -0
  135. package/skills/domains/pharma/drug-development-guide/SKILL.md +177 -0
  136. package/skills/domains/physics/computational-physics-guide/SKILL.md +300 -0
  137. package/skills/domains/physics/nasa-ads-api/SKILL.md +150 -0
  138. package/skills/domains/physics/quantum-computing-guide/SKILL.md +234 -0
  139. package/skills/domains/social-science/social-research-methods/SKILL.md +194 -0
  140. package/skills/domains/social-science/survey-research-guide/SKILL.md +182 -0
  141. package/skills/literature/discovery/citation-alert-guide/SKILL.md +154 -0
  142. package/skills/literature/discovery/conference-proceedings-guide/SKILL.md +142 -0
  143. package/skills/literature/discovery/literature-mapping-guide/SKILL.md +175 -0
  144. package/skills/literature/discovery/paper-tracking-guide/SKILL.md +211 -0
  145. package/skills/literature/discovery/rss-paper-feeds/SKILL.md +214 -0
  146. package/skills/literature/discovery/semantic-scholar-recs-guide/SKILL.md +164 -0
  147. package/skills/literature/fulltext/doaj-api/SKILL.md +120 -0
  148. package/skills/literature/fulltext/interlibrary-loan-guide/SKILL.md +163 -0
  149. package/skills/literature/fulltext/open-access-guide/SKILL.md +183 -0
  150. package/skills/literature/fulltext/pmc-oai-api/SKILL.md +184 -0
  151. package/skills/literature/fulltext/preprint-servers-guide/SKILL.md +128 -0
  152. package/skills/literature/fulltext/repository-harvesting-guide/SKILL.md +207 -0
  153. package/skills/literature/fulltext/unpaywall-api/SKILL.md +113 -0
  154. package/skills/literature/metadata/altmetrics-guide/SKILL.md +132 -0
  155. package/skills/literature/metadata/citation-network-guide/SKILL.md +236 -0
  156. package/skills/literature/metadata/crossref-api/SKILL.md +133 -0
  157. package/skills/literature/metadata/datacite-api/SKILL.md +126 -0
  158. package/skills/literature/metadata/doi-resolution-guide/SKILL.md +168 -0
  159. package/skills/literature/metadata/h-index-guide/SKILL.md +183 -0
  160. package/skills/literature/metadata/journal-metrics-guide/SKILL.md +188 -0
  161. package/skills/literature/metadata/opencitations-api/SKILL.md +128 -0
  162. package/skills/literature/metadata/orcid-api/SKILL.md +136 -0
  163. package/skills/literature/metadata/orcid-integration-guide/SKILL.md +178 -0
  164. package/skills/literature/search/arxiv-api/SKILL.md +95 -0
  165. package/skills/literature/search/biorxiv-api/SKILL.md +123 -0
  166. package/skills/literature/search/boolean-search-guide/SKILL.md +199 -0
  167. package/skills/literature/search/citation-chaining-guide/SKILL.md +148 -0
  168. package/skills/literature/search/database-comparison-guide/SKILL.md +100 -0
  169. package/skills/literature/search/europe-pmc-api/SKILL.md +120 -0
  170. package/skills/literature/search/google-scholar-guide/SKILL.md +182 -0
  171. package/skills/literature/search/mesh-terms-guide/SKILL.md +164 -0
  172. package/skills/literature/search/openalex-api/SKILL.md +134 -0
  173. package/skills/literature/search/pubmed-api/SKILL.md +130 -0
  174. package/skills/literature/search/scientify-literature-survey/SKILL.md +203 -0
  175. package/skills/literature/search/semantic-scholar-api/SKILL.md +134 -0
  176. package/skills/literature/search/systematic-search-strategy/SKILL.md +214 -0
  177. package/skills/research/automation/ai-scientist-guide/SKILL.md +228 -0
  178. package/skills/research/automation/data-collection-automation/SKILL.md +248 -0
  179. package/skills/research/automation/research-workflow-automation/SKILL.md +266 -0
  180. package/skills/research/deep-research/meta-synthesis-guide/SKILL.md +174 -0
  181. package/skills/research/deep-research/research-cog/SKILL.md +153 -0
  182. package/skills/research/deep-research/scoping-review-guide/SKILL.md +217 -0
  183. package/skills/research/deep-research/systematic-review-guide/SKILL.md +250 -0
  184. package/skills/research/funding/figshare-api/SKILL.md +163 -0
  185. package/skills/research/funding/grant-writing-guide/SKILL.md +233 -0
  186. package/skills/research/funding/nsf-grant-guide/SKILL.md +206 -0
  187. package/skills/research/funding/open-science-guide/SKILL.md +255 -0
  188. package/skills/research/funding/zenodo-api/SKILL.md +174 -0
  189. package/skills/research/methodology/action-research-guide/SKILL.md +201 -0
  190. package/skills/research/methodology/experimental-design-guide/SKILL.md +236 -0
  191. package/skills/research/methodology/grad-school-guide/SKILL.md +182 -0
  192. package/skills/research/methodology/grounded-theory-guide/SKILL.md +171 -0
  193. package/skills/research/methodology/mixed-methods-guide/SKILL.md +208 -0
  194. package/skills/research/methodology/qualitative-research-guide/SKILL.md +234 -0
  195. package/skills/research/methodology/scientify-idea-generation/SKILL.md +222 -0
  196. package/skills/research/paper-review/paper-reading-assistant/SKILL.md +266 -0
  197. package/skills/research/paper-review/peer-review-guide/SKILL.md +227 -0
  198. package/skills/research/paper-review/rebuttal-writing-guide/SKILL.md +185 -0
  199. package/skills/research/paper-review/scientify-write-review-paper/SKILL.md +209 -0
  200. package/skills/tools/code-exec/jupyter-notebook-guide/SKILL.md +178 -0
  201. package/skills/tools/code-exec/python-reproducibility-guide/SKILL.md +341 -0
  202. package/skills/tools/code-exec/r-reproducibility-guide/SKILL.md +236 -0
  203. package/skills/tools/code-exec/sandbox-execution-guide/SKILL.md +221 -0
  204. package/skills/tools/diagram/mermaid-diagram-guide/SKILL.md +269 -0
  205. package/skills/tools/diagram/plantuml-guide/SKILL.md +397 -0
  206. package/skills/tools/diagram/scientific-illustration-guide/SKILL.md +225 -0
  207. package/skills/tools/document/anystyle-api/SKILL.md +199 -0
  208. package/skills/tools/document/grobid-pdf-parsing/SKILL.md +294 -0
  209. package/skills/tools/document/markdown-academic-guide/SKILL.md +217 -0
  210. package/skills/tools/document/pdf-extraction-guide/SKILL.md +321 -0
  211. package/skills/tools/knowledge-graph/knowledge-graph-construction/SKILL.md +306 -0
  212. package/skills/tools/knowledge-graph/ontology-design-guide/SKILL.md +214 -0
  213. package/skills/tools/knowledge-graph/rag-methodology-guide/SKILL.md +325 -0
  214. package/skills/tools/ocr-translate/formula-recognition-guide/SKILL.md +367 -0
  215. package/skills/tools/ocr-translate/handwriting-recognition-guide/SKILL.md +211 -0
  216. package/skills/tools/ocr-translate/latex-ocr-guide/SKILL.md +204 -0
  217. package/skills/tools/ocr-translate/multilingual-research-guide/SKILL.md +234 -0
  218. package/skills/tools/scraping/academic-web-scraping/SKILL.md +326 -0
  219. package/skills/tools/scraping/api-data-collection-guide/SKILL.md +301 -0
  220. package/skills/tools/scraping/web-scraping-ethics-guide/SKILL.md +250 -0
  221. package/skills/writing/citation/bibtex-management-guide/SKILL.md +246 -0
  222. package/skills/writing/citation/citation-style-guide/SKILL.md +248 -0
  223. package/skills/writing/citation/reference-manager-comparison/SKILL.md +208 -0
  224. package/skills/writing/citation/zotero-api/SKILL.md +188 -0
  225. package/skills/writing/composition/abstract-writing-guide/SKILL.md +188 -0
  226. package/skills/writing/composition/discussion-writing-guide/SKILL.md +194 -0
  227. package/skills/writing/composition/introduction-writing-guide/SKILL.md +194 -0
  228. package/skills/writing/composition/literature-review-writing/SKILL.md +196 -0
  229. package/skills/writing/composition/methods-section-guide/SKILL.md +185 -0
  230. package/skills/writing/composition/response-to-reviewers/SKILL.md +215 -0
  231. package/skills/writing/composition/scientific-writing-guide/SKILL.md +152 -0
  232. package/skills/writing/latex/bibliography-management-guide/SKILL.md +206 -0
  233. package/skills/writing/latex/latex-drawing-guide/SKILL.md +234 -0
  234. package/skills/writing/latex/latex-ecosystem-guide/SKILL.md +240 -0
  235. package/skills/writing/latex/math-typesetting-guide/SKILL.md +231 -0
  236. package/skills/writing/latex/overleaf-collaboration-guide/SKILL.md +211 -0
  237. package/skills/writing/latex/tikz-diagrams-guide/SKILL.md +211 -0
  238. package/skills/writing/polish/academic-translation-guide/SKILL.md +175 -0
  239. package/skills/writing/polish/academic-writing-refiner/SKILL.md +143 -0
  240. package/skills/writing/polish/ai-writing-humanizer/SKILL.md +178 -0
  241. package/skills/writing/polish/grammar-checker-guide/SKILL.md +184 -0
  242. package/skills/writing/polish/plagiarism-detection-guide/SKILL.md +167 -0
  243. package/skills/writing/templates/beamer-presentation-guide/SKILL.md +263 -0
  244. package/skills/writing/templates/conference-paper-template/SKILL.md +219 -0
  245. package/skills/writing/templates/thesis-template-guide/SKILL.md +200 -0
  246. package/skills/writing/templates/thesis-writing-guide/SKILL.md +220 -0
  247. package/src/tools/arxiv.ts +131 -0
  248. package/src/tools/crossref.ts +112 -0
  249. package/src/tools/openalex.ts +174 -0
  250. package/src/tools/pubmed.ts +166 -0
  251. package/src/tools/semantic-scholar.ts +108 -0
  252. package/src/tools/unpaywall.ts +58 -0
@@ -0,0 +1,194 @@
1
+ ---
2
+ name: llm-evaluation-guide
3
+ description: "Evaluate and benchmark large language models for research applications"
4
+ metadata:
5
+ openclaw:
6
+ emoji: "brain"
7
+ category: "domains"
8
+ subcategory: "ai-ml"
9
+ keywords: ["LLM evaluation", "benchmarking", "language models", "model evaluation", "NLP metrics", "BLEU", "perplexity"]
10
+ source: "wentor-research-plugins"
11
+ ---
12
+
13
+ # LLM Evaluation Guide
14
+
15
+ A skill for evaluating and benchmarking large language models (LLMs) in research settings. Covers automatic metrics, human evaluation protocols, benchmark suites, evaluation pitfalls, and best practices for reporting LLM performance.
16
+
17
+ ## Evaluation Taxonomy
18
+
19
+ ### Types of Evaluation
20
+
21
+ ```
22
+ 1. Intrinsic evaluation:
23
+ Measures model quality on its own terms
24
+ - Perplexity, likelihood, calibration
25
+ - Useful for comparing architectures and training procedures
26
+
27
+ 2. Extrinsic evaluation:
28
+ Measures model quality on downstream tasks
29
+ - Task-specific benchmarks (QA, summarization, classification)
30
+ - Closer to real-world usefulness
31
+
32
+ 3. Human evaluation:
33
+ Human judges rate model outputs
34
+ - Fluency, correctness, helpfulness, safety
35
+ - Gold standard but expensive and slow
36
+ ```
37
+
38
+ ## Automatic Metrics
39
+
40
+ ### Common Metrics by Task
41
+
42
+ | Task | Metric | Description |
43
+ |------|--------|-------------|
44
+ | Language modeling | Perplexity | Lower is better; measures prediction quality |
45
+ | Machine translation | BLEU, COMET | N-gram overlap; learned quality estimation |
46
+ | Summarization | ROUGE-1/2/L | Recall of n-grams against reference |
47
+ | Question answering | Exact Match, F1 | Token-level match against reference answer |
48
+ | Classification | Accuracy, F1 | Standard classification metrics |
49
+ | Generation quality | BERTScore | Semantic similarity via embeddings |
50
+ | Factuality | FActScore | Proportion of atomic facts supported by evidence |
51
+
52
+ ### Computing Key Metrics
53
+
54
+ ```python
55
+ from collections import Counter
56
+ import math
57
+
58
+
59
+ def compute_bleu(reference: list[str], hypothesis: list[str],
60
+ max_n: int = 4) -> float:
61
+ """
62
+ Compute corpus-level BLEU score (simplified).
63
+
64
+ Args:
65
+ reference: List of reference token sequences
66
+ hypothesis: List of hypothesis token sequences
67
+ max_n: Maximum n-gram order
68
+ """
69
+ precisions = []
70
+
71
+ for n in range(1, max_n + 1):
72
+ num = 0
73
+ den = 0
74
+ for ref_tokens, hyp_tokens in zip(reference, hypothesis):
75
+ ref_ngrams = Counter(
76
+ tuple(ref_tokens[i:i+n]) for i in range(len(ref_tokens) - n + 1)
77
+ )
78
+ hyp_ngrams = Counter(
79
+ tuple(hyp_tokens[i:i+n]) for i in range(len(hyp_tokens) - n + 1)
80
+ )
81
+ clipped = {ng: min(c, ref_ngrams.get(ng, 0))
82
+ for ng, c in hyp_ngrams.items()}
83
+ num += sum(clipped.values())
84
+ den += max(sum(hyp_ngrams.values()), 1)
85
+
86
+ precisions.append(num / max(den, 1))
87
+
88
+ # Brevity penalty
89
+ ref_len = sum(len(r) for r in reference)
90
+ hyp_len = sum(len(h) for h in hypothesis)
91
+ bp = math.exp(1 - ref_len / max(hyp_len, 1)) if hyp_len < ref_len else 1.0
92
+
93
+ # Geometric mean of precisions
94
+ log_avg = sum(math.log(max(p, 1e-10)) for p in precisions) / max_n
95
+ return bp * math.exp(log_avg)
96
+ ```
97
+
98
+ ## Benchmark Suites
99
+
100
+ ### Major LLM Benchmarks
101
+
102
+ ```
103
+ General knowledge and reasoning:
104
+ - MMLU (Massive Multitask Language Understanding): 57 subjects, MCQ
105
+ - HellaSwag: Commonsense sentence completion
106
+ - ARC (AI2 Reasoning Challenge): Science questions
107
+ - WinoGrande: Coreference resolution / commonsense
108
+
109
+ Coding:
110
+ - HumanEval: Python function completion (pass@k)
111
+ - MBPP: Mostly basic Python problems
112
+ - SWE-bench: Real-world software engineering tasks
113
+
114
+ Math:
115
+ - GSM8K: Grade school math word problems
116
+ - MATH: Competition-level mathematics
117
+
118
+ Safety and alignment:
119
+ - TruthfulQA: Resistance to common misconceptions
120
+ - BBQ (Bias Benchmark for QA): Social bias in QA
121
+ - RealToxicityPrompts: Tendency to generate toxic text
122
+
123
+ Instruction following:
124
+ - MT-Bench: Multi-turn conversation quality (LLM-as-judge)
125
+ - AlpacaEval: Instruction-following quality
126
+ - Chatbot Arena: ELO-based human preference ranking
127
+ ```
128
+
129
+ ## Human Evaluation
130
+
131
+ ### Designing a Human Evaluation Protocol
132
+
133
+ ```python
134
+ def design_human_eval(task: str, n_annotators: int = 3,
135
+ n_examples: int = 200) -> dict:
136
+ """
137
+ Design a human evaluation protocol for LLM outputs.
138
+
139
+ Args:
140
+ task: The task being evaluated
141
+ n_annotators: Number of independent annotators per example
142
+ n_examples: Number of examples to evaluate
143
+ """
144
+ return {
145
+ "task": task,
146
+ "n_annotators": n_annotators,
147
+ "n_examples": n_examples,
148
+ "criteria": [
149
+ {"name": "Fluency", "scale": "1-5",
150
+ "description": "Is the text grammatically correct and natural?"},
151
+ {"name": "Relevance", "scale": "1-5",
152
+ "description": "Does the output address the input/question?"},
153
+ {"name": "Correctness", "scale": "1-5",
154
+ "description": "Is the factual content accurate?"},
155
+ {"name": "Helpfulness", "scale": "1-5",
156
+ "description": "Would a user find this response useful?"}
157
+ ],
158
+ "agreement_metric": "Krippendorff's alpha (ordinal)",
159
+ "presentation": "Randomize model order; blind annotators to model identity",
160
+ "calibration": "Have all annotators rate 20 shared examples first",
161
+ "cost_estimate": f"~{n_examples * n_annotators * 0.50:.0f} USD at typical rates"
162
+ }
163
+ ```
164
+
165
+ ## Evaluation Pitfalls
166
+
167
+ ### Common Mistakes
168
+
169
+ ```
170
+ 1. Data contamination:
171
+ Test data may appear in the LLM's training set.
172
+ Mitigation: Use held-out datasets, check for contamination,
173
+ create new test sets.
174
+
175
+ 2. Metric gaming:
176
+ High BLEU does not mean high quality; ROUGE rewards verbosity.
177
+ Mitigation: Use multiple metrics and human evaluation.
178
+
179
+ 3. Cherry-picking examples:
180
+ Showing only best-case outputs misrepresents model capabilities.
181
+ Mitigation: Report aggregate metrics over full test sets.
182
+
183
+ 4. Ignoring variance:
184
+ LLM outputs vary with temperature and random seeds.
185
+ Mitigation: Report mean and standard deviation over multiple runs.
186
+
187
+ 5. Unfair comparisons:
188
+ Comparing models with different prompt formats or few-shot counts.
189
+ Mitigation: Standardize prompts and report all hyperparameters.
190
+ ```
191
+
192
+ ## Reporting Standards
193
+
194
+ When publishing LLM evaluation results, report: model name and version, parameter count and architecture, evaluation dataset with version number, exact prompts used (include in appendix), number of few-shot examples, decoding parameters (temperature, top-p, max tokens), multiple metrics (not just one), confidence intervals or significance tests, and hardware and inference cost where relevant.
@@ -0,0 +1,233 @@
1
+ ---
2
+ name: prompt-engineering-research
3
+ description: "Systematic prompt engineering methods for AI-assisted academic research workf..."
4
+ metadata:
5
+ openclaw:
6
+ emoji: "robot"
7
+ category: "domains"
8
+ subcategory: "ai-ml"
9
+ keywords: ["machine learning", "deep learning", "NLP", "AI coding", "prompt engineering", "LLM"]
10
+ source: "wentor"
11
+ ---
12
+
13
+ # Prompt Engineering for Research
14
+
15
+ A skill for applying systematic prompt engineering techniques in academic research contexts. Covers prompt design patterns, evaluation methodologies, and practical workflows for using large language models (LLMs) as research tools.
16
+
17
+ ## Prompt Design Patterns
18
+
19
+ ### Core Prompting Strategies
20
+
21
+ | Strategy | Description | Best For | Reliability |
22
+ |----------|------------|---------|-------------|
23
+ | Zero-shot | Direct instruction, no examples | Simple, well-defined tasks | Moderate |
24
+ | Few-shot | Include 2-5 examples in prompt | Pattern matching, formatting | High |
25
+ | Chain-of-thought | "Think step by step" | Reasoning, math, analysis | High |
26
+ | Role prompting | "You are an expert in..." | Domain-specific tasks | Moderate |
27
+ | Structured output | Request JSON/YAML/table format | Data extraction | High |
28
+ | Self-consistency | Sample multiple times, majority vote | Fact-checking, reasoning | Very high |
29
+
30
+ ### Research-Specific Prompt Templates
31
+
32
+ ```python
33
+ def create_research_prompt(task_type: str, context: dict) -> str:
34
+ """
35
+ Generate a structured prompt for common research tasks.
36
+
37
+ Args:
38
+ task_type: One of 'literature_summary', 'methodology_critique',
39
+ 'code_review', 'data_interpretation', 'writing_feedback'
40
+ context: Dict with task-specific context
41
+ """
42
+ templates = {
43
+ 'literature_summary': """
44
+ You are an academic researcher specializing in {domain}.
45
+
46
+ Summarize the following paper excerpt, focusing on:
47
+ 1. The research question and its significance
48
+ 2. The methodology used
49
+ 3. Key findings and their implications
50
+ 4. Limitations acknowledged by the authors
51
+ 5. How this work relates to {related_topic}
52
+
53
+ Paper excerpt:
54
+ {text}
55
+
56
+ Provide a structured summary in 200-300 words. Distinguish clearly
57
+ between what the authors claim and what the evidence supports.
58
+ """,
59
+ 'methodology_critique': """
60
+ You are a methods expert reviewing a research design.
61
+
62
+ Evaluate the following methodology description:
63
+ {text}
64
+
65
+ Assess the following:
66
+ 1. Internal validity: Are there confounding variables not controlled?
67
+ 2. External validity: How generalizable are the findings?
68
+ 3. Statistical approach: Is the analysis appropriate for the data?
69
+ 4. Sample: Is the sample size adequate? Any selection bias?
70
+ 5. Reproducibility: Could another researcher replicate this?
71
+
72
+ For each concern, rate severity (minor/moderate/major) and suggest
73
+ a specific improvement.
74
+ """,
75
+ 'data_interpretation': """
76
+ You are a statistical consultant helping interpret results.
77
+
78
+ Given these results:
79
+ {results}
80
+
81
+ Context: {context_description}
82
+
83
+ Provide:
84
+ 1. Plain-language interpretation of each result
85
+ 2. Effect size interpretation (is it practically significant?)
86
+ 3. Potential alternative explanations
87
+ 4. Caveats the authors should mention
88
+ 5. Suggested follow-up analyses
89
+
90
+ Be precise about what the data does and does not support.
91
+ Do not overstate findings.
92
+ """
93
+ }
94
+
95
+ template = templates.get(task_type, templates['literature_summary'])
96
+ return template.format(**context)
97
+ ```
98
+
99
+ ## Chain-of-Thought for Complex Research Tasks
100
+
101
+ ### Structured Reasoning
102
+
103
+ ```python
104
+ def research_cot_prompt(question: str, data: str) -> str:
105
+ """
106
+ Create a chain-of-thought prompt for complex research analysis.
107
+ """
108
+ return f"""
109
+ I need to analyze the following research question step by step.
110
+
111
+ Research Question: {question}
112
+
113
+ Available Data:
114
+ {data}
115
+
116
+ Please reason through this systematically:
117
+
118
+ Step 1: Identify the key variables and their relationships
119
+ Step 2: Consider what statistical test or analytical approach is appropriate
120
+ Step 3: Check assumptions required for this approach
121
+ Step 4: Perform the analysis or describe how to perform it
122
+ Step 5: Interpret the results in context
123
+ Step 6: State limitations and alternative interpretations
124
+
125
+ Show your reasoning at each step before moving to the next.
126
+ If you are uncertain about any step, explicitly state the uncertainty
127
+ rather than guessing.
128
+ """
129
+ ```
130
+
131
+ ## Evaluation and Reliability
132
+
133
+ ### Measuring Prompt Effectiveness
134
+
135
+ ```python
136
+ def evaluate_prompt(prompt_template: str, test_cases: list[dict],
137
+ expected_outputs: list[str],
138
+ model_fn: callable) -> dict:
139
+ """
140
+ Systematically evaluate a prompt template's reliability.
141
+
142
+ Args:
143
+ prompt_template: The prompt template with {placeholders}
144
+ test_cases: List of dicts with placeholder values
145
+ expected_outputs: Expected outputs for each test case
146
+ model_fn: Function that takes a prompt string and returns model output
147
+ """
148
+ results = []
149
+ for case, expected in zip(test_cases, expected_outputs):
150
+ prompt = prompt_template.format(**case)
151
+
152
+ # Run multiple times for consistency check
153
+ outputs = [model_fn(prompt) for _ in range(3)]
154
+
155
+ # Measure consistency (self-agreement)
156
+ from difflib import SequenceMatcher
157
+ similarities = []
158
+ for i in range(len(outputs)):
159
+ for j in range(i+1, len(outputs)):
160
+ sim = SequenceMatcher(None, outputs[i], outputs[j]).ratio()
161
+ similarities.append(sim)
162
+
163
+ avg_similarity = sum(similarities) / len(similarities) if similarities else 0
164
+
165
+ results.append({
166
+ 'test_case': case,
167
+ 'n_runs': 3,
168
+ 'consistency': round(avg_similarity, 3),
169
+ 'outputs': outputs
170
+ })
171
+
172
+ return {
173
+ 'n_test_cases': len(test_cases),
174
+ 'avg_consistency': round(
175
+ sum(r['consistency'] for r in results) / len(results), 3
176
+ ),
177
+ 'results': results,
178
+ 'reliability': (
179
+ 'high' if all(r['consistency'] > 0.8 for r in results)
180
+ else 'moderate' if all(r['consistency'] > 0.5 for r in results)
181
+ else 'low -- prompt needs refinement'
182
+ )
183
+ }
184
+ ```
185
+
186
+ ## Research Workflow Integration
187
+
188
+ ### Automated Literature Screening
189
+
190
+ ```python
191
+ def screen_paper_relevance(title: str, abstract: str,
192
+ inclusion_criteria: list[str],
193
+ exclusion_criteria: list[str]) -> str:
194
+ """
195
+ Generate a prompt for AI-assisted paper screening in systematic reviews.
196
+ """
197
+ return f"""
198
+ You are screening papers for a systematic review.
199
+
200
+ Paper:
201
+ Title: {title}
202
+ Abstract: {abstract}
203
+
204
+ Inclusion criteria:
205
+ {chr(10).join(f'- {c}' for c in inclusion_criteria)}
206
+
207
+ Exclusion criteria:
208
+ {chr(10).join(f'- {c}' for c in exclusion_criteria)}
209
+
210
+ Evaluate the paper against each criterion and respond with:
211
+ 1. INCLUDE, EXCLUDE, or UNCERTAIN
212
+ 2. Which specific criteria were met or not met
213
+ 3. Confidence level (high/medium/low)
214
+
215
+ Important: When uncertain, err on the side of INCLUDE (to be screened
216
+ at full-text stage). False exclusions are worse than false inclusions
217
+ in systematic review screening.
218
+ """
219
+ ```
220
+
221
+ ## Ethical Considerations
222
+
223
+ - **Transparency**: Always disclose AI usage in your research methodology
224
+ - **Verification**: Never trust LLM outputs without independent verification -- check facts, citations, and calculations
225
+ - **Bias awareness**: LLMs can introduce biases; use structured prompts and diverse perspectives
226
+ - **Citation integrity**: LLMs may hallucinate citations; verify every reference exists
227
+ - **Authorship**: AI tools do not meet authorship criteria (ICMJE); they are tools, not co-authors
228
+ - **Reproducibility**: Document the model, version, temperature, and exact prompts used
229
+
230
+ ## Key References
231
+
232
+ - Wei, J., et al. (2022). Chain-of-thought prompting elicits reasoning in LLMs. *NeurIPS*.
233
+ - Brown, T., et al. (2020). Language models are few-shot learners. *NeurIPS*.
@@ -0,0 +1,254 @@
1
+ ---
2
+ name: reinforcement-learning-guide
3
+ description: "Reinforcement learning fundamentals, algorithms, and research"
4
+ metadata:
5
+ openclaw:
6
+ emoji: "robot"
7
+ category: "domains"
8
+ subcategory: "ai-ml"
9
+ keywords: ["reinforcement learning", "machine learning", "deep learning", "neural network"]
10
+ source: "wentor-research-plugins"
11
+ ---
12
+
13
+ # Reinforcement Learning Guide
14
+
15
+ Understand and implement reinforcement learning algorithms from tabular methods through deep RL, including policy gradients, actor-critic, and model-based approaches.
16
+
17
+ ## RL Fundamentals
18
+
19
+ ### The RL Framework
20
+
21
+ An agent interacts with an environment to maximize cumulative reward:
22
+
23
+ ```
24
+ Agent Environment
25
+ | |
26
+ |--- action a_t ---------->|
27
+ | |--- next state s_{t+1}
28
+ |<-- reward r_t, state s_t |--- reward r_{t+1}
29
+ | |
30
+ ```
31
+
32
+ | Concept | Symbol | Definition |
33
+ |---------|--------|-----------|
34
+ | State | s | Observation of the environment |
35
+ | Action | a | Decision made by the agent |
36
+ | Reward | r | Scalar feedback signal |
37
+ | Policy | pi(a\|s) | Mapping from states to actions |
38
+ | Value function | V(s) | Expected cumulative reward from state s |
39
+ | Q-function | Q(s, a) | Expected cumulative reward from (s, a) |
40
+ | Discount factor | gamma | Weight of future vs. immediate rewards (0-1) |
41
+ | Return | G_t | Sum of discounted future rewards from time t |
42
+
43
+ ### Key Equations
44
+
45
+ ```
46
+ # Return (discounted cumulative reward)
47
+ G_t = r_t + gamma * r_{t+1} + gamma^2 * r_{t+2} + ...
48
+
49
+ # Bellman equation for V
50
+ V(s) = E[r + gamma * V(s') | s]
51
+
52
+ # Bellman equation for Q
53
+ Q(s, a) = E[r + gamma * max_a' Q(s', a') | s, a]
54
+
55
+ # Policy gradient theorem
56
+ gradient J(theta) = E[gradient log pi_theta(a|s) * Q(s, a)]
57
+ ```
58
+
59
+ ## Algorithm Taxonomy
60
+
61
+ | Category | Algorithm | Key Idea | On/Off Policy |
62
+ |----------|-----------|----------|--------------|
63
+ | **Value-based** | Q-Learning | Learn Q(s,a), act greedily | Off-policy |
64
+ | | DQN | Q-Learning + neural net + replay buffer | Off-policy |
65
+ | | Double DQN | Two networks to reduce overestimation | Off-policy |
66
+ | | Dueling DQN | Separate value and advantage streams | Off-policy |
67
+ | **Policy gradient** | REINFORCE | Monte Carlo policy gradient | On-policy |
68
+ | | PPO | Clipped surrogate objective | On-policy |
69
+ | | TRPO | Trust region constraint | On-policy |
70
+ | **Actor-Critic** | A2C/A3C | Advantage actor-critic (parallel) | On-policy |
71
+ | | SAC | Maximum entropy + off-policy AC | Off-policy |
72
+ | | TD3 | Twin delayed DDPG | Off-policy |
73
+ | **Model-based** | Dreamer | World model + imagination | On-policy |
74
+ | | MBPO | Model-based policy optimization | Off-policy |
75
+ | | MuZero | Learned model + planning (MCTS) | Off-policy |
76
+
77
+ ## Implementation: DQN
78
+
79
+ ```python
80
+ import torch
81
+ import torch.nn as nn
82
+ import torch.optim as optim
83
+ import numpy as np
84
+ from collections import deque
85
+ import random
86
+
87
+ class QNetwork(nn.Module):
88
+ def __init__(self, state_dim, action_dim, hidden_dim=128):
89
+ super().__init__()
90
+ self.net = nn.Sequential(
91
+ nn.Linear(state_dim, hidden_dim),
92
+ nn.ReLU(),
93
+ nn.Linear(hidden_dim, hidden_dim),
94
+ nn.ReLU(),
95
+ nn.Linear(hidden_dim, action_dim)
96
+ )
97
+
98
+ def forward(self, x):
99
+ return self.net(x)
100
+
101
+ class DQNAgent:
102
+ def __init__(self, state_dim, action_dim, lr=1e-3, gamma=0.99,
103
+ epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01,
104
+ buffer_size=10000, batch_size=64):
105
+ self.action_dim = action_dim
106
+ self.gamma = gamma
107
+ self.epsilon = epsilon
108
+ self.epsilon_decay = epsilon_decay
109
+ self.epsilon_min = epsilon_min
110
+ self.batch_size = batch_size
111
+
112
+ self.q_network = QNetwork(state_dim, action_dim)
113
+ self.target_network = QNetwork(state_dim, action_dim)
114
+ self.target_network.load_state_dict(self.q_network.state_dict())
115
+ self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)
116
+
117
+ self.replay_buffer = deque(maxlen=buffer_size)
118
+
119
+ def select_action(self, state):
120
+ if random.random() < self.epsilon:
121
+ return random.randint(0, self.action_dim - 1)
122
+ with torch.no_grad():
123
+ q_values = self.q_network(torch.FloatTensor(state))
124
+ return q_values.argmax().item()
125
+
126
+ def store_transition(self, state, action, reward, next_state, done):
127
+ self.replay_buffer.append((state, action, reward, next_state, done))
128
+
129
+ def train_step(self):
130
+ if len(self.replay_buffer) < self.batch_size:
131
+ return 0.0
132
+
133
+ batch = random.sample(self.replay_buffer, self.batch_size)
134
+ states, actions, rewards, next_states, dones = zip(*batch)
135
+
136
+ states = torch.FloatTensor(np.array(states))
137
+ actions = torch.LongTensor(actions)
138
+ rewards = torch.FloatTensor(rewards)
139
+ next_states = torch.FloatTensor(np.array(next_states))
140
+ dones = torch.FloatTensor(dones)
141
+
142
+ # Current Q values
143
+ q_values = self.q_network(states).gather(1, actions.unsqueeze(1)).squeeze()
144
+
145
+ # Target Q values (Double DQN variant)
146
+ with torch.no_grad():
147
+ best_actions = self.q_network(next_states).argmax(1)
148
+ next_q = self.target_network(next_states).gather(1, best_actions.unsqueeze(1)).squeeze()
149
+ targets = rewards + self.gamma * next_q * (1 - dones)
150
+
151
+ loss = nn.MSELoss()(q_values, targets)
152
+ self.optimizer.zero_grad()
153
+ loss.backward()
154
+ self.optimizer.step()
155
+
156
+ self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
157
+ return loss.item()
158
+
159
+ def update_target(self):
160
+ self.target_network.load_state_dict(self.q_network.state_dict())
161
+ ```
162
+
163
+ ## Implementation: PPO
164
+
165
+ ```python
166
+ class PPOAgent:
167
+ def __init__(self, state_dim, action_dim, lr=3e-4, gamma=0.99,
168
+ lam=0.95, clip_ratio=0.2, epochs=10):
169
+ self.gamma = gamma
170
+ self.lam = lam
171
+ self.clip_ratio = clip_ratio
172
+ self.epochs = epochs
173
+
174
+ self.actor = nn.Sequential(
175
+ nn.Linear(state_dim, 64), nn.Tanh(),
176
+ nn.Linear(64, 64), nn.Tanh(),
177
+ nn.Linear(64, action_dim), nn.Softmax(dim=-1)
178
+ )
179
+ self.critic = nn.Sequential(
180
+ nn.Linear(state_dim, 64), nn.Tanh(),
181
+ nn.Linear(64, 64), nn.Tanh(),
182
+ nn.Linear(64, 1)
183
+ )
184
+ self.optimizer = optim.Adam(
185
+ list(self.actor.parameters()) + list(self.critic.parameters()), lr=lr
186
+ )
187
+
188
+ def compute_gae(self, rewards, values, dones):
189
+ """Generalized Advantage Estimation."""
190
+ advantages = []
191
+ gae = 0
192
+ for t in reversed(range(len(rewards))):
193
+ next_value = values[t + 1] if t + 1 < len(values) else 0
194
+ delta = rewards[t] + self.gamma * next_value * (1 - dones[t]) - values[t]
195
+ gae = delta + self.gamma * self.lam * (1 - dones[t]) * gae
196
+ advantages.insert(0, gae)
197
+ return torch.FloatTensor(advantages)
198
+
199
+ def update(self, states, actions, old_log_probs, rewards, dones):
200
+ values = self.critic(states).squeeze().detach().numpy()
201
+ advantages = self.compute_gae(rewards, values, dones)
202
+ returns = advantages + torch.FloatTensor(values[:len(advantages)])
203
+ advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
204
+
205
+ for _ in range(self.epochs):
206
+ probs = self.actor(states)
207
+ dist = torch.distributions.Categorical(probs)
208
+ new_log_probs = dist.log_prob(actions)
209
+ entropy = dist.entropy().mean()
210
+
211
+ ratio = (new_log_probs - old_log_probs).exp()
212
+ clipped = torch.clamp(ratio, 1 - self.clip_ratio, 1 + self.clip_ratio)
213
+ actor_loss = -torch.min(ratio * advantages, clipped * advantages).mean()
214
+
215
+ critic_loss = nn.MSELoss()(self.critic(states).squeeze(), returns)
216
+
217
+ loss = actor_loss + 0.5 * critic_loss - 0.01 * entropy
218
+ self.optimizer.zero_grad()
219
+ loss.backward()
220
+ self.optimizer.step()
221
+ ```
222
+
223
+ ## Research Environments
224
+
225
+ | Environment | Domain | Complexity | Key Paper |
226
+ |-------------|--------|-----------|-----------|
227
+ | Gymnasium (ex-Gym) | Classic control, Atari | Low-High | Brockman et al., 2016 |
228
+ | MuJoCo | Continuous control, robotics | Medium-High | Todorov et al., 2012 |
229
+ | DMControl | Continuous control from pixels | High | Tassa et al., 2018 |
230
+ | ProcGen | Procedurally generated games | High (generalization) | Cobbe et al., 2020 |
231
+ | Minigrid | Grid-world navigation | Low-Medium | Chevalier-Boisvert et al. |
232
+ | Isaac Gym | GPU-accelerated physics sim | High | Makoviychuk et al., 2021 |
233
+ | NetHack | Complex roguelike game | Very High | Kuttler et al., 2020 |
234
+
235
+ ## Top Venues
236
+
237
+ | Venue | Type | Focus |
238
+ |-------|------|-------|
239
+ | NeurIPS | Conference | Broad ML including RL |
240
+ | ICML | Conference | Broad ML including RL |
241
+ | ICLR | Conference | Representation learning, deep RL |
242
+ | AAAI | Conference | Broad AI |
243
+ | CoRL | Conference | Robot learning |
244
+ | JMLR | Journal | Broad ML (open access) |
245
+ | L4DC | Conference | Learning for dynamics and control |
246
+
247
+ ## Key Research Directions (2024-2025)
248
+
249
+ 1. **RLHF / RLAIF**: RL from human or AI feedback for LLM alignment
250
+ 2. **Offline RL**: Learning from pre-collected datasets without environment interaction
251
+ 3. **Foundation models for control**: Using pre-trained LLMs/VLMs as world models or planners
252
+ 4. **Multi-agent RL**: Cooperative and competitive settings with communication
253
+ 5. **Safe RL**: Constrained optimization to ensure safety during training and deployment
254
+ 6. **Sample-efficient RL**: Reducing the gap between model-free and model-based sample complexity