flonat-research 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (285) hide show
  1. package/.claude/agents/domain-reviewer.md +336 -0
  2. package/.claude/agents/fixer.md +226 -0
  3. package/.claude/agents/paper-critic.md +370 -0
  4. package/.claude/agents/peer-reviewer.md +289 -0
  5. package/.claude/agents/proposal-reviewer.md +215 -0
  6. package/.claude/agents/referee2-reviewer.md +367 -0
  7. package/.claude/agents/references/journal-referee-profiles.md +354 -0
  8. package/.claude/agents/references/paper-critic/council-personas.md +77 -0
  9. package/.claude/agents/references/paper-critic/council-prompts.md +198 -0
  10. package/.claude/agents/references/peer-reviewer/report-template.md +199 -0
  11. package/.claude/agents/references/peer-reviewer/sa-prompts.md +260 -0
  12. package/.claude/agents/references/peer-reviewer/security-scan.md +188 -0
  13. package/.claude/agents/references/proposal-reviewer/report-template.md +144 -0
  14. package/.claude/agents/references/proposal-reviewer/sa-prompts.md +149 -0
  15. package/.claude/agents/references/referee-config.md +114 -0
  16. package/.claude/agents/references/referee2-reviewer/audit-checklists.md +287 -0
  17. package/.claude/agents/references/referee2-reviewer/report-template.md +334 -0
  18. package/.claude/rules/design-before-results.md +52 -0
  19. package/.claude/rules/ignore-agents-md.md +17 -0
  20. package/.claude/rules/ignore-gemini-md.md +17 -0
  21. package/.claude/rules/lean-claude-md.md +45 -0
  22. package/.claude/rules/learn-tags.md +99 -0
  23. package/.claude/rules/overleaf-separation.md +67 -0
  24. package/.claude/rules/plan-first.md +175 -0
  25. package/.claude/rules/read-docs-first.md +50 -0
  26. package/.claude/rules/scope-discipline.md +28 -0
  27. package/.claude/settings.json +125 -0
  28. package/.context/current-focus.md +33 -0
  29. package/.context/preferences/priorities.md +36 -0
  30. package/.context/preferences/task-naming.md +28 -0
  31. package/.context/profile.md +29 -0
  32. package/.context/projects/_index.md +41 -0
  33. package/.context/projects/papers/nudge-exp.md +22 -0
  34. package/.context/projects/papers/uncertainty.md +31 -0
  35. package/.context/resources/claude-scientific-writer-review.md +48 -0
  36. package/.context/resources/cunningham-multi-analyst-agents.md +104 -0
  37. package/.context/resources/cunningham-multilang-code-audit.md +62 -0
  38. package/.context/resources/google-ai-co-scientist-review.md +72 -0
  39. package/.context/resources/karpathy-llm-council-review.md +58 -0
  40. package/.context/resources/multi-coder-reliability-protocol.md +175 -0
  41. package/.context/resources/pedro-santanna-takeaways.md +96 -0
  42. package/.context/resources/venue-rankings/abs_ajg_2024.csv +1823 -0
  43. package/.context/resources/venue-rankings/abs_ajg_2024_econ.csv +356 -0
  44. package/.context/resources/venue-rankings/cabs_4_4star_theory.csv +40 -0
  45. package/.context/resources/venue-rankings/core_2026.csv +801 -0
  46. package/.context/resources/venue-rankings.md +147 -0
  47. package/.context/workflows/README.md +69 -0
  48. package/.context/workflows/daily-review.md +91 -0
  49. package/.context/workflows/meeting-actions.md +108 -0
  50. package/.context/workflows/replication-protocol.md +155 -0
  51. package/.context/workflows/weekly-review.md +113 -0
  52. package/.mcp-server-biblio/formatters.py +158 -0
  53. package/.mcp-server-biblio/pyproject.toml +11 -0
  54. package/.mcp-server-biblio/server.py +678 -0
  55. package/.mcp-server-biblio/sources/__init__.py +14 -0
  56. package/.mcp-server-biblio/sources/base.py +73 -0
  57. package/.mcp-server-biblio/sources/formatters.py +83 -0
  58. package/.mcp-server-biblio/sources/models.py +22 -0
  59. package/.mcp-server-biblio/sources/multi_source.py +243 -0
  60. package/.mcp-server-biblio/sources/openalex_source.py +183 -0
  61. package/.mcp-server-biblio/sources/scopus_source.py +309 -0
  62. package/.mcp-server-biblio/sources/wos_source.py +508 -0
  63. package/.mcp-server-biblio/uv.lock +896 -0
  64. package/.scripts/README.md +161 -0
  65. package/.scripts/ai_pattern_density.py +446 -0
  66. package/.scripts/conf +445 -0
  67. package/.scripts/config.py +122 -0
  68. package/.scripts/count_inventory.py +275 -0
  69. package/.scripts/daily_digest.py +288 -0
  70. package/.scripts/done +177 -0
  71. package/.scripts/extract_meeting_actions.py +223 -0
  72. package/.scripts/focus +176 -0
  73. package/.scripts/generate-codex-agents-md.py +217 -0
  74. package/.scripts/inbox +194 -0
  75. package/.scripts/notion_helpers.py +325 -0
  76. package/.scripts/openalex/query_helpers.py +306 -0
  77. package/.scripts/papers +227 -0
  78. package/.scripts/query +223 -0
  79. package/.scripts/session-history.py +201 -0
  80. package/.scripts/skill-health.py +516 -0
  81. package/.scripts/skill-log-miner.py +273 -0
  82. package/.scripts/sync-to-codex.sh +252 -0
  83. package/.scripts/task +213 -0
  84. package/.scripts/tasks +190 -0
  85. package/.scripts/week +206 -0
  86. package/CLAUDE.md +197 -0
  87. package/LICENSE +21 -0
  88. package/MEMORY.md +38 -0
  89. package/README.md +269 -0
  90. package/docs/agents.md +44 -0
  91. package/docs/bibliography-setup.md +55 -0
  92. package/docs/council-mode.md +36 -0
  93. package/docs/getting-started.md +245 -0
  94. package/docs/hooks.md +38 -0
  95. package/docs/mcp-servers.md +82 -0
  96. package/docs/notion-setup.md +109 -0
  97. package/docs/rules.md +33 -0
  98. package/docs/scripts.md +303 -0
  99. package/docs/setup-overview/setup-overview.pdf +0 -0
  100. package/docs/skills.md +70 -0
  101. package/docs/system.md +159 -0
  102. package/hooks/block-destructive-git.sh +66 -0
  103. package/hooks/context-monitor.py +114 -0
  104. package/hooks/postcompact-restore.py +157 -0
  105. package/hooks/precompact-autosave.py +181 -0
  106. package/hooks/promise-checker.sh +124 -0
  107. package/hooks/protect-source-files.sh +81 -0
  108. package/hooks/resume-context-loader.sh +53 -0
  109. package/hooks/startup-context-loader.sh +102 -0
  110. package/package.json +51 -0
  111. package/packages/cli-council/.github/workflows/claude-code-review.yml +44 -0
  112. package/packages/cli-council/.github/workflows/claude.yml +50 -0
  113. package/packages/cli-council/README.md +100 -0
  114. package/packages/cli-council/pyproject.toml +43 -0
  115. package/packages/cli-council/src/cli_council/__init__.py +19 -0
  116. package/packages/cli-council/src/cli_council/__main__.py +185 -0
  117. package/packages/cli-council/src/cli_council/backends/__init__.py +8 -0
  118. package/packages/cli-council/src/cli_council/backends/base.py +81 -0
  119. package/packages/cli-council/src/cli_council/backends/claude.py +25 -0
  120. package/packages/cli-council/src/cli_council/backends/codex.py +27 -0
  121. package/packages/cli-council/src/cli_council/backends/gemini.py +26 -0
  122. package/packages/cli-council/src/cli_council/checkpoint.py +212 -0
  123. package/packages/cli-council/src/cli_council/config.py +51 -0
  124. package/packages/cli-council/src/cli_council/council.py +391 -0
  125. package/packages/cli-council/src/cli_council/models.py +46 -0
  126. package/packages/llm-council/.github/workflows/claude-code-review.yml +44 -0
  127. package/packages/llm-council/.github/workflows/claude.yml +50 -0
  128. package/packages/llm-council/README.md +453 -0
  129. package/packages/llm-council/pyproject.toml +42 -0
  130. package/packages/llm-council/src/llm_council/__init__.py +23 -0
  131. package/packages/llm-council/src/llm_council/__main__.py +259 -0
  132. package/packages/llm-council/src/llm_council/checkpoint.py +193 -0
  133. package/packages/llm-council/src/llm_council/client.py +253 -0
  134. package/packages/llm-council/src/llm_council/config.py +232 -0
  135. package/packages/llm-council/src/llm_council/council.py +482 -0
  136. package/packages/llm-council/src/llm_council/models.py +46 -0
  137. package/packages/mcp-bibliography/MEMORY.md +31 -0
  138. package/packages/mcp-bibliography/_app.py +226 -0
  139. package/packages/mcp-bibliography/formatters.py +158 -0
  140. package/packages/mcp-bibliography/log/2026-03-13-2100.md +35 -0
  141. package/packages/mcp-bibliography/pyproject.toml +15 -0
  142. package/packages/mcp-bibliography/run.sh +20 -0
  143. package/packages/mcp-bibliography/scholarly_formatters.py +83 -0
  144. package/packages/mcp-bibliography/server.py +1857 -0
  145. package/packages/mcp-bibliography/tools/__init__.py +28 -0
  146. package/packages/mcp-bibliography/tools/_registry.py +19 -0
  147. package/packages/mcp-bibliography/tools/altmetric.py +107 -0
  148. package/packages/mcp-bibliography/tools/core.py +92 -0
  149. package/packages/mcp-bibliography/tools/dblp.py +52 -0
  150. package/packages/mcp-bibliography/tools/openalex.py +296 -0
  151. package/packages/mcp-bibliography/tools/opencitations.py +102 -0
  152. package/packages/mcp-bibliography/tools/openreview.py +179 -0
  153. package/packages/mcp-bibliography/tools/orcid.py +131 -0
  154. package/packages/mcp-bibliography/tools/scholarly.py +575 -0
  155. package/packages/mcp-bibliography/tools/unpaywall.py +63 -0
  156. package/packages/mcp-bibliography/tools/zenodo.py +123 -0
  157. package/packages/mcp-bibliography/uv.lock +711 -0
  158. package/scripts/setup.sh +143 -0
  159. package/skills/beamer-deck/SKILL.md +199 -0
  160. package/skills/beamer-deck/references/quality-rubric.md +54 -0
  161. package/skills/beamer-deck/references/review-prompts.md +106 -0
  162. package/skills/bib-validate/SKILL.md +261 -0
  163. package/skills/bib-validate/references/council-mode.md +34 -0
  164. package/skills/bib-validate/references/deep-verify.md +79 -0
  165. package/skills/bib-validate/references/fix-mode.md +36 -0
  166. package/skills/bib-validate/references/openalex-verification.md +45 -0
  167. package/skills/bib-validate/references/preprint-check.md +31 -0
  168. package/skills/bib-validate/references/ref-manager-crossref.md +41 -0
  169. package/skills/bib-validate/references/report-template.md +82 -0
  170. package/skills/code-archaeology/SKILL.md +141 -0
  171. package/skills/code-review/SKILL.md +265 -0
  172. package/skills/code-review/references/quality-rubric.md +67 -0
  173. package/skills/consolidate-memory/SKILL.md +208 -0
  174. package/skills/context-status/SKILL.md +126 -0
  175. package/skills/creation-guard/SKILL.md +230 -0
  176. package/skills/devils-advocate/SKILL.md +130 -0
  177. package/skills/devils-advocate/references/competing-hypotheses.md +83 -0
  178. package/skills/init-project/SKILL.md +115 -0
  179. package/skills/init-project-course/references/memory-and-settings.md +92 -0
  180. package/skills/init-project-course/references/organise-templates.md +94 -0
  181. package/skills/init-project-course/skill.md +147 -0
  182. package/skills/init-project-light/skill.md +139 -0
  183. package/skills/init-project-research/SKILL.md +368 -0
  184. package/skills/init-project-research/references/atlas-pipeline-sync.md +70 -0
  185. package/skills/init-project-research/references/atlas-schema.md +81 -0
  186. package/skills/init-project-research/references/confirmation-report.md +39 -0
  187. package/skills/init-project-research/references/domain-profile-template.md +104 -0
  188. package/skills/init-project-research/references/interview-round3.md +34 -0
  189. package/skills/init-project-research/references/literature-discovery.md +43 -0
  190. package/skills/init-project-research/references/scaffold-details.md +197 -0
  191. package/skills/init-project-research/templates/field-calibration.md +60 -0
  192. package/skills/init-project-research/templates/pipeline-manifest.md +63 -0
  193. package/skills/init-project-research/templates/run-all.sh +116 -0
  194. package/skills/init-project-research/templates/seed-files.md +337 -0
  195. package/skills/insights-deck/SKILL.md +151 -0
  196. package/skills/interview-me/SKILL.md +157 -0
  197. package/skills/latex/SKILL.md +141 -0
  198. package/skills/latex/references/latex-configs.md +183 -0
  199. package/skills/latex-autofix/SKILL.md +230 -0
  200. package/skills/latex-autofix/references/known-errors.md +183 -0
  201. package/skills/latex-autofix/references/quality-rubric.md +50 -0
  202. package/skills/latex-health-check/SKILL.md +161 -0
  203. package/skills/learn/SKILL.md +220 -0
  204. package/skills/learn/scripts/validate_skill.py +265 -0
  205. package/skills/lessons-learned/SKILL.md +201 -0
  206. package/skills/literature/SKILL.md +335 -0
  207. package/skills/literature/references/agent-templates.md +393 -0
  208. package/skills/literature/references/bibliometric-apis.md +44 -0
  209. package/skills/literature/references/cli-council-search.md +79 -0
  210. package/skills/literature/references/openalex-api-guide.md +371 -0
  211. package/skills/literature/references/openalex-common-queries.md +381 -0
  212. package/skills/literature/references/openalex-workflows.md +248 -0
  213. package/skills/literature/references/reference-manager-sync.md +36 -0
  214. package/skills/literature/references/scopus-api-guide.md +208 -0
  215. package/skills/literature/references/wos-api-guide.md +308 -0
  216. package/skills/multi-perspective/SKILL.md +311 -0
  217. package/skills/multi-perspective/references/computational-many-analysts.md +77 -0
  218. package/skills/pipeline-manifest/SKILL.md +226 -0
  219. package/skills/pre-submission-report/SKILL.md +153 -0
  220. package/skills/process-reviews/SKILL.md +244 -0
  221. package/skills/process-reviews/references/rr-routing.md +101 -0
  222. package/skills/project-deck/SKILL.md +87 -0
  223. package/skills/project-safety/SKILL.md +135 -0
  224. package/skills/proofread/SKILL.md +254 -0
  225. package/skills/proofread/references/quality-rubric.md +104 -0
  226. package/skills/python-env/SKILL.md +57 -0
  227. package/skills/quarto-deck/SKILL.md +226 -0
  228. package/skills/quarto-deck/references/markdown-format.md +143 -0
  229. package/skills/quarto-deck/references/quality-rubric.md +54 -0
  230. package/skills/save-context/SKILL.md +174 -0
  231. package/skills/session-log/SKILL.md +98 -0
  232. package/skills/shared/concept-validation-gate.md +161 -0
  233. package/skills/shared/council-protocol.md +265 -0
  234. package/skills/shared/distribution-diagnostics.md +164 -0
  235. package/skills/shared/engagement-stratified-sampling.md +218 -0
  236. package/skills/shared/escalation-protocol.md +74 -0
  237. package/skills/shared/external-audit-protocol.md +205 -0
  238. package/skills/shared/intercoder-reliability.md +256 -0
  239. package/skills/shared/mcp-degradation.md +81 -0
  240. package/skills/shared/method-probing-questions.md +163 -0
  241. package/skills/shared/multi-language-conventions.md +143 -0
  242. package/skills/shared/paid-api-safety.md +174 -0
  243. package/skills/shared/palettes.md +90 -0
  244. package/skills/shared/progressive-disclosure.md +92 -0
  245. package/skills/shared/project-documentation-content.md +443 -0
  246. package/skills/shared/project-documentation-format.md +281 -0
  247. package/skills/shared/project-documentation.md +100 -0
  248. package/skills/shared/publication-output.md +138 -0
  249. package/skills/shared/quality-scoring.md +70 -0
  250. package/skills/shared/reference-resolution.md +77 -0
  251. package/skills/shared/research-quality-rubric.md +165 -0
  252. package/skills/shared/rhetoric-principles.md +54 -0
  253. package/skills/shared/skill-design-patterns.md +272 -0
  254. package/skills/shared/skill-index.md +240 -0
  255. package/skills/shared/system-documentation.md +334 -0
  256. package/skills/shared/tikz-rules.md +402 -0
  257. package/skills/shared/validation-tiers.md +121 -0
  258. package/skills/shared/venue-guides/README.md +46 -0
  259. package/skills/shared/venue-guides/cell_press_style.md +483 -0
  260. package/skills/shared/venue-guides/conferences_formatting.md +564 -0
  261. package/skills/shared/venue-guides/cs_conference_style.md +463 -0
  262. package/skills/shared/venue-guides/examples/cell_summary_example.md +247 -0
  263. package/skills/shared/venue-guides/examples/medical_structured_abstract.md +313 -0
  264. package/skills/shared/venue-guides/examples/nature_abstract_examples.md +213 -0
  265. package/skills/shared/venue-guides/examples/neurips_introduction_example.md +245 -0
  266. package/skills/shared/venue-guides/journals_formatting.md +486 -0
  267. package/skills/shared/venue-guides/medical_journal_styles.md +535 -0
  268. package/skills/shared/venue-guides/ml_conference_style.md +556 -0
  269. package/skills/shared/venue-guides/nature_science_style.md +405 -0
  270. package/skills/shared/venue-guides/reviewer_expectations.md +417 -0
  271. package/skills/shared/venue-guides/venue_writing_styles.md +321 -0
  272. package/skills/split-pdf/SKILL.md +172 -0
  273. package/skills/split-pdf/methodology.md +48 -0
  274. package/skills/sync-notion/SKILL.md +93 -0
  275. package/skills/system-audit/SKILL.md +157 -0
  276. package/skills/system-audit/references/sub-agent-prompts.md +294 -0
  277. package/skills/task-management/SKILL.md +131 -0
  278. package/skills/update-focus/SKILL.md +204 -0
  279. package/skills/update-project-doc/SKILL.md +194 -0
  280. package/skills/validate-bib/SKILL.md +242 -0
  281. package/skills/validate-bib/references/council-mode.md +34 -0
  282. package/skills/validate-bib/references/deep-verify.md +71 -0
  283. package/skills/validate-bib/references/openalex-verification.md +45 -0
  284. package/skills/validate-bib/references/preprint-check.md +31 -0
  285. package/skills/validate-bib/references/report-template.md +62 -0
@@ -0,0 +1,28 @@
1
+ # Task Naming Conventions
2
+
3
+ ## Structure
4
+
5
+ `[Action verb] [Object] - [Brief context]`
6
+
7
+ ## Action Verbs
8
+
9
+ | Verb | When to use |
10
+ |------|-------------|
11
+ | Write | Creating new content from scratch |
12
+ | Draft | First version, expecting revisions |
13
+ | Edit | Modifying existing content |
14
+ | Review | Reading and providing feedback |
15
+ | Read | Reading for understanding |
16
+ | Send | Communication tasks |
17
+ | Submit | Final submission |
18
+ | Prepare | Getting something ready |
19
+ | Research | Investigating a topic |
20
+ | Decide | Making a choice |
21
+ | Follow up | Checking on something sent |
22
+
23
+ ## Examples
24
+
25
+ - "Draft methodology section - uncertainty paper"
26
+ - "Review co-author feedback - nudge-exp intro"
27
+ - "Submit ethics application - experiment 1"
28
+ - "Read Smith (2024) - lit review"
@@ -0,0 +1,29 @@
1
+ # Profile
2
+
3
+ <!-- Customise this with your own details. This file helps Claude understand
4
+ your background and calibrate its responses. -->
5
+
6
+ ## Identity
7
+
8
+ - **Name:** [Your Name]
9
+ - **Role:** PhD Researcher
10
+ - **Institution:** UK University
11
+
12
+ ## Supervisors
13
+
14
+ | Name | Institution | Role | Focus |
15
+ |------|-------------|------|-------|
16
+ | [Supervisor 1] | [University] | Primary | [Their focus] |
17
+ | [Supervisor 2] | [University] | Secondary | [Their focus] |
18
+
19
+ ## Teaching
20
+
21
+ - Graduate TA for Research Methods (Autumn term)
22
+ - Workshop facilitator for Data Analysis in R
23
+
24
+ ## Tools & Preferences
25
+
26
+ - **Writing:** LaTeX (papers), Markdown (notes)
27
+ - **Data analysis:** R (primary), Python (secondary)
28
+ - **Version control:** Git
29
+ - **Reference management:** Paperpile (synced to `paperpile.bib`)
@@ -0,0 +1,41 @@
1
+ # Projects Index
2
+
3
+ > Overview of all active research projects. Updated as projects progress.
4
+
5
+ ## Active Projects
6
+
7
+ | # | Short name | Title | Stage | Target | Co-authors |
8
+ |---|-----------|-------|-------|--------|------------|
9
+ | 1 | uncertainty | Decision Making Under Uncertainty | Drafting | Management Science | J. Smith |
10
+ | 2 | nudge-exp | Nudge Effectiveness in Org Settings | Literature Review | JDM | T. Brown |
11
+ | 3 | survey-methods | Survey Design for Behavioural Research | Idea | OBHDP | — |
12
+
13
+ ## Project Details
14
+
15
+ ### 1. uncertainty
16
+
17
+ - **Directory:** `~/Research/uncertainty/`
18
+ - **Paper:** `paper/` (Overleaf symlink)
19
+ - **Status:** Methodology and results drafted, introduction needs revision
20
+ - **Next:** Finish results tables, send to co-author for review
21
+ - **Pipeline stage:** Drafting
22
+
23
+ ### 2. nudge-exp
24
+
25
+ - **Directory:** `~/Research/nudge-exp/`
26
+ - **Paper:** `paper/` (Overleaf symlink)
27
+ - **Status:** Literature review in progress, 40+ papers collected
28
+ - **Next:** Synthesise literature, draft theoretical framework
29
+ - **Pipeline stage:** Literature Review
30
+
31
+ ### 3. survey-methods
32
+
33
+ - **Directory:** `~/Research/survey-methods/`
34
+ - **Paper:** Not started
35
+ - **Status:** Idea phase — exploring feasibility
36
+ - **Next:** Read 5 key papers, write 1-page research question memo
37
+ - **Pipeline stage:** Idea
38
+
39
+ ## Archived Projects
40
+
41
+ None yet.
@@ -0,0 +1,22 @@
1
+ # Paper: Nudge Effectiveness in Organisational Settings
2
+
3
+ ## Metadata
4
+
5
+ | Field | Value |
6
+ |-------|-------|
7
+ | Short name | nudge-exp |
8
+ | Title | When Do Nudges Work? Context Effects in Organisational Decision Making |
9
+ | Authors | Alex Chen, Tom Brown |
10
+ | Target journal | Journal of Decision Making |
11
+ | Stage | Literature Review |
12
+ | Directory | `~/Research/nudge-exp/` |
13
+
14
+ ## Research Question
15
+
16
+ Under what conditions do behavioural nudges improve decision quality in organisational contexts?
17
+
18
+ ## Current Status
19
+
20
+ - Literature review in progress (40+ papers collected)
21
+ - Theoretical framework emerging
22
+ - Experiment not yet designed
@@ -0,0 +1,31 @@
1
+ # Paper: Decision Making Under Uncertainty
2
+
3
+ ## Metadata
4
+
5
+ | Field | Value |
6
+ |-------|-------|
7
+ | Short name | uncertainty |
8
+ | Title | Decision Making Under Uncertainty: A Behavioural Approach |
9
+ | Authors | Alex Chen, Jane Smith |
10
+ | Target journal | Management Science |
11
+ | Stage | Drafting |
12
+ | Directory | `~/Research/uncertainty/` |
13
+
14
+ ## Research Question
15
+
16
+ How do individuals' decision-making strategies change when uncertainty increases in organisational settings?
17
+
18
+ ## Key Contributions
19
+
20
+ 1. Novel experimental design for measuring decision quality under varying uncertainty
21
+ 2. Theoretical framework linking uncertainty perception to strategy selection
22
+ 3. Empirical evidence from a lab experiment with N=200 participants
23
+
24
+ ## Current Status
25
+
26
+ - Introduction: First draft (needs revision based on co-author feedback)
27
+ - Literature review: Complete
28
+ - Methodology: Complete
29
+ - Results: In progress (Table 2 pending)
30
+ - Discussion: Not started
31
+ - Conclusion: Not started
@@ -0,0 +1,48 @@
1
+ # Review: K-Dense-AI/claude-scientific-writer
2
+
3
+ > Reviewed 2026-02-24. Repo: https://github.com/K-Dense-AI/claude-scientific-writer (856 stars)
4
+
5
+ ## What It Is
6
+
7
+ A monolithic scientific document generation tool combining Claude with real-time literature search (Perplexity via OpenRouter) and AI diagram generation (Nano Banana Pro). Available as Claude Code plugin, CLI, or Python API. MIT-licensed.
8
+
9
+ ## Architecture
10
+
11
+ - **24 skills**: writing, research-lookup, peer-review, citation-management, clinical-reports, research-grants, latex-posters, scientific-slides, hypothesis-generation, market-research-reports, scientific-schematics, infographics, literature-review, venue-templates, etc.
12
+ - **External APIs**: Perplexity Sonar Pro Search (literature), Nano Banana Pro (diagrams), Parallel Web API (general web search/extraction)
13
+ - **LaTeX-first** output with BibTeX citations by default
14
+ - **Multi-pass writing**: skeleton → research per section → write → verify citations → compile → PDF review via image conversion
15
+ - **Version management**: v1_draft.tex, v2_draft.tex, etc. — never overwrites previous versions
16
+ - All research results saved to `sources/` folder for auditability and context recovery
17
+
18
+ ## Key Strengths
19
+
20
+ - **Citation verification loop**: research before writing, verify every BibTeX entry has complete metadata (DOI, volume, pages), web-search for missing fields
21
+ - **Peer review**: quantitative ScholarEval framework (8-dimension scoring)
22
+ - **Figure generation**: minimum counts per document type, mandatory graphical abstract
23
+ - **Full lifecycle**: hypothesis → writing → review → revision
24
+ - **Structured output pipeline** with progress tracking and summary reports
25
+
26
+ ## Overlap with Our Setup
27
+
28
+ | Their Feature | Our Equivalent |
29
+ |---------------|---------------|
30
+ | `research-lookup` + `citation-management` | `/literature` skill + `packages/mcp-bibliography` |
31
+ | `peer-review` / `scholar-evaluation` | `referee2-reviewer`, `paper-critic`, `domain-reviewer` agents |
32
+ | `venue-templates` | Scout `venues.json` + framing workflow |
33
+ | `latex-posters` / `scientific-slides` | `/beamer-deck`, `/quarto-deck` |
34
+
35
+ ## Key Differences from Our Approach
36
+
37
+ - **Monolithic vs modular**: one tool does everything; ours is skills + agents + standalone apps
38
+ - **External API-heavy**: requires Perplexity, Nano Banana Pro, Parallel Web API keys (more cost)
39
+ - **"Never stop" philosophy**: their CLAUDE.md says "NEVER ask 'would you like me to continue?'"; ours has plan-first discipline and phase boundaries
40
+ - **No novelty scoring or venue-specific framing**: that's unique to our Scout
41
+ - **Heavy CLAUDE.md**: ~500 lines of instructions; ours follows lean-claude-md rule
42
+ - **No design-before-results discipline**: no equivalent to our research design rules
43
+
44
+ ## Ideas Worth Borrowing
45
+
46
+ 1. **Citation metadata verification loop** — after BibTeX creation, systematically search for missing DOI/volume/pages. Could enhance `/bib-validate`.
47
+ 4. **PDF review via image conversion** — convert PDF to images and visually inspect each page for formatting issues. Could add as a step in `/latex-autofix`.
48
+ 5. **Progress logging per section** — timestamped logs of word count and citation count per section during writing. Lighter version could fit our session logging.
@@ -0,0 +1,104 @@
1
+ # Scott Cunningham — Multi-Analyst Agent Designs & Non-Standard Errors
2
+
3
+ > Source: Substack posts 21, 25, 26, 27 (Feb–Mar 2026). Series on Claude Code for causal inference.
4
+
5
+ ## Post 26: Computational Many-Analysts Design (DiD Part 2)
6
+
7
+ ### Core Idea
8
+
9
+ Run N independent AI agents on the same dataset + estimator, isolating each in a temp directory with no shared memory. This approximates a many-analysts design (Silberzahn et al. 2018, Huntington-Klein et al. 2021, Menkveld et al. 2024 "non-standard errors") at near-zero marginal cost.
10
+
11
+ ### Experiment
12
+
13
+ - 15 agents × 5 packages (csdid, csdid2, did, differences, diff-diff) × 3 languages
14
+ - Each launched via `claude -p` in isolated temp dirs, no shared history
15
+ - Instructions: Callaway & Sant'Anna estimator, universal base period, not-yet-treated controls
16
+ - **Primary discretionary node:** covariate selection for conditional parallel trends
17
+
18
+ ### Key Findings
19
+
20
+ - **Structural decisions unanimous (15/15):** control group, base period, balanced cohorts, trimming
21
+ - **All variation in covariate selection:** log GDP (14/15), population (12/15), poverty (10/15), health spending (7/15), Bolsa Familia (2/15), geographic vars (1/15)
22
+ - Agents drew the confounder/mediator boundary differently — same reasoning, different thresholds
23
+ - All 15 chose doubly robust; Stata agents split on DRIPW vs DRIMP variant
24
+
25
+ ### Relevance
26
+
27
+ - Demonstrates that agent-based many-analysts designs are feasible and cheap
28
+ - Covariate selection as the key discretionary node in DiD — directly measurable
29
+ - Forest plots of agent estimates could become a standard robustness diagnostic
30
+ - The spread across agents quantifies researcher degrees of freedom computationally
31
+
32
+ ## Post 21: Attention, Verification, and Convex Costs
33
+
34
+ ### Framework
35
+
36
+ Isoquants for cognitive work have flattened → human time ≈ machine time for many tasks → rational substitution toward cheaper machine time → reduced human attention despite more output.
37
+
38
+ ### Key Claims
39
+
40
+ 1. **5x productivity, >5x mess.** "Stock pollutants" (excess files, duplicate outputs, hard-coded results, branching pipelines) grow convex in productivity, not linearly.
41
+ 2. **Three binding constraints in human-AI research:**
42
+ - Human verification (Karpathy: "the new skill is verification")
43
+ - Sustained attention (resist automation of the learning process itself)
44
+ - Congestion management (finding things in your own output)
45
+ 3. **Legacy projects are harder.** New projects scaffold cleanly; revived R&Rs become "Frankenstein hodge-podge" of old and new organisation.
46
+
47
+ ### Relevance
48
+
49
+ - Empirical account of human-AI collaboration friction from a power user
50
+ - The convex cost function is testable: does error rate grow faster than output rate?
51
+ - "Beautiful decks" as attention maintenance strategy — works at low volume, breaks at scale
52
+ - Directly maps to the user's research themes (human-AI collaboration, org behaviour)
53
+
54
+ ## Post 25: OpenClaw Security & Anthropic's Response
55
+
56
+ ### What Happened
57
+
58
+ OpenClaw (always-on WhatsApp AI agent, 230K GitHub stars) had critical security failures:
59
+ - No authentication by default; ~1000 open installations found via Shodan
60
+ - Prompt injection: embedded instructions in emails forwarded data to attackers
61
+ - 230+ malicious plugins in one week on unmoderated skill marketplace
62
+ - Cisco found a #1-ranked skill that was literal malware (curl exfiltration)
63
+
64
+ ### Anthropic's Response
65
+
66
+ - Cowork (scheduled tasks) and Remote Control (phone → local machine)
67
+ - Key architectural differences: no inbound ports, short-lived credentials, sandboxed, TLS encryption
68
+ - Trade-off: more constrained but safer; computer must be awake for Cowork
69
+
70
+ ### Relevance
71
+
72
+ - Case study in safety norms emerging in AI agent ecosystems
73
+ - The semantic attack surface (prompt = attack vector) vs traditional code exploits
74
+ - Anthropic's "safety as brand equity" thesis — early safety investment paying off in agent era
75
+
76
+ ## Post 27: Research vs Publishing Economics
77
+
78
+ ### Core Argument
79
+
80
+ AI collapses the cost of producing submission-quality manuscripts → 5x submissions → acceptance rates crater → journals earn windfall fees → referee system breaks → prisoner's dilemma.
81
+
82
+ ### Numbers
83
+
84
+ - ~12,000 research-active economists, ~39,000 submissions/year currently
85
+ - At 5x: top-5 acceptance drops from 5% to 1%; 87 journals go from $6.2M to $31M in fees
86
+ - Referee need: 146K reports/year against ~54K realistic supply
87
+ - Individual cost of 3x scaling: ~$3,200/year (fees + Claude Max)
88
+
89
+ ### Project APE (Zurich Social Catalyst Lab)
90
+
91
+ - 204 fully automated econ papers; 60 added in one week
92
+ - 4.7% win rate vs AER-equivalent in head-to-head, improving to 7.6% in latest cohort
93
+ - Goal: 1,000 papers
94
+
95
+ ### Cunningham's Own Test
96
+
97
+ Fully automated a paper end-to-end: idea generation → shift-share IV → web data → analysis → writing → refine.ink review ($40-50) → revision → referee2 audit → cross-language code audit. Total: ~$100, few hours.
98
+
99
+ ### Relevance
100
+
101
+ - The "binding constraint shifts from production to evaluation" thesis
102
+ - 75 working papers on a website = "lottery player" signal, not "serious researcher"
103
+ - Refine.ink as a verification service that gets paid multiple times per paper (polishing → submission → desk screen → R&R)
104
+ - For the user: the institutional response to AI-generated research volume is a research topic in itself (org behaviour, mechanism design)
@@ -0,0 +1,62 @@
1
+ # Scott Cunningham — Multi-Language Code Audits
2
+
3
+ > Source: Substack (Feb 2026). First in a series on Claude Code for causal inference pipelines.
4
+
5
+ ## Core Idea
6
+
7
+ Frame LLM coding errors as **measurement error**: random, language-specific syntax mistakes that are independent across languages. If errors in R, Python, and Stata are independent:
8
+
9
+ ```
10
+ P(all three wrong) = ε_R × ε_P × ε_S ≈ very small
11
+ ```
12
+
13
+ Therefore: replicate your entire data pipeline in 2-3 languages and compare outputs numerically (coefficients, test statistics, table values) to catch implementation bugs.
14
+
15
+ ## Key Claims
16
+
17
+ 1. **Hallucination as measurement error.** LLM code errors are random draws from a language-specific error distribution — analogous to classical measurement error (variable = true value + noise).
18
+ 2. **Independence across languages.** Syntax-specific errors (Stata's missing-value trap, R's factor ordering, Python's 0-indexing) are plausibly independent because they stem from different language grammars.
19
+ 3. **Full pipeline replication, not just code review.** Replicate cleaning, merging, and estimation end-to-end in 2+ languages. Compare outputs table-by-table, coefficient-by-coefficient.
20
+
21
+ ## When It Works
22
+
23
+ - Deterministic computations: OLS, DiD, IV, F-tests, analytical standard errors, R-squared
24
+ - Data processing: cleaning, merging, variable construction
25
+ - Any pipeline where the same inputs should produce identical numerical outputs
26
+
27
+ ## When It Doesn't Work
28
+
29
+ - Bootstrap (language-specific seeds)
30
+ - Simulation-based estimators (simulated MLE, method of simulated moments)
31
+ - Bayesian MCMC (Gibbs, HMC)
32
+ - EM algorithms with random starting points
33
+ - Machine learning (SGD, random forests, neural net initialisation)
34
+
35
+ ## Limitation: Independence Breaks Down for Conceptual Errors
36
+
37
+ The independence assumption holds for *syntax-specific* bugs but NOT for *design errors* (wrong estimand, wrong merge logic, wrong identification strategy). Conceptual errors replicate across languages because they're language-agnostic. This approach catches implementation bugs, not design bugs.
38
+
39
+ ## Illustrative Example: Stata Missing-Value Trap
40
+
41
+ ```stata
42
+ * WRONG — also replaces missing values to 10
43
+ replace olddog = 10 if olddog > 10
44
+
45
+ * CORRECT — excludes missing values
46
+ replace olddog = 10 if olddog > 10 & olddog ~= .
47
+ ```
48
+
49
+ Claude Code knows the correct version (trained on Stata manuals + Nick Cox's listserv posts) but may randomly omit the missing-value guard. If you only run Stata, this propagates silently. If you also run R and Python (where `NA` handling differs), the discrepancy surfaces in the output comparison.
50
+
51
+ ## Case Study
52
+
53
+ Callaway & Sant'Anna DiD applied to Brazilian deinstitutionalisation (CAPS) and homicides. Packages used:
54
+ - **Stata:** `csdid`, `csdid2`
55
+ - **R:** `did`
56
+ - **Python:** `differences` (Dionisi), `diff-diff` (Gerber)
57
+
58
+ ## Relevance
59
+
60
+ - Complements existing `/code-review` skill (which already has cross-language verification as category 7/11) — but Cunningham's version is more aggressive: full pipeline replication, not just spot-checking
61
+ - Does NOT replace design-level audits (`/devils-advocate`, `design-before-results` rule) — those catch the conceptual errors this approach misses
62
+ - Most applicable to empirical papers with deterministic estimation pipelines
@@ -0,0 +1,72 @@
1
+ # Paper Note: Towards an AI Co-Scientist (Google, Feb 2025)
2
+
3
+ > arXiv 2502.18864 | 81 pages | Saved at `to-sort/downloads/2502.18864-ai-co-scientist.pdf`
4
+
5
+ ## Summary
6
+
7
+ Multi-agent system built on Gemini 2.0 for scientific hypothesis generation and refinement. Six specialized agents (Generation, Reflection, Ranking, Evolution, Proximity, Meta-review) orchestrated by a Supervisor agent. Applied to three biomedical domains with wet-lab validation.
8
+
9
+ ## Architecture
10
+
11
+ - **Generation agent**: Literature search, simulated scientific debates (self-play), iterative assumption identification, research expansion
12
+ - **Reflection agent**: Multi-tier review (initial safety screen, full literature-grounded review, deep verification, observational review, recontextualized tournament review)
13
+ - **Ranking agent**: Pairwise tournament using Elo ratings. Evaluates on alignment, plausibility, novelty, testability, safety
14
+ - **Proximity agent**: Builds similarity graph over hypotheses for clustering and gap identification
15
+ - **Evolution agent**: Refines top-ranked hypotheses via enhancement, feasibility improvement, combination, simplification, out-of-box thinking
16
+ - **Meta-review agent**: Synthesizes patterns across all reviews, provides feedback to other agents' prompts (learning without backpropagation)
17
+
18
+ Key design: tournament-based evolution with Elo ratings replaces "pick best from list" with a scalable ranking mechanism. More compute = monotonically better results (test-time compute scaling).
19
+
20
+ ## Key Results
21
+
22
+ | Metric | Result |
23
+ |--------|--------|
24
+ | GPQA diamond accuracy (top-1) | 78.4% |
25
+ | Expert preference ranking | 2.36/4 (best among all systems) |
26
+ | Expert novelty rating | 3.64/5 |
27
+ | Expert impact rating | 3.09/5 |
28
+ | Drug repurposing (AML) | 3/6 expert-selected drugs confirmed in vitro; KIRA6 is a genuine novel discovery |
29
+ | Liver fibrosis targets | 2/3 epigenetic modifiers showed anti-fibrotic activity |
30
+ | AMR (cf-PICIs) | Independently recapitulated an unpublished finding in 2 days vs. decade-long conventional programme |
31
+
32
+ Expert-in-the-loop results (Fig 6): AI-augmented expert hypotheses eventually surpass both pure-AI and pure-expert baselines.
33
+
34
+ ## Limitations (Acknowledged)
35
+
36
+ - Open-access literature only (misses paywalled papers)
37
+ - No negative results data (published literature skews positive)
38
+ - No multimodal reasoning (can't read figures, charts, omics data)
39
+ - Elo is self-evaluated, not grounded in external truth
40
+ - Biomedical only (no social science, economics, or humanities validation)
41
+ - Small expert evaluation (11 goals, 7 experts)
42
+
43
+ ## Relevance to the user's Research
44
+
45
+ ### For human-AI collaboration research
46
+ - Concrete case study of human-AI complementarity (citable: Fig 6 shows AI+expert > either alone)
47
+ - Expert-in-the-loop design: scientists refine goals, provide manual reviews, contribute hypotheses that compete alongside AI-generated ones
48
+ - Gap in their framework: no MCDM perspective on how scientists should *decide* which AI-generated hypotheses to pursue (novelty vs. feasibility vs. resource cost vs. alignment)
49
+
50
+ ### For multi-agent systems research
51
+ - Well-documented architecture with emergent capabilities through orchestration
52
+ - Tournament + meta-review loop is a novel coordination mechanism distinct from sequential pipelines or debate protocols
53
+ - Proximity graph for idea deduplication and exploration-space mapping
54
+
55
+ ### Borrowed for Claude Code infrastructure
56
+ - **Multi-turn debate** added to `/devils-advocate` (3-round: critic -> defense -> adjudication)
57
+
58
+ ### Not worth implementing
59
+ - Full tournament/Elo infrastructure (overkill for 6 projects vs. hundreds of hypotheses)
60
+ - Proximity graphs (not enough parallel outputs to need clustering)
61
+ - Evolution agent (existing review cycles already iterate)
62
+
63
+ ## Citation
64
+
65
+ ```bibtex
66
+ @article{gottweis2025towards,
67
+ title={Towards an AI Co-Scientist},
68
+ author={Gottweis, Juraj and others},
69
+ year={2025},
70
+ journal={arXiv preprint arXiv:2502.18864}
71
+ }
72
+ ```
@@ -0,0 +1,58 @@
1
+ # karpathy/llm-council — Review
2
+
3
+ > Reviewed 2026-02-25. Repo: https://github.com/karpathy/llm-council (Karpathy "Saturday hack")
4
+
5
+ ## What It Does
6
+
7
+ Multi-model council that answers questions in 3 stages:
8
+ 1. **Stage 1 — Individual responses**: Query all council members (4 models) in parallel via OpenRouter
9
+ 2. **Stage 2 — Peer review**: Each model ranks all other responses (anonymised as "Response A/B/C/D"), parses `FINAL RANKING:` section
10
+ 3. **Stage 3 — Chairman synthesis**: A designated chairman model synthesises all responses + rankings into a final answer
11
+
12
+ ## Architecture
13
+
14
+ - **Backend**: FastAPI + httpx (async OpenRouter calls) + JSON file storage
15
+ - **Frontend**: React + Vite (separate app)
16
+ - **API**: OpenRouter only (same as our Scout migration)
17
+ - **Config**: `COUNCIL_MODELS` list + `CHAIRMAN_MODEL` in `config.py`
18
+ - **Streaming**: SSE endpoint streams stage completions to frontend
19
+ - Dependencies: `fastapi`, `httpx`, `pydantic`, `uvicorn`, `python-dotenv`
20
+
21
+ ## Key Design Choices
22
+
23
+ - Anonymous peer review: models see "Response A/B/C" not model names — prevents brand-bias
24
+ - Parallel queries via `asyncio.gather` — all models queried simultaneously
25
+ - Aggregate rankings: Borda-count-style average position across all reviewers
26
+ - Chairman can be any model (default: Gemini 3 Pro)
27
+ - Regex parsing of `FINAL RANKING:` section — fragile but works
28
+
29
+ ## What's Good
30
+
31
+ - Clean separation: `openrouter.py` (API client) → `council.py` (orchestration) → `main.py` (routes)
32
+ - Parallel execution throughout — fast despite 4+ model calls per stage
33
+ - SSE streaming so frontend shows progressive results
34
+ - Anonymisation prevents model-name bias in peer review
35
+
36
+ ## Weaknesses
37
+
38
+ - No system prompts — all queries are bare user messages
39
+ - JSON file storage — no DB, no caching
40
+ - `FINAL RANKING:` parsing is fragile (regex on free-form text)
41
+ - No retry logic on model failures
42
+ - No cost tracking or token counting
43
+ - Fixed council membership — no per-query model selection
44
+ - No conversation context (each turn is independent, no history passed)
45
+
46
+ ## Relevance for Scout
47
+
48
+ Core 3-stage pattern is directly reusable:
49
+ 1. Query N models with the same prompt (research idea to evaluate)
50
+ 2. Each model reviews/ranks the other responses (anonymised)
51
+ 3. Chairman synthesises into a consensus evaluation
52
+
53
+ Key adaptations needed:
54
+ - Domain-specific system prompts (research novelty/framing context, not general Q&A)
55
+ - Use structured JSON output (Pydantic models) instead of free-form markdown
56
+ - Integrate with existing OpenRouter client (`LLMService`) rather than raw httpx
57
+ - Store results in SQLite (existing pattern) not JSON files
58
+ - HTMX partial rendering, not React SPA
@@ -0,0 +1,175 @@
1
+ # Multi-Coder Reliability Protocol
2
+
3
+ > Source: weiai-wayne-xu/commDAAF — Nonprofit Mission Framing Study (March 2026)
4
+ > Problem solved: Establishing inter-coder reliability for AI-assisted content analysis without human validation
5
+
6
+ ## Overview
7
+
8
+ Traditional content analysis requires human coders for reliability. This protocol uses **three independent AI models** to establish reliability without human intervention while avoiding "single-coder circularity."
9
+
10
+ ## The 3-Model Protocol
11
+
12
+ ```
13
+ ┌─────────────┐ ┌─────────────┐
14
+ │ Coder A │ │ Coder B │
15
+ │ (Codex) │ │ (Gemini) │
16
+ │ Batch 1 │ │ Batch 2 │
17
+ └──────┬──────┘ └──────┬──────┘
18
+ │ │
19
+ └────────┬─────────┘
20
+
21
+ ┌─────────────┐
22
+ │ Coder C │
23
+ │ (Claude) │
24
+ │ 30-item │
25
+ │ reliability │
26
+ │ sample │
27
+ └─────────────┘
28
+ ```
29
+
30
+ ## Protocol Steps
31
+
32
+ ### Step 1: Split Primary Coding
33
+ ```python
34
+ # Divide dataset between two coders
35
+ batch_1 = missions[:len(missions)//2] # → Codex
36
+ batch_2 = missions[len(missions)//2:] # → Gemini
37
+ ```
38
+
39
+ ### Step 2: Code Independently
40
+ Each coder receives identical instructions but codes independently:
41
+ ```python
42
+ codex_results = code_with_codex(batch_1, instructions)
43
+ gemini_results = code_with_gemini(batch_2, instructions)
44
+ ```
45
+
46
+ ### Step 3: Sample for Reliability
47
+ ```python
48
+ import random
49
+ random.seed(42) # Reproducible
50
+
51
+ # Sample from BOTH batches
52
+ combined = codex_results + gemini_results
53
+ reliability_sample = random.sample(combined, 30)
54
+ ```
55
+
56
+ ### Step 4: Third-Model Validation
57
+ ```python
58
+ # Claude codes the 30-item sample independently
59
+ claude_results = code_with_claude(reliability_sample, instructions)
60
+
61
+ # Compare against original coders
62
+ kappa_vs_codex = cohens_kappa(
63
+ [r for r in claude_results if r["original_coder"] == "codex"],
64
+ [r for r in codex_results if r["id"] in sample_ids]
65
+ )
66
+ kappa_vs_gemini = cohens_kappa(
67
+ [r for r in claude_results if r["original_coder"] == "gemini"],
68
+ [r for r in gemini_results if r["id"] in sample_ids]
69
+ )
70
+ ```
71
+
72
+ ### Step 5: Report Both kappa Values
73
+ ```markdown
74
+ ### Inter-Coder Reliability
75
+
76
+ | Comparison | kappa | Agreement |
77
+ |------------|-------|-----------|
78
+ | Claude vs Codex | 0.94 | 97% |
79
+ | Claude vs Gemini | 0.92 | 96% |
80
+ | **Overall** | **0.935** | **96.7%** |
81
+ ```
82
+
83
+ ## Why This Works
84
+
85
+ | Concern | How Protocol Addresses It |
86
+ |---------|---------------------------|
87
+ | Single-model bias | Three different model families |
88
+ | Circular validation | Validator (Claude) never saw training data |
89
+ | Provider collusion | Codex (OpenAI), Gemini (Google), Claude (Anthropic) |
90
+ | Replicability | Fixed random seed, identical instructions |
91
+
92
+ ## Minimum Requirements
93
+
94
+ - **3 distinct models** from different providers
95
+ - **30+ items** in reliability sample
96
+ - **Fixed random seed** for reproducibility
97
+ - **Identical instructions** to all coders
98
+ - **kappa > 0.80** for acceptable reliability (Landis & Koch)
99
+
100
+ ## Landis & Koch kappa Interpretation
101
+
102
+ | kappa | Interpretation |
103
+ |-------|----------------|
104
+ | < 0.00 | Poor |
105
+ | 0.00-0.20 | Slight |
106
+ | 0.21-0.40 | Fair |
107
+ | 0.41-0.60 | Moderate |
108
+ | 0.61-0.80 | Substantial |
109
+ | **0.81-1.00** | **Almost perfect** |
110
+
111
+ ## Code Template
112
+
113
+ ```python
114
+ from sklearn.metrics import cohen_kappa_score
115
+ import random
116
+
117
+ def compute_reliability(primary_results: list, validator_results: list) -> dict:
118
+ """Compute inter-coder reliability between primary coder and validator."""
119
+
120
+ # Match items by ID
121
+ matched = []
122
+ for v in validator_results:
123
+ for p in primary_results:
124
+ if p["id"] == v["id"]:
125
+ matched.append((p["code"], v["code"]))
126
+ break
127
+
128
+ primary_codes = [m[0] for m in matched]
129
+ validator_codes = [m[1] for m in matched]
130
+
131
+ # Cohen's kappa
132
+ kappa = cohen_kappa_score(primary_codes, validator_codes)
133
+
134
+ # Simple agreement
135
+ agreement = sum(p == v for p, v in matched) / len(matched)
136
+
137
+ return {
138
+ "n_sample": len(matched),
139
+ "cohens_kappa": round(kappa, 3),
140
+ "simple_agreement": round(agreement * 100, 1),
141
+ "interpretation": interpret_kappa(kappa),
142
+ }
143
+
144
+ def interpret_kappa(k: float) -> str:
145
+ if k >= 0.81: return "almost perfect"
146
+ if k >= 0.61: return "substantial"
147
+ if k >= 0.41: return "moderate"
148
+ if k >= 0.21: return "fair"
149
+ if k >= 0.00: return "slight"
150
+ return "poor"
151
+ ```
152
+
153
+ ## Reporting Disagreements
154
+
155
+ Always report the nature of disagreements:
156
+
157
+ ```markdown
158
+ ### Disagreement Analysis
159
+
160
+ Single disagreement (1/30): County fair organization
161
+ - Codex: SERVICE (provides entertainment to community)
162
+ - Claude: FELLOWSHIP (exists for member exhibitors)
163
+
164
+ **Interpretation:** Borderline case illustrating reasonable ambiguity
165
+ in the scheme, not systematic divergence.
166
+ ```
167
+
168
+ ## Anti-Patterns to Avoid
169
+
170
+ | Anti-Pattern | Problem | Fix |
171
+ |--------------|---------|-----|
172
+ | Same model for coding + validation | Circular | Use different model families |
173
+ | No reliability sample | Unverifiable | Always sample 30+ items |
174
+ | Reconciliation before kappa | Inflated agreement | Compute kappa on independent codes |
175
+ | Reporting only agreement % | Ignores chance | Always report Cohen's kappa |