flonat-research 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/domain-reviewer.md +336 -0
- package/.claude/agents/fixer.md +226 -0
- package/.claude/agents/paper-critic.md +370 -0
- package/.claude/agents/peer-reviewer.md +289 -0
- package/.claude/agents/proposal-reviewer.md +215 -0
- package/.claude/agents/referee2-reviewer.md +367 -0
- package/.claude/agents/references/journal-referee-profiles.md +354 -0
- package/.claude/agents/references/paper-critic/council-personas.md +77 -0
- package/.claude/agents/references/paper-critic/council-prompts.md +198 -0
- package/.claude/agents/references/peer-reviewer/report-template.md +199 -0
- package/.claude/agents/references/peer-reviewer/sa-prompts.md +260 -0
- package/.claude/agents/references/peer-reviewer/security-scan.md +188 -0
- package/.claude/agents/references/proposal-reviewer/report-template.md +144 -0
- package/.claude/agents/references/proposal-reviewer/sa-prompts.md +149 -0
- package/.claude/agents/references/referee-config.md +114 -0
- package/.claude/agents/references/referee2-reviewer/audit-checklists.md +287 -0
- package/.claude/agents/references/referee2-reviewer/report-template.md +334 -0
- package/.claude/rules/design-before-results.md +52 -0
- package/.claude/rules/ignore-agents-md.md +17 -0
- package/.claude/rules/ignore-gemini-md.md +17 -0
- package/.claude/rules/lean-claude-md.md +45 -0
- package/.claude/rules/learn-tags.md +99 -0
- package/.claude/rules/overleaf-separation.md +67 -0
- package/.claude/rules/plan-first.md +175 -0
- package/.claude/rules/read-docs-first.md +50 -0
- package/.claude/rules/scope-discipline.md +28 -0
- package/.claude/settings.json +125 -0
- package/.context/current-focus.md +33 -0
- package/.context/preferences/priorities.md +36 -0
- package/.context/preferences/task-naming.md +28 -0
- package/.context/profile.md +29 -0
- package/.context/projects/_index.md +41 -0
- package/.context/projects/papers/nudge-exp.md +22 -0
- package/.context/projects/papers/uncertainty.md +31 -0
- package/.context/resources/claude-scientific-writer-review.md +48 -0
- package/.context/resources/cunningham-multi-analyst-agents.md +104 -0
- package/.context/resources/cunningham-multilang-code-audit.md +62 -0
- package/.context/resources/google-ai-co-scientist-review.md +72 -0
- package/.context/resources/karpathy-llm-council-review.md +58 -0
- package/.context/resources/multi-coder-reliability-protocol.md +175 -0
- package/.context/resources/pedro-santanna-takeaways.md +96 -0
- package/.context/resources/venue-rankings/abs_ajg_2024.csv +1823 -0
- package/.context/resources/venue-rankings/abs_ajg_2024_econ.csv +356 -0
- package/.context/resources/venue-rankings/cabs_4_4star_theory.csv +40 -0
- package/.context/resources/venue-rankings/core_2026.csv +801 -0
- package/.context/resources/venue-rankings.md +147 -0
- package/.context/workflows/README.md +69 -0
- package/.context/workflows/daily-review.md +91 -0
- package/.context/workflows/meeting-actions.md +108 -0
- package/.context/workflows/replication-protocol.md +155 -0
- package/.context/workflows/weekly-review.md +113 -0
- package/.mcp-server-biblio/formatters.py +158 -0
- package/.mcp-server-biblio/pyproject.toml +11 -0
- package/.mcp-server-biblio/server.py +678 -0
- package/.mcp-server-biblio/sources/__init__.py +14 -0
- package/.mcp-server-biblio/sources/base.py +73 -0
- package/.mcp-server-biblio/sources/formatters.py +83 -0
- package/.mcp-server-biblio/sources/models.py +22 -0
- package/.mcp-server-biblio/sources/multi_source.py +243 -0
- package/.mcp-server-biblio/sources/openalex_source.py +183 -0
- package/.mcp-server-biblio/sources/scopus_source.py +309 -0
- package/.mcp-server-biblio/sources/wos_source.py +508 -0
- package/.mcp-server-biblio/uv.lock +896 -0
- package/.scripts/README.md +161 -0
- package/.scripts/ai_pattern_density.py +446 -0
- package/.scripts/conf +445 -0
- package/.scripts/config.py +122 -0
- package/.scripts/count_inventory.py +275 -0
- package/.scripts/daily_digest.py +288 -0
- package/.scripts/done +177 -0
- package/.scripts/extract_meeting_actions.py +223 -0
- package/.scripts/focus +176 -0
- package/.scripts/generate-codex-agents-md.py +217 -0
- package/.scripts/inbox +194 -0
- package/.scripts/notion_helpers.py +325 -0
- package/.scripts/openalex/query_helpers.py +306 -0
- package/.scripts/papers +227 -0
- package/.scripts/query +223 -0
- package/.scripts/session-history.py +201 -0
- package/.scripts/skill-health.py +516 -0
- package/.scripts/skill-log-miner.py +273 -0
- package/.scripts/sync-to-codex.sh +252 -0
- package/.scripts/task +213 -0
- package/.scripts/tasks +190 -0
- package/.scripts/week +206 -0
- package/CLAUDE.md +197 -0
- package/LICENSE +21 -0
- package/MEMORY.md +38 -0
- package/README.md +269 -0
- package/docs/agents.md +44 -0
- package/docs/bibliography-setup.md +55 -0
- package/docs/council-mode.md +36 -0
- package/docs/getting-started.md +245 -0
- package/docs/hooks.md +38 -0
- package/docs/mcp-servers.md +82 -0
- package/docs/notion-setup.md +109 -0
- package/docs/rules.md +33 -0
- package/docs/scripts.md +303 -0
- package/docs/setup-overview/setup-overview.pdf +0 -0
- package/docs/skills.md +70 -0
- package/docs/system.md +159 -0
- package/hooks/block-destructive-git.sh +66 -0
- package/hooks/context-monitor.py +114 -0
- package/hooks/postcompact-restore.py +157 -0
- package/hooks/precompact-autosave.py +181 -0
- package/hooks/promise-checker.sh +124 -0
- package/hooks/protect-source-files.sh +81 -0
- package/hooks/resume-context-loader.sh +53 -0
- package/hooks/startup-context-loader.sh +102 -0
- package/package.json +51 -0
- package/packages/cli-council/.github/workflows/claude-code-review.yml +44 -0
- package/packages/cli-council/.github/workflows/claude.yml +50 -0
- package/packages/cli-council/README.md +100 -0
- package/packages/cli-council/pyproject.toml +43 -0
- package/packages/cli-council/src/cli_council/__init__.py +19 -0
- package/packages/cli-council/src/cli_council/__main__.py +185 -0
- package/packages/cli-council/src/cli_council/backends/__init__.py +8 -0
- package/packages/cli-council/src/cli_council/backends/base.py +81 -0
- package/packages/cli-council/src/cli_council/backends/claude.py +25 -0
- package/packages/cli-council/src/cli_council/backends/codex.py +27 -0
- package/packages/cli-council/src/cli_council/backends/gemini.py +26 -0
- package/packages/cli-council/src/cli_council/checkpoint.py +212 -0
- package/packages/cli-council/src/cli_council/config.py +51 -0
- package/packages/cli-council/src/cli_council/council.py +391 -0
- package/packages/cli-council/src/cli_council/models.py +46 -0
- package/packages/llm-council/.github/workflows/claude-code-review.yml +44 -0
- package/packages/llm-council/.github/workflows/claude.yml +50 -0
- package/packages/llm-council/README.md +453 -0
- package/packages/llm-council/pyproject.toml +42 -0
- package/packages/llm-council/src/llm_council/__init__.py +23 -0
- package/packages/llm-council/src/llm_council/__main__.py +259 -0
- package/packages/llm-council/src/llm_council/checkpoint.py +193 -0
- package/packages/llm-council/src/llm_council/client.py +253 -0
- package/packages/llm-council/src/llm_council/config.py +232 -0
- package/packages/llm-council/src/llm_council/council.py +482 -0
- package/packages/llm-council/src/llm_council/models.py +46 -0
- package/packages/mcp-bibliography/MEMORY.md +31 -0
- package/packages/mcp-bibliography/_app.py +226 -0
- package/packages/mcp-bibliography/formatters.py +158 -0
- package/packages/mcp-bibliography/log/2026-03-13-2100.md +35 -0
- package/packages/mcp-bibliography/pyproject.toml +15 -0
- package/packages/mcp-bibliography/run.sh +20 -0
- package/packages/mcp-bibliography/scholarly_formatters.py +83 -0
- package/packages/mcp-bibliography/server.py +1857 -0
- package/packages/mcp-bibliography/tools/__init__.py +28 -0
- package/packages/mcp-bibliography/tools/_registry.py +19 -0
- package/packages/mcp-bibliography/tools/altmetric.py +107 -0
- package/packages/mcp-bibliography/tools/core.py +92 -0
- package/packages/mcp-bibliography/tools/dblp.py +52 -0
- package/packages/mcp-bibliography/tools/openalex.py +296 -0
- package/packages/mcp-bibliography/tools/opencitations.py +102 -0
- package/packages/mcp-bibliography/tools/openreview.py +179 -0
- package/packages/mcp-bibliography/tools/orcid.py +131 -0
- package/packages/mcp-bibliography/tools/scholarly.py +575 -0
- package/packages/mcp-bibliography/tools/unpaywall.py +63 -0
- package/packages/mcp-bibliography/tools/zenodo.py +123 -0
- package/packages/mcp-bibliography/uv.lock +711 -0
- package/scripts/setup.sh +143 -0
- package/skills/beamer-deck/SKILL.md +199 -0
- package/skills/beamer-deck/references/quality-rubric.md +54 -0
- package/skills/beamer-deck/references/review-prompts.md +106 -0
- package/skills/bib-validate/SKILL.md +261 -0
- package/skills/bib-validate/references/council-mode.md +34 -0
- package/skills/bib-validate/references/deep-verify.md +79 -0
- package/skills/bib-validate/references/fix-mode.md +36 -0
- package/skills/bib-validate/references/openalex-verification.md +45 -0
- package/skills/bib-validate/references/preprint-check.md +31 -0
- package/skills/bib-validate/references/ref-manager-crossref.md +41 -0
- package/skills/bib-validate/references/report-template.md +82 -0
- package/skills/code-archaeology/SKILL.md +141 -0
- package/skills/code-review/SKILL.md +265 -0
- package/skills/code-review/references/quality-rubric.md +67 -0
- package/skills/consolidate-memory/SKILL.md +208 -0
- package/skills/context-status/SKILL.md +126 -0
- package/skills/creation-guard/SKILL.md +230 -0
- package/skills/devils-advocate/SKILL.md +130 -0
- package/skills/devils-advocate/references/competing-hypotheses.md +83 -0
- package/skills/init-project/SKILL.md +115 -0
- package/skills/init-project-course/references/memory-and-settings.md +92 -0
- package/skills/init-project-course/references/organise-templates.md +94 -0
- package/skills/init-project-course/skill.md +147 -0
- package/skills/init-project-light/skill.md +139 -0
- package/skills/init-project-research/SKILL.md +368 -0
- package/skills/init-project-research/references/atlas-pipeline-sync.md +70 -0
- package/skills/init-project-research/references/atlas-schema.md +81 -0
- package/skills/init-project-research/references/confirmation-report.md +39 -0
- package/skills/init-project-research/references/domain-profile-template.md +104 -0
- package/skills/init-project-research/references/interview-round3.md +34 -0
- package/skills/init-project-research/references/literature-discovery.md +43 -0
- package/skills/init-project-research/references/scaffold-details.md +197 -0
- package/skills/init-project-research/templates/field-calibration.md +60 -0
- package/skills/init-project-research/templates/pipeline-manifest.md +63 -0
- package/skills/init-project-research/templates/run-all.sh +116 -0
- package/skills/init-project-research/templates/seed-files.md +337 -0
- package/skills/insights-deck/SKILL.md +151 -0
- package/skills/interview-me/SKILL.md +157 -0
- package/skills/latex/SKILL.md +141 -0
- package/skills/latex/references/latex-configs.md +183 -0
- package/skills/latex-autofix/SKILL.md +230 -0
- package/skills/latex-autofix/references/known-errors.md +183 -0
- package/skills/latex-autofix/references/quality-rubric.md +50 -0
- package/skills/latex-health-check/SKILL.md +161 -0
- package/skills/learn/SKILL.md +220 -0
- package/skills/learn/scripts/validate_skill.py +265 -0
- package/skills/lessons-learned/SKILL.md +201 -0
- package/skills/literature/SKILL.md +335 -0
- package/skills/literature/references/agent-templates.md +393 -0
- package/skills/literature/references/bibliometric-apis.md +44 -0
- package/skills/literature/references/cli-council-search.md +79 -0
- package/skills/literature/references/openalex-api-guide.md +371 -0
- package/skills/literature/references/openalex-common-queries.md +381 -0
- package/skills/literature/references/openalex-workflows.md +248 -0
- package/skills/literature/references/reference-manager-sync.md +36 -0
- package/skills/literature/references/scopus-api-guide.md +208 -0
- package/skills/literature/references/wos-api-guide.md +308 -0
- package/skills/multi-perspective/SKILL.md +311 -0
- package/skills/multi-perspective/references/computational-many-analysts.md +77 -0
- package/skills/pipeline-manifest/SKILL.md +226 -0
- package/skills/pre-submission-report/SKILL.md +153 -0
- package/skills/process-reviews/SKILL.md +244 -0
- package/skills/process-reviews/references/rr-routing.md +101 -0
- package/skills/project-deck/SKILL.md +87 -0
- package/skills/project-safety/SKILL.md +135 -0
- package/skills/proofread/SKILL.md +254 -0
- package/skills/proofread/references/quality-rubric.md +104 -0
- package/skills/python-env/SKILL.md +57 -0
- package/skills/quarto-deck/SKILL.md +226 -0
- package/skills/quarto-deck/references/markdown-format.md +143 -0
- package/skills/quarto-deck/references/quality-rubric.md +54 -0
- package/skills/save-context/SKILL.md +174 -0
- package/skills/session-log/SKILL.md +98 -0
- package/skills/shared/concept-validation-gate.md +161 -0
- package/skills/shared/council-protocol.md +265 -0
- package/skills/shared/distribution-diagnostics.md +164 -0
- package/skills/shared/engagement-stratified-sampling.md +218 -0
- package/skills/shared/escalation-protocol.md +74 -0
- package/skills/shared/external-audit-protocol.md +205 -0
- package/skills/shared/intercoder-reliability.md +256 -0
- package/skills/shared/mcp-degradation.md +81 -0
- package/skills/shared/method-probing-questions.md +163 -0
- package/skills/shared/multi-language-conventions.md +143 -0
- package/skills/shared/paid-api-safety.md +174 -0
- package/skills/shared/palettes.md +90 -0
- package/skills/shared/progressive-disclosure.md +92 -0
- package/skills/shared/project-documentation-content.md +443 -0
- package/skills/shared/project-documentation-format.md +281 -0
- package/skills/shared/project-documentation.md +100 -0
- package/skills/shared/publication-output.md +138 -0
- package/skills/shared/quality-scoring.md +70 -0
- package/skills/shared/reference-resolution.md +77 -0
- package/skills/shared/research-quality-rubric.md +165 -0
- package/skills/shared/rhetoric-principles.md +54 -0
- package/skills/shared/skill-design-patterns.md +272 -0
- package/skills/shared/skill-index.md +240 -0
- package/skills/shared/system-documentation.md +334 -0
- package/skills/shared/tikz-rules.md +402 -0
- package/skills/shared/validation-tiers.md +121 -0
- package/skills/shared/venue-guides/README.md +46 -0
- package/skills/shared/venue-guides/cell_press_style.md +483 -0
- package/skills/shared/venue-guides/conferences_formatting.md +564 -0
- package/skills/shared/venue-guides/cs_conference_style.md +463 -0
- package/skills/shared/venue-guides/examples/cell_summary_example.md +247 -0
- package/skills/shared/venue-guides/examples/medical_structured_abstract.md +313 -0
- package/skills/shared/venue-guides/examples/nature_abstract_examples.md +213 -0
- package/skills/shared/venue-guides/examples/neurips_introduction_example.md +245 -0
- package/skills/shared/venue-guides/journals_formatting.md +486 -0
- package/skills/shared/venue-guides/medical_journal_styles.md +535 -0
- package/skills/shared/venue-guides/ml_conference_style.md +556 -0
- package/skills/shared/venue-guides/nature_science_style.md +405 -0
- package/skills/shared/venue-guides/reviewer_expectations.md +417 -0
- package/skills/shared/venue-guides/venue_writing_styles.md +321 -0
- package/skills/split-pdf/SKILL.md +172 -0
- package/skills/split-pdf/methodology.md +48 -0
- package/skills/sync-notion/SKILL.md +93 -0
- package/skills/system-audit/SKILL.md +157 -0
- package/skills/system-audit/references/sub-agent-prompts.md +294 -0
- package/skills/task-management/SKILL.md +131 -0
- package/skills/update-focus/SKILL.md +204 -0
- package/skills/update-project-doc/SKILL.md +194 -0
- package/skills/validate-bib/SKILL.md +242 -0
- package/skills/validate-bib/references/council-mode.md +34 -0
- package/skills/validate-bib/references/deep-verify.md +71 -0
- package/skills/validate-bib/references/openalex-verification.md +45 -0
- package/skills/validate-bib/references/preprint-check.md +31 -0
- package/skills/validate-bib/references/report-template.md +62 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Task Naming Conventions
|
|
2
|
+
|
|
3
|
+
## Structure
|
|
4
|
+
|
|
5
|
+
`[Action verb] [Object] - [Brief context]`
|
|
6
|
+
|
|
7
|
+
## Action Verbs
|
|
8
|
+
|
|
9
|
+
| Verb | When to use |
|
|
10
|
+
|------|-------------|
|
|
11
|
+
| Write | Creating new content from scratch |
|
|
12
|
+
| Draft | First version, expecting revisions |
|
|
13
|
+
| Edit | Modifying existing content |
|
|
14
|
+
| Review | Reading and providing feedback |
|
|
15
|
+
| Read | Reading for understanding |
|
|
16
|
+
| Send | Communication tasks |
|
|
17
|
+
| Submit | Final submission |
|
|
18
|
+
| Prepare | Getting something ready |
|
|
19
|
+
| Research | Investigating a topic |
|
|
20
|
+
| Decide | Making a choice |
|
|
21
|
+
| Follow up | Checking on something sent |
|
|
22
|
+
|
|
23
|
+
## Examples
|
|
24
|
+
|
|
25
|
+
- "Draft methodology section - uncertainty paper"
|
|
26
|
+
- "Review co-author feedback - nudge-exp intro"
|
|
27
|
+
- "Submit ethics application - experiment 1"
|
|
28
|
+
- "Read Smith (2024) - lit review"
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# Profile
|
|
2
|
+
|
|
3
|
+
<!-- Customise this with your own details. This file helps Claude understand
|
|
4
|
+
your background and calibrate its responses. -->
|
|
5
|
+
|
|
6
|
+
## Identity
|
|
7
|
+
|
|
8
|
+
- **Name:** [Your Name]
|
|
9
|
+
- **Role:** PhD Researcher
|
|
10
|
+
- **Institution:** UK University
|
|
11
|
+
|
|
12
|
+
## Supervisors
|
|
13
|
+
|
|
14
|
+
| Name | Institution | Role | Focus |
|
|
15
|
+
|------|-------------|------|-------|
|
|
16
|
+
| [Supervisor 1] | [University] | Primary | [Their focus] |
|
|
17
|
+
| [Supervisor 2] | [University] | Secondary | [Their focus] |
|
|
18
|
+
|
|
19
|
+
## Teaching
|
|
20
|
+
|
|
21
|
+
- Graduate TA for Research Methods (Autumn term)
|
|
22
|
+
- Workshop facilitator for Data Analysis in R
|
|
23
|
+
|
|
24
|
+
## Tools & Preferences
|
|
25
|
+
|
|
26
|
+
- **Writing:** LaTeX (papers), Markdown (notes)
|
|
27
|
+
- **Data analysis:** R (primary), Python (secondary)
|
|
28
|
+
- **Version control:** Git
|
|
29
|
+
- **Reference management:** Paperpile (synced to `paperpile.bib`)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# Projects Index
|
|
2
|
+
|
|
3
|
+
> Overview of all active research projects. Updated as projects progress.
|
|
4
|
+
|
|
5
|
+
## Active Projects
|
|
6
|
+
|
|
7
|
+
| # | Short name | Title | Stage | Target | Co-authors |
|
|
8
|
+
|---|-----------|-------|-------|--------|------------|
|
|
9
|
+
| 1 | uncertainty | Decision Making Under Uncertainty | Drafting | Management Science | J. Smith |
|
|
10
|
+
| 2 | nudge-exp | Nudge Effectiveness in Org Settings | Literature Review | JDM | T. Brown |
|
|
11
|
+
| 3 | survey-methods | Survey Design for Behavioural Research | Idea | OBHDP | — |
|
|
12
|
+
|
|
13
|
+
## Project Details
|
|
14
|
+
|
|
15
|
+
### 1. uncertainty
|
|
16
|
+
|
|
17
|
+
- **Directory:** `~/Research/uncertainty/`
|
|
18
|
+
- **Paper:** `paper/` (Overleaf symlink)
|
|
19
|
+
- **Status:** Methodology and results drafted, introduction needs revision
|
|
20
|
+
- **Next:** Finish results tables, send to co-author for review
|
|
21
|
+
- **Pipeline stage:** Drafting
|
|
22
|
+
|
|
23
|
+
### 2. nudge-exp
|
|
24
|
+
|
|
25
|
+
- **Directory:** `~/Research/nudge-exp/`
|
|
26
|
+
- **Paper:** `paper/` (Overleaf symlink)
|
|
27
|
+
- **Status:** Literature review in progress, 40+ papers collected
|
|
28
|
+
- **Next:** Synthesise literature, draft theoretical framework
|
|
29
|
+
- **Pipeline stage:** Literature Review
|
|
30
|
+
|
|
31
|
+
### 3. survey-methods
|
|
32
|
+
|
|
33
|
+
- **Directory:** `~/Research/survey-methods/`
|
|
34
|
+
- **Paper:** Not started
|
|
35
|
+
- **Status:** Idea phase — exploring feasibility
|
|
36
|
+
- **Next:** Read 5 key papers, write 1-page research question memo
|
|
37
|
+
- **Pipeline stage:** Idea
|
|
38
|
+
|
|
39
|
+
## Archived Projects
|
|
40
|
+
|
|
41
|
+
None yet.
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# Paper: Nudge Effectiveness in Organisational Settings
|
|
2
|
+
|
|
3
|
+
## Metadata
|
|
4
|
+
|
|
5
|
+
| Field | Value |
|
|
6
|
+
|-------|-------|
|
|
7
|
+
| Short name | nudge-exp |
|
|
8
|
+
| Title | When Do Nudges Work? Context Effects in Organisational Decision Making |
|
|
9
|
+
| Authors | Alex Chen, Tom Brown |
|
|
10
|
+
| Target journal | Journal of Decision Making |
|
|
11
|
+
| Stage | Literature Review |
|
|
12
|
+
| Directory | `~/Research/nudge-exp/` |
|
|
13
|
+
|
|
14
|
+
## Research Question
|
|
15
|
+
|
|
16
|
+
Under what conditions do behavioural nudges improve decision quality in organisational contexts?
|
|
17
|
+
|
|
18
|
+
## Current Status
|
|
19
|
+
|
|
20
|
+
- Literature review in progress (40+ papers collected)
|
|
21
|
+
- Theoretical framework emerging
|
|
22
|
+
- Experiment not yet designed
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Paper: Decision Making Under Uncertainty
|
|
2
|
+
|
|
3
|
+
## Metadata
|
|
4
|
+
|
|
5
|
+
| Field | Value |
|
|
6
|
+
|-------|-------|
|
|
7
|
+
| Short name | uncertainty |
|
|
8
|
+
| Title | Decision Making Under Uncertainty: A Behavioural Approach |
|
|
9
|
+
| Authors | Alex Chen, Jane Smith |
|
|
10
|
+
| Target journal | Management Science |
|
|
11
|
+
| Stage | Drafting |
|
|
12
|
+
| Directory | `~/Research/uncertainty/` |
|
|
13
|
+
|
|
14
|
+
## Research Question
|
|
15
|
+
|
|
16
|
+
How do individuals' decision-making strategies change when uncertainty increases in organisational settings?
|
|
17
|
+
|
|
18
|
+
## Key Contributions
|
|
19
|
+
|
|
20
|
+
1. Novel experimental design for measuring decision quality under varying uncertainty
|
|
21
|
+
2. Theoretical framework linking uncertainty perception to strategy selection
|
|
22
|
+
3. Empirical evidence from a lab experiment with N=200 participants
|
|
23
|
+
|
|
24
|
+
## Current Status
|
|
25
|
+
|
|
26
|
+
- Introduction: First draft (needs revision based on co-author feedback)
|
|
27
|
+
- Literature review: Complete
|
|
28
|
+
- Methodology: Complete
|
|
29
|
+
- Results: In progress (Table 2 pending)
|
|
30
|
+
- Discussion: Not started
|
|
31
|
+
- Conclusion: Not started
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# Review: K-Dense-AI/claude-scientific-writer
|
|
2
|
+
|
|
3
|
+
> Reviewed 2026-02-24. Repo: https://github.com/K-Dense-AI/claude-scientific-writer (856 stars)
|
|
4
|
+
|
|
5
|
+
## What It Is
|
|
6
|
+
|
|
7
|
+
A monolithic scientific document generation tool combining Claude with real-time literature search (Perplexity via OpenRouter) and AI diagram generation (Nano Banana Pro). Available as Claude Code plugin, CLI, or Python API. MIT-licensed.
|
|
8
|
+
|
|
9
|
+
## Architecture
|
|
10
|
+
|
|
11
|
+
- **24 skills**: writing, research-lookup, peer-review, citation-management, clinical-reports, research-grants, latex-posters, scientific-slides, hypothesis-generation, market-research-reports, scientific-schematics, infographics, literature-review, venue-templates, etc.
|
|
12
|
+
- **External APIs**: Perplexity Sonar Pro Search (literature), Nano Banana Pro (diagrams), Parallel Web API (general web search/extraction)
|
|
13
|
+
- **LaTeX-first** output with BibTeX citations by default
|
|
14
|
+
- **Multi-pass writing**: skeleton → research per section → write → verify citations → compile → PDF review via image conversion
|
|
15
|
+
- **Version management**: v1_draft.tex, v2_draft.tex, etc. — never overwrites previous versions
|
|
16
|
+
- All research results saved to `sources/` folder for auditability and context recovery
|
|
17
|
+
|
|
18
|
+
## Key Strengths
|
|
19
|
+
|
|
20
|
+
- **Citation verification loop**: research before writing, verify every BibTeX entry has complete metadata (DOI, volume, pages), web-search for missing fields
|
|
21
|
+
- **Peer review**: quantitative ScholarEval framework (8-dimension scoring)
|
|
22
|
+
- **Figure generation**: minimum counts per document type, mandatory graphical abstract
|
|
23
|
+
- **Full lifecycle**: hypothesis → writing → review → revision
|
|
24
|
+
- **Structured output pipeline** with progress tracking and summary reports
|
|
25
|
+
|
|
26
|
+
## Overlap with Our Setup
|
|
27
|
+
|
|
28
|
+
| Their Feature | Our Equivalent |
|
|
29
|
+
|---------------|---------------|
|
|
30
|
+
| `research-lookup` + `citation-management` | `/literature` skill + `packages/mcp-bibliography` |
|
|
31
|
+
| `peer-review` / `scholar-evaluation` | `referee2-reviewer`, `paper-critic`, `domain-reviewer` agents |
|
|
32
|
+
| `venue-templates` | Scout `venues.json` + framing workflow |
|
|
33
|
+
| `latex-posters` / `scientific-slides` | `/beamer-deck`, `/quarto-deck` |
|
|
34
|
+
|
|
35
|
+
## Key Differences from Our Approach
|
|
36
|
+
|
|
37
|
+
- **Monolithic vs modular**: one tool does everything; ours is skills + agents + standalone apps
|
|
38
|
+
- **External API-heavy**: requires Perplexity, Nano Banana Pro, Parallel Web API keys (more cost)
|
|
39
|
+
- **"Never stop" philosophy**: their CLAUDE.md says "NEVER ask 'would you like me to continue?'"; ours has plan-first discipline and phase boundaries
|
|
40
|
+
- **No novelty scoring or venue-specific framing**: that's unique to our Scout
|
|
41
|
+
- **Heavy CLAUDE.md**: ~500 lines of instructions; ours follows lean-claude-md rule
|
|
42
|
+
- **No design-before-results discipline**: no equivalent to our research design rules
|
|
43
|
+
|
|
44
|
+
## Ideas Worth Borrowing
|
|
45
|
+
|
|
46
|
+
1. **Citation metadata verification loop** — after BibTeX creation, systematically search for missing DOI/volume/pages. Could enhance `/bib-validate`.
|
|
47
|
+
4. **PDF review via image conversion** — convert PDF to images and visually inspect each page for formatting issues. Could add as a step in `/latex-autofix`.
|
|
48
|
+
5. **Progress logging per section** — timestamped logs of word count and citation count per section during writing. Lighter version could fit our session logging.
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
# Scott Cunningham — Multi-Analyst Agent Designs & Non-Standard Errors
|
|
2
|
+
|
|
3
|
+
> Source: Substack posts 21, 25, 26, 27 (Feb–Mar 2026). Series on Claude Code for causal inference.
|
|
4
|
+
|
|
5
|
+
## Post 26: Computational Many-Analysts Design (DiD Part 2)
|
|
6
|
+
|
|
7
|
+
### Core Idea
|
|
8
|
+
|
|
9
|
+
Run N independent AI agents on the same dataset + estimator, isolating each in a temp directory with no shared memory. This approximates a many-analysts design (Silberzahn et al. 2018, Huntington-Klein et al. 2021, Menkveld et al. 2024 "non-standard errors") at near-zero marginal cost.
|
|
10
|
+
|
|
11
|
+
### Experiment
|
|
12
|
+
|
|
13
|
+
- 15 agents × 5 packages (csdid, csdid2, did, differences, diff-diff) × 3 languages
|
|
14
|
+
- Each launched via `claude -p` in isolated temp dirs, no shared history
|
|
15
|
+
- Instructions: Callaway & Sant'Anna estimator, universal base period, not-yet-treated controls
|
|
16
|
+
- **Primary discretionary node:** covariate selection for conditional parallel trends
|
|
17
|
+
|
|
18
|
+
### Key Findings
|
|
19
|
+
|
|
20
|
+
- **Structural decisions unanimous (15/15):** control group, base period, balanced cohorts, trimming
|
|
21
|
+
- **All variation in covariate selection:** log GDP (14/15), population (12/15), poverty (10/15), health spending (7/15), Bolsa Familia (2/15), geographic vars (1/15)
|
|
22
|
+
- Agents drew the confounder/mediator boundary differently — same reasoning, different thresholds
|
|
23
|
+
- All 15 chose doubly robust; Stata agents split on DRIPW vs DRIMP variant
|
|
24
|
+
|
|
25
|
+
### Relevance
|
|
26
|
+
|
|
27
|
+
- Demonstrates that agent-based many-analysts designs are feasible and cheap
|
|
28
|
+
- Covariate selection as the key discretionary node in DiD — directly measurable
|
|
29
|
+
- Forest plots of agent estimates could become a standard robustness diagnostic
|
|
30
|
+
- The spread across agents quantifies researcher degrees of freedom computationally
|
|
31
|
+
|
|
32
|
+
## Post 21: Attention, Verification, and Convex Costs
|
|
33
|
+
|
|
34
|
+
### Framework
|
|
35
|
+
|
|
36
|
+
Isoquants for cognitive work have flattened → human time ≈ machine time for many tasks → rational substitution toward cheaper machine time → reduced human attention despite more output.
|
|
37
|
+
|
|
38
|
+
### Key Claims
|
|
39
|
+
|
|
40
|
+
1. **5x productivity, >5x mess.** "Stock pollutants" (excess files, duplicate outputs, hard-coded results, branching pipelines) grow convex in productivity, not linearly.
|
|
41
|
+
2. **Three binding constraints in human-AI research:**
|
|
42
|
+
- Human verification (Karpathy: "the new skill is verification")
|
|
43
|
+
- Sustained attention (resist automation of the learning process itself)
|
|
44
|
+
- Congestion management (finding things in your own output)
|
|
45
|
+
3. **Legacy projects are harder.** New projects scaffold cleanly; revived R&Rs become "Frankenstein hodge-podge" of old and new organisation.
|
|
46
|
+
|
|
47
|
+
### Relevance
|
|
48
|
+
|
|
49
|
+
- Empirical account of human-AI collaboration friction from a power user
|
|
50
|
+
- The convex cost function is testable: does error rate grow faster than output rate?
|
|
51
|
+
- "Beautiful decks" as attention maintenance strategy — works at low volume, breaks at scale
|
|
52
|
+
- Directly maps to the user's research themes (human-AI collaboration, org behaviour)
|
|
53
|
+
|
|
54
|
+
## Post 25: OpenClaw Security & Anthropic's Response
|
|
55
|
+
|
|
56
|
+
### What Happened
|
|
57
|
+
|
|
58
|
+
OpenClaw (always-on WhatsApp AI agent, 230K GitHub stars) had critical security failures:
|
|
59
|
+
- No authentication by default; ~1000 open installations found via Shodan
|
|
60
|
+
- Prompt injection: embedded instructions in emails forwarded data to attackers
|
|
61
|
+
- 230+ malicious plugins in one week on unmoderated skill marketplace
|
|
62
|
+
- Cisco found a #1-ranked skill that was literal malware (curl exfiltration)
|
|
63
|
+
|
|
64
|
+
### Anthropic's Response
|
|
65
|
+
|
|
66
|
+
- Cowork (scheduled tasks) and Remote Control (phone → local machine)
|
|
67
|
+
- Key architectural differences: no inbound ports, short-lived credentials, sandboxed, TLS encryption
|
|
68
|
+
- Trade-off: more constrained but safer; computer must be awake for Cowork
|
|
69
|
+
|
|
70
|
+
### Relevance
|
|
71
|
+
|
|
72
|
+
- Case study in safety norms emerging in AI agent ecosystems
|
|
73
|
+
- The semantic attack surface (prompt = attack vector) vs traditional code exploits
|
|
74
|
+
- Anthropic's "safety as brand equity" thesis — early safety investment paying off in agent era
|
|
75
|
+
|
|
76
|
+
## Post 27: Research vs Publishing Economics
|
|
77
|
+
|
|
78
|
+
### Core Argument
|
|
79
|
+
|
|
80
|
+
AI collapses the cost of producing submission-quality manuscripts → 5x submissions → acceptance rates crater → journals earn windfall fees → referee system breaks → prisoner's dilemma.
|
|
81
|
+
|
|
82
|
+
### Numbers
|
|
83
|
+
|
|
84
|
+
- ~12,000 research-active economists, ~39,000 submissions/year currently
|
|
85
|
+
- At 5x: top-5 acceptance drops from 5% to 1%; 87 journals go from $6.2M to $31M in fees
|
|
86
|
+
- Referee need: 146K reports/year against ~54K realistic supply
|
|
87
|
+
- Individual cost of 3x scaling: ~$3,200/year (fees + Claude Max)
|
|
88
|
+
|
|
89
|
+
### Project APE (Zurich Social Catalyst Lab)
|
|
90
|
+
|
|
91
|
+
- 204 fully automated econ papers; 60 added in one week
|
|
92
|
+
- 4.7% win rate vs AER-equivalent in head-to-head, improving to 7.6% in latest cohort
|
|
93
|
+
- Goal: 1,000 papers
|
|
94
|
+
|
|
95
|
+
### Cunningham's Own Test
|
|
96
|
+
|
|
97
|
+
Fully automated a paper end-to-end: idea generation → shift-share IV → web data → analysis → writing → refine.ink review ($40-50) → revision → referee2 audit → cross-language code audit. Total: ~$100, few hours.
|
|
98
|
+
|
|
99
|
+
### Relevance
|
|
100
|
+
|
|
101
|
+
- The "binding constraint shifts from production to evaluation" thesis
|
|
102
|
+
- 75 working papers on a website = "lottery player" signal, not "serious researcher"
|
|
103
|
+
- Refine.ink as a verification service that gets paid multiple times per paper (polishing → submission → desk screen → R&R)
|
|
104
|
+
- For the user: the institutional response to AI-generated research volume is a research topic in itself (org behaviour, mechanism design)
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# Scott Cunningham — Multi-Language Code Audits
|
|
2
|
+
|
|
3
|
+
> Source: Substack (Feb 2026). First in a series on Claude Code for causal inference pipelines.
|
|
4
|
+
|
|
5
|
+
## Core Idea
|
|
6
|
+
|
|
7
|
+
Frame LLM coding errors as **measurement error**: random, language-specific syntax mistakes that are independent across languages. If errors in R, Python, and Stata are independent:
|
|
8
|
+
|
|
9
|
+
```
|
|
10
|
+
P(all three wrong) = ε_R × ε_P × ε_S ≈ very small
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
Therefore: replicate your entire data pipeline in 2-3 languages and compare outputs numerically (coefficients, test statistics, table values) to catch implementation bugs.
|
|
14
|
+
|
|
15
|
+
## Key Claims
|
|
16
|
+
|
|
17
|
+
1. **Hallucination as measurement error.** LLM code errors are random draws from a language-specific error distribution — analogous to classical measurement error (variable = true value + noise).
|
|
18
|
+
2. **Independence across languages.** Syntax-specific errors (Stata's missing-value trap, R's factor ordering, Python's 0-indexing) are plausibly independent because they stem from different language grammars.
|
|
19
|
+
3. **Full pipeline replication, not just code review.** Replicate cleaning, merging, and estimation end-to-end in 2+ languages. Compare outputs table-by-table, coefficient-by-coefficient.
|
|
20
|
+
|
|
21
|
+
## When It Works
|
|
22
|
+
|
|
23
|
+
- Deterministic computations: OLS, DiD, IV, F-tests, analytical standard errors, R-squared
|
|
24
|
+
- Data processing: cleaning, merging, variable construction
|
|
25
|
+
- Any pipeline where the same inputs should produce identical numerical outputs
|
|
26
|
+
|
|
27
|
+
## When It Doesn't Work
|
|
28
|
+
|
|
29
|
+
- Bootstrap (language-specific seeds)
|
|
30
|
+
- Simulation-based estimators (simulated MLE, method of simulated moments)
|
|
31
|
+
- Bayesian MCMC (Gibbs, HMC)
|
|
32
|
+
- EM algorithms with random starting points
|
|
33
|
+
- Machine learning (SGD, random forests, neural net initialisation)
|
|
34
|
+
|
|
35
|
+
## Limitation: Independence Breaks Down for Conceptual Errors
|
|
36
|
+
|
|
37
|
+
The independence assumption holds for *syntax-specific* bugs but NOT for *design errors* (wrong estimand, wrong merge logic, wrong identification strategy). Conceptual errors replicate across languages because they're language-agnostic. This approach catches implementation bugs, not design bugs.
|
|
38
|
+
|
|
39
|
+
## Illustrative Example: Stata Missing-Value Trap
|
|
40
|
+
|
|
41
|
+
```stata
|
|
42
|
+
* WRONG — also replaces missing values to 10
|
|
43
|
+
replace olddog = 10 if olddog > 10
|
|
44
|
+
|
|
45
|
+
* CORRECT — excludes missing values
|
|
46
|
+
replace olddog = 10 if olddog > 10 & olddog ~= .
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
Claude Code knows the correct version (trained on Stata manuals + Nick Cox's listserv posts) but may randomly omit the missing-value guard. If you only run Stata, this propagates silently. If you also run R and Python (where `NA` handling differs), the discrepancy surfaces in the output comparison.
|
|
50
|
+
|
|
51
|
+
## Case Study
|
|
52
|
+
|
|
53
|
+
Callaway & Sant'Anna DiD applied to Brazilian deinstitutionalisation (CAPS) and homicides. Packages used:
|
|
54
|
+
- **Stata:** `csdid`, `csdid2`
|
|
55
|
+
- **R:** `did`
|
|
56
|
+
- **Python:** `differences` (Dionisi), `diff-diff` (Gerber)
|
|
57
|
+
|
|
58
|
+
## Relevance
|
|
59
|
+
|
|
60
|
+
- Complements existing `/code-review` skill (which already has cross-language verification as category 7/11) — but Cunningham's version is more aggressive: full pipeline replication, not just spot-checking
|
|
61
|
+
- Does NOT replace design-level audits (`/devils-advocate`, `design-before-results` rule) — those catch the conceptual errors this approach misses
|
|
62
|
+
- Most applicable to empirical papers with deterministic estimation pipelines
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# Paper Note: Towards an AI Co-Scientist (Google, Feb 2025)
|
|
2
|
+
|
|
3
|
+
> arXiv 2502.18864 | 81 pages | Saved at `to-sort/downloads/2502.18864-ai-co-scientist.pdf`
|
|
4
|
+
|
|
5
|
+
## Summary
|
|
6
|
+
|
|
7
|
+
Multi-agent system built on Gemini 2.0 for scientific hypothesis generation and refinement. Six specialized agents (Generation, Reflection, Ranking, Evolution, Proximity, Meta-review) orchestrated by a Supervisor agent. Applied to three biomedical domains with wet-lab validation.
|
|
8
|
+
|
|
9
|
+
## Architecture
|
|
10
|
+
|
|
11
|
+
- **Generation agent**: Literature search, simulated scientific debates (self-play), iterative assumption identification, research expansion
|
|
12
|
+
- **Reflection agent**: Multi-tier review (initial safety screen, full literature-grounded review, deep verification, observational review, recontextualized tournament review)
|
|
13
|
+
- **Ranking agent**: Pairwise tournament using Elo ratings. Evaluates on alignment, plausibility, novelty, testability, safety
|
|
14
|
+
- **Proximity agent**: Builds similarity graph over hypotheses for clustering and gap identification
|
|
15
|
+
- **Evolution agent**: Refines top-ranked hypotheses via enhancement, feasibility improvement, combination, simplification, out-of-box thinking
|
|
16
|
+
- **Meta-review agent**: Synthesizes patterns across all reviews, provides feedback to other agents' prompts (learning without backpropagation)
|
|
17
|
+
|
|
18
|
+
Key design: tournament-based evolution with Elo ratings replaces "pick best from list" with a scalable ranking mechanism. More compute = monotonically better results (test-time compute scaling).
|
|
19
|
+
|
|
20
|
+
## Key Results
|
|
21
|
+
|
|
22
|
+
| Metric | Result |
|
|
23
|
+
|--------|--------|
|
|
24
|
+
| GPQA diamond accuracy (top-1) | 78.4% |
|
|
25
|
+
| Expert preference ranking | 2.36/4 (best among all systems) |
|
|
26
|
+
| Expert novelty rating | 3.64/5 |
|
|
27
|
+
| Expert impact rating | 3.09/5 |
|
|
28
|
+
| Drug repurposing (AML) | 3/6 expert-selected drugs confirmed in vitro; KIRA6 is a genuine novel discovery |
|
|
29
|
+
| Liver fibrosis targets | 2/3 epigenetic modifiers showed anti-fibrotic activity |
|
|
30
|
+
| AMR (cf-PICIs) | Independently recapitulated an unpublished finding in 2 days vs. decade-long conventional programme |
|
|
31
|
+
|
|
32
|
+
Expert-in-the-loop results (Fig 6): AI-augmented expert hypotheses eventually surpass both pure-AI and pure-expert baselines.
|
|
33
|
+
|
|
34
|
+
## Limitations (Acknowledged)
|
|
35
|
+
|
|
36
|
+
- Open-access literature only (misses paywalled papers)
|
|
37
|
+
- No negative results data (published literature skews positive)
|
|
38
|
+
- No multimodal reasoning (can't read figures, charts, omics data)
|
|
39
|
+
- Elo is self-evaluated, not grounded in external truth
|
|
40
|
+
- Biomedical only (no social science, economics, or humanities validation)
|
|
41
|
+
- Small expert evaluation (11 goals, 7 experts)
|
|
42
|
+
|
|
43
|
+
## Relevance to the user's Research
|
|
44
|
+
|
|
45
|
+
### For human-AI collaboration research
|
|
46
|
+
- Concrete case study of human-AI complementarity (citable: Fig 6 shows AI+expert > either alone)
|
|
47
|
+
- Expert-in-the-loop design: scientists refine goals, provide manual reviews, contribute hypotheses that compete alongside AI-generated ones
|
|
48
|
+
- Gap in their framework: no MCDM perspective on how scientists should *decide* which AI-generated hypotheses to pursue (novelty vs. feasibility vs. resource cost vs. alignment)
|
|
49
|
+
|
|
50
|
+
### For multi-agent systems research
|
|
51
|
+
- Well-documented architecture with emergent capabilities through orchestration
|
|
52
|
+
- Tournament + meta-review loop is a novel coordination mechanism distinct from sequential pipelines or debate protocols
|
|
53
|
+
- Proximity graph for idea deduplication and exploration-space mapping
|
|
54
|
+
|
|
55
|
+
### Borrowed for Claude Code infrastructure
|
|
56
|
+
- **Multi-turn debate** added to `/devils-advocate` (3-round: critic -> defense -> adjudication)
|
|
57
|
+
|
|
58
|
+
### Not worth implementing
|
|
59
|
+
- Full tournament/Elo infrastructure (overkill for 6 projects vs. hundreds of hypotheses)
|
|
60
|
+
- Proximity graphs (not enough parallel outputs to need clustering)
|
|
61
|
+
- Evolution agent (existing review cycles already iterate)
|
|
62
|
+
|
|
63
|
+
## Citation
|
|
64
|
+
|
|
65
|
+
```bibtex
|
|
66
|
+
@article{gottweis2025towards,
|
|
67
|
+
title={Towards an AI Co-Scientist},
|
|
68
|
+
author={Gottweis, Juraj and others},
|
|
69
|
+
year={2025},
|
|
70
|
+
journal={arXiv preprint arXiv:2502.18864}
|
|
71
|
+
}
|
|
72
|
+
```
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# karpathy/llm-council — Review
|
|
2
|
+
|
|
3
|
+
> Reviewed 2026-02-25. Repo: https://github.com/karpathy/llm-council (Karpathy "Saturday hack")
|
|
4
|
+
|
|
5
|
+
## What It Does
|
|
6
|
+
|
|
7
|
+
Multi-model council that answers questions in 3 stages:
|
|
8
|
+
1. **Stage 1 — Individual responses**: Query all council members (4 models) in parallel via OpenRouter
|
|
9
|
+
2. **Stage 2 — Peer review**: Each model ranks all other responses (anonymised as "Response A/B/C/D"), parses `FINAL RANKING:` section
|
|
10
|
+
3. **Stage 3 — Chairman synthesis**: A designated chairman model synthesises all responses + rankings into a final answer
|
|
11
|
+
|
|
12
|
+
## Architecture
|
|
13
|
+
|
|
14
|
+
- **Backend**: FastAPI + httpx (async OpenRouter calls) + JSON file storage
|
|
15
|
+
- **Frontend**: React + Vite (separate app)
|
|
16
|
+
- **API**: OpenRouter only (same as our Scout migration)
|
|
17
|
+
- **Config**: `COUNCIL_MODELS` list + `CHAIRMAN_MODEL` in `config.py`
|
|
18
|
+
- **Streaming**: SSE endpoint streams stage completions to frontend
|
|
19
|
+
- Dependencies: `fastapi`, `httpx`, `pydantic`, `uvicorn`, `python-dotenv`
|
|
20
|
+
|
|
21
|
+
## Key Design Choices
|
|
22
|
+
|
|
23
|
+
- Anonymous peer review: models see "Response A/B/C" not model names — prevents brand-bias
|
|
24
|
+
- Parallel queries via `asyncio.gather` — all models queried simultaneously
|
|
25
|
+
- Aggregate rankings: Borda-count-style average position across all reviewers
|
|
26
|
+
- Chairman can be any model (default: Gemini 3 Pro)
|
|
27
|
+
- Regex parsing of `FINAL RANKING:` section — fragile but works
|
|
28
|
+
|
|
29
|
+
## What's Good
|
|
30
|
+
|
|
31
|
+
- Clean separation: `openrouter.py` (API client) → `council.py` (orchestration) → `main.py` (routes)
|
|
32
|
+
- Parallel execution throughout — fast despite 4+ model calls per stage
|
|
33
|
+
- SSE streaming so frontend shows progressive results
|
|
34
|
+
- Anonymisation prevents model-name bias in peer review
|
|
35
|
+
|
|
36
|
+
## Weaknesses
|
|
37
|
+
|
|
38
|
+
- No system prompts — all queries are bare user messages
|
|
39
|
+
- JSON file storage — no DB, no caching
|
|
40
|
+
- `FINAL RANKING:` parsing is fragile (regex on free-form text)
|
|
41
|
+
- No retry logic on model failures
|
|
42
|
+
- No cost tracking or token counting
|
|
43
|
+
- Fixed council membership — no per-query model selection
|
|
44
|
+
- No conversation context (each turn is independent, no history passed)
|
|
45
|
+
|
|
46
|
+
## Relevance for Scout
|
|
47
|
+
|
|
48
|
+
Core 3-stage pattern is directly reusable:
|
|
49
|
+
1. Query N models with the same prompt (research idea to evaluate)
|
|
50
|
+
2. Each model reviews/ranks the other responses (anonymised)
|
|
51
|
+
3. Chairman synthesises into a consensus evaluation
|
|
52
|
+
|
|
53
|
+
Key adaptations needed:
|
|
54
|
+
- Domain-specific system prompts (research novelty/framing context, not general Q&A)
|
|
55
|
+
- Use structured JSON output (Pydantic models) instead of free-form markdown
|
|
56
|
+
- Integrate with existing OpenRouter client (`LLMService`) rather than raw httpx
|
|
57
|
+
- Store results in SQLite (existing pattern) not JSON files
|
|
58
|
+
- HTMX partial rendering, not React SPA
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
# Multi-Coder Reliability Protocol
|
|
2
|
+
|
|
3
|
+
> Source: weiai-wayne-xu/commDAAF — Nonprofit Mission Framing Study (March 2026)
|
|
4
|
+
> Problem solved: Establishing inter-coder reliability for AI-assisted content analysis without human validation
|
|
5
|
+
|
|
6
|
+
## Overview
|
|
7
|
+
|
|
8
|
+
Traditional content analysis requires human coders for reliability. This protocol uses **three independent AI models** to establish reliability without human intervention while avoiding "single-coder circularity."
|
|
9
|
+
|
|
10
|
+
## The 3-Model Protocol
|
|
11
|
+
|
|
12
|
+
```
|
|
13
|
+
┌─────────────┐ ┌─────────────┐
|
|
14
|
+
│ Coder A │ │ Coder B │
|
|
15
|
+
│ (Codex) │ │ (Gemini) │
|
|
16
|
+
│ Batch 1 │ │ Batch 2 │
|
|
17
|
+
└──────┬──────┘ └──────┬──────┘
|
|
18
|
+
│ │
|
|
19
|
+
└────────┬─────────┘
|
|
20
|
+
▼
|
|
21
|
+
┌─────────────┐
|
|
22
|
+
│ Coder C │
|
|
23
|
+
│ (Claude) │
|
|
24
|
+
│ 30-item │
|
|
25
|
+
│ reliability │
|
|
26
|
+
│ sample │
|
|
27
|
+
└─────────────┘
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Protocol Steps
|
|
31
|
+
|
|
32
|
+
### Step 1: Split Primary Coding
|
|
33
|
+
```python
|
|
34
|
+
# Divide dataset between two coders
|
|
35
|
+
batch_1 = missions[:len(missions)//2] # → Codex
|
|
36
|
+
batch_2 = missions[len(missions)//2:] # → Gemini
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
### Step 2: Code Independently
|
|
40
|
+
Each coder receives identical instructions but codes independently:
|
|
41
|
+
```python
|
|
42
|
+
codex_results = code_with_codex(batch_1, instructions)
|
|
43
|
+
gemini_results = code_with_gemini(batch_2, instructions)
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### Step 3: Sample for Reliability
|
|
47
|
+
```python
|
|
48
|
+
import random
|
|
49
|
+
random.seed(42) # Reproducible
|
|
50
|
+
|
|
51
|
+
# Sample from BOTH batches
|
|
52
|
+
combined = codex_results + gemini_results
|
|
53
|
+
reliability_sample = random.sample(combined, 30)
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### Step 4: Third-Model Validation
|
|
57
|
+
```python
|
|
58
|
+
# Claude codes the 30-item sample independently
|
|
59
|
+
claude_results = code_with_claude(reliability_sample, instructions)
|
|
60
|
+
|
|
61
|
+
# Compare against original coders
|
|
62
|
+
kappa_vs_codex = cohens_kappa(
|
|
63
|
+
[r for r in claude_results if r["original_coder"] == "codex"],
|
|
64
|
+
[r for r in codex_results if r["id"] in sample_ids]
|
|
65
|
+
)
|
|
66
|
+
kappa_vs_gemini = cohens_kappa(
|
|
67
|
+
[r for r in claude_results if r["original_coder"] == "gemini"],
|
|
68
|
+
[r for r in gemini_results if r["id"] in sample_ids]
|
|
69
|
+
)
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Step 5: Report Both kappa Values
|
|
73
|
+
```markdown
|
|
74
|
+
### Inter-Coder Reliability
|
|
75
|
+
|
|
76
|
+
| Comparison | kappa | Agreement |
|
|
77
|
+
|------------|-------|-----------|
|
|
78
|
+
| Claude vs Codex | 0.94 | 97% |
|
|
79
|
+
| Claude vs Gemini | 0.92 | 96% |
|
|
80
|
+
| **Overall** | **0.935** | **96.7%** |
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Why This Works
|
|
84
|
+
|
|
85
|
+
| Concern | How Protocol Addresses It |
|
|
86
|
+
|---------|---------------------------|
|
|
87
|
+
| Single-model bias | Three different model families |
|
|
88
|
+
| Circular validation | Validator (Claude) never saw training data |
|
|
89
|
+
| Provider collusion | Codex (OpenAI), Gemini (Google), Claude (Anthropic) |
|
|
90
|
+
| Replicability | Fixed random seed, identical instructions |
|
|
91
|
+
|
|
92
|
+
## Minimum Requirements
|
|
93
|
+
|
|
94
|
+
- **3 distinct models** from different providers
|
|
95
|
+
- **30+ items** in reliability sample
|
|
96
|
+
- **Fixed random seed** for reproducibility
|
|
97
|
+
- **Identical instructions** to all coders
|
|
98
|
+
- **kappa > 0.80** for acceptable reliability (Landis & Koch)
|
|
99
|
+
|
|
100
|
+
## Landis & Koch kappa Interpretation
|
|
101
|
+
|
|
102
|
+
| kappa | Interpretation |
|
|
103
|
+
|-------|----------------|
|
|
104
|
+
| < 0.00 | Poor |
|
|
105
|
+
| 0.00-0.20 | Slight |
|
|
106
|
+
| 0.21-0.40 | Fair |
|
|
107
|
+
| 0.41-0.60 | Moderate |
|
|
108
|
+
| 0.61-0.80 | Substantial |
|
|
109
|
+
| **0.81-1.00** | **Almost perfect** |
|
|
110
|
+
|
|
111
|
+
## Code Template
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
from sklearn.metrics import cohen_kappa_score
|
|
115
|
+
import random
|
|
116
|
+
|
|
117
|
+
def compute_reliability(primary_results: list, validator_results: list) -> dict:
|
|
118
|
+
"""Compute inter-coder reliability between primary coder and validator."""
|
|
119
|
+
|
|
120
|
+
# Match items by ID
|
|
121
|
+
matched = []
|
|
122
|
+
for v in validator_results:
|
|
123
|
+
for p in primary_results:
|
|
124
|
+
if p["id"] == v["id"]:
|
|
125
|
+
matched.append((p["code"], v["code"]))
|
|
126
|
+
break
|
|
127
|
+
|
|
128
|
+
primary_codes = [m[0] for m in matched]
|
|
129
|
+
validator_codes = [m[1] for m in matched]
|
|
130
|
+
|
|
131
|
+
# Cohen's kappa
|
|
132
|
+
kappa = cohen_kappa_score(primary_codes, validator_codes)
|
|
133
|
+
|
|
134
|
+
# Simple agreement
|
|
135
|
+
agreement = sum(p == v for p, v in matched) / len(matched)
|
|
136
|
+
|
|
137
|
+
return {
|
|
138
|
+
"n_sample": len(matched),
|
|
139
|
+
"cohens_kappa": round(kappa, 3),
|
|
140
|
+
"simple_agreement": round(agreement * 100, 1),
|
|
141
|
+
"interpretation": interpret_kappa(kappa),
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
def interpret_kappa(k: float) -> str:
|
|
145
|
+
if k >= 0.81: return "almost perfect"
|
|
146
|
+
if k >= 0.61: return "substantial"
|
|
147
|
+
if k >= 0.41: return "moderate"
|
|
148
|
+
if k >= 0.21: return "fair"
|
|
149
|
+
if k >= 0.00: return "slight"
|
|
150
|
+
return "poor"
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## Reporting Disagreements
|
|
154
|
+
|
|
155
|
+
Always report the nature of disagreements:
|
|
156
|
+
|
|
157
|
+
```markdown
|
|
158
|
+
### Disagreement Analysis
|
|
159
|
+
|
|
160
|
+
Single disagreement (1/30): County fair organization
|
|
161
|
+
- Codex: SERVICE (provides entertainment to community)
|
|
162
|
+
- Claude: FELLOWSHIP (exists for member exhibitors)
|
|
163
|
+
|
|
164
|
+
**Interpretation:** Borderline case illustrating reasonable ambiguity
|
|
165
|
+
in the scheme, not systematic divergence.
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
## Anti-Patterns to Avoid
|
|
169
|
+
|
|
170
|
+
| Anti-Pattern | Problem | Fix |
|
|
171
|
+
|--------------|---------|-----|
|
|
172
|
+
| Same model for coding + validation | Circular | Use different model families |
|
|
173
|
+
| No reliability sample | Unverifiable | Always sample 30+ items |
|
|
174
|
+
| Reconciliation before kappa | Inflated agreement | Compute kappa on independent codes |
|
|
175
|
+
| Reporting only agreement % | Ignores chance | Always report Cohen's kappa |
|