flonat-research 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/domain-reviewer.md +336 -0
- package/.claude/agents/fixer.md +226 -0
- package/.claude/agents/paper-critic.md +370 -0
- package/.claude/agents/peer-reviewer.md +289 -0
- package/.claude/agents/proposal-reviewer.md +215 -0
- package/.claude/agents/referee2-reviewer.md +367 -0
- package/.claude/agents/references/journal-referee-profiles.md +354 -0
- package/.claude/agents/references/paper-critic/council-personas.md +77 -0
- package/.claude/agents/references/paper-critic/council-prompts.md +198 -0
- package/.claude/agents/references/peer-reviewer/report-template.md +199 -0
- package/.claude/agents/references/peer-reviewer/sa-prompts.md +260 -0
- package/.claude/agents/references/peer-reviewer/security-scan.md +188 -0
- package/.claude/agents/references/proposal-reviewer/report-template.md +144 -0
- package/.claude/agents/references/proposal-reviewer/sa-prompts.md +149 -0
- package/.claude/agents/references/referee-config.md +114 -0
- package/.claude/agents/references/referee2-reviewer/audit-checklists.md +287 -0
- package/.claude/agents/references/referee2-reviewer/report-template.md +334 -0
- package/.claude/rules/design-before-results.md +52 -0
- package/.claude/rules/ignore-agents-md.md +17 -0
- package/.claude/rules/ignore-gemini-md.md +17 -0
- package/.claude/rules/lean-claude-md.md +45 -0
- package/.claude/rules/learn-tags.md +99 -0
- package/.claude/rules/overleaf-separation.md +67 -0
- package/.claude/rules/plan-first.md +175 -0
- package/.claude/rules/read-docs-first.md +50 -0
- package/.claude/rules/scope-discipline.md +28 -0
- package/.claude/settings.json +125 -0
- package/.context/current-focus.md +33 -0
- package/.context/preferences/priorities.md +36 -0
- package/.context/preferences/task-naming.md +28 -0
- package/.context/profile.md +29 -0
- package/.context/projects/_index.md +41 -0
- package/.context/projects/papers/nudge-exp.md +22 -0
- package/.context/projects/papers/uncertainty.md +31 -0
- package/.context/resources/claude-scientific-writer-review.md +48 -0
- package/.context/resources/cunningham-multi-analyst-agents.md +104 -0
- package/.context/resources/cunningham-multilang-code-audit.md +62 -0
- package/.context/resources/google-ai-co-scientist-review.md +72 -0
- package/.context/resources/karpathy-llm-council-review.md +58 -0
- package/.context/resources/multi-coder-reliability-protocol.md +175 -0
- package/.context/resources/pedro-santanna-takeaways.md +96 -0
- package/.context/resources/venue-rankings/abs_ajg_2024.csv +1823 -0
- package/.context/resources/venue-rankings/abs_ajg_2024_econ.csv +356 -0
- package/.context/resources/venue-rankings/cabs_4_4star_theory.csv +40 -0
- package/.context/resources/venue-rankings/core_2026.csv +801 -0
- package/.context/resources/venue-rankings.md +147 -0
- package/.context/workflows/README.md +69 -0
- package/.context/workflows/daily-review.md +91 -0
- package/.context/workflows/meeting-actions.md +108 -0
- package/.context/workflows/replication-protocol.md +155 -0
- package/.context/workflows/weekly-review.md +113 -0
- package/.mcp-server-biblio/formatters.py +158 -0
- package/.mcp-server-biblio/pyproject.toml +11 -0
- package/.mcp-server-biblio/server.py +678 -0
- package/.mcp-server-biblio/sources/__init__.py +14 -0
- package/.mcp-server-biblio/sources/base.py +73 -0
- package/.mcp-server-biblio/sources/formatters.py +83 -0
- package/.mcp-server-biblio/sources/models.py +22 -0
- package/.mcp-server-biblio/sources/multi_source.py +243 -0
- package/.mcp-server-biblio/sources/openalex_source.py +183 -0
- package/.mcp-server-biblio/sources/scopus_source.py +309 -0
- package/.mcp-server-biblio/sources/wos_source.py +508 -0
- package/.mcp-server-biblio/uv.lock +896 -0
- package/.scripts/README.md +161 -0
- package/.scripts/ai_pattern_density.py +446 -0
- package/.scripts/conf +445 -0
- package/.scripts/config.py +122 -0
- package/.scripts/count_inventory.py +275 -0
- package/.scripts/daily_digest.py +288 -0
- package/.scripts/done +177 -0
- package/.scripts/extract_meeting_actions.py +223 -0
- package/.scripts/focus +176 -0
- package/.scripts/generate-codex-agents-md.py +217 -0
- package/.scripts/inbox +194 -0
- package/.scripts/notion_helpers.py +325 -0
- package/.scripts/openalex/query_helpers.py +306 -0
- package/.scripts/papers +227 -0
- package/.scripts/query +223 -0
- package/.scripts/session-history.py +201 -0
- package/.scripts/skill-health.py +516 -0
- package/.scripts/skill-log-miner.py +273 -0
- package/.scripts/sync-to-codex.sh +252 -0
- package/.scripts/task +213 -0
- package/.scripts/tasks +190 -0
- package/.scripts/week +206 -0
- package/CLAUDE.md +197 -0
- package/LICENSE +21 -0
- package/MEMORY.md +38 -0
- package/README.md +269 -0
- package/docs/agents.md +44 -0
- package/docs/bibliography-setup.md +55 -0
- package/docs/council-mode.md +36 -0
- package/docs/getting-started.md +245 -0
- package/docs/hooks.md +38 -0
- package/docs/mcp-servers.md +82 -0
- package/docs/notion-setup.md +109 -0
- package/docs/rules.md +33 -0
- package/docs/scripts.md +303 -0
- package/docs/setup-overview/setup-overview.pdf +0 -0
- package/docs/skills.md +70 -0
- package/docs/system.md +159 -0
- package/hooks/block-destructive-git.sh +66 -0
- package/hooks/context-monitor.py +114 -0
- package/hooks/postcompact-restore.py +157 -0
- package/hooks/precompact-autosave.py +181 -0
- package/hooks/promise-checker.sh +124 -0
- package/hooks/protect-source-files.sh +81 -0
- package/hooks/resume-context-loader.sh +53 -0
- package/hooks/startup-context-loader.sh +102 -0
- package/package.json +51 -0
- package/packages/cli-council/.github/workflows/claude-code-review.yml +44 -0
- package/packages/cli-council/.github/workflows/claude.yml +50 -0
- package/packages/cli-council/README.md +100 -0
- package/packages/cli-council/pyproject.toml +43 -0
- package/packages/cli-council/src/cli_council/__init__.py +19 -0
- package/packages/cli-council/src/cli_council/__main__.py +185 -0
- package/packages/cli-council/src/cli_council/backends/__init__.py +8 -0
- package/packages/cli-council/src/cli_council/backends/base.py +81 -0
- package/packages/cli-council/src/cli_council/backends/claude.py +25 -0
- package/packages/cli-council/src/cli_council/backends/codex.py +27 -0
- package/packages/cli-council/src/cli_council/backends/gemini.py +26 -0
- package/packages/cli-council/src/cli_council/checkpoint.py +212 -0
- package/packages/cli-council/src/cli_council/config.py +51 -0
- package/packages/cli-council/src/cli_council/council.py +391 -0
- package/packages/cli-council/src/cli_council/models.py +46 -0
- package/packages/llm-council/.github/workflows/claude-code-review.yml +44 -0
- package/packages/llm-council/.github/workflows/claude.yml +50 -0
- package/packages/llm-council/README.md +453 -0
- package/packages/llm-council/pyproject.toml +42 -0
- package/packages/llm-council/src/llm_council/__init__.py +23 -0
- package/packages/llm-council/src/llm_council/__main__.py +259 -0
- package/packages/llm-council/src/llm_council/checkpoint.py +193 -0
- package/packages/llm-council/src/llm_council/client.py +253 -0
- package/packages/llm-council/src/llm_council/config.py +232 -0
- package/packages/llm-council/src/llm_council/council.py +482 -0
- package/packages/llm-council/src/llm_council/models.py +46 -0
- package/packages/mcp-bibliography/MEMORY.md +31 -0
- package/packages/mcp-bibliography/_app.py +226 -0
- package/packages/mcp-bibliography/formatters.py +158 -0
- package/packages/mcp-bibliography/log/2026-03-13-2100.md +35 -0
- package/packages/mcp-bibliography/pyproject.toml +15 -0
- package/packages/mcp-bibliography/run.sh +20 -0
- package/packages/mcp-bibliography/scholarly_formatters.py +83 -0
- package/packages/mcp-bibliography/server.py +1857 -0
- package/packages/mcp-bibliography/tools/__init__.py +28 -0
- package/packages/mcp-bibliography/tools/_registry.py +19 -0
- package/packages/mcp-bibliography/tools/altmetric.py +107 -0
- package/packages/mcp-bibliography/tools/core.py +92 -0
- package/packages/mcp-bibliography/tools/dblp.py +52 -0
- package/packages/mcp-bibliography/tools/openalex.py +296 -0
- package/packages/mcp-bibliography/tools/opencitations.py +102 -0
- package/packages/mcp-bibliography/tools/openreview.py +179 -0
- package/packages/mcp-bibliography/tools/orcid.py +131 -0
- package/packages/mcp-bibliography/tools/scholarly.py +575 -0
- package/packages/mcp-bibliography/tools/unpaywall.py +63 -0
- package/packages/mcp-bibliography/tools/zenodo.py +123 -0
- package/packages/mcp-bibliography/uv.lock +711 -0
- package/scripts/setup.sh +143 -0
- package/skills/beamer-deck/SKILL.md +199 -0
- package/skills/beamer-deck/references/quality-rubric.md +54 -0
- package/skills/beamer-deck/references/review-prompts.md +106 -0
- package/skills/bib-validate/SKILL.md +261 -0
- package/skills/bib-validate/references/council-mode.md +34 -0
- package/skills/bib-validate/references/deep-verify.md +79 -0
- package/skills/bib-validate/references/fix-mode.md +36 -0
- package/skills/bib-validate/references/openalex-verification.md +45 -0
- package/skills/bib-validate/references/preprint-check.md +31 -0
- package/skills/bib-validate/references/ref-manager-crossref.md +41 -0
- package/skills/bib-validate/references/report-template.md +82 -0
- package/skills/code-archaeology/SKILL.md +141 -0
- package/skills/code-review/SKILL.md +265 -0
- package/skills/code-review/references/quality-rubric.md +67 -0
- package/skills/consolidate-memory/SKILL.md +208 -0
- package/skills/context-status/SKILL.md +126 -0
- package/skills/creation-guard/SKILL.md +230 -0
- package/skills/devils-advocate/SKILL.md +130 -0
- package/skills/devils-advocate/references/competing-hypotheses.md +83 -0
- package/skills/init-project/SKILL.md +115 -0
- package/skills/init-project-course/references/memory-and-settings.md +92 -0
- package/skills/init-project-course/references/organise-templates.md +94 -0
- package/skills/init-project-course/skill.md +147 -0
- package/skills/init-project-light/skill.md +139 -0
- package/skills/init-project-research/SKILL.md +368 -0
- package/skills/init-project-research/references/atlas-pipeline-sync.md +70 -0
- package/skills/init-project-research/references/atlas-schema.md +81 -0
- package/skills/init-project-research/references/confirmation-report.md +39 -0
- package/skills/init-project-research/references/domain-profile-template.md +104 -0
- package/skills/init-project-research/references/interview-round3.md +34 -0
- package/skills/init-project-research/references/literature-discovery.md +43 -0
- package/skills/init-project-research/references/scaffold-details.md +197 -0
- package/skills/init-project-research/templates/field-calibration.md +60 -0
- package/skills/init-project-research/templates/pipeline-manifest.md +63 -0
- package/skills/init-project-research/templates/run-all.sh +116 -0
- package/skills/init-project-research/templates/seed-files.md +337 -0
- package/skills/insights-deck/SKILL.md +151 -0
- package/skills/interview-me/SKILL.md +157 -0
- package/skills/latex/SKILL.md +141 -0
- package/skills/latex/references/latex-configs.md +183 -0
- package/skills/latex-autofix/SKILL.md +230 -0
- package/skills/latex-autofix/references/known-errors.md +183 -0
- package/skills/latex-autofix/references/quality-rubric.md +50 -0
- package/skills/latex-health-check/SKILL.md +161 -0
- package/skills/learn/SKILL.md +220 -0
- package/skills/learn/scripts/validate_skill.py +265 -0
- package/skills/lessons-learned/SKILL.md +201 -0
- package/skills/literature/SKILL.md +335 -0
- package/skills/literature/references/agent-templates.md +393 -0
- package/skills/literature/references/bibliometric-apis.md +44 -0
- package/skills/literature/references/cli-council-search.md +79 -0
- package/skills/literature/references/openalex-api-guide.md +371 -0
- package/skills/literature/references/openalex-common-queries.md +381 -0
- package/skills/literature/references/openalex-workflows.md +248 -0
- package/skills/literature/references/reference-manager-sync.md +36 -0
- package/skills/literature/references/scopus-api-guide.md +208 -0
- package/skills/literature/references/wos-api-guide.md +308 -0
- package/skills/multi-perspective/SKILL.md +311 -0
- package/skills/multi-perspective/references/computational-many-analysts.md +77 -0
- package/skills/pipeline-manifest/SKILL.md +226 -0
- package/skills/pre-submission-report/SKILL.md +153 -0
- package/skills/process-reviews/SKILL.md +244 -0
- package/skills/process-reviews/references/rr-routing.md +101 -0
- package/skills/project-deck/SKILL.md +87 -0
- package/skills/project-safety/SKILL.md +135 -0
- package/skills/proofread/SKILL.md +254 -0
- package/skills/proofread/references/quality-rubric.md +104 -0
- package/skills/python-env/SKILL.md +57 -0
- package/skills/quarto-deck/SKILL.md +226 -0
- package/skills/quarto-deck/references/markdown-format.md +143 -0
- package/skills/quarto-deck/references/quality-rubric.md +54 -0
- package/skills/save-context/SKILL.md +174 -0
- package/skills/session-log/SKILL.md +98 -0
- package/skills/shared/concept-validation-gate.md +161 -0
- package/skills/shared/council-protocol.md +265 -0
- package/skills/shared/distribution-diagnostics.md +164 -0
- package/skills/shared/engagement-stratified-sampling.md +218 -0
- package/skills/shared/escalation-protocol.md +74 -0
- package/skills/shared/external-audit-protocol.md +205 -0
- package/skills/shared/intercoder-reliability.md +256 -0
- package/skills/shared/mcp-degradation.md +81 -0
- package/skills/shared/method-probing-questions.md +163 -0
- package/skills/shared/multi-language-conventions.md +143 -0
- package/skills/shared/paid-api-safety.md +174 -0
- package/skills/shared/palettes.md +90 -0
- package/skills/shared/progressive-disclosure.md +92 -0
- package/skills/shared/project-documentation-content.md +443 -0
- package/skills/shared/project-documentation-format.md +281 -0
- package/skills/shared/project-documentation.md +100 -0
- package/skills/shared/publication-output.md +138 -0
- package/skills/shared/quality-scoring.md +70 -0
- package/skills/shared/reference-resolution.md +77 -0
- package/skills/shared/research-quality-rubric.md +165 -0
- package/skills/shared/rhetoric-principles.md +54 -0
- package/skills/shared/skill-design-patterns.md +272 -0
- package/skills/shared/skill-index.md +240 -0
- package/skills/shared/system-documentation.md +334 -0
- package/skills/shared/tikz-rules.md +402 -0
- package/skills/shared/validation-tiers.md +121 -0
- package/skills/shared/venue-guides/README.md +46 -0
- package/skills/shared/venue-guides/cell_press_style.md +483 -0
- package/skills/shared/venue-guides/conferences_formatting.md +564 -0
- package/skills/shared/venue-guides/cs_conference_style.md +463 -0
- package/skills/shared/venue-guides/examples/cell_summary_example.md +247 -0
- package/skills/shared/venue-guides/examples/medical_structured_abstract.md +313 -0
- package/skills/shared/venue-guides/examples/nature_abstract_examples.md +213 -0
- package/skills/shared/venue-guides/examples/neurips_introduction_example.md +245 -0
- package/skills/shared/venue-guides/journals_formatting.md +486 -0
- package/skills/shared/venue-guides/medical_journal_styles.md +535 -0
- package/skills/shared/venue-guides/ml_conference_style.md +556 -0
- package/skills/shared/venue-guides/nature_science_style.md +405 -0
- package/skills/shared/venue-guides/reviewer_expectations.md +417 -0
- package/skills/shared/venue-guides/venue_writing_styles.md +321 -0
- package/skills/split-pdf/SKILL.md +172 -0
- package/skills/split-pdf/methodology.md +48 -0
- package/skills/sync-notion/SKILL.md +93 -0
- package/skills/system-audit/SKILL.md +157 -0
- package/skills/system-audit/references/sub-agent-prompts.md +294 -0
- package/skills/task-management/SKILL.md +131 -0
- package/skills/update-focus/SKILL.md +204 -0
- package/skills/update-project-doc/SKILL.md +194 -0
- package/skills/validate-bib/SKILL.md +242 -0
- package/skills/validate-bib/references/council-mode.md +34 -0
- package/skills/validate-bib/references/deep-verify.md +71 -0
- package/skills/validate-bib/references/openalex-verification.md +45 -0
- package/skills/validate-bib/references/preprint-check.md +31 -0
- package/skills/validate-bib/references/report-template.md +62 -0
|
@@ -0,0 +1,482 @@
|
|
|
1
|
+
"""3-stage LLM Council orchestration.
|
|
2
|
+
|
|
3
|
+
Adapted from karpathy/llm-council:
|
|
4
|
+
Stage 1 -- Individual assessments (parallel, structured JSON or text)
|
|
5
|
+
Stage 2 -- Peer review (parallel, free-form text with FINAL RANKING)
|
|
6
|
+
Stage 3 -- Chairman synthesis (single model, structured JSON or text)
|
|
7
|
+
|
|
8
|
+
Supports checkpoint-based session resumption (inspired by Owlex) and
|
|
9
|
+
atomic file-based state (inspired by agents-council).
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import asyncio
|
|
15
|
+
import json
|
|
16
|
+
import logging
|
|
17
|
+
import re
|
|
18
|
+
from collections import defaultdict
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from time import perf_counter
|
|
21
|
+
|
|
22
|
+
from llm_council.checkpoint import CouncilCheckpointer
|
|
23
|
+
from llm_council.client import LLMClient
|
|
24
|
+
from llm_council.config import AVAILABLE_MODELS, model_display_name
|
|
25
|
+
from llm_council.models import (
|
|
26
|
+
CouncilAssessment,
|
|
27
|
+
CouncilMeta,
|
|
28
|
+
CouncilPeerReview,
|
|
29
|
+
CouncilResult,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class CouncilService:
|
|
36
|
+
"""Orchestrates a multi-model council review."""
|
|
37
|
+
|
|
38
|
+
def __init__(self, llm: LLMClient, *, max_tokens: int | None = None) -> None:
|
|
39
|
+
self.llm = llm
|
|
40
|
+
self._max_tokens = max_tokens
|
|
41
|
+
|
|
42
|
+
async def run_council(
|
|
43
|
+
self,
|
|
44
|
+
system_prompt: str,
|
|
45
|
+
user_msg: str,
|
|
46
|
+
council_models: list[str],
|
|
47
|
+
chairman_model: str,
|
|
48
|
+
*,
|
|
49
|
+
existing_result: dict | None = None,
|
|
50
|
+
existing_model: str | None = None,
|
|
51
|
+
stage2_system: str | None = None,
|
|
52
|
+
stage3_prompt_builder: object | None = None,
|
|
53
|
+
checkpoint_dir: str | Path | None = None,
|
|
54
|
+
resume: bool = False,
|
|
55
|
+
) -> CouncilResult:
|
|
56
|
+
"""Run the full 3-stage council process.
|
|
57
|
+
|
|
58
|
+
Parameters
|
|
59
|
+
----------
|
|
60
|
+
system_prompt:
|
|
61
|
+
The system prompt for Stage 1 assessments.
|
|
62
|
+
user_msg:
|
|
63
|
+
The user message for Stage 1 assessments.
|
|
64
|
+
council_models:
|
|
65
|
+
List of OpenRouter model IDs to query (3+ models).
|
|
66
|
+
chairman_model:
|
|
67
|
+
Model to use for the final synthesis.
|
|
68
|
+
existing_result:
|
|
69
|
+
If provided, reuse this as one of the Stage 1 assessments.
|
|
70
|
+
existing_model:
|
|
71
|
+
The model ID that produced ``existing_result``.
|
|
72
|
+
stage2_system:
|
|
73
|
+
Optional custom system prompt for Stage 2 peer review.
|
|
74
|
+
Defaults to a generic meta-reviewer prompt.
|
|
75
|
+
stage3_prompt_builder:
|
|
76
|
+
Optional callable(assessments, peer_reviews, user_msg) -> str
|
|
77
|
+
that builds a custom Stage 3 chairman prompt. If None, uses
|
|
78
|
+
the default synthesis prompt.
|
|
79
|
+
checkpoint_dir:
|
|
80
|
+
Directory for checkpoint files. If provided, each stage's
|
|
81
|
+
results are saved atomically for crash recovery and resumption.
|
|
82
|
+
resume:
|
|
83
|
+
If True and checkpoint_dir is provided, resume from the last
|
|
84
|
+
completed stage of the most recent run.
|
|
85
|
+
"""
|
|
86
|
+
t_total = perf_counter()
|
|
87
|
+
|
|
88
|
+
# Set up checkpointing
|
|
89
|
+
ckpt = None
|
|
90
|
+
resume_from = 0
|
|
91
|
+
if checkpoint_dir:
|
|
92
|
+
checkpoint_path = Path(checkpoint_dir)
|
|
93
|
+
if resume:
|
|
94
|
+
probe = CouncilCheckpointer(checkpoint_path)
|
|
95
|
+
latest_run = probe.find_latest_run()
|
|
96
|
+
if latest_run:
|
|
97
|
+
ckpt = CouncilCheckpointer(checkpoint_path, run_id=latest_run)
|
|
98
|
+
resume_from = ckpt.last_completed_stage()
|
|
99
|
+
if resume_from > 0:
|
|
100
|
+
logger.info(
|
|
101
|
+
"Resuming run %s from stage %d",
|
|
102
|
+
latest_run, resume_from + 1,
|
|
103
|
+
)
|
|
104
|
+
else:
|
|
105
|
+
ckpt = CouncilCheckpointer(checkpoint_path)
|
|
106
|
+
else:
|
|
107
|
+
ckpt = CouncilCheckpointer(checkpoint_path)
|
|
108
|
+
else:
|
|
109
|
+
ckpt = CouncilCheckpointer(checkpoint_path)
|
|
110
|
+
|
|
111
|
+
# Stage 1
|
|
112
|
+
stage1_ms = 0
|
|
113
|
+
if resume_from >= 1 and ckpt:
|
|
114
|
+
saved = ckpt.load_stage1()
|
|
115
|
+
if saved:
|
|
116
|
+
assessments = [CouncilAssessment(**a) for a in saved]
|
|
117
|
+
logger.info("Stage 1: loaded %d assessments from checkpoint", len(assessments))
|
|
118
|
+
else:
|
|
119
|
+
t1 = perf_counter()
|
|
120
|
+
assessments = await self._stage1_collect(
|
|
121
|
+
system_prompt, user_msg, council_models,
|
|
122
|
+
existing_result=existing_result,
|
|
123
|
+
existing_model=existing_model,
|
|
124
|
+
)
|
|
125
|
+
stage1_ms = int((perf_counter() - t1) * 1000)
|
|
126
|
+
else:
|
|
127
|
+
t1 = perf_counter()
|
|
128
|
+
assessments = await self._stage1_collect(
|
|
129
|
+
system_prompt, user_msg, council_models,
|
|
130
|
+
existing_result=existing_result,
|
|
131
|
+
existing_model=existing_model,
|
|
132
|
+
)
|
|
133
|
+
stage1_ms = int((perf_counter() - t1) * 1000)
|
|
134
|
+
|
|
135
|
+
if not assessments:
|
|
136
|
+
return CouncilResult(
|
|
137
|
+
final_result={},
|
|
138
|
+
assessments=[],
|
|
139
|
+
peer_reviews=[],
|
|
140
|
+
meta=CouncilMeta(
|
|
141
|
+
council_models=council_models,
|
|
142
|
+
chairman_model=chairman_model,
|
|
143
|
+
stage1_ms=stage1_ms,
|
|
144
|
+
total_ms=int((perf_counter() - t_total) * 1000),
|
|
145
|
+
reused_model=existing_model,
|
|
146
|
+
),
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
for i, a in enumerate(assessments):
|
|
150
|
+
a.label = f"Assessment {chr(65 + i)}"
|
|
151
|
+
|
|
152
|
+
# Checkpoint Stage 1
|
|
153
|
+
if ckpt and resume_from < 1:
|
|
154
|
+
ckpt.save_stage1(
|
|
155
|
+
[a.model_dump() for a in assessments],
|
|
156
|
+
[a.model for a in assessments],
|
|
157
|
+
)
|
|
158
|
+
pending = ckpt.pending_participants(
|
|
159
|
+
council_models,
|
|
160
|
+
[a.model for a in assessments],
|
|
161
|
+
)
|
|
162
|
+
if pending:
|
|
163
|
+
logger.warning("Stage 1: pending models: %s", pending)
|
|
164
|
+
|
|
165
|
+
# Stage 2
|
|
166
|
+
stage2_ms = 0
|
|
167
|
+
label_to_model = {a.label: a.model for a in assessments}
|
|
168
|
+
|
|
169
|
+
if resume_from >= 2 and ckpt:
|
|
170
|
+
saved = ckpt.load_stage2()
|
|
171
|
+
if saved:
|
|
172
|
+
reviews_data, saved_rankings = saved
|
|
173
|
+
peer_reviews = [CouncilPeerReview(**r) for r in reviews_data]
|
|
174
|
+
aggregate_rankings = saved_rankings
|
|
175
|
+
logger.info("Stage 2: loaded %d reviews from checkpoint", len(peer_reviews))
|
|
176
|
+
else:
|
|
177
|
+
t2 = perf_counter()
|
|
178
|
+
peer_reviews, label_to_model = await self._stage2_peer_review(
|
|
179
|
+
system_prompt, user_msg, assessments, council_models,
|
|
180
|
+
custom_system=stage2_system,
|
|
181
|
+
)
|
|
182
|
+
stage2_ms = int((perf_counter() - t2) * 1000)
|
|
183
|
+
aggregate_rankings = self._calculate_aggregate_rankings(
|
|
184
|
+
peer_reviews, label_to_model,
|
|
185
|
+
)
|
|
186
|
+
else:
|
|
187
|
+
t2 = perf_counter()
|
|
188
|
+
peer_reviews, label_to_model = await self._stage2_peer_review(
|
|
189
|
+
system_prompt, user_msg, assessments, council_models,
|
|
190
|
+
custom_system=stage2_system,
|
|
191
|
+
)
|
|
192
|
+
stage2_ms = int((perf_counter() - t2) * 1000)
|
|
193
|
+
aggregate_rankings = self._calculate_aggregate_rankings(
|
|
194
|
+
peer_reviews, label_to_model,
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
# Checkpoint Stage 2
|
|
198
|
+
if ckpt and resume_from < 2:
|
|
199
|
+
ckpt.save_stage2(
|
|
200
|
+
[r.model_dump() for r in peer_reviews],
|
|
201
|
+
[r.model for r in peer_reviews],
|
|
202
|
+
aggregate_rankings=aggregate_rankings,
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
# Stage 3
|
|
206
|
+
t3 = perf_counter()
|
|
207
|
+
final_result, stage3_fallback = await self._stage3_synthesise(
|
|
208
|
+
system_prompt, user_msg, assessments, peer_reviews,
|
|
209
|
+
chairman_model,
|
|
210
|
+
custom_prompt_builder=stage3_prompt_builder,
|
|
211
|
+
)
|
|
212
|
+
stage3_ms = int((perf_counter() - t3) * 1000)
|
|
213
|
+
|
|
214
|
+
# Checkpoint Stage 3
|
|
215
|
+
if ckpt:
|
|
216
|
+
ckpt.save_stage3(final_result, chairman_model)
|
|
217
|
+
|
|
218
|
+
total_ms = int((perf_counter() - t_total) * 1000)
|
|
219
|
+
|
|
220
|
+
return CouncilResult(
|
|
221
|
+
final_result=final_result,
|
|
222
|
+
assessments=assessments,
|
|
223
|
+
peer_reviews=peer_reviews,
|
|
224
|
+
meta=CouncilMeta(
|
|
225
|
+
council_models=council_models,
|
|
226
|
+
chairman_model=chairman_model,
|
|
227
|
+
stage1_ms=stage1_ms,
|
|
228
|
+
stage2_ms=stage2_ms,
|
|
229
|
+
stage3_ms=stage3_ms,
|
|
230
|
+
total_ms=total_ms,
|
|
231
|
+
reused_model=existing_model,
|
|
232
|
+
aggregate_rankings=aggregate_rankings,
|
|
233
|
+
stage3_fallback=stage3_fallback,
|
|
234
|
+
),
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
# ------------------------------------------------------------------
|
|
238
|
+
# Stage 1
|
|
239
|
+
# ------------------------------------------------------------------
|
|
240
|
+
|
|
241
|
+
async def _stage1_collect(
|
|
242
|
+
self,
|
|
243
|
+
system_prompt: str,
|
|
244
|
+
user_msg: str,
|
|
245
|
+
council_models: list[str],
|
|
246
|
+
*,
|
|
247
|
+
existing_result: dict | None = None,
|
|
248
|
+
existing_model: str | None = None,
|
|
249
|
+
) -> list[CouncilAssessment]:
|
|
250
|
+
assessments: list[CouncilAssessment] = []
|
|
251
|
+
|
|
252
|
+
if existing_result and existing_model:
|
|
253
|
+
assessments.append(CouncilAssessment(
|
|
254
|
+
model=existing_model,
|
|
255
|
+
model_name=model_display_name(existing_model),
|
|
256
|
+
result_json=existing_result,
|
|
257
|
+
))
|
|
258
|
+
|
|
259
|
+
models_to_query = [
|
|
260
|
+
m for m in council_models
|
|
261
|
+
if m != existing_model or existing_result is None
|
|
262
|
+
]
|
|
263
|
+
|
|
264
|
+
if not models_to_query:
|
|
265
|
+
return assessments
|
|
266
|
+
|
|
267
|
+
async def _query_one(model_id: str) -> CouncilAssessment | None:
|
|
268
|
+
try:
|
|
269
|
+
result = await self.llm.chat_json(
|
|
270
|
+
system_prompt, user_msg,
|
|
271
|
+
model=model_id, max_tokens=self._max_tokens,
|
|
272
|
+
)
|
|
273
|
+
return CouncilAssessment(
|
|
274
|
+
model=model_id,
|
|
275
|
+
model_name=model_display_name(model_id),
|
|
276
|
+
result_json=result,
|
|
277
|
+
)
|
|
278
|
+
except Exception:
|
|
279
|
+
logger.exception("Council Stage 1: model %s failed", model_id)
|
|
280
|
+
return None
|
|
281
|
+
|
|
282
|
+
tasks = [_query_one(m) for m in models_to_query]
|
|
283
|
+
results = await asyncio.gather(*tasks)
|
|
284
|
+
|
|
285
|
+
for r in results:
|
|
286
|
+
if r is not None:
|
|
287
|
+
assessments.append(r)
|
|
288
|
+
|
|
289
|
+
logger.info(
|
|
290
|
+
"Council Stage 1: %d/%d models responded",
|
|
291
|
+
len(assessments), len(council_models),
|
|
292
|
+
)
|
|
293
|
+
return assessments
|
|
294
|
+
|
|
295
|
+
# ------------------------------------------------------------------
|
|
296
|
+
# Stage 2
|
|
297
|
+
# ------------------------------------------------------------------
|
|
298
|
+
|
|
299
|
+
async def _stage2_peer_review(
|
|
300
|
+
self,
|
|
301
|
+
system_prompt: str,
|
|
302
|
+
user_msg: str,
|
|
303
|
+
assessments: list[CouncilAssessment],
|
|
304
|
+
council_models: list[str],
|
|
305
|
+
*,
|
|
306
|
+
custom_system: str | None = None,
|
|
307
|
+
) -> tuple[list[CouncilPeerReview], dict[str, str]]:
|
|
308
|
+
assessments_text = "\n\n---\n\n".join(
|
|
309
|
+
f"**{a.label}:**\n```json\n{json.dumps(a.result_json, indent=2)}\n```"
|
|
310
|
+
for a in assessments
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
label_to_model = {a.label: a.model for a in assessments}
|
|
314
|
+
|
|
315
|
+
review_prompt = f"""You are reviewing multiple assessments of the same question/task.
|
|
316
|
+
|
|
317
|
+
The original question/context given to all assessors:
|
|
318
|
+
{user_msg[:3000]}
|
|
319
|
+
|
|
320
|
+
Here are the anonymised assessments:
|
|
321
|
+
|
|
322
|
+
{assessments_text}
|
|
323
|
+
|
|
324
|
+
Your task:
|
|
325
|
+
1. Evaluate each assessment individually. What does it do well? What does it miss or get wrong?
|
|
326
|
+
2. Identify specific areas of AGREEMENT across assessments.
|
|
327
|
+
3. Identify specific areas of DISAGREEMENT and explain which position you find more convincing and why.
|
|
328
|
+
4. Provide a final ranking from best to worst.
|
|
329
|
+
|
|
330
|
+
IMPORTANT: Your final ranking MUST be formatted EXACTLY as follows:
|
|
331
|
+
- Start with the line "FINAL RANKING:" (all caps, with colon)
|
|
332
|
+
- Then list the assessments from best to worst as a numbered list
|
|
333
|
+
- Each line should be: number, period, space, then ONLY the assessment label (e.g., "1. Assessment A")
|
|
334
|
+
|
|
335
|
+
Now provide your evaluation and ranking:"""
|
|
336
|
+
|
|
337
|
+
review_system = custom_system or (
|
|
338
|
+
"You are an expert meta-reviewer. You evaluate and compare "
|
|
339
|
+
"multiple independent assessments, identifying strengths, "
|
|
340
|
+
"weaknesses, agreements, and disagreements."
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
models_to_review = [m for m in council_models if m in {a.model for a in assessments}]
|
|
344
|
+
if not models_to_review:
|
|
345
|
+
models_to_review = council_models
|
|
346
|
+
|
|
347
|
+
async def _review_one(model_id: str) -> CouncilPeerReview | None:
|
|
348
|
+
try:
|
|
349
|
+
text = await self.llm.chat_text(
|
|
350
|
+
review_system, review_prompt,
|
|
351
|
+
model=model_id, max_tokens=self._max_tokens,
|
|
352
|
+
)
|
|
353
|
+
parsed = self._parse_ranking_from_text(text)
|
|
354
|
+
return CouncilPeerReview(
|
|
355
|
+
model=model_id,
|
|
356
|
+
model_name=model_display_name(model_id),
|
|
357
|
+
review_text=text,
|
|
358
|
+
parsed_ranking=parsed,
|
|
359
|
+
)
|
|
360
|
+
except Exception:
|
|
361
|
+
logger.exception("Council Stage 2: model %s failed", model_id)
|
|
362
|
+
return None
|
|
363
|
+
|
|
364
|
+
tasks = [_review_one(m) for m in models_to_review]
|
|
365
|
+
results = await asyncio.gather(*tasks)
|
|
366
|
+
|
|
367
|
+
reviews = [r for r in results if r is not None]
|
|
368
|
+
logger.info(
|
|
369
|
+
"Council Stage 2: %d/%d models reviewed",
|
|
370
|
+
len(reviews), len(models_to_review),
|
|
371
|
+
)
|
|
372
|
+
return reviews, label_to_model
|
|
373
|
+
|
|
374
|
+
# ------------------------------------------------------------------
|
|
375
|
+
# Stage 3
|
|
376
|
+
# ------------------------------------------------------------------
|
|
377
|
+
|
|
378
|
+
async def _stage3_synthesise(
|
|
379
|
+
self,
|
|
380
|
+
system_prompt: str,
|
|
381
|
+
user_msg: str,
|
|
382
|
+
assessments: list[CouncilAssessment],
|
|
383
|
+
peer_reviews: list[CouncilPeerReview],
|
|
384
|
+
chairman_model: str,
|
|
385
|
+
*,
|
|
386
|
+
custom_prompt_builder: object | None = None,
|
|
387
|
+
) -> tuple[dict, bool]:
|
|
388
|
+
if custom_prompt_builder and callable(custom_prompt_builder):
|
|
389
|
+
chairman_prompt = custom_prompt_builder(assessments, peer_reviews, user_msg)
|
|
390
|
+
else:
|
|
391
|
+
assessments_text = "\n\n".join(
|
|
392
|
+
f"**{a.label}** (by {a.model_name}):\n"
|
|
393
|
+
f"```json\n{json.dumps(a.result_json, indent=2)}\n```"
|
|
394
|
+
for a in assessments
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
reviews_text = "\n\n".join(
|
|
398
|
+
f"**Review by {r.model_name}:**\n{r.review_text}"
|
|
399
|
+
for r in peer_reviews
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
chairman_prompt = f"""You are the Chairman of an LLM Council. Multiple AI models have independently assessed the same question, and then peer-reviewed each other's assessments.
|
|
403
|
+
|
|
404
|
+
ORIGINAL CONTEXT:
|
|
405
|
+
{user_msg[:3000]}
|
|
406
|
+
|
|
407
|
+
STAGE 1 -- Individual Assessments:
|
|
408
|
+
{assessments_text}
|
|
409
|
+
|
|
410
|
+
STAGE 2 -- Peer Reviews:
|
|
411
|
+
{reviews_text}
|
|
412
|
+
|
|
413
|
+
Your task as Chairman:
|
|
414
|
+
1. Consider all individual assessments and their insights
|
|
415
|
+
2. Consider the peer reviews and what they reveal about quality and disagreements
|
|
416
|
+
3. Identify areas of strong consensus vs. genuine disagreement
|
|
417
|
+
4. Synthesise a SINGLE, comprehensive answer that represents the council's collective wisdom
|
|
418
|
+
|
|
419
|
+
Where the council agrees, reflect that consensus. Where they disagree, use your judgment to select the most well-reasoned position and explain why.
|
|
420
|
+
|
|
421
|
+
You MUST respond with valid JSON matching the EXACT SAME SCHEMA as the individual assessments above. Respond ONLY with valid JSON."""
|
|
422
|
+
|
|
423
|
+
try:
|
|
424
|
+
result = await self.llm.chat_json(
|
|
425
|
+
system_prompt, chairman_prompt,
|
|
426
|
+
model=chairman_model, max_tokens=self._max_tokens,
|
|
427
|
+
)
|
|
428
|
+
return result, False
|
|
429
|
+
except Exception:
|
|
430
|
+
logger.exception("Council Stage 3: chairman %s failed", chairman_model)
|
|
431
|
+
if assessments:
|
|
432
|
+
return assessments[0].result_json, True
|
|
433
|
+
return {}, True
|
|
434
|
+
|
|
435
|
+
# ------------------------------------------------------------------
|
|
436
|
+
# Ranking utilities
|
|
437
|
+
# ------------------------------------------------------------------
|
|
438
|
+
|
|
439
|
+
@staticmethod
|
|
440
|
+
def _parse_ranking_from_text(text: str) -> list[str]:
|
|
441
|
+
if "FINAL RANKING:" in text:
|
|
442
|
+
parts = text.split("FINAL RANKING:")
|
|
443
|
+
if len(parts) >= 2:
|
|
444
|
+
ranking_section = parts[1]
|
|
445
|
+
numbered = re.findall(
|
|
446
|
+
r"\d+\.\s*Assessment [A-Z]", ranking_section,
|
|
447
|
+
)
|
|
448
|
+
if numbered:
|
|
449
|
+
return [
|
|
450
|
+
re.search(r"Assessment [A-Z]", m).group()
|
|
451
|
+
for m in numbered
|
|
452
|
+
]
|
|
453
|
+
matches = re.findall(r"Assessment [A-Z]", ranking_section)
|
|
454
|
+
return matches
|
|
455
|
+
|
|
456
|
+
return re.findall(r"Assessment [A-Z]", text)
|
|
457
|
+
|
|
458
|
+
@staticmethod
|
|
459
|
+
def _calculate_aggregate_rankings(
|
|
460
|
+
peer_reviews: list[CouncilPeerReview],
|
|
461
|
+
label_to_model: dict[str, str],
|
|
462
|
+
) -> list[dict]:
|
|
463
|
+
positions: dict[str, list[int]] = defaultdict(list)
|
|
464
|
+
|
|
465
|
+
for review in peer_reviews:
|
|
466
|
+
for pos, label in enumerate(review.parsed_ranking, start=1):
|
|
467
|
+
if label in label_to_model:
|
|
468
|
+
positions[label].append(pos)
|
|
469
|
+
|
|
470
|
+
aggregate = []
|
|
471
|
+
for label, pos_list in positions.items():
|
|
472
|
+
avg = sum(pos_list) / len(pos_list)
|
|
473
|
+
aggregate.append({
|
|
474
|
+
"label": label,
|
|
475
|
+
"model": label_to_model.get(label, "unknown"),
|
|
476
|
+
"model_name": model_display_name(label_to_model.get(label, "")),
|
|
477
|
+
"average_rank": round(avg, 2),
|
|
478
|
+
"rankings_count": len(pos_list),
|
|
479
|
+
})
|
|
480
|
+
|
|
481
|
+
aggregate.sort(key=lambda x: x["average_rank"])
|
|
482
|
+
return aggregate
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Pydantic models for council deliberation results."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class CouncilAssessment(BaseModel):
|
|
9
|
+
"""One council member's individual assessment (Stage 1)."""
|
|
10
|
+
|
|
11
|
+
model: str # OpenRouter model ID
|
|
12
|
+
model_name: str # human-readable name
|
|
13
|
+
result_json: dict # raw JSON response (schema depends on consumer)
|
|
14
|
+
label: str = "" # anonymised label, e.g. "Assessment A"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class CouncilPeerReview(BaseModel):
|
|
18
|
+
"""One council member's peer review of all assessments (Stage 2)."""
|
|
19
|
+
|
|
20
|
+
model: str
|
|
21
|
+
model_name: str
|
|
22
|
+
review_text: str # free-form evaluation + disagreement analysis
|
|
23
|
+
parsed_ranking: list[str] = Field(default_factory=list)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class CouncilMeta(BaseModel):
|
|
27
|
+
"""Metadata for a council run."""
|
|
28
|
+
|
|
29
|
+
council_models: list[str]
|
|
30
|
+
chairman_model: str
|
|
31
|
+
stage1_ms: int = 0
|
|
32
|
+
stage2_ms: int = 0
|
|
33
|
+
stage3_ms: int = 0
|
|
34
|
+
total_ms: int = 0
|
|
35
|
+
reused_model: str | None = None
|
|
36
|
+
aggregate_rankings: list[dict] = Field(default_factory=list)
|
|
37
|
+
stage3_fallback: bool = False
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class CouncilResult(BaseModel):
|
|
41
|
+
"""Full council deliberation result."""
|
|
42
|
+
|
|
43
|
+
final_result: dict # synthesised response (schema depends on consumer)
|
|
44
|
+
assessments: list[CouncilAssessment]
|
|
45
|
+
peer_reviews: list[CouncilPeerReview]
|
|
46
|
+
meta: CouncilMeta
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Bibliography MCP — Known Issues & Learnings
|
|
2
|
+
|
|
3
|
+
## Known Issues
|
|
4
|
+
|
|
5
|
+
### S2 `scholarly_paper_detail` returns wrong journal names
|
|
6
|
+
|
|
7
|
+
Semantic Scholar maps journal abbreviations incorrectly for some management journals:
|
|
8
|
+
|
|
9
|
+
| S2 returns | Actual journal |
|
|
10
|
+
|-----------|---------------|
|
|
11
|
+
| Southern Medical Journal | Strategic Management Journal |
|
|
12
|
+
| Quality Engineering | Journal of Management Studies |
|
|
13
|
+
|
|
14
|
+
**Impact:** BibTeX from `citationStyles` field and `venue` metadata are wrong.
|
|
15
|
+
**Mitigation:** Always cross-check journal names against CrossRef DOI lookup or `scholarly_verify_dois` results.
|
|
16
|
+
|
|
17
|
+
### `scholarly_search` unreliable for specific known papers
|
|
18
|
+
|
|
19
|
+
Returns noisy, irrelevant results when searching for a specific paper by author + title + year (e.g., management papers return medical/biology results).
|
|
20
|
+
|
|
21
|
+
**Impact:** Cannot reliably find seminal papers in management/org theory.
|
|
22
|
+
**Mitigation:** Use CrossRef API directly for targeted lookups:
|
|
23
|
+
```bash
|
|
24
|
+
curl -sL "https://api.crossref.org/works?query.bibliographic=URL-encoded+title+author&rows=3"
|
|
25
|
+
```
|
|
26
|
+
Reserve `scholarly_search` for broad topic discovery only.
|
|
27
|
+
|
|
28
|
+
## Citations
|
|
29
|
+
|
|
30
|
+
[LEARN:citation] S2 "Southern Medical Journal" → Strategic Management Journal (SMJ abbreviation collision)
|
|
31
|
+
[LEARN:citation] S2 "Quality Engineering" → Journal of Management Studies (JMS abbreviation collision)
|