feed-the-machine 1.0.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. package/bin/generate-manifest.mjs +253 -0
  2. package/bin/install.mjs +134 -4
  3. package/docs/HOOKS.md +243 -0
  4. package/docs/INBOX.md +233 -0
  5. package/ftm/SKILL.md +34 -0
  6. package/ftm-audit/SKILL.md +69 -0
  7. package/ftm-brainstorm/SKILL.md +51 -0
  8. package/ftm-browse/SKILL.md +39 -0
  9. package/ftm-capture/SKILL.md +370 -0
  10. package/ftm-capture.yml +4 -0
  11. package/ftm-codex-gate/SKILL.md +59 -0
  12. package/ftm-config/SKILL.md +35 -0
  13. package/ftm-council/SKILL.md +56 -0
  14. package/ftm-dashboard/SKILL.md +163 -0
  15. package/ftm-debug/SKILL.md +84 -0
  16. package/ftm-diagram/SKILL.md +44 -0
  17. package/ftm-executor/SKILL.md +97 -0
  18. package/ftm-git/SKILL.md +60 -0
  19. package/ftm-inbox/backend/__init__.py +0 -0
  20. package/ftm-inbox/backend/__pycache__/main.cpython-314.pyc +0 -0
  21. package/ftm-inbox/backend/adapters/__init__.py +0 -0
  22. package/ftm-inbox/backend/adapters/_retry.py +64 -0
  23. package/ftm-inbox/backend/adapters/base.py +230 -0
  24. package/ftm-inbox/backend/adapters/freshservice.py +104 -0
  25. package/ftm-inbox/backend/adapters/gmail.py +125 -0
  26. package/ftm-inbox/backend/adapters/jira.py +136 -0
  27. package/ftm-inbox/backend/adapters/registry.py +192 -0
  28. package/ftm-inbox/backend/adapters/slack.py +110 -0
  29. package/ftm-inbox/backend/db/__init__.py +0 -0
  30. package/ftm-inbox/backend/db/connection.py +54 -0
  31. package/ftm-inbox/backend/db/schema.py +78 -0
  32. package/ftm-inbox/backend/executor/__init__.py +7 -0
  33. package/ftm-inbox/backend/executor/engine.py +149 -0
  34. package/ftm-inbox/backend/executor/step_runner.py +98 -0
  35. package/ftm-inbox/backend/main.py +103 -0
  36. package/ftm-inbox/backend/models/__init__.py +1 -0
  37. package/ftm-inbox/backend/models/unified_task.py +36 -0
  38. package/ftm-inbox/backend/planner/__init__.py +6 -0
  39. package/ftm-inbox/backend/planner/__pycache__/__init__.cpython-314.pyc +0 -0
  40. package/ftm-inbox/backend/planner/__pycache__/generator.cpython-314.pyc +0 -0
  41. package/ftm-inbox/backend/planner/__pycache__/schema.cpython-314.pyc +0 -0
  42. package/ftm-inbox/backend/planner/generator.py +127 -0
  43. package/ftm-inbox/backend/planner/schema.py +34 -0
  44. package/ftm-inbox/backend/requirements.txt +5 -0
  45. package/ftm-inbox/backend/routes/__init__.py +0 -0
  46. package/ftm-inbox/backend/routes/__pycache__/plan.cpython-314.pyc +0 -0
  47. package/ftm-inbox/backend/routes/execute.py +186 -0
  48. package/ftm-inbox/backend/routes/health.py +52 -0
  49. package/ftm-inbox/backend/routes/inbox.py +68 -0
  50. package/ftm-inbox/backend/routes/plan.py +271 -0
  51. package/ftm-inbox/bin/launchagent.mjs +91 -0
  52. package/ftm-inbox/bin/setup.mjs +188 -0
  53. package/ftm-inbox/bin/start.sh +10 -0
  54. package/ftm-inbox/bin/status.sh +17 -0
  55. package/ftm-inbox/bin/stop.sh +8 -0
  56. package/ftm-inbox/config.example.yml +55 -0
  57. package/ftm-inbox/package-lock.json +2898 -0
  58. package/ftm-inbox/package.json +26 -0
  59. package/ftm-inbox/postcss.config.js +6 -0
  60. package/ftm-inbox/src/app.css +199 -0
  61. package/ftm-inbox/src/app.html +18 -0
  62. package/ftm-inbox/src/lib/api.ts +166 -0
  63. package/ftm-inbox/src/lib/components/ExecutionLog.svelte +81 -0
  64. package/ftm-inbox/src/lib/components/InboxFeed.svelte +143 -0
  65. package/ftm-inbox/src/lib/components/PlanStep.svelte +271 -0
  66. package/ftm-inbox/src/lib/components/PlanView.svelte +206 -0
  67. package/ftm-inbox/src/lib/components/StreamPanel.svelte +99 -0
  68. package/ftm-inbox/src/lib/components/TaskCard.svelte +190 -0
  69. package/ftm-inbox/src/lib/components/ui/EmptyState.svelte +63 -0
  70. package/ftm-inbox/src/lib/components/ui/KawaiiCard.svelte +86 -0
  71. package/ftm-inbox/src/lib/components/ui/PillButton.svelte +106 -0
  72. package/ftm-inbox/src/lib/components/ui/StatusBadge.svelte +67 -0
  73. package/ftm-inbox/src/lib/components/ui/StreamDrawer.svelte +149 -0
  74. package/ftm-inbox/src/lib/components/ui/ThemeToggle.svelte +80 -0
  75. package/ftm-inbox/src/lib/theme.ts +47 -0
  76. package/ftm-inbox/src/routes/+layout.svelte +76 -0
  77. package/ftm-inbox/src/routes/+page.svelte +401 -0
  78. package/ftm-inbox/static/favicon.png +0 -0
  79. package/ftm-inbox/svelte.config.js +12 -0
  80. package/ftm-inbox/tailwind.config.ts +63 -0
  81. package/ftm-inbox/tsconfig.json +13 -0
  82. package/ftm-inbox/vite.config.ts +6 -0
  83. package/ftm-intent/SKILL.md +44 -0
  84. package/ftm-manifest.json +3794 -0
  85. package/ftm-map/SKILL.md +259 -0
  86. package/ftm-map/scripts/db.py +391 -0
  87. package/ftm-map/scripts/index.py +341 -0
  88. package/ftm-map/scripts/parser.py +455 -0
  89. package/ftm-map/scripts/queries/.gitkeep +0 -0
  90. package/ftm-map/scripts/queries/javascript-tags.scm +23 -0
  91. package/ftm-map/scripts/queries/python-tags.scm +17 -0
  92. package/ftm-map/scripts/queries/typescript-tags.scm +29 -0
  93. package/ftm-map/scripts/query.py +149 -0
  94. package/ftm-map/scripts/requirements.txt +2 -0
  95. package/ftm-map/scripts/setup-hooks.sh +27 -0
  96. package/ftm-map/scripts/setup.sh +45 -0
  97. package/ftm-map/scripts/test_db.py +124 -0
  98. package/ftm-map/scripts/test_parser.py +106 -0
  99. package/ftm-map/scripts/test_query.py +66 -0
  100. package/ftm-map/scripts/tests/fixtures/__init__.py +0 -0
  101. package/ftm-map/scripts/tests/fixtures/sample_project/api.ts +16 -0
  102. package/ftm-map/scripts/tests/fixtures/sample_project/auth.py +15 -0
  103. package/ftm-map/scripts/tests/fixtures/sample_project/utils.js +16 -0
  104. package/ftm-map/scripts/views.py +545 -0
  105. package/ftm-mind/SKILL.md +173 -66
  106. package/ftm-pause/SKILL.md +43 -0
  107. package/ftm-researcher/SKILL.md +275 -0
  108. package/ftm-researcher/evals/agent-diversity.yaml +17 -0
  109. package/ftm-researcher/evals/synthesis-quality.yaml +12 -0
  110. package/ftm-researcher/evals/trigger-accuracy.yaml +39 -0
  111. package/ftm-researcher/references/adaptive-search.md +116 -0
  112. package/ftm-researcher/references/agent-prompts.md +193 -0
  113. package/ftm-researcher/references/council-integration.md +193 -0
  114. package/ftm-researcher/references/output-format.md +203 -0
  115. package/ftm-researcher/references/synthesis-pipeline.md +165 -0
  116. package/ftm-researcher/scripts/score_credibility.py +234 -0
  117. package/ftm-researcher/scripts/validate_research.py +92 -0
  118. package/ftm-resume/SKILL.md +47 -0
  119. package/ftm-retro/SKILL.md +54 -0
  120. package/ftm-routine/SKILL.md +170 -0
  121. package/ftm-state/blackboard/capabilities.json +5 -0
  122. package/ftm-state/blackboard/capabilities.schema.json +27 -0
  123. package/ftm-upgrade/SKILL.md +41 -0
  124. package/ftm-upgrade/scripts/check-version.sh +1 -1
  125. package/ftm-upgrade/scripts/upgrade.sh +1 -1
  126. package/hooks/ftm-blackboard-enforcer.sh +94 -0
  127. package/hooks/ftm-discovery-reminder.sh +90 -0
  128. package/hooks/ftm-drafts-gate.sh +61 -0
  129. package/hooks/ftm-event-logger.mjs +107 -0
  130. package/hooks/ftm-map-autodetect.sh +79 -0
  131. package/hooks/ftm-pending-sync-check.sh +22 -0
  132. package/hooks/ftm-plan-gate.sh +96 -0
  133. package/hooks/ftm-post-commit-trigger.sh +57 -0
  134. package/hooks/settings-template.json +81 -0
  135. package/install.sh +140 -11
  136. package/package.json +12 -2
@@ -0,0 +1,203 @@
1
+ # Output Format Specification
2
+
3
+ ## Structured JSON Artifact
4
+
5
+ This is the primary output for skill-to-skill consumption (ftm-brainstorm, ftm-executor, etc.).
6
+
7
+ ### Schema
8
+
9
+ ```json
10
+ {
11
+ "query": "original research question",
12
+ "mode": "quick | standard | deep",
13
+ "timestamp": "ISO-8601",
14
+ "waves_completed": 1,
15
+ "agents_dispatched": 7,
16
+ "council_used": true,
17
+ "duration_ms": 12345,
18
+
19
+ "findings": [
20
+ {
21
+ "id": "f-001",
22
+ "claim": "one-sentence factual claim",
23
+ "evidence": "2-3 sentence supporting detail",
24
+ "source_url": "https://...",
25
+ "source_type": "primary | peer_reviewed | official_docs | code_repo | qa_site | news | blog | forum | codebase",
26
+ "confidence": 0.85,
27
+ "credibility_score": 0.78,
28
+ "trust_level": "high | moderate | low | verify",
29
+ "agent_role": "web_surveyor | academic_scout | github_miner | competitive_analyst | stack_overflow_digger | codebase_analyst | historical_investigator",
30
+ "wave": 1,
31
+ "corroborated": true,
32
+ "circular_sourcing": false
33
+ }
34
+ ],
35
+
36
+ "disagreement_map": {
37
+ "consensus": [
38
+ {
39
+ "claim": "...",
40
+ "supporting_agents": ["web_surveyor", "github_miner", "academic_scout"],
41
+ "source_count": 5,
42
+ "source_diversity": 3,
43
+ "council_verdict": "agreed",
44
+ "confidence": 0.92
45
+ }
46
+ ],
47
+ "contested": [
48
+ {
49
+ "claim_a": "...",
50
+ "claim_b": "...",
51
+ "agents_for_a": ["web_surveyor"],
52
+ "agents_for_b": ["competitive_analyst"],
53
+ "council_verdict": "contested",
54
+ "provider_positions": {
55
+ "claude": "a",
56
+ "codex": "b",
57
+ "gemini": "a"
58
+ },
59
+ "rank_winner": "a",
60
+ "judge_rationale": "..."
61
+ }
62
+ ],
63
+ "unique_insights": [
64
+ {
65
+ "claim": "...",
66
+ "agent_role": "historical_investigator",
67
+ "confidence": 0.6,
68
+ "note": "Single source — may be high-value insight or hallucination"
69
+ }
70
+ ],
71
+ "refuted": [
72
+ {
73
+ "claim": "...",
74
+ "rejection_reason": "Council unanimously rejected — evidence traces to a single unreliable blog post",
75
+ "original_agent": "web_surveyor"
76
+ }
77
+ ]
78
+ },
79
+
80
+ "metadata": {
81
+ "sources_total": 34,
82
+ "sources_high_trust": 12,
83
+ "sources_moderate_trust": 15,
84
+ "sources_low_trust": 7,
85
+ "circular_sourcing_detected": 2,
86
+ "agent_performance": {
87
+ "web_surveyor": {"findings": 6, "avg_credibility": 0.65},
88
+ "academic_scout": {"findings": 4, "avg_credibility": 0.88}
89
+ }
90
+ }
91
+ }
92
+ ```
93
+
94
+ ### Field Descriptions
95
+
96
+ | Field | Type | Required | Description |
97
+ |---|---|---|---|
98
+ | query | string | yes | The original research question |
99
+ | mode | enum | yes | quick, standard, or deep |
100
+ | timestamp | ISO-8601 | yes | When the research completed |
101
+ | waves_completed | integer | yes | 1 for quick/standard, 1-2 for deep |
102
+ | agents_dispatched | integer | yes | Total agents spawned across all waves |
103
+ | council_used | boolean | yes | Whether ftm-council was invoked |
104
+ | duration_ms | integer | yes | Total execution time |
105
+ | findings | array | yes | All individual findings |
106
+ | disagreement_map | object | standard/deep | The 4-tier reconciled output |
107
+ | metadata | object | yes | Aggregate statistics |
108
+
109
+ ### Finding ID Convention
110
+
111
+ Finding IDs follow the pattern `f-NNN` where NNN is a zero-padded sequential number. IDs are stable within a session — if the user drills down on finding #3, it remains f-003 even after new findings are added.
112
+
113
+ ---
114
+
115
+ ## Markdown Rendering Template
116
+
117
+ For user display:
118
+
119
+ ```markdown
120
+ # Research: [query]
121
+
122
+ **Mode:** [mode] | **Agents:** [count] | **Sources:** [total] | **Duration:** [time]
123
+
124
+ ---
125
+
126
+ ## Consensus Findings
127
+
128
+ [For each consensus claim:]
129
+ **[N].** [claim] ([confidence]% confidence)
130
+ - *Evidence:* [key evidence summary]
131
+ - *Sources:* [source count] across [diversity] types — [top source URL]
132
+ - *Agreed by:* [agent list]
133
+
134
+ ---
135
+
136
+ ## Contested — Where Agents Disagreed
137
+
138
+ [For each contested pair:]
139
+ **[N].** [topic of disagreement]
140
+
141
+ | Position A | Position B |
142
+ |---|---|
143
+ | [claim_a] | [claim_b] |
144
+ | Supported by: [agents_for_a] | Supported by: [agents_for_b] |
145
+ | [evidence summary] | [evidence summary] |
146
+
147
+ *Ranking:* [winner] — [judge rationale summary]
148
+ *Council:* [provider positions if available]
149
+
150
+ ---
151
+
152
+ ## Unique Insights — Unverified but Interesting
153
+
154
+ [For each unique insight:]
155
+ - **[claim]** (from [agent_role], [confidence]% confidence)
156
+ - [note about single-source status]
157
+
158
+ ---
159
+
160
+ ## Refuted — What We Ruled Out
161
+
162
+ [For each refuted claim:]
163
+ - ~~[claim]~~ — [rejection_reason]
164
+
165
+ ---
166
+
167
+ ## Source Summary
168
+
169
+ | Source Type | Count | Avg Credibility |
170
+ |---|---|---|
171
+ | [type] | [count] | [avg score] |
172
+
173
+ ---
174
+
175
+ *What's next? You can:*
176
+ - *"dig deeper on #N"* — spawn targeted agents on a specific finding
177
+ - *"I disagree with #N"* — find counter-evidence
178
+ - *"focus on [angle]"* — reshape and re-run with new weights
179
+ - *"council #N"* — route a specific claim to ftm-council
180
+ - *"compare A vs B"* — spawn comparison agent
181
+ - *"done"* — finalize the research session
182
+ ```
183
+
184
+ ---
185
+
186
+ ## Conversational Iteration Protocol
187
+
188
+ After presenting results, the skill enters iteration mode. Each user command triggers a specific action:
189
+
190
+ | User Command | Action | Updates |
191
+ |---|---|---|
192
+ | "dig deeper on #N" | Spawn 3 targeted agents on finding N's topic | Add new findings, re-render |
193
+ | "I think X is wrong because Y" | Spawn counter-evidence agents + update findings | May move claims between tiers |
194
+ | "focus on [angle]" | Reshape subtopics, re-dispatch with new weights | Full re-run with angle bias |
195
+ | "council #N" | Route specific finding to ftm-council | Update council_verdict for that claim |
196
+ | "more on [agent]'s findings" | Re-dispatch that agent with broader query | Add new findings from that domain |
197
+ | "compare A vs B" | Spawn comparison agent with both findings as context | Add comparison analysis |
198
+ | "done" | Finalize, write blackboard, emit events | Session ends |
199
+
200
+ Each iteration:
201
+ 1. Updates the JSON artifact (new findings get new IDs, tiers may change)
202
+ 2. Re-renders the markdown with changes highlighted
203
+ 3. Preserves the full conversation history for ftm-pause/resume
@@ -0,0 +1,165 @@
1
+ # Synthesis Pipeline
2
+
3
+ 5-phase pipeline that takes raw findings from finder agents and produces a structured disagreement map.
4
+
5
+ ---
6
+
7
+ ## Phase 1: Normalize & Deduplicate
8
+
9
+ Input: Raw findings from all finder agents (7 agents x 3-8 findings each = 21-56 findings)
10
+
11
+ Steps:
12
+ 1. Flatten all findings into a single list
13
+ 2. Group by semantic similarity (same claim from different agents)
14
+ 3. For each group:
15
+ - Merge into a single canonical claim
16
+ - Track which agents found it (agent_count)
17
+ - Track source type diversity (source_diversity_score = unique source types / total sources)
18
+ - Flag circular sourcing: if all sources in a group cite the same original source, mark as circular=true
19
+ 4. Output: unique_claims[] sorted by agent_count DESC, source_diversity_score DESC
20
+
21
+ ### Semantic Similarity Heuristics
22
+
23
+ Two claims are considered semantically similar when:
24
+ - They make the same factual assertion about the same subject, even with different wording
25
+ - One is a subset of the other (e.g., "X uses Y" vs "X uses Y for Z")
26
+ - They cite the same source for the same conclusion
27
+
28
+ Two claims are NOT similar when:
29
+ - They address different aspects of the same topic
30
+ - They reach different conclusions about the same subject
31
+ - One is general and the other is specific with additional qualifying conditions
32
+
33
+ When merging, keep the most specific version as the canonical claim.
34
+
35
+ ---
36
+
37
+ ## Phase 2: Adversarial Review (ftm-council)
38
+
39
+ Input: Top claims from Phase 1 (all claims with agent_count >= 2, plus any high-confidence unique claims with confidence > 0.8)
40
+
41
+ Council invocation:
42
+ - Send claims as a structured prompt to ftm-council
43
+ - Ask: "Evaluate each claim. For each: Is the evidence sufficient? What would make this wrong? Are there alternative explanations? Rate confidence 0-1."
44
+ - Council runs Claude + Codex + Gemini independently, then reconciles
45
+
46
+ Output: claims[] with council_verdict (agreed | contested | insufficient_evidence), provider_disagreements[]
47
+
48
+ ### FALLBACK (if Codex/Gemini unavailable):
49
+
50
+ Spawn 2 standalone agents on the review model:
51
+
52
+ **Devil's Advocate:** "Your job is to find reasons each claim is WRONG. Search for counter-evidence, flag single-source claims, identify logical gaps."
53
+
54
+ **Edge Case Hunter:** "Your job is to find where each claim BREAKS. Scaling limits, security concerns, accessibility gaps, failure modes under load."
55
+
56
+ Both receive all claims and return challenge_findings[]
57
+
58
+ ---
59
+
60
+ ## Phase 3: Pairwise Rank (for contested claims)
61
+
62
+ Input: Claims marked as "contested" by council
63
+
64
+ For each pair of conflicting claims:
65
+ - LLM-as-judge prompt: "Given research question Q, Claim A says [X] with evidence [E1]. Claim B says [Y] with evidence [E2]. Which claim is better supported? Why? Consider: source authority, evidence specificity, logical coherence, relevance to the question."
66
+ - Tournament bracket: winners advance, losers are demoted to "minority view"
67
+
68
+ Output: ranked_claims[] with rank_position, judge_rationale
69
+
70
+ ### Ranking Criteria (in priority order)
71
+
72
+ 1. **Source authority**: Primary sources and peer-reviewed research outweigh blog posts and forum answers
73
+ 2. **Evidence specificity**: Concrete data points (benchmarks, case studies with numbers) outweigh general assertions
74
+ 3. **Logical coherence**: Claims with clear causal reasoning outweigh correlational arguments
75
+ 4. **Relevance to question**: Claims that directly address the research question outweigh tangentially related findings
76
+ 5. **Recency**: For fast-moving topics, newer evidence outweighs older evidence (all else equal)
77
+
78
+ ---
79
+
80
+ ## Phase 4: Reconcile — Disagreement Map
81
+
82
+ Input: All processed claims (normalized, council-reviewed, ranked)
83
+
84
+ The Reconciler agent produces structured output in 4 tiers:
85
+
86
+ ### Tier 1: Consensus Claims
87
+ 3+ agents agree, council agreed, multiple source types.
88
+ - Highest confidence. Present as established findings.
89
+ - Include: canonical claim, supporting agents, source count, source diversity, council verdict, confidence score
90
+
91
+ ### Tier 2: Contested Claims
92
+ Council disagreed, or pairwise ranking was close.
93
+ - Present BOTH sides with the specific disagreement.
94
+ - Include: claim_a, claim_b, agents_for_a, agents_for_b, council positions, rank winner, judge rationale
95
+
96
+ ### Tier 3: Unique Insights
97
+ Found by 1 agent only, not contradicted.
98
+ - High value OR hallucination — flag for user judgment.
99
+ - Include: claim, agent_role, confidence, source, note flagging single-source status
100
+
101
+ ### Tier 4: Refuted Claims
102
+ Council rejected, or pairwise loser with low evidence.
103
+ - Still present briefly — knowing what's wrong is valuable.
104
+ - Include: claim, rejection_reason, original_agent
105
+
106
+ ---
107
+
108
+ ## Phase 5: Render
109
+
110
+ Produce both:
111
+ - **Structured JSON artifact** (see output-format.md for schema)
112
+ - **Rendered markdown** for user display (see output-format.md for template)
113
+
114
+ The JSON artifact is the primary output for skill-to-skill consumption. The markdown is for human reading.
115
+
116
+ ---
117
+
118
+ ## Reconciler Agent Prompt
119
+
120
+ ```
121
+ You are the Reconciler — the final judge in a multi-agent research pipeline.
122
+ You receive findings from 7 research agents that have been normalized,
123
+ deduplicated, and adversarially reviewed.
124
+
125
+ Your job is NOT to average or blend. Your job is to JUDGE:
126
+ - Which claims are strong? (multiple independent sources, council agreement)
127
+ - Which claims are contested? (present both sides, don't pick a winner)
128
+ - Which claims are unique insights? (valuable if true, flag for verification)
129
+ - Which claims should be rejected? (weak evidence, circular sourcing, council rejection)
130
+
131
+ Produce a structured disagreement map, not a smooth summary.
132
+ The user should see WHERE agents agreed, WHERE they disagreed, and WHY.
133
+
134
+ INPUT:
135
+ - normalized_claims: [list of deduplicated claims with agent_count and source_diversity]
136
+ - council_verdicts: [list of claims with agreed/contested/insufficient verdicts]
137
+ - pairwise_rankings: [list of contested claim pairs with winners and rationale]
138
+ - credibility_scores: [list of claims with scored credibility from score_credibility.py]
139
+
140
+ OUTPUT FORMAT:
141
+ Return a JSON object with these exact keys:
142
+ {
143
+ "consensus": [{ claim, supporting_agents, source_count, source_diversity, council_verdict, confidence }],
144
+ "contested": [{ claim_a, claim_b, agents_for_a, agents_for_b, council_verdict, provider_positions, rank_winner, judge_rationale }],
145
+ "unique_insights": [{ claim, agent_role, confidence, note }],
146
+ "refuted": [{ claim, rejection_reason, original_agent }]
147
+ }
148
+
149
+ RULES:
150
+ - A claim needs 3+ agents AND council agreement to be consensus
151
+ - A claim with 2 agents but council agreement goes to consensus with a "moderate confidence" flag
152
+ - A claim with council disagreement ALWAYS goes to contested, even if 5 agents agree
153
+ - A single-agent claim with confidence > 0.8 goes to unique_insights
154
+ - A single-agent claim with confidence <= 0.5 goes to refuted
155
+ - Everything else goes to unique_insights with appropriate flagging
156
+ - NEVER merge contested claims into a smooth middle ground — preserve the disagreement
157
+ ```
158
+
159
+ ---
160
+
161
+ ## Pipeline Skip Rules
162
+
163
+ - **Quick mode**: Skip Phases 2, 3, 4. Orchestrator does a single-pass synthesis directly from normalized findings.
164
+ - **Standard mode**: Skip Phase 2 (council). Run Phases 1, 3, 4, 5.
165
+ - **Deep mode**: Run all 5 phases.
@@ -0,0 +1,234 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Source credibility scoring for ftm-researcher findings.
4
+
5
+ Scoring dimensions:
6
+ - Source type weight (35%): primary > peer_reviewed > official_docs > news > blog > forum
7
+ - Recency (20%): decay based on age for fast-moving topics
8
+ - Expertise signals (25%): domain authority, author credentials
9
+ - Bias detection (20%): sensationalism penalties, balanced language bonuses
10
+
11
+ Additional flags:
12
+ - Corroboration bonus: +0.15 if independently found by 2+ agents from different source types
13
+ - Circular sourcing: flag if multiple sources cite the same original
14
+ """
15
+ import json
16
+ import sys
17
+ import re
18
+ from datetime import datetime
19
+ from urllib.parse import urlparse
20
+
21
+ # Source type base weights
22
+ SOURCE_WEIGHTS = {
23
+ "primary": 1.0,
24
+ "peer_reviewed": 0.9,
25
+ "official_docs": 0.85,
26
+ "code_repo": 0.8,
27
+ "qa_site": 0.65,
28
+ "news": 0.6,
29
+ "blog": 0.4,
30
+ "forum": 0.25,
31
+ "codebase": 0.95, # local codebase findings are high-trust
32
+ }
33
+
34
+ # High-authority domains
35
+ HIGH_AUTHORITY = {
36
+ "arxiv.org", "nature.com", "science.org", "acm.org", "ieee.org",
37
+ "github.com", "docs.python.org", "developer.mozilla.org",
38
+ "platform.openai.com", "docs.anthropic.com", "cloud.google.com",
39
+ "aws.amazon.com", "learn.microsoft.com",
40
+ }
41
+
42
+ MODERATE_AUTHORITY = {
43
+ "stackoverflow.com", "stackexchange.com", "reddit.com",
44
+ "news.ycombinator.com", "techcrunch.com", "arstechnica.com",
45
+ "thenewstack.io", "infoq.com", "dev.to",
46
+ }
47
+
48
+ # Sensationalism indicators
49
+ SENSATIONAL_PATTERNS = [
50
+ r"you won't believe", r"shocking", r"mind-blowing", r"game.?changer",
51
+ r"revolutionary", r"incredible", r"amazing breakthrough",
52
+ ]
53
+
54
+ # Balanced language indicators
55
+ BALANCED_PATTERNS = [
56
+ r"however", r"on the other hand", r"trade-?off", r"limitation",
57
+ r"caveat", r"although", r"despite", r"conversely",
58
+ ]
59
+
60
+
61
+ def score_source_type(finding: dict) -> float:
62
+ return SOURCE_WEIGHTS.get(finding.get("source_type", "blog"), 0.4)
63
+
64
+
65
+ def score_recency(finding: dict, fast_moving: bool = True) -> float:
66
+ """Score based on source recency. Extracts year from URL or metadata."""
67
+ url = finding.get("source_url", "")
68
+ evidence = finding.get("evidence", "")
69
+ current_year = datetime.now().year
70
+
71
+ # Try to extract year from URL (common in blog/paper URLs)
72
+ year_match = re.search(r'/(20[12]\d)/', url)
73
+ if not year_match:
74
+ # Try evidence text for year mentions
75
+ year_match = re.search(r'\b(20[12]\d)\b', evidence)
76
+
77
+ if year_match:
78
+ source_year = int(year_match.group(1))
79
+ age = current_year - source_year
80
+ if fast_moving:
81
+ # Aggressive decay for fast-moving topics (tech, AI, etc.)
82
+ decay_map = {0: 1.0, 1: 0.85, 2: 0.65, 3: 0.45, 4: 0.30}
83
+ return decay_map.get(age, 0.2)
84
+ else:
85
+ # Gentle decay for stable topics
86
+ decay_map = {0: 1.0, 1: 0.95, 2: 0.85, 3: 0.75, 4: 0.65, 5: 0.55}
87
+ return decay_map.get(age, 0.4)
88
+
89
+ # No date info — return neutral
90
+ return 0.7
91
+
92
+
93
+ def score_domain_authority(finding: dict) -> float:
94
+ url = finding.get("source_url", "")
95
+ if not url:
96
+ if finding.get("source_type") == "codebase":
97
+ return 0.95
98
+ return 0.5
99
+
100
+ try:
101
+ domain = urlparse(url).netloc.lower()
102
+ # Strip www.
103
+ domain = domain.removeprefix("www.")
104
+ except Exception:
105
+ return 0.5
106
+
107
+ if domain in HIGH_AUTHORITY:
108
+ return 0.9
109
+ if domain in MODERATE_AUTHORITY:
110
+ return 0.7
111
+ # Check for .edu, .gov
112
+ if domain.endswith(".edu") or domain.endswith(".gov"):
113
+ return 0.85
114
+ return 0.55
115
+
116
+
117
+ def score_bias(finding: dict) -> float:
118
+ text = finding.get("evidence", "") + " " + finding.get("claim", "")
119
+ text_lower = text.lower()
120
+
121
+ score = 0.7 # baseline
122
+
123
+ # Penalize sensationalism
124
+ for pattern in SENSATIONAL_PATTERNS:
125
+ if re.search(pattern, text_lower):
126
+ score -= 0.1
127
+
128
+ # Bonus for balanced language
129
+ for pattern in BALANCED_PATTERNS:
130
+ if re.search(pattern, text_lower):
131
+ score += 0.05
132
+
133
+ return max(0.1, min(1.0, score))
134
+
135
+
136
+ def detect_circular_sourcing(findings: list) -> list:
137
+ """Flag findings where multiple sources trace to the same original."""
138
+ url_groups = {}
139
+ for i, f in enumerate(findings):
140
+ url = f.get("source_url", "")
141
+ if url:
142
+ domain = urlparse(url).netloc.lower().removeprefix("www.")
143
+ claim_key = f.get("claim", "")[:50]
144
+ key = f"{domain}:{claim_key}"
145
+ url_groups.setdefault(key, []).append(i)
146
+
147
+ circular_indices = set()
148
+ for key, indices in url_groups.items():
149
+ if len(indices) > 1:
150
+ for idx in indices:
151
+ circular_indices.add(idx)
152
+
153
+ return list(circular_indices)
154
+
155
+
156
+ def score_findings(findings: list) -> list:
157
+ circular = detect_circular_sourcing(findings)
158
+
159
+ # Count agent agreement per claim (simplified: exact claim match)
160
+ claim_agents = {}
161
+ for f in findings:
162
+ claim = f.get("claim", "")
163
+ agent = f.get("agent_role", "unknown")
164
+ source_type = f.get("source_type", "")
165
+ claim_agents.setdefault(claim, {"agents": set(), "source_types": set()})
166
+ claim_agents[claim]["agents"].add(agent)
167
+ claim_agents[claim]["source_types"].add(source_type)
168
+
169
+ scored = []
170
+ for i, f in enumerate(findings):
171
+ type_score = score_source_type(f)
172
+ recency_score = score_recency(f)
173
+ authority_score = score_domain_authority(f)
174
+ bias_score = score_bias(f)
175
+
176
+ # Weighted composite
177
+ composite = (
178
+ type_score * 0.35 +
179
+ recency_score * 0.20 +
180
+ authority_score * 0.25 +
181
+ bias_score * 0.20
182
+ )
183
+
184
+ # Corroboration bonus
185
+ claim = f.get("claim", "")
186
+ if claim in claim_agents:
187
+ info = claim_agents[claim]
188
+ if len(info["agents"]) >= 2 and len(info["source_types"]) >= 2:
189
+ composite += 0.15
190
+
191
+ # Circular sourcing penalty
192
+ is_circular = i in circular
193
+ if is_circular:
194
+ composite -= 0.2
195
+
196
+ composite = max(0.0, min(1.0, composite))
197
+
198
+ scored_finding = {
199
+ **f,
200
+ "credibility_score": round(composite, 3),
201
+ "score_breakdown": {
202
+ "source_type": round(type_score, 3),
203
+ "recency": round(recency_score, 3),
204
+ "domain_authority": round(authority_score, 3),
205
+ "bias": round(bias_score, 3),
206
+ },
207
+ "circular_sourcing": is_circular,
208
+ "corroborated": claim in claim_agents and len(claim_agents[claim]["agents"]) >= 2,
209
+ "trust_level": (
210
+ "high" if composite >= 0.75 else
211
+ "moderate" if composite >= 0.55 else
212
+ "low" if composite >= 0.35 else
213
+ "verify"
214
+ ),
215
+ }
216
+ scored.append(scored_finding)
217
+
218
+ return sorted(scored, key=lambda x: x["credibility_score"], reverse=True)
219
+
220
+
221
+ def main():
222
+ if len(sys.argv) < 2:
223
+ print("Usage: score_credibility.py <findings.json>", file=sys.stderr)
224
+ sys.exit(1)
225
+
226
+ with open(sys.argv[1]) as f:
227
+ findings = json.load(f)
228
+
229
+ scored = score_findings(findings)
230
+ print(json.dumps(scored, indent=2))
231
+
232
+
233
+ if __name__ == "__main__":
234
+ main()
@@ -0,0 +1,92 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Validates ftm-researcher output for completeness and quality.
4
+
5
+ Checks:
6
+ 1. All required fields present in each finding
7
+ 2. Source URLs are non-empty for non-codebase findings
8
+ 3. Confidence scores in valid range
9
+ 4. Disagreement map has all 4 tiers
10
+ 5. No placeholder text (TBD, TODO, FIXME)
11
+ 6. Minimum finding count per mode (quick: 3, standard: 10, deep: 15)
12
+ 7. Source diversity: at least 3 different source types represented
13
+ 8. No duplicate claims (exact match)
14
+ """
15
+ import json
16
+ import sys
17
+
18
+ REQUIRED_FINDING_FIELDS = ["claim", "source_type", "confidence", "agent_role"]
19
+ REQUIRED_MAP_TIERS = ["consensus", "contested", "unique_insights", "refuted"]
20
+ PLACEHOLDER_PATTERNS = ["TBD", "TODO", "FIXME", "placeholder", "lorem ipsum"]
21
+ MIN_FINDINGS = {"quick": 3, "standard": 10, "deep": 15}
22
+
23
+
24
+ def validate(output: dict) -> dict:
25
+ errors = []
26
+ warnings = []
27
+
28
+ mode = output.get("mode", "standard")
29
+ findings = output.get("findings", [])
30
+ disagreement_map = output.get("disagreement_map", {})
31
+
32
+ # Check minimum findings
33
+ min_count = MIN_FINDINGS.get(mode, 10)
34
+ if len(findings) < min_count:
35
+ warnings.append(f"Only {len(findings)} findings for {mode} mode (expected >= {min_count})")
36
+
37
+ # Check required fields
38
+ for i, f in enumerate(findings):
39
+ for field in REQUIRED_FINDING_FIELDS:
40
+ if field not in f or not f[field]:
41
+ errors.append(f"Finding {i}: missing required field '{field}'")
42
+
43
+ # Source URL required for non-codebase
44
+ if f.get("source_type") != "codebase" and not f.get("source_url"):
45
+ warnings.append(f"Finding {i}: no source_url for {f.get('source_type')} source")
46
+
47
+ # Confidence range
48
+ conf = f.get("confidence", 0)
49
+ if not (0.0 <= conf <= 1.0):
50
+ errors.append(f"Finding {i}: confidence {conf} out of range [0, 1]")
51
+
52
+ # Placeholder detection
53
+ text = json.dumps(f).lower()
54
+ for p in PLACEHOLDER_PATTERNS:
55
+ if p.lower() in text:
56
+ errors.append(f"Finding {i}: contains placeholder text '{p}'")
57
+
58
+ # Source diversity
59
+ source_types = set(f.get("source_type", "") for f in findings)
60
+ if len(source_types) < 3:
61
+ warnings.append(f"Only {len(source_types)} source types (expected >= 3)")
62
+
63
+ # Duplicate detection
64
+ claims = [f.get("claim", "") for f in findings]
65
+ dupes = [c for c in claims if claims.count(c) > 1]
66
+ if dupes:
67
+ errors.append(f"Duplicate claims found: {set(dupes)}")
68
+
69
+ # Disagreement map tiers
70
+ if mode in ("standard", "deep"):
71
+ for tier in REQUIRED_MAP_TIERS:
72
+ if tier not in disagreement_map:
73
+ errors.append(f"Disagreement map missing tier: {tier}")
74
+
75
+ return {"errors": errors, "warnings": warnings, "valid": len(errors) == 0}
76
+
77
+
78
+ def main():
79
+ if len(sys.argv) < 2:
80
+ print("Usage: validate_research.py <output.json>", file=sys.stderr)
81
+ sys.exit(1)
82
+
83
+ with open(sys.argv[1]) as f:
84
+ output = json.load(f)
85
+
86
+ result = validate(output)
87
+ print(json.dumps(result, indent=2))
88
+ sys.exit(0 if result["valid"] else 1)
89
+
90
+
91
+ if __name__ == "__main__":
92
+ main()