tokens-for-good 0.4.13 → 0.4.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/pipeline/01-research/PROMPT.md +24 -15
- package/src/api-client.js +9 -1
- package/src/api-client.test.js +89 -0
- package/src/mcp-server.js +7 -3
package/package.json
CHANGED
|
@@ -114,23 +114,32 @@ Documented program changes based on evidence. "They adapted" needs specifics: wh
|
|
|
114
114
|
|
|
115
115
|
#### EVIDENCE TABLE
|
|
116
116
|
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
|
117
|
+
The score is computed deterministically by code from this table, not by you. Leave a row blank if you didn't find supporting evidence — a blank row is the honest answer when the org doesn't have that thing. Inventing evidence will lower the org's real score and may get the report rejected.
|
|
118
|
+
|
|
119
|
+
**What each row asks for** (read this before filling in the table):
|
|
120
|
+
|
|
121
|
+
- **a1** — A stated condition-level outcome goal (health, income, wellbeing, food security, survival). NOT activity-level ("train X people") and NOT access-level ("expand financial access") — those are intermediary.
|
|
122
|
+
- **a2** — A number or percentage attached to the goal in a1 (e.g. "reduce stunting by 30%"). The same quote as a1 is fine if it already contains the number.
|
|
123
|
+
- **a3** — A target population AND a target year for the goal (e.g. "children under 5 in Ghana by 2030"). Same quote as a1/a2 is fine if it covers both.
|
|
124
|
+
- **b** — An intermediate outcome the org MEASURED, with a number (e.g. "78% of trained CHWs retained at 24 months").
|
|
125
|
+
- **c** — An ultimate outcome the org MEASURED, with a number (e.g. "27% reduction in under-five mortality").
|
|
126
|
+
- **d** — A documented program change the org made BASED ON outcome data. The quote should make both the data and the change concrete (e.g. "In 2022 we moved to blended training after retention dropped to 45%").
|
|
127
|
+
- **e** — An intermediate result measured with a comparison or control group. Name the design (RCT, quasi-experimental, matched comparison). Before/after alone does not count.
|
|
128
|
+
- **f** — An ultimate result measured with a comparison or control group. Same design rules as e.
|
|
129
|
+
|
|
130
|
+
| Row | Quote (verbatim from the cited page) | Source URL | Source name |
|
|
131
|
+
|-----|--------------------------------------|------------|-------------|
|
|
132
|
+
| a1 | | | |
|
|
133
|
+
| a2 | | | |
|
|
134
|
+
| a3 | | | |
|
|
135
|
+
| b | | | |
|
|
136
|
+
| c | | | |
|
|
137
|
+
| d | | | |
|
|
138
|
+
| e | | | |
|
|
139
|
+
| f | | | |
|
|
131
140
|
|
|
132
141
|
**Rules for the EVIDENCE TABLE:**
|
|
133
|
-
- The quoted text must appear verbatim on the cited page.
|
|
142
|
+
- The quoted text must appear verbatim on the cited page. A separate fact-check pass verifies your quotes against the page bodies after submission; invented or paraphrased quotes get the report flagged.
|
|
134
143
|
- Use the real URL of the specific page that contains the quote. Not the org homepage. Not `example.com`.
|
|
135
144
|
- A blank row is the correct answer when the evidence doesn't exist. Do not invent.
|
|
136
145
|
- One row, one quote, one URL. Don't bundle two facts under one citation.
|
package/src/api-client.js
CHANGED
|
@@ -61,10 +61,18 @@ export class ApiClient {
|
|
|
61
61
|
}
|
|
62
62
|
|
|
63
63
|
async submitReport(claimId, reportMarkdown, tokenUsage = null, metrics = null, modelUsed = null, promptVersion = null, disagreementRows = null) {
|
|
64
|
+
// The MCP tool surface accepts `estimated_tokens` as a plain number, but
|
|
65
|
+
// the API validates `token_usage` as `nullable|array` and reads
|
|
66
|
+
// `token_usage.total_tokens` for leaderboard accounting. Wrap a bare
|
|
67
|
+
// number into the shape the server expects so MCP submits don't 422.
|
|
68
|
+
const normalizedTokenUsage = typeof tokenUsage === 'number'
|
|
69
|
+
? { total_tokens: tokenUsage }
|
|
70
|
+
: tokenUsage;
|
|
71
|
+
|
|
64
72
|
return this.request('POST', '/research/submit', {
|
|
65
73
|
claim_id: claimId,
|
|
66
74
|
report_markdown: reportMarkdown,
|
|
67
|
-
token_usage:
|
|
75
|
+
token_usage: normalizedTokenUsage,
|
|
68
76
|
metrics: metrics,
|
|
69
77
|
model_used: modelUsed,
|
|
70
78
|
prompt_version: promptVersion,
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
// Regression tests for ApiClient. The MCP layer passes `estimated_tokens` as
|
|
2
|
+
// a plain number; the Laravel API validates token_usage as `nullable|array`
|
|
3
|
+
// and reads `token_usage.total_tokens` for leaderboard accounting. If we
|
|
4
|
+
// stop normalizing the shape, every MCP submit silently 422s.
|
|
5
|
+
|
|
6
|
+
import { test } from 'node:test';
|
|
7
|
+
import assert from 'node:assert/strict';
|
|
8
|
+
import { ApiClient } from './api-client.js';
|
|
9
|
+
|
|
10
|
+
function withMockFetch(fn) {
|
|
11
|
+
const original = globalThis.fetch;
|
|
12
|
+
const calls = [];
|
|
13
|
+
globalThis.fetch = async (url, opts) => {
|
|
14
|
+
calls.push({ url, opts });
|
|
15
|
+
return new Response(JSON.stringify({ success: true, org_name: 'Test' }), {
|
|
16
|
+
status: 200,
|
|
17
|
+
headers: { 'Content-Type': 'application/json' },
|
|
18
|
+
});
|
|
19
|
+
};
|
|
20
|
+
return fn(calls).finally(() => {
|
|
21
|
+
globalThis.fetch = original;
|
|
22
|
+
});
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
test('submitReport wraps a numeric token_usage into {total_tokens: N}', async () => {
|
|
26
|
+
await withMockFetch(async (calls) => {
|
|
27
|
+
const client = new ApiClient('tfg_test_key');
|
|
28
|
+
await client.submitReport('claim-uuid', 'report body', 12345);
|
|
29
|
+
|
|
30
|
+
const body = JSON.parse(calls[0].opts.body);
|
|
31
|
+
assert.deepEqual(
|
|
32
|
+
body.token_usage,
|
|
33
|
+
{ total_tokens: 12345 },
|
|
34
|
+
'numeric token usage must be wrapped so Laravel `nullable|array` accepts it'
|
|
35
|
+
);
|
|
36
|
+
});
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
test('submitReport passes an array-shaped token_usage through untouched', async () => {
|
|
40
|
+
await withMockFetch(async (calls) => {
|
|
41
|
+
const client = new ApiClient('tfg_test_key');
|
|
42
|
+
const usage = { total_tokens: 42, input_tokens: 30, output_tokens: 12 };
|
|
43
|
+
await client.submitReport('claim-uuid', 'report body', usage);
|
|
44
|
+
|
|
45
|
+
const body = JSON.parse(calls[0].opts.body);
|
|
46
|
+
assert.deepEqual(body.token_usage, usage);
|
|
47
|
+
});
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
test('submitReport leaves null token_usage as null', async () => {
|
|
51
|
+
await withMockFetch(async (calls) => {
|
|
52
|
+
const client = new ApiClient('tfg_test_key');
|
|
53
|
+
await client.submitReport('claim-uuid', 'report body', null);
|
|
54
|
+
|
|
55
|
+
const body = JSON.parse(calls[0].opts.body);
|
|
56
|
+
assert.equal(body.token_usage, null);
|
|
57
|
+
});
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
test('submitReport forwards disagreement_rows and prompt_version', async () => {
|
|
61
|
+
await withMockFetch(async (calls) => {
|
|
62
|
+
const client = new ApiClient('tfg_test_key');
|
|
63
|
+
await client.submitReport(
|
|
64
|
+
'claim-uuid',
|
|
65
|
+
'report body',
|
|
66
|
+
100,
|
|
67
|
+
null,
|
|
68
|
+
'sonnet-4-6',
|
|
69
|
+
'v3',
|
|
70
|
+
['a1', 'b']
|
|
71
|
+
);
|
|
72
|
+
|
|
73
|
+
const body = JSON.parse(calls[0].opts.body);
|
|
74
|
+
assert.equal(body.prompt_version, 'v3');
|
|
75
|
+
assert.deepEqual(body.disagreement_rows, ['a1', 'b']);
|
|
76
|
+
});
|
|
77
|
+
});
|
|
78
|
+
|
|
79
|
+
test('request() returns null on 204 No Content (consolidation queue empty)', async () => {
|
|
80
|
+
const original = globalThis.fetch;
|
|
81
|
+
globalThis.fetch = async () => new Response(null, { status: 204 });
|
|
82
|
+
try {
|
|
83
|
+
const client = new ApiClient('tfg_test_key');
|
|
84
|
+
const result = await client.getNextConsolidation();
|
|
85
|
+
assert.equal(result, null);
|
|
86
|
+
} finally {
|
|
87
|
+
globalThis.fetch = original;
|
|
88
|
+
}
|
|
89
|
+
});
|
package/src/mcp-server.js
CHANGED
|
@@ -143,12 +143,13 @@ server.tool('submit_report', 'Submit a completed research report (or a consolida
|
|
|
143
143
|
report_markdown: z.string().describe('The full research report in markdown'),
|
|
144
144
|
estimated_tokens: z.number().describe('Estimated total tokens used: count web searches (~1K each), web fetches (~2-5K each), report output (~4 tokens/word), plus ~10K overhead'),
|
|
145
145
|
model_used: z.string().optional().describe('The model that generated this report'),
|
|
146
|
+
prompt_version: z.string().optional().describe('Methodology version: "v3" for the EVIDENCE TABLE flow (default), "v2" for the legacy scorecard flow.'),
|
|
146
147
|
disagreement_rows: z.array(z.enum(['a1', 'a2', 'a3', 'b', 'c', 'd', 'e', 'f'])).optional().describe('Consolidation-only: EVIDENCE TABLE row keys where the two researchers materially disagreed. >=3 auto-triggers a 3rd researcher.'),
|
|
147
|
-
}, async ({ claim_id, report_markdown, estimated_tokens, model_used, disagreement_rows }) => {
|
|
148
|
+
}, async ({ claim_id, report_markdown, estimated_tokens, model_used, prompt_version, disagreement_rows }) => {
|
|
148
149
|
if (!client) return { content: [{ type: 'text', text: 'Error: TFG_API_KEY not set.' }] };
|
|
149
150
|
|
|
150
151
|
try {
|
|
151
|
-
const result = await client.submitReport(claim_id, report_markdown, estimated_tokens, null, model_used,
|
|
152
|
+
const result = await client.submitReport(claim_id, report_markdown, estimated_tokens, null, model_used, prompt_version, disagreement_rows);
|
|
152
153
|
markContributed();
|
|
153
154
|
|
|
154
155
|
// One-off users: first successful submit completes their initial setup,
|
|
@@ -278,8 +279,11 @@ server.tool('my_impact', 'See your personal contribution stats, tier, and histor
|
|
|
278
279
|
const result = await client.getImpact();
|
|
279
280
|
const c = result.contributor;
|
|
280
281
|
|
|
282
|
+
// Older server builds omit github_handle from the impact response — fall
|
|
283
|
+
// back to display_name so we never print "@undefined".
|
|
284
|
+
const who = c.github_handle ? `@${c.github_handle}` : (c.display_name || 'you');
|
|
281
285
|
return {
|
|
282
|
-
content: [{ type: 'text', text: `Your Impact (
|
|
286
|
+
content: [{ type: 'text', text: `Your Impact (${who}):\n\nTier: ${c.tier}\nOrgs researched: ${c.total_orgs}\nAcceptance rate: ${c.acceptance_rate}%\nAutomation: ${c.has_schedule ? 'Active' : 'Not set up'}\n\nRecent:\n${result.claims?.slice(0, 5).map(cl => ` ${cl.organization?.name || 'Unknown'} - ${cl.status}`).join('\n') || 'None'}` }],
|
|
283
287
|
};
|
|
284
288
|
} catch (err) {
|
|
285
289
|
return { content: [{ type: 'text', text: `Error: ${err.message}` }] };
|