@botlearn/academic-search 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +35 -0
- package/knowledge/anti-patterns.md +88 -0
- package/knowledge/best-practices.md +165 -0
- package/knowledge/domain.md +293 -0
- package/manifest.json +28 -0
- package/package.json +38 -0
- package/skill.md +56 -0
- package/strategies/main.md +134 -0
- package/tests/benchmark.json +476 -0
- package/tests/smoke.json +54 -0
|
@@ -0,0 +1,476 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": "0.0.1",
|
|
3
|
+
"dimension": "information-retrieval",
|
|
4
|
+
"tasks": [
|
|
5
|
+
{
|
|
6
|
+
"id": "bench-easy-01",
|
|
7
|
+
"difficulty": "easy",
|
|
8
|
+
"description": "Find a specific well-known paper by topic and approximate details",
|
|
9
|
+
"input": "Find the original Transformer paper — the one that introduced the 'Attention Is All You Need' architecture. Give me the full citation, where it was published, its citation count, and a one-paragraph summary of its main contribution.",
|
|
10
|
+
"rubric": [
|
|
11
|
+
{
|
|
12
|
+
"criterion": "Identification Accuracy",
|
|
13
|
+
"weight": 0.4,
|
|
14
|
+
"scoring": {
|
|
15
|
+
"5": "Correctly identifies Vaswani et al. (2017), 'Attention Is All You Need', NeurIPS 2017; provides arXiv ID (1706.03762) and/or DOI",
|
|
16
|
+
"3": "Identifies the correct paper but with minor metadata errors (wrong year, incomplete author list)",
|
|
17
|
+
"1": "Returns a related but different paper on attention mechanisms",
|
|
18
|
+
"0": "Cannot find the paper or returns irrelevant results"
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
{
|
|
22
|
+
"criterion": "Bibliographic Completeness",
|
|
23
|
+
"weight": 0.3,
|
|
24
|
+
"scoring": {
|
|
25
|
+
"5": "Full citation: all 8 authors, year (2017), venue (NeurIPS / NIPS), citation count from Semantic Scholar, arXiv link, publication status (peer-reviewed)",
|
|
26
|
+
"3": "Partial citation with title, first author, and year but missing some metadata",
|
|
27
|
+
"1": "Title and URL only",
|
|
28
|
+
"0": "Incomplete or fabricated citation"
|
|
29
|
+
}
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
"criterion": "Summary Quality",
|
|
33
|
+
"weight": 0.3,
|
|
34
|
+
"scoring": {
|
|
35
|
+
"5": "Accurate summary covering: self-attention mechanism replacing recurrence/convolution, multi-head attention, positional encoding, and its impact on NLP/sequence modeling",
|
|
36
|
+
"3": "Correct but superficial summary mentioning attention but lacking architectural details",
|
|
37
|
+
"1": "Vague or partially incorrect summary",
|
|
38
|
+
"0": "No summary or fabricated claims"
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
],
|
|
42
|
+
"expectedScoreWithout": 45,
|
|
43
|
+
"expectedScoreWith": 85
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"id": "bench-easy-02",
|
|
47
|
+
"difficulty": "easy",
|
|
48
|
+
"description": "Find recent papers on a well-defined topic within a single database",
|
|
49
|
+
"input": "Search arXiv for the 5 most recent papers on diffusion models for image generation submitted in 2024 or later. List them with titles, authors, submission dates, and arXiv IDs.",
|
|
50
|
+
"rubric": [
|
|
51
|
+
{
|
|
52
|
+
"criterion": "Database Targeting",
|
|
53
|
+
"weight": 0.3,
|
|
54
|
+
"scoring": {
|
|
55
|
+
"5": "Uses arXiv API with appropriate query: cat:cs.CV, ti:/abs: prefixes for 'diffusion model', sortBy=submittedDate, date constraint for 2024+",
|
|
56
|
+
"3": "Searches arXiv but with suboptimal query construction (e.g., no category filter)",
|
|
57
|
+
"1": "Searches Google Scholar or Semantic Scholar instead of arXiv as requested",
|
|
58
|
+
"0": "No database-specific query"
|
|
59
|
+
}
|
|
60
|
+
},
|
|
61
|
+
{
|
|
62
|
+
"criterion": "Result Accuracy",
|
|
63
|
+
"weight": 0.4,
|
|
64
|
+
"scoring": {
|
|
65
|
+
"5": "Returns 5 real arXiv papers on diffusion models for image generation, all from 2024+; arXiv IDs are valid; dates are correct",
|
|
66
|
+
"3": "Returns 5 papers but 1-2 are not specifically about diffusion models for images, or dates are slightly off",
|
|
67
|
+
"1": "Fewer than 3 relevant papers or some entries appear fabricated",
|
|
68
|
+
"0": "Results are not from arXiv or are fabricated"
|
|
69
|
+
}
|
|
70
|
+
},
|
|
71
|
+
{
|
|
72
|
+
"criterion": "Metadata Quality",
|
|
73
|
+
"weight": 0.3,
|
|
74
|
+
"scoring": {
|
|
75
|
+
"5": "Each paper has: complete title, full author list, exact submission date, valid arXiv ID (YYMM.NNNNN format)",
|
|
76
|
+
"3": "Title and arXiv ID present but author list is incomplete or dates are approximate",
|
|
77
|
+
"1": "Only titles provided; missing IDs and dates",
|
|
78
|
+
"0": "Incomplete or fabricated metadata"
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
],
|
|
82
|
+
"expectedScoreWithout": 35,
|
|
83
|
+
"expectedScoreWith": 80
|
|
84
|
+
},
|
|
85
|
+
{
|
|
86
|
+
"id": "bench-easy-03",
|
|
87
|
+
"difficulty": "easy",
|
|
88
|
+
"description": "Find papers by a specific author on a known topic",
|
|
89
|
+
"input": "Find papers by Yann LeCun on self-supervised learning published since 2020. Include citation counts and venues.",
|
|
90
|
+
"rubric": [
|
|
91
|
+
{
|
|
92
|
+
"criterion": "Author Identification",
|
|
93
|
+
"weight": 0.3,
|
|
94
|
+
"scoring": {
|
|
95
|
+
"5": "Correctly searches for Yann LeCun using author-specific query syntax across databases; uses au: prefix on arXiv, author search on Semantic Scholar",
|
|
96
|
+
"3": "Searches for the author but only on one database",
|
|
97
|
+
"1": "Searches for the topic but does not properly filter by author",
|
|
98
|
+
"0": "No author filtering applied"
|
|
99
|
+
}
|
|
100
|
+
},
|
|
101
|
+
{
|
|
102
|
+
"criterion": "Result Relevance",
|
|
103
|
+
"weight": 0.4,
|
|
104
|
+
"scoring": {
|
|
105
|
+
"5": "Returns papers where LeCun is an author (not just cited); all papers are about self-supervised learning; all from 2020+; includes key works like the JEPA paper or VICReg",
|
|
106
|
+
"3": "Most papers are by LeCun on self-supervised learning but 1-2 are off-topic or misattributed",
|
|
107
|
+
"1": "Mix of relevant and irrelevant papers; author filtering is loose",
|
|
108
|
+
"0": "Papers are not by LeCun or not about self-supervised learning"
|
|
109
|
+
}
|
|
110
|
+
},
|
|
111
|
+
{
|
|
112
|
+
"criterion": "Citation & Venue Data",
|
|
113
|
+
"weight": 0.3,
|
|
114
|
+
"scoring": {
|
|
115
|
+
"5": "Each paper includes citation count (from Semantic Scholar or Google Scholar), venue name, and publication status; citation counts are plausible",
|
|
116
|
+
"3": "Citation counts provided for most papers but venues missing for some",
|
|
117
|
+
"1": "No citation counts; only titles and years",
|
|
118
|
+
"0": "Missing or fabricated citation data"
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
],
|
|
122
|
+
"expectedScoreWithout": 35,
|
|
123
|
+
"expectedScoreWith": 75
|
|
124
|
+
},
|
|
125
|
+
{
|
|
126
|
+
"id": "bench-med-01",
|
|
127
|
+
"difficulty": "medium",
|
|
128
|
+
"description": "Cross-database literature search on an interdisciplinary topic",
|
|
129
|
+
"input": "I'm starting a research project on using graph neural networks for drug discovery. Find the top 5 most relevant papers that cover: (1) GNN architectures for molecular property prediction, (2) applications to virtual screening or lead optimization, and (3) benchmark datasets used in this area. I need papers from top venues with citation analysis.",
|
|
130
|
+
"rubric": [
|
|
131
|
+
{
|
|
132
|
+
"criterion": "Multi-Aspect Coverage",
|
|
133
|
+
"weight": 0.3,
|
|
134
|
+
"scoring": {
|
|
135
|
+
"5": "Returns papers covering all 3 requested aspects: GNN architectures (e.g., SchNet, DimeNet, GemNet), virtual screening applications, and benchmark datasets (e.g., MoleculeNet, ZINC); at least 1 paper per aspect",
|
|
136
|
+
"3": "Covers 2 of 3 aspects; one aspect underrepresented",
|
|
137
|
+
"1": "Only covers GNN architectures without drug discovery application or benchmarks",
|
|
138
|
+
"0": "Results are generic GNN papers without drug discovery focus"
|
|
139
|
+
}
|
|
140
|
+
},
|
|
141
|
+
{
|
|
142
|
+
"criterion": "Cross-Database Search",
|
|
143
|
+
"weight": 0.2,
|
|
144
|
+
"scoring": {
|
|
145
|
+
"5": "Searches arXiv (cs.LG + q-bio.QM categories), Semantic Scholar (fieldsOfStudy: Computer Science + Biology), and Google Scholar; deduplicates across databases",
|
|
146
|
+
"3": "Searches 2 databases with reasonable queries",
|
|
147
|
+
"1": "Single database search only",
|
|
148
|
+
"0": "No academic database search"
|
|
149
|
+
}
|
|
150
|
+
},
|
|
151
|
+
{
|
|
152
|
+
"criterion": "Venue Quality & Citation Analysis",
|
|
153
|
+
"weight": 0.25,
|
|
154
|
+
"scoring": {
|
|
155
|
+
"5": "Papers from top venues (NeurIPS, ICML, Nature Machine Intelligence, JCIM, Bioinformatics); includes citation counts, influential citation counts, and citation velocity for trend analysis",
|
|
156
|
+
"3": "Good venues but citation analysis is limited to raw counts only",
|
|
157
|
+
"1": "Mixed venue quality; no citation analysis",
|
|
158
|
+
"0": "Low-quality venues or no venue information"
|
|
159
|
+
}
|
|
160
|
+
},
|
|
161
|
+
{
|
|
162
|
+
"criterion": "Synthesis & Research Guidance",
|
|
163
|
+
"weight": 0.25,
|
|
164
|
+
"scoring": {
|
|
165
|
+
"5": "Provides thematic grouping of papers, identifies the current state-of-the-art, notes key open challenges, and suggests a reading order for someone starting in this field",
|
|
166
|
+
"3": "Brief per-paper summaries but no cross-paper synthesis",
|
|
167
|
+
"1": "Paper list without context or guidance",
|
|
168
|
+
"0": "No synthesis"
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
],
|
|
172
|
+
"expectedScoreWithout": 25,
|
|
173
|
+
"expectedScoreWith": 70
|
|
174
|
+
},
|
|
175
|
+
{
|
|
176
|
+
"id": "bench-med-02",
|
|
177
|
+
"difficulty": "medium",
|
|
178
|
+
"description": "Literature review requiring citation graph traversal",
|
|
179
|
+
"input": "I found the paper 'BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding' (Devlin et al., 2019) very useful. Find 5 papers that directly build on BERT's approach — specifically papers that propose improvements to BERT's pre-training objectives or architecture. I want the papers that cite BERT and made the biggest impact. Exclude papers that merely use BERT as a baseline without modifying it.",
|
|
180
|
+
"rubric": [
|
|
181
|
+
{
|
|
182
|
+
"criterion": "Citation Graph Usage",
|
|
183
|
+
"weight": 0.3,
|
|
184
|
+
"scoring": {
|
|
185
|
+
"5": "Uses Semantic Scholar citation API to find papers citing BERT; filters by influential citation count; correctly identifies papers that extend (not just use) BERT's architecture or pre-training",
|
|
186
|
+
"3": "Finds papers related to BERT but through keyword search rather than citation graph traversal",
|
|
187
|
+
"1": "Returns general BERT-related papers without distinguishing extensions from applications",
|
|
188
|
+
"0": "No citation graph analysis"
|
|
189
|
+
}
|
|
190
|
+
},
|
|
191
|
+
{
|
|
192
|
+
"criterion": "Result Relevance & Filtering",
|
|
193
|
+
"weight": 0.35,
|
|
194
|
+
"scoring": {
|
|
195
|
+
"5": "All 5 papers propose modifications to BERT's pre-training or architecture (e.g., RoBERTa, ALBERT, DeBERTa, SpanBERT, ELECTRA); correctly excludes papers that merely fine-tune BERT on downstream tasks",
|
|
196
|
+
"3": "3-4 papers are BERT extensions; 1-2 are BERT applications rather than modifications",
|
|
197
|
+
"1": "Mix of BERT-related papers without clear distinction between extensions and applications",
|
|
198
|
+
"0": "Results are generic NLP papers or not about BERT improvements"
|
|
199
|
+
}
|
|
200
|
+
},
|
|
201
|
+
{
|
|
202
|
+
"criterion": "Impact Assessment",
|
|
203
|
+
"weight": 0.2,
|
|
204
|
+
"scoring": {
|
|
205
|
+
"5": "Papers are ranked by actual impact: citation count, influential citations, adoption in the field; explains what each paper changed about BERT and why it mattered",
|
|
206
|
+
"3": "Citation counts provided but no analysis of relative impact or contribution type",
|
|
207
|
+
"1": "No impact ranking or assessment",
|
|
208
|
+
"0": "Impact data is missing or fabricated"
|
|
209
|
+
}
|
|
210
|
+
},
|
|
211
|
+
{
|
|
212
|
+
"criterion": "Completeness",
|
|
213
|
+
"weight": 0.15,
|
|
214
|
+
"scoring": {
|
|
215
|
+
"5": "Full bibliographic metadata for all 5 papers; open-access links; clear description of what each paper modified about BERT",
|
|
216
|
+
"3": "Metadata mostly complete but missing details on specific modifications",
|
|
217
|
+
"1": "Titles and authors only",
|
|
218
|
+
"0": "Incomplete results"
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
],
|
|
222
|
+
"expectedScoreWithout": 25,
|
|
223
|
+
"expectedScoreWith": 70
|
|
224
|
+
},
|
|
225
|
+
{
|
|
226
|
+
"id": "bench-med-03",
|
|
227
|
+
"difficulty": "medium",
|
|
228
|
+
"description": "Search requiring publication status verification and open-access resolution",
|
|
229
|
+
"input": "Find 5 papers on federated learning for healthcare data published in peer-reviewed journals (not preprints or conference papers). For each paper, I need the journal name, impact factor or h5-index, DOI, and whether it's open access. If a paper is behind a paywall, find the open-access version if one exists.",
|
|
230
|
+
"rubric": [
|
|
231
|
+
{
|
|
232
|
+
"criterion": "Publication Status Filtering",
|
|
233
|
+
"weight": 0.3,
|
|
234
|
+
"scoring": {
|
|
235
|
+
"5": "All 5 papers are from peer-reviewed journals (not conferences or arXiv); correctly identifies and filters out preprints; uses publicationTypes filter on Semantic Scholar or equivalent",
|
|
236
|
+
"3": "4 papers are from journals; 1 is a conference paper or preprint incorrectly included",
|
|
237
|
+
"1": "Mix of journals, conferences, and preprints without distinction",
|
|
238
|
+
"0": "No publication status filtering"
|
|
239
|
+
}
|
|
240
|
+
},
|
|
241
|
+
{
|
|
242
|
+
"criterion": "Venue Metadata Quality",
|
|
243
|
+
"weight": 0.25,
|
|
244
|
+
"scoring": {
|
|
245
|
+
"5": "Each paper includes: journal name, impact factor or h5-index, ISSN or journal ranking; journals are relevant to healthcare AI (e.g., Nature Medicine, JAMIA, npj Digital Medicine, IEEE JBHI)",
|
|
246
|
+
"3": "Journal names provided but missing impact metrics for some entries",
|
|
247
|
+
"1": "Some journal names missing; no impact metrics",
|
|
248
|
+
"0": "No venue metadata"
|
|
249
|
+
}
|
|
250
|
+
},
|
|
251
|
+
{
|
|
252
|
+
"criterion": "Open Access Resolution",
|
|
253
|
+
"weight": 0.25,
|
|
254
|
+
"scoring": {
|
|
255
|
+
"5": "For each paper: DOI provided, open-access status checked via Semantic Scholar openAccessPdf field; for paywalled papers, checks arXiv preprint version, PubMed Central, or author's repository; provides direct PDF links where available",
|
|
256
|
+
"3": "DOIs provided; open-access status noted but no effort to find alternative access for paywalled papers",
|
|
257
|
+
"1": "DOIs for some papers; no open-access analysis",
|
|
258
|
+
"0": "No DOIs or access information"
|
|
259
|
+
}
|
|
260
|
+
},
|
|
261
|
+
{
|
|
262
|
+
"criterion": "Topical Relevance",
|
|
263
|
+
"weight": 0.2,
|
|
264
|
+
"scoring": {
|
|
265
|
+
"5": "All 5 papers are specifically about federated learning applied to healthcare/medical data; covers relevant aspects (privacy, clinical outcomes, multi-site studies)",
|
|
266
|
+
"3": "Most papers are relevant but 1-2 are about federated learning in general or healthcare ML without federated component",
|
|
267
|
+
"1": "Generic federated learning or healthcare ML papers",
|
|
268
|
+
"0": "Off-topic results"
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
],
|
|
272
|
+
"expectedScoreWithout": 20,
|
|
273
|
+
"expectedScoreWith": 65
|
|
274
|
+
},
|
|
275
|
+
{
|
|
276
|
+
"id": "bench-med-04",
|
|
277
|
+
"difficulty": "medium",
|
|
278
|
+
"description": "Comparative literature search requiring multi-method analysis",
|
|
279
|
+
"input": "Compare the main approaches to text-to-image generation in the academic literature: GANs vs. diffusion models vs. autoregressive models. Find 2 representative papers for each approach (6 total) and analyze which approach is currently dominant based on benchmark results (FID scores on standard datasets) and citation trends.",
|
|
280
|
+
"rubric": [
|
|
281
|
+
{
|
|
282
|
+
"criterion": "Approach Coverage",
|
|
283
|
+
"weight": 0.3,
|
|
284
|
+
"scoring": {
|
|
285
|
+
"5": "Returns exactly 2 representative papers per approach (6 total): GANs (e.g., StyleGAN, GigaGAN), diffusion (e.g., DALL-E 2, Stable Diffusion, Imagen), autoregressive (e.g., DALL-E, Parti, CogView); papers are well-chosen representatives",
|
|
286
|
+
"3": "Covers all 3 approaches but paper selection is uneven or suboptimal (e.g., 3 diffusion papers, 1 GAN, 2 autoregressive)",
|
|
287
|
+
"1": "Covers only 2 of 3 approaches or returns generic papers",
|
|
288
|
+
"0": "Does not distinguish between approaches"
|
|
289
|
+
}
|
|
290
|
+
},
|
|
291
|
+
{
|
|
292
|
+
"criterion": "Quantitative Comparison",
|
|
293
|
+
"weight": 0.3,
|
|
294
|
+
"scoring": {
|
|
295
|
+
"5": "Compares approaches using standard metrics (FID, CLIP score, IS) on common benchmarks (COCO, ImageNet); identifies which approach achieves state-of-the-art; notes the evolution over time",
|
|
296
|
+
"3": "Mentions metrics but without systematic comparison across approaches",
|
|
297
|
+
"1": "Qualitative comparison only; no benchmark numbers",
|
|
298
|
+
"0": "No comparative analysis"
|
|
299
|
+
}
|
|
300
|
+
},
|
|
301
|
+
{
|
|
302
|
+
"criterion": "Trend Analysis",
|
|
303
|
+
"weight": 0.2,
|
|
304
|
+
"scoring": {
|
|
305
|
+
"5": "Analyzes citation trends showing the shift from GANs to diffusion models; notes publication volume and venue trends; provides evidence for which approach is currently dominant",
|
|
306
|
+
"3": "Notes that diffusion models are recent trend but without citation data to support",
|
|
307
|
+
"1": "No trend analysis; static comparison only",
|
|
308
|
+
"0": "Incorrect trend assessment"
|
|
309
|
+
}
|
|
310
|
+
},
|
|
311
|
+
{
|
|
312
|
+
"criterion": "Paper Quality & Metadata",
|
|
313
|
+
"weight": 0.2,
|
|
314
|
+
"scoring": {
|
|
315
|
+
"5": "All 6 papers are from top venues with full metadata; citation counts and publication years allow trend visualization",
|
|
316
|
+
"3": "Good papers but incomplete metadata for trend analysis",
|
|
317
|
+
"1": "Mixed quality papers; some metadata missing",
|
|
318
|
+
"0": "Low-quality sources or fabricated papers"
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
],
|
|
322
|
+
"expectedScoreWithout": 25,
|
|
323
|
+
"expectedScoreWith": 70
|
|
324
|
+
},
|
|
325
|
+
{
|
|
326
|
+
"id": "bench-hard-01",
|
|
327
|
+
"difficulty": "hard",
|
|
328
|
+
"description": "Systematic literature search on a niche interdisciplinary topic",
|
|
329
|
+
"input": "Conduct a mini systematic literature review on the use of causal inference methods in natural language processing. I need: (1) a clear search strategy documented with queries used, (2) papers categorized by causal method type (do-calculus, instrumental variables, propensity score matching, counterfactual reasoning), (3) the NLP tasks these methods are applied to, and (4) an assessment of whether results show causal methods outperform correlation-based approaches. Target the top 5 most impactful papers.",
|
|
330
|
+
"rubric": [
|
|
331
|
+
{
|
|
332
|
+
"criterion": "Search Strategy Documentation",
|
|
333
|
+
"weight": 0.2,
|
|
334
|
+
"scoring": {
|
|
335
|
+
"5": "Documents the exact queries used for each database (arXiv: field prefixes + category codes; Semantic Scholar: query + filters; Google Scholar: operators); explains inclusion/exclusion criteria; notes how many results were screened vs. selected",
|
|
336
|
+
"3": "Mentions databases searched but doesn't document exact queries or screening process",
|
|
337
|
+
"1": "No documentation of search methodology",
|
|
338
|
+
"0": "Unclear how papers were found"
|
|
339
|
+
}
|
|
340
|
+
},
|
|
341
|
+
{
|
|
342
|
+
"criterion": "Methodological Categorization",
|
|
343
|
+
"weight": 0.3,
|
|
344
|
+
"scoring": {
|
|
345
|
+
"5": "Papers are correctly categorized by causal method: identifies which use do-calculus/SCMs, instrumental variables, propensity scores, counterfactual reasoning, or other causal frameworks; explains each method briefly",
|
|
346
|
+
"3": "Papers are grouped but categorization is imprecise or misses some method types",
|
|
347
|
+
"1": "Papers listed without methodological categorization",
|
|
348
|
+
"0": "No categorization or incorrect method identification"
|
|
349
|
+
}
|
|
350
|
+
},
|
|
351
|
+
{
|
|
352
|
+
"criterion": "NLP Task Mapping",
|
|
353
|
+
"weight": 0.2,
|
|
354
|
+
"scoring": {
|
|
355
|
+
"5": "Clearly maps each paper to the NLP task it addresses (e.g., text classification, NLI, bias mitigation, question answering, summarization); identifies which tasks have the most causal inference research",
|
|
356
|
+
"3": "NLP tasks mentioned but mapping is incomplete",
|
|
357
|
+
"1": "Tasks not clearly identified",
|
|
358
|
+
"0": "No task mapping"
|
|
359
|
+
}
|
|
360
|
+
},
|
|
361
|
+
{
|
|
362
|
+
"criterion": "Evidence Synthesis",
|
|
363
|
+
"weight": 0.3,
|
|
364
|
+
"scoring": {
|
|
365
|
+
"5": "Synthesizes findings across papers: reports whether causal methods outperform baselines (with specific metrics), identifies conditions where causal approaches are most beneficial, notes limitations and open challenges; provides a clear conclusion",
|
|
366
|
+
"3": "Summarizes individual paper findings but doesn't synthesize across papers",
|
|
367
|
+
"1": "Abstracts only; no comparative analysis",
|
|
368
|
+
"0": "No synthesis"
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
],
|
|
372
|
+
"expectedScoreWithout": 15,
|
|
373
|
+
"expectedScoreWith": 60
|
|
374
|
+
},
|
|
375
|
+
{
|
|
376
|
+
"id": "bench-hard-02",
|
|
377
|
+
"difficulty": "hard",
|
|
378
|
+
"description": "Search requiring quality assessment of contradictory findings in the literature",
|
|
379
|
+
"input": "There is an ongoing debate about whether scaling language models (more parameters, more data) is sufficient for achieving reasoning capabilities, or whether architectural innovations are needed. Find the 5 most important papers representing both sides of this debate. For each paper, assess the strength of its evidence (methodology, experimental rigor, reproducibility). Provide a balanced synthesis of the current state of the debate.",
|
|
380
|
+
"rubric": [
|
|
381
|
+
{
|
|
382
|
+
"criterion": "Debate Coverage Balance",
|
|
383
|
+
"weight": 0.25,
|
|
384
|
+
"scoring": {
|
|
385
|
+
"5": "Returns papers from both sides: 2-3 pro-scaling papers (e.g., Kaplan et al. scaling laws, GPT-4 technical report, Chinchilla) and 2-3 pro-architecture papers (e.g., chain-of-thought, Mamba, retrieval-augmented approaches); balance is fair",
|
|
386
|
+
"3": "Both sides represented but imbalanced (e.g., 4 papers from one side, 1 from the other)",
|
|
387
|
+
"1": "Only one side of the debate represented",
|
|
388
|
+
"0": "Papers are not about the scaling vs. architecture debate"
|
|
389
|
+
}
|
|
390
|
+
},
|
|
391
|
+
{
|
|
392
|
+
"criterion": "Evidence Quality Assessment",
|
|
393
|
+
"weight": 0.3,
|
|
394
|
+
"scoring": {
|
|
395
|
+
"5": "For each paper: evaluates methodology (controlled experiments vs. observational), identifies baseline comparisons, notes whether claims are well-supported by data, flags potential confounds (e.g., training data contamination, compute budget differences); rates evidence strength",
|
|
396
|
+
"3": "Notes methodology for some papers but assessment is surface-level",
|
|
397
|
+
"1": "No evidence quality assessment; takes paper claims at face value",
|
|
398
|
+
"0": "Mischaracterizes paper evidence"
|
|
399
|
+
}
|
|
400
|
+
},
|
|
401
|
+
{
|
|
402
|
+
"criterion": "Synthesis & Nuance",
|
|
403
|
+
"weight": 0.3,
|
|
404
|
+
"scoring": {
|
|
405
|
+
"5": "Provides nuanced synthesis: acknowledges that both sides have valid evidence, identifies conditions where scaling helps vs. where architecture matters, notes that the debate may be a false dichotomy, discusses implications for future research",
|
|
406
|
+
"3": "Presents both sides but synthesis lacks nuance or makes oversimplified conclusions",
|
|
407
|
+
"1": "Summaries of individual papers without connecting them",
|
|
408
|
+
"0": "Biased or incoherent synthesis"
|
|
409
|
+
}
|
|
410
|
+
},
|
|
411
|
+
{
|
|
412
|
+
"criterion": "Paper Quality & Selection",
|
|
413
|
+
"weight": 0.15,
|
|
414
|
+
"scoring": {
|
|
415
|
+
"5": "Papers are from top venues, highly cited, and represent key milestones in the debate; selection demonstrates deep understanding of the research landscape",
|
|
416
|
+
"3": "Good papers but misses some key contributions to the debate",
|
|
417
|
+
"1": "Mixed quality; includes minor papers while missing landmark ones",
|
|
418
|
+
"0": "Poor paper selection"
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
],
|
|
422
|
+
"expectedScoreWithout": 15,
|
|
423
|
+
"expectedScoreWith": 60
|
|
424
|
+
},
|
|
425
|
+
{
|
|
426
|
+
"id": "bench-hard-03",
|
|
427
|
+
"difficulty": "hard",
|
|
428
|
+
"description": "Cross-lingual and cross-domain academic search with methodology assessment",
|
|
429
|
+
"input": "I need to understand the current state of research on AI-generated text detection. Find 5 key papers covering: (1) detection methods (watermarking, statistical tests, trained classifiers), (2) adversarial attacks that evade detection, and (3) evaluation benchmarks. For each paper, critically assess whether the methodology would generalize to newer models like GPT-4 and Claude. Identify the biggest gaps in the current detection literature.",
|
|
430
|
+
"rubric": [
|
|
431
|
+
{
|
|
432
|
+
"criterion": "Topic Coverage",
|
|
433
|
+
"weight": 0.25,
|
|
434
|
+
"scoring": {
|
|
435
|
+
"5": "Papers cover all 3 requested subtopics: detection methods (watermarking like Kirchenbauer et al., statistical like DetectGPT, classifiers like GPTZero papers), adversarial evasion techniques, and evaluation benchmarks; at least 1 paper per subtopic",
|
|
436
|
+
"3": "Covers 2 of 3 subtopics well; one subtopic underrepresented",
|
|
437
|
+
"1": "Only covers detection methods without adversarial or benchmark perspectives",
|
|
438
|
+
"0": "Off-topic or insufficient coverage"
|
|
439
|
+
}
|
|
440
|
+
},
|
|
441
|
+
{
|
|
442
|
+
"criterion": "Generalizability Assessment",
|
|
443
|
+
"weight": 0.3,
|
|
444
|
+
"scoring": {
|
|
445
|
+
"5": "For each paper, critically evaluates whether findings generalize to newer models: identifies which methods are model-specific (e.g., watermarking requires model access) vs. model-agnostic; discusses known failures on GPT-4-class models; distinguishes white-box from black-box approaches",
|
|
446
|
+
"3": "Mentions generalizability concerns but assessment is not paper-specific",
|
|
447
|
+
"1": "No generalizability analysis; assumes all methods work on any model",
|
|
448
|
+
"0": "Incorrect assessment of generalizability"
|
|
449
|
+
}
|
|
450
|
+
},
|
|
451
|
+
{
|
|
452
|
+
"criterion": "Gap Identification",
|
|
453
|
+
"weight": 0.25,
|
|
454
|
+
"scoring": {
|
|
455
|
+
"5": "Identifies specific research gaps: detection for paraphrased AI text, cross-lingual detection, detection in mixed human-AI text, benchmark standardization, real-world deployment challenges; supported by evidence from the surveyed papers",
|
|
456
|
+
"3": "Identifies 1-2 gaps but analysis is superficial",
|
|
457
|
+
"1": "Generic statement about needing more research without specific gaps",
|
|
458
|
+
"0": "No gap identification"
|
|
459
|
+
}
|
|
460
|
+
},
|
|
461
|
+
{
|
|
462
|
+
"criterion": "Search Quality & Bibliographic Rigor",
|
|
463
|
+
"weight": 0.2,
|
|
464
|
+
"scoring": {
|
|
465
|
+
"5": "Multi-database search with deduplication; all papers are real with complete metadata; includes mix of peer-reviewed and significant preprints; notes publication status",
|
|
466
|
+
"3": "Good papers but search limited to one database; metadata mostly complete",
|
|
467
|
+
"1": "Papers may be fabricated or metadata is incomplete",
|
|
468
|
+
"0": "Poor search quality or fabricated results"
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
],
|
|
472
|
+
"expectedScoreWithout": 15,
|
|
473
|
+
"expectedScoreWith": 60
|
|
474
|
+
}
|
|
475
|
+
]
|
|
476
|
+
}
|
package/tests/smoke.json
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": "0.0.1",
|
|
3
|
+
"timeout": 60,
|
|
4
|
+
"tasks": [
|
|
5
|
+
{
|
|
6
|
+
"id": "smoke-01",
|
|
7
|
+
"description": "Find top academic papers on a well-defined AI topic with multi-database search and citation analysis",
|
|
8
|
+
"input": "Find the top 5 most influential recent papers on retrieval-augmented generation (RAG) for reducing hallucination in large language models. I need papers from 2023 or later, with citation counts, publication venues, and a brief synthesis of how the field is evolving. Prioritize peer-reviewed work but include significant preprints.",
|
|
9
|
+
"rubric": [
|
|
10
|
+
{
|
|
11
|
+
"criterion": "Query Construction & Database Coverage",
|
|
12
|
+
"weight": 0.25,
|
|
13
|
+
"scoring": {
|
|
14
|
+
"5": "Constructs database-specific queries for arXiv (using cat:cs.CL, ti:/abs: prefixes), Semantic Scholar (with fieldsOfStudy, year filter, citation fields), and Google Scholar; uses appropriate academic terminology (RAG, hallucination, LLM, faithfulness)",
|
|
15
|
+
"3": "Searches 2 databases with reasonable queries but misses database-specific optimizations (e.g., no category codes on arXiv)",
|
|
16
|
+
"1": "Searches only one database or uses generic web-search-style queries",
|
|
17
|
+
"0": "No academic database search attempted; treats as a general web search"
|
|
18
|
+
}
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
"criterion": "Result Relevance & Quality",
|
|
22
|
+
"weight": 0.3,
|
|
23
|
+
"scoring": {
|
|
24
|
+
"5": "All 5 papers directly address RAG for hallucination reduction; includes mix of foundational work and recent advances; papers are from reputable venues (ACL, EMNLP, NeurIPS, ICML, ICLR, or high-citation arXiv preprints)",
|
|
25
|
+
"3": "3-4 papers are directly relevant; 1-2 are tangentially related (e.g., general RAG without hallucination focus, or general hallucination without RAG)",
|
|
26
|
+
"1": "Fewer than 3 relevant papers; includes non-academic sources or off-topic papers",
|
|
27
|
+
"0": "Results are not academic papers or do not address the topic"
|
|
28
|
+
}
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
"criterion": "Bibliographic Completeness & Citation Analysis",
|
|
32
|
+
"weight": 0.25,
|
|
33
|
+
"scoring": {
|
|
34
|
+
"5": "Each paper includes: full title, authors, year, venue, publication status (peer-reviewed/preprint), citation count, DOI or arXiv ID, and open-access link; citation velocity or influential citation count noted",
|
|
35
|
+
"3": "Papers have title, authors, and year but missing venue, citation data, or identifiers for some entries",
|
|
36
|
+
"1": "Only titles and URLs provided; no citation analysis",
|
|
37
|
+
"0": "Incomplete or fabricated bibliographic information"
|
|
38
|
+
}
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
"criterion": "Synthesis & Research Narrative",
|
|
42
|
+
"weight": 0.2,
|
|
43
|
+
"scoring": {
|
|
44
|
+
"5": "Provides a synthesis connecting the 5 papers: identifies thematic clusters (e.g., retrieval methods, evaluation approaches), notes consensus findings, highlights open questions, and suggests a reading order",
|
|
45
|
+
"3": "Brief summary of each paper but no cross-paper synthesis or thematic grouping",
|
|
46
|
+
"1": "Raw list of papers with abstracts but no synthesis",
|
|
47
|
+
"0": "No synthesis or narrative connecting results"
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
],
|
|
51
|
+
"passThreshold": 60
|
|
52
|
+
}
|
|
53
|
+
]
|
|
54
|
+
}
|