datamaestro-text 2026.2.2__py3-none-any.whl → 2026.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamaestro_text/config/com/github/aagohary/canard.py +27 -24
- datamaestro_text/config/com/github/apple/ml-qrecc.py +30 -25
- datamaestro_text/config/com/github/ikat.py +76 -61
- datamaestro_text/config/com/github/prdwb/orconvqa.py +41 -37
- datamaestro_text/config/com/microsoft/msmarco/passage.py +278 -207
- datamaestro_text/config/com/oscar-corpus.py +13 -10
- datamaestro_text/config/com/sentiment140.py +17 -12
- datamaestro_text/config/com/smashwords/bookcorpus.py +13 -10
- datamaestro_text/config/edu/stanford/aclimdb.py +14 -9
- datamaestro_text/config/edu/stanford/glove.py +66 -31
- datamaestro_text/config/edu/upenn/ldc/aquaint.py +35 -17
- datamaestro_text/config/fr/granddebat.py +57 -48
- datamaestro_text/config/gov/nist/ir/covid.py +61 -50
- datamaestro_text/config/gov/nist/trec/adhoc.py +395 -255
- datamaestro_text/config/gov/nist/trec/tipster.py +170 -64
- datamaestro_text/config/io/github/thunlp/fewrel.py +20 -15
- datamaestro_text/config/io/metamind/research/wikitext.py +50 -32
- datamaestro_text/config/org/grouplens/movielens.py +28 -37
- datamaestro_text/config/org/universaldependencies/french.py +16 -11
- datamaestro_text/test/test_documented.py +2 -2
- datamaestro_text/version.py +2 -2
- {datamaestro_text-2026.2.2.dist-info → datamaestro_text-2026.2.3.dist-info}/METADATA +2 -2
- {datamaestro_text-2026.2.2.dist-info → datamaestro_text-2026.2.3.dist-info}/RECORD +26 -26
- {datamaestro_text-2026.2.2.dist-info → datamaestro_text-2026.2.3.dist-info}/WHEEL +0 -0
- {datamaestro_text-2026.2.2.dist-info → datamaestro_text-2026.2.3.dist-info}/entry_points.txt +0 -0
- {datamaestro_text-2026.2.2.dist-info → datamaestro_text-2026.2.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -4,12 +4,12 @@ See [https://trec.nist.gov/data/test_coll.html](https://trec.nist.gov/data/test_
|
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
6
|
from datamaestro.download import reference
|
|
7
|
-
from datamaestro.download.single import
|
|
7
|
+
from datamaestro.download.single import FileDownloader, ConcatDownloader
|
|
8
8
|
from datamaestro.download.links import links
|
|
9
9
|
from datamaestro.stream import TransformList
|
|
10
10
|
from datamaestro.stream.compress import Gunzip
|
|
11
11
|
from datamaestro.stream.lines import Replace, Filter
|
|
12
|
-
from datamaestro.definitions import dataset
|
|
12
|
+
from datamaestro.definitions import Dataset, dataset
|
|
13
13
|
|
|
14
14
|
from datamaestro_text.data.ir.trec import (
|
|
15
15
|
TipsterCollection,
|
|
@@ -19,368 +19,508 @@ from datamaestro_text.data.ir.trec import (
|
|
|
19
19
|
from datamaestro_text.data.ir import Adhoc
|
|
20
20
|
|
|
21
21
|
from . import tipster
|
|
22
|
-
from datamaestro_text.config.edu.upenn.ldc.aquaint import
|
|
22
|
+
from datamaestro_text.config.edu.upenn.ldc.aquaint import Aquaint
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
# --- TREC 1 (1992)
|
|
26
26
|
|
|
27
27
|
|
|
28
|
-
@
|
|
29
|
-
|
|
30
|
-
ap88=tipster.ap88.path,
|
|
31
|
-
ap89=tipster.ap89.path,
|
|
32
|
-
fr88=tipster.fr88.path,
|
|
33
|
-
fr89=tipster.fr89.path,
|
|
34
|
-
wsj87=tipster.wsj87.path,
|
|
35
|
-
wsj88=tipster.wsj88.path,
|
|
36
|
-
wsj89=tipster.wsj89.path,
|
|
37
|
-
wsj90=tipster.wsj90.path,
|
|
38
|
-
wsj91=tipster.wsj91.path,
|
|
39
|
-
wsj92=tipster.wsj92.path,
|
|
40
|
-
ziff1=tipster.ziff1.path,
|
|
41
|
-
ziff2=tipster.ziff2.path,
|
|
42
|
-
)
|
|
43
|
-
@dataset(TipsterCollection, id="1.documents")
|
|
44
|
-
def trec1_documents(documents):
|
|
28
|
+
@dataset(id=".1.documents")
|
|
29
|
+
class Trec1Documents(Dataset):
|
|
45
30
|
"""TREC-1 to TREC-3 documents (TIPSTER volumes 1 and 2)"""
|
|
46
|
-
return {"path": documents}
|
|
47
|
-
|
|
48
31
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
32
|
+
DOCUMENTS = links(
|
|
33
|
+
"documents",
|
|
34
|
+
ap88=tipster.Ap88,
|
|
35
|
+
ap89=tipster.Ap89,
|
|
36
|
+
fr88=tipster.Fr88,
|
|
37
|
+
fr89=tipster.Fr89,
|
|
38
|
+
wsj87=tipster.Wsj87,
|
|
39
|
+
wsj88=tipster.Wsj88,
|
|
40
|
+
wsj89=tipster.Wsj89,
|
|
41
|
+
wsj90=tipster.Wsj90,
|
|
42
|
+
wsj91=tipster.Wsj91,
|
|
43
|
+
wsj92=tipster.Wsj92,
|
|
44
|
+
ziff1=tipster.Ziff1,
|
|
45
|
+
ziff2=tipster.Ziff2,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
def config(self) -> TipsterCollection:
|
|
49
|
+
return TipsterCollection.C(path=self.DOCUMENTS.path)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataset(id=".1.topics", url="")
|
|
53
|
+
class Trec1Topics(Dataset):
|
|
54
|
+
FILE = FileDownloader(
|
|
55
|
+
"topics.sgml",
|
|
56
|
+
"http://trec.nist.gov/data/topics_eng/topics.51-100.gz",
|
|
57
|
+
transforms=TransformList(Gunzip(), Replace(r"Number:(\s+)0", r"Number: \1")),
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
def config(self) -> TrecTopics:
|
|
61
|
+
return TrecTopics.C(path=self.FILE.path, parts=["desc"])
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@dataset(id=".1.assessments")
|
|
65
|
+
class Trec1Assessments(Dataset):
|
|
66
|
+
FILE = ConcatDownloader(
|
|
67
|
+
"assessments.qrels",
|
|
68
|
+
"http://trec.nist.gov/data/qrels_eng/qrels.51-100.disk1.disk2.parts1-5.tar.gz",
|
|
69
|
+
transforms=TransformList(Gunzip(), Replace(r"Number:(\s+)0", r"Number: \1")),
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
def config(self) -> TrecAdhocAssessments:
|
|
73
|
+
return TrecAdhocAssessments.C(path=self.FILE.path)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@dataset(id=".1")
|
|
77
|
+
class Trec1(Dataset):
|
|
78
|
+
"Ad-hoc task of TREC 1 (1992)"
|
|
57
79
|
|
|
80
|
+
DOCUMENTS = reference(varname="documents", reference=Trec1Documents)
|
|
81
|
+
TOPICS = reference(varname="topics", reference=Trec1Topics)
|
|
82
|
+
ASSESSMENTS = reference(varname="assessments", reference=Trec1Assessments)
|
|
58
83
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
)
|
|
64
|
-
|
|
65
|
-
def trec1_assessments(assessments):
|
|
66
|
-
return {"path": assessments}
|
|
84
|
+
def config(self) -> Adhoc:
|
|
85
|
+
return Adhoc.C(
|
|
86
|
+
documents=self.DOCUMENTS.prepare(),
|
|
87
|
+
topics=self.TOPICS.prepare(),
|
|
88
|
+
assessments=self.ASSESSMENTS.prepare(),
|
|
89
|
+
)
|
|
67
90
|
|
|
68
91
|
|
|
69
|
-
|
|
70
|
-
@reference("topics", trec1_topics)
|
|
71
|
-
@reference("assessments", trec1_assessments)
|
|
72
|
-
@dataset(Adhoc, id="1")
|
|
73
|
-
def trec1(documents, topics, assessments):
|
|
74
|
-
"Ad-hoc task of TREC 1 (1992)"
|
|
75
|
-
return {"documents": documents, "topics": topics, "assessments": assessments}
|
|
92
|
+
# --- TREC 2 (1993)
|
|
76
93
|
|
|
77
94
|
|
|
78
|
-
|
|
95
|
+
@dataset(id=".2.topics")
|
|
96
|
+
class Trec2Topics(Dataset):
|
|
97
|
+
FILE = FileDownloader(
|
|
98
|
+
"topics.sgml",
|
|
99
|
+
"http://trec.nist.gov/data/topics_eng/topics.101-150.gz",
|
|
100
|
+
transforms=TransformList(Gunzip(), Replace(r"Number:(\s+)0", r"Number: \1")),
|
|
101
|
+
)
|
|
79
102
|
|
|
103
|
+
def config(self) -> TrecTopics:
|
|
104
|
+
return TrecTopics.C(path=self.FILE.path, parts=["title", "desc"])
|
|
80
105
|
|
|
81
|
-
@filedownloader(
|
|
82
|
-
"topics.sgml",
|
|
83
|
-
"http://trec.nist.gov/data/topics_eng/topics.101-150.gz",
|
|
84
|
-
transforms=TransformList(Gunzip(), Replace(r"Number:(\s+)0", r"Number: \1")),
|
|
85
|
-
)
|
|
86
|
-
@dataset(TrecTopics, id="2.topics")
|
|
87
|
-
def trec2_topics(topics):
|
|
88
|
-
return {"path": topics, "parts": ["title", "desc"]}
|
|
89
106
|
|
|
107
|
+
@dataset(id=".2.assessments")
|
|
108
|
+
class Trec2Assessments(Dataset):
|
|
109
|
+
FILE = ConcatDownloader(
|
|
110
|
+
"assessments.qrels",
|
|
111
|
+
"http://trec.nist.gov/data/qrels_eng/qrels.101-150.disk1.disk2.parts1-5.tar.gz",
|
|
112
|
+
transforms=TransformList(Gunzip(), Replace(r"Number:(\s+)0", r"Number: \1")),
|
|
113
|
+
)
|
|
90
114
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
"http://trec.nist.gov/data/qrels_eng/qrels.101-150.disk1.disk2.parts1-5.tar.gz",
|
|
94
|
-
transforms=TransformList(Gunzip(), Replace(r"Number:(\s+)0", r"Number: \1")),
|
|
95
|
-
)
|
|
96
|
-
@dataset(TrecAdhocAssessments, id="2.assessments")
|
|
97
|
-
def trec2_assessments(assessments):
|
|
98
|
-
return {"path": assessments}
|
|
115
|
+
def config(self) -> TrecAdhocAssessments:
|
|
116
|
+
return TrecAdhocAssessments.C(path=self.FILE.path)
|
|
99
117
|
|
|
100
118
|
|
|
101
|
-
@
|
|
102
|
-
|
|
103
|
-
@reference("assessments", trec2_assessments)
|
|
104
|
-
@dataset(Adhoc, id="2")
|
|
105
|
-
def trec2(documents, topics, assessments):
|
|
119
|
+
@dataset(id=".2")
|
|
120
|
+
class Trec2(Dataset):
|
|
106
121
|
"Ad-hoc task of TREC 2 (1993)"
|
|
107
|
-
|
|
122
|
+
|
|
123
|
+
DOCUMENTS = reference(varname="documents", reference=Trec1Documents)
|
|
124
|
+
TOPICS = reference(varname="topics", reference=Trec2Topics)
|
|
125
|
+
ASSESSMENTS = reference(varname="assessments", reference=Trec2Assessments)
|
|
126
|
+
|
|
127
|
+
def config(self) -> Adhoc:
|
|
128
|
+
return Adhoc.C(
|
|
129
|
+
documents=self.DOCUMENTS.prepare(),
|
|
130
|
+
topics=self.TOPICS.prepare(),
|
|
131
|
+
assessments=self.ASSESSMENTS.prepare(),
|
|
132
|
+
)
|
|
108
133
|
|
|
109
134
|
|
|
110
135
|
# --- TREC 3 (1994)
|
|
111
136
|
|
|
112
137
|
|
|
113
|
-
@
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
138
|
+
@dataset(id=".3.topics")
|
|
139
|
+
class Trec3Topics(Dataset):
|
|
140
|
+
FILE = FileDownloader(
|
|
141
|
+
"topics.sgml", "http://trec.nist.gov/data/topics_eng/topics.151-200.gz"
|
|
142
|
+
)
|
|
117
143
|
|
|
144
|
+
def config(self) -> TrecTopics:
|
|
145
|
+
return TrecTopics.C(path=self.FILE.path, parts=["title", "desc"])
|
|
118
146
|
|
|
119
|
-
@concatdownload(
|
|
120
|
-
"assessments.qrels",
|
|
121
|
-
"http://trec.nist.gov/data/qrels_eng/qrels.151-200.201-250.disks1-3.all.tar.gz",
|
|
122
|
-
transforms=TransformList(Gunzip(), Filter(r"^(1\d\d|200)\s")),
|
|
123
|
-
)
|
|
124
|
-
@dataset(TrecAdhocAssessments, id="3.assessments")
|
|
125
|
-
def trec3_assessments(assessments):
|
|
126
|
-
return {"path": assessments}
|
|
127
147
|
|
|
148
|
+
@dataset(id=".3.assessments")
|
|
149
|
+
class Trec3Assessments(Dataset):
|
|
150
|
+
FILE = ConcatDownloader(
|
|
151
|
+
"assessments.qrels",
|
|
152
|
+
"http://trec.nist.gov/data/qrels_eng/qrels.151-200.201-250.disks1-3.all.tar.gz",
|
|
153
|
+
transforms=TransformList(Gunzip(), Filter(r"^(1\d\d|200)\s")),
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
def config(self) -> TrecAdhocAssessments:
|
|
157
|
+
return TrecAdhocAssessments.C(path=self.FILE.path)
|
|
128
158
|
|
|
129
|
-
|
|
130
|
-
@
|
|
131
|
-
|
|
132
|
-
@dataset(Adhoc, id="3")
|
|
133
|
-
def trec3(documents, topics, assessments):
|
|
159
|
+
|
|
160
|
+
@dataset(id=".3")
|
|
161
|
+
class Trec3(Dataset):
|
|
134
162
|
"Ad-hoc task of TREC 3 (1994)"
|
|
135
|
-
|
|
163
|
+
|
|
164
|
+
DOCUMENTS = reference(varname="documents", reference=Trec1Documents)
|
|
165
|
+
TOPICS = reference(varname="topics", reference=Trec3Topics)
|
|
166
|
+
ASSESSMENTS = reference(varname="assessments", reference=Trec3Assessments)
|
|
167
|
+
|
|
168
|
+
def config(self) -> Adhoc:
|
|
169
|
+
return Adhoc.C(
|
|
170
|
+
documents=self.DOCUMENTS.prepare(),
|
|
171
|
+
topics=self.TOPICS.prepare(),
|
|
172
|
+
assessments=self.ASSESSMENTS.prepare(),
|
|
173
|
+
)
|
|
136
174
|
|
|
137
175
|
|
|
138
176
|
# --- TREC 4 (1995)
|
|
139
177
|
|
|
140
178
|
|
|
141
|
-
@
|
|
142
|
-
|
|
143
|
-
ap88=tipster.ap88.path,
|
|
144
|
-
ap89=tipster.ap89.path,
|
|
145
|
-
ap90=tipster.ap90.path,
|
|
146
|
-
fr88=tipster.fr88.path,
|
|
147
|
-
sjm1=tipster.sjm1.path,
|
|
148
|
-
wsj90=tipster.wsj90.path,
|
|
149
|
-
wsj91=tipster.wsj91.path,
|
|
150
|
-
wsj92=tipster.wsj92.path,
|
|
151
|
-
ziff2=tipster.ziff2.path,
|
|
152
|
-
ziff3=tipster.ziff3.path,
|
|
153
|
-
)
|
|
154
|
-
@dataset(TipsterCollection, id="4.documents")
|
|
155
|
-
def trec4_documents(documents):
|
|
179
|
+
@dataset(id=".4.documents")
|
|
180
|
+
class Trec4Documents(Dataset):
|
|
156
181
|
"""TREC-4 documents"""
|
|
157
|
-
return {"path": documents}
|
|
158
182
|
|
|
183
|
+
DOCUMENTS = links(
|
|
184
|
+
"documents",
|
|
185
|
+
ap88=tipster.Ap88,
|
|
186
|
+
ap89=tipster.Ap89,
|
|
187
|
+
ap90=tipster.Ap90,
|
|
188
|
+
fr88=tipster.Fr88,
|
|
189
|
+
sjm1=tipster.Sjm1,
|
|
190
|
+
wsj90=tipster.Wsj90,
|
|
191
|
+
wsj91=tipster.Wsj91,
|
|
192
|
+
wsj92=tipster.Wsj92,
|
|
193
|
+
ziff2=tipster.Ziff2,
|
|
194
|
+
ziff3=tipster.Ziff3,
|
|
195
|
+
)
|
|
159
196
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
def trec4_topics(topics):
|
|
163
|
-
return {"path": topics, "parts": ["title", "desc"]}
|
|
197
|
+
def config(self) -> TipsterCollection:
|
|
198
|
+
return TipsterCollection.C(path=self.DOCUMENTS.path)
|
|
164
199
|
|
|
165
200
|
|
|
166
|
-
@
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
201
|
+
@dataset(id=".4.topics")
|
|
202
|
+
class Trec4Topics(Dataset):
|
|
203
|
+
FILE = FileDownloader(
|
|
204
|
+
"topics.sgml", "http://trec.nist.gov/data/topics_eng/topics.201-250.gz"
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
def config(self) -> TrecTopics:
|
|
208
|
+
return TrecTopics.C(path=self.FILE.path, parts=["title", "desc"])
|
|
173
209
|
|
|
174
210
|
|
|
175
|
-
@
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
211
|
+
@dataset(id=".4.assessments")
|
|
212
|
+
class Trec4Assessments(Dataset):
|
|
213
|
+
FILE = ConcatDownloader(
|
|
214
|
+
"assessments.qrels",
|
|
215
|
+
"http://trec.nist.gov/data/qrels_eng/qrels.201-250.disk2.disk3.parts1-5.tar.gz",
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
def config(self) -> TrecAdhocAssessments:
|
|
219
|
+
return TrecAdhocAssessments.C(path=self.FILE.path)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
@dataset(id=".4")
|
|
223
|
+
class Trec4(Dataset):
|
|
180
224
|
"Ad-hoc task of TREC 4 (1995)"
|
|
181
|
-
|
|
225
|
+
|
|
226
|
+
DOCUMENTS = reference(varname="documents", reference=Trec4Documents)
|
|
227
|
+
TOPICS = reference(varname="topics", reference=Trec4Topics)
|
|
228
|
+
ASSESSMENTS = reference(varname="assessments", reference=Trec4Assessments)
|
|
229
|
+
|
|
230
|
+
def config(self) -> Adhoc:
|
|
231
|
+
return Adhoc.C(
|
|
232
|
+
documents=self.DOCUMENTS.prepare(),
|
|
233
|
+
topics=self.TOPICS.prepare(),
|
|
234
|
+
assessments=self.ASSESSMENTS.prepare(),
|
|
235
|
+
)
|
|
182
236
|
|
|
183
237
|
|
|
184
238
|
# --- TREC 5 (1995)
|
|
185
239
|
|
|
186
240
|
|
|
187
|
-
@
|
|
188
|
-
|
|
189
|
-
ap88=tipster.ap88.path,
|
|
190
|
-
cr1=tipster.cr1.path,
|
|
191
|
-
fr88=tipster.fr88.path,
|
|
192
|
-
fr94=tipster.fr94.path,
|
|
193
|
-
ft1=tipster.ft1.path,
|
|
194
|
-
wsj90=tipster.wsj90.path,
|
|
195
|
-
wsj91=tipster.wsj91.path,
|
|
196
|
-
wsj9=tipster.wsj92.path,
|
|
197
|
-
ziff2=tipster.ziff2.path,
|
|
198
|
-
)
|
|
199
|
-
@dataset(TipsterCollection, id="5.documents")
|
|
200
|
-
def trec5_documents(documents):
|
|
241
|
+
@dataset(id=".5.documents")
|
|
242
|
+
class Trec5Documents(Dataset):
|
|
201
243
|
"""TREC-5 documents"""
|
|
202
|
-
return {"path": documents}
|
|
203
244
|
|
|
245
|
+
DOCUMENTS = links(
|
|
246
|
+
"documents",
|
|
247
|
+
ap88=tipster.Ap88,
|
|
248
|
+
cr1=tipster.Cr1,
|
|
249
|
+
fr88=tipster.Fr88,
|
|
250
|
+
fr94=tipster.Fr94,
|
|
251
|
+
ft1=tipster.Ft1,
|
|
252
|
+
wsj90=tipster.Wsj90,
|
|
253
|
+
wsj91=tipster.Wsj91,
|
|
254
|
+
wsj9=tipster.Wsj92,
|
|
255
|
+
ziff2=tipster.Ziff2,
|
|
256
|
+
)
|
|
204
257
|
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
def trec5_topics(topics):
|
|
208
|
-
return {"path": topics, "parts": ["title", "desc"]}
|
|
258
|
+
def config(self) -> TipsterCollection:
|
|
259
|
+
return TipsterCollection.C(path=self.DOCUMENTS.path)
|
|
209
260
|
|
|
210
261
|
|
|
211
|
-
@
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
262
|
+
@dataset(id=".5.topics")
|
|
263
|
+
class Trec5Topics(Dataset):
|
|
264
|
+
FILE = FileDownloader(
|
|
265
|
+
"topics.sgml", "http://trec.nist.gov/data/topics_eng/topics.251-300.gz"
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
def config(self) -> TrecTopics:
|
|
269
|
+
return TrecTopics.C(path=self.FILE.path, parts=["title", "desc"])
|
|
218
270
|
|
|
219
271
|
|
|
220
|
-
@
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
272
|
+
@dataset(id=".5.qrels")
|
|
273
|
+
class Trec5Assessments(Dataset):
|
|
274
|
+
FILE = ConcatDownloader(
|
|
275
|
+
"assessments.qrels",
|
|
276
|
+
url="http://trec.nist.gov/data/qrels_eng/qrels.251-300.parts1-5.tar.gz",
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
def config(self) -> TrecAdhocAssessments:
|
|
280
|
+
return TrecAdhocAssessments.C(path=self.FILE.path)
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
@dataset(id=".5")
|
|
284
|
+
class Trec5(Dataset):
|
|
225
285
|
"Ad-hoc task of TREC 5 (1996)"
|
|
226
|
-
|
|
286
|
+
|
|
287
|
+
DOCUMENTS = reference(varname="documents", reference=Trec5Documents)
|
|
288
|
+
TOPICS = reference(varname="topics", reference=Trec5Topics)
|
|
289
|
+
ASSESSMENTS = reference(varname="assessments", reference=Trec5Assessments)
|
|
290
|
+
|
|
291
|
+
def config(self) -> Adhoc:
|
|
292
|
+
return Adhoc.C(
|
|
293
|
+
documents=self.DOCUMENTS.prepare(),
|
|
294
|
+
topics=self.TOPICS.prepare(),
|
|
295
|
+
assessments=self.ASSESSMENTS.prepare(),
|
|
296
|
+
)
|
|
227
297
|
|
|
228
298
|
|
|
229
299
|
# -- TREC 6 (1997)
|
|
230
300
|
|
|
231
301
|
|
|
232
|
-
@
|
|
233
|
-
|
|
234
|
-
cr1=tipster.cr1.path,
|
|
235
|
-
fbis1=tipster.fbis1.path,
|
|
236
|
-
fr94=tipster.fr94.path,
|
|
237
|
-
ft1=tipster.ft1.path,
|
|
238
|
-
la8990=tipster.la8990.path,
|
|
239
|
-
)
|
|
240
|
-
@dataset(TipsterCollection, id="6.documents")
|
|
241
|
-
def trec6_documents(documents):
|
|
302
|
+
@dataset(id=".6.documents")
|
|
303
|
+
class Trec6Documents(Dataset):
|
|
242
304
|
"""TREC-5 documents"""
|
|
243
|
-
return {"path": documents}
|
|
244
305
|
|
|
306
|
+
DOCUMENTS = links(
|
|
307
|
+
"documents",
|
|
308
|
+
cr1=tipster.Cr1,
|
|
309
|
+
fbis1=tipster.Fbis1,
|
|
310
|
+
fr94=tipster.Fr94,
|
|
311
|
+
ft1=tipster.Ft1,
|
|
312
|
+
la8990=tipster.La8990,
|
|
313
|
+
)
|
|
245
314
|
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
def trec6_topics(topics):
|
|
249
|
-
return {"path": topics, "parts": ["title", "desc"]}
|
|
315
|
+
def config(self) -> TipsterCollection:
|
|
316
|
+
return TipsterCollection.C(path=self.DOCUMENTS.path)
|
|
250
317
|
|
|
251
318
|
|
|
252
|
-
@
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
319
|
+
@dataset(id=".6.topics")
|
|
320
|
+
class Trec6Topics(Dataset):
|
|
321
|
+
FILE = FileDownloader(
|
|
322
|
+
"topics.sgml", "http://trec.nist.gov/data/topics_eng/topics.301-350.gz"
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
def config(self) -> TrecTopics:
|
|
326
|
+
return TrecTopics.C(path=self.FILE.path, parts=["title", "desc"])
|
|
259
327
|
|
|
260
328
|
|
|
261
|
-
@
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
329
|
+
@dataset(id=".6.qrels")
|
|
330
|
+
class Trec6Assessments(Dataset):
|
|
331
|
+
FILE = ConcatDownloader(
|
|
332
|
+
"assessments.qrels",
|
|
333
|
+
url="http://trec.nist.gov/data/qrels_eng/qrels.trec6.adhoc.parts1-5.tar.gz",
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
def config(self) -> TrecAdhocAssessments:
|
|
337
|
+
return TrecAdhocAssessments.C(path=self.FILE.path)
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
@dataset(id=".6")
|
|
341
|
+
class Trec6(Dataset):
|
|
266
342
|
"Ad-hoc task of TREC 6 (1997)"
|
|
267
|
-
|
|
343
|
+
|
|
344
|
+
DOCUMENTS = reference(varname="documents", reference=Trec6Documents)
|
|
345
|
+
TOPICS = reference(varname="topics", reference=Trec6Topics)
|
|
346
|
+
ASSESSMENTS = reference(varname="assessments", reference=Trec6Assessments)
|
|
347
|
+
|
|
348
|
+
def config(self) -> Adhoc:
|
|
349
|
+
return Adhoc.C(
|
|
350
|
+
documents=self.DOCUMENTS.prepare(),
|
|
351
|
+
topics=self.TOPICS.prepare(),
|
|
352
|
+
assessments=self.ASSESSMENTS.prepare(),
|
|
353
|
+
)
|
|
268
354
|
|
|
269
355
|
|
|
270
356
|
# --- TREC 7 (1998)
|
|
271
357
|
|
|
272
358
|
|
|
273
|
-
@
|
|
274
|
-
|
|
275
|
-
fbis1=tipster.fbis1.path,
|
|
276
|
-
fr94=tipster.fr94.path,
|
|
277
|
-
ft1=tipster.ft1.path,
|
|
278
|
-
la8990=tipster.la8990.path,
|
|
279
|
-
)
|
|
280
|
-
@dataset(TipsterCollection, id="7.documents")
|
|
281
|
-
def trec7_documents(documents):
|
|
359
|
+
@dataset(id=".7.documents")
|
|
360
|
+
class Trec7Documents(Dataset):
|
|
282
361
|
"""TREC-7 documents"""
|
|
283
|
-
return {"path": documents}
|
|
284
362
|
|
|
363
|
+
DOCUMENTS = links(
|
|
364
|
+
"documents",
|
|
365
|
+
fbis1=tipster.Fbis1,
|
|
366
|
+
fr94=tipster.Fr94,
|
|
367
|
+
ft1=tipster.Ft1,
|
|
368
|
+
la8990=tipster.La8990,
|
|
369
|
+
)
|
|
285
370
|
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
def trec7_topics(topics):
|
|
289
|
-
return {"path": topics, "parts": ["title", "desc"]}
|
|
371
|
+
def config(self) -> TipsterCollection:
|
|
372
|
+
return TipsterCollection.C(path=self.DOCUMENTS.path)
|
|
290
373
|
|
|
291
374
|
|
|
292
|
-
@
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
375
|
+
@dataset(id=".7.topics")
|
|
376
|
+
class Trec7Topics(Dataset):
|
|
377
|
+
FILE = FileDownloader(
|
|
378
|
+
"topics.sgml", "http://trec.nist.gov/data/topics_eng/topics.351-400.gz"
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
def config(self) -> TrecTopics:
|
|
382
|
+
return TrecTopics.C(path=self.FILE.path, parts=["title", "desc"])
|
|
299
383
|
|
|
300
384
|
|
|
301
|
-
@
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
385
|
+
@dataset(id=".7.qrels")
|
|
386
|
+
class Trec7Assessments(Dataset):
|
|
387
|
+
FILE = ConcatDownloader(
|
|
388
|
+
"assessments.qrels",
|
|
389
|
+
url="http://trec.nist.gov/data/qrels_eng/qrels.trec7.adhoc.parts1-5.tar.gz",
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
def config(self) -> TrecAdhocAssessments:
|
|
393
|
+
return TrecAdhocAssessments.C(path=self.FILE.path)
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
@dataset(id=".7")
|
|
397
|
+
class Trec7(Dataset):
|
|
306
398
|
"Ad-hoc task of TREC 3 (1994)"
|
|
307
|
-
|
|
399
|
+
|
|
400
|
+
DOCUMENTS = reference(varname="documents", reference=Trec7Documents)
|
|
401
|
+
TOPICS = reference(varname="topics", reference=Trec7Topics)
|
|
402
|
+
ASSESSMENTS = reference(varname="assessments", reference=Trec7Assessments)
|
|
403
|
+
|
|
404
|
+
def config(self) -> Adhoc:
|
|
405
|
+
return Adhoc.C(
|
|
406
|
+
documents=self.DOCUMENTS.prepare(),
|
|
407
|
+
topics=self.TOPICS.prepare(),
|
|
408
|
+
assessments=self.ASSESSMENTS.prepare(),
|
|
409
|
+
)
|
|
308
410
|
|
|
309
411
|
|
|
310
412
|
# --- TREC 8 (1999)
|
|
311
413
|
|
|
312
414
|
|
|
313
|
-
@
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
415
|
+
@dataset(id=".8.topics")
|
|
416
|
+
class Trec8Topics(Dataset):
|
|
417
|
+
FILE = FileDownloader(
|
|
418
|
+
"topics.sgml", "http://trec.nist.gov/data/topics_eng/topics.401-450.gz"
|
|
419
|
+
)
|
|
317
420
|
|
|
421
|
+
def config(self) -> TrecTopics:
|
|
422
|
+
return TrecTopics.C(path=self.FILE.path, parts=["title", "desc"])
|
|
318
423
|
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
424
|
+
|
|
425
|
+
@dataset(id=".8.qrels")
|
|
426
|
+
class Trec8Assessments(Dataset):
|
|
427
|
+
FILE = ConcatDownloader(
|
|
428
|
+
"assessments.qrels",
|
|
429
|
+
url="https://trec.nist.gov/data/qrels_eng/qrels.trec8.adhoc.parts1-5.tar.gz",
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
def config(self) -> TrecAdhocAssessments:
|
|
433
|
+
return TrecAdhocAssessments.C(path=self.FILE.path)
|
|
326
434
|
|
|
327
435
|
|
|
328
|
-
@
|
|
329
|
-
|
|
330
|
-
@reference("assessments", trec8_assessments)
|
|
331
|
-
@dataset(Adhoc, id="8")
|
|
332
|
-
def trec8(documents, topics, assessments):
|
|
436
|
+
@dataset(id=".8")
|
|
437
|
+
class Trec8(Dataset):
|
|
333
438
|
"Ad-hoc task of TREC 8 (1999)"
|
|
334
|
-
|
|
439
|
+
|
|
440
|
+
DOCUMENTS = reference(varname="documents", reference=Trec7Documents)
|
|
441
|
+
TOPICS = reference(varname="topics", reference=Trec8Topics)
|
|
442
|
+
ASSESSMENTS = reference(varname="assessments", reference=Trec8Assessments)
|
|
443
|
+
|
|
444
|
+
def config(self) -> Adhoc:
|
|
445
|
+
return Adhoc.C(
|
|
446
|
+
documents=self.DOCUMENTS.prepare(),
|
|
447
|
+
topics=self.TOPICS.prepare(),
|
|
448
|
+
assessments=self.ASSESSMENTS.prepare(),
|
|
449
|
+
)
|
|
335
450
|
|
|
336
451
|
|
|
337
452
|
# --- TREC Robust (2004)
|
|
338
453
|
|
|
339
454
|
|
|
340
|
-
@
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
return {"path": topics, "parts": ["title", "desc"]}
|
|
455
|
+
@dataset(id=".robust.2004.topics")
|
|
456
|
+
class Robust2004Topics(Dataset):
|
|
457
|
+
FILE = FileDownloader("topics", "http://trec.nist.gov/data/robust/04.testset.gz")
|
|
344
458
|
|
|
459
|
+
def config(self) -> TrecTopics:
|
|
460
|
+
return TrecTopics.C(path=self.FILE.path, parts=["title", "desc"])
|
|
345
461
|
|
|
346
|
-
@filedownloader(
|
|
347
|
-
"assessments.qrels", "http://trec.nist.gov/data/robust/qrels.robust2004.txt"
|
|
348
|
-
)
|
|
349
|
-
@dataset(TrecAdhocAssessments, id="robust.2004.qrels")
|
|
350
|
-
def robust2004_assessments(assessments):
|
|
351
|
-
return {"path": assessments}
|
|
352
462
|
|
|
463
|
+
@dataset(id=".robust.2004.qrels")
|
|
464
|
+
class Robust2004Assessments(Dataset):
|
|
465
|
+
FILE = FileDownloader(
|
|
466
|
+
"assessments.qrels", "http://trec.nist.gov/data/robust/qrels.robust2004.txt"
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
def config(self) -> TrecAdhocAssessments:
|
|
470
|
+
return TrecAdhocAssessments.C(path=self.FILE.path)
|
|
353
471
|
|
|
354
|
-
|
|
355
|
-
@
|
|
356
|
-
|
|
357
|
-
@dataset(Adhoc, id="robust.2004")
|
|
358
|
-
def robust2004(documents, topics, assessments):
|
|
472
|
+
|
|
473
|
+
@dataset(id=".robust.2004")
|
|
474
|
+
class Robust2004(Dataset):
|
|
359
475
|
"Ad-hoc task of TREC Robust (2004)"
|
|
360
|
-
|
|
476
|
+
|
|
477
|
+
DOCUMENTS = reference(varname="documents", reference=Trec7Documents)
|
|
478
|
+
TOPICS = reference(varname="topics", reference=Robust2004Topics)
|
|
479
|
+
ASSESSMENTS = reference(varname="assessments", reference=Robust2004Assessments)
|
|
480
|
+
|
|
481
|
+
def config(self) -> Adhoc:
|
|
482
|
+
return Adhoc.C(
|
|
483
|
+
documents=self.DOCUMENTS.prepare(),
|
|
484
|
+
topics=self.TOPICS.prepare(),
|
|
485
|
+
assessments=self.ASSESSMENTS.prepare(),
|
|
486
|
+
)
|
|
361
487
|
|
|
362
488
|
|
|
363
489
|
# --- TREC Robust (2005)
|
|
364
490
|
|
|
365
491
|
|
|
366
|
-
@
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
492
|
+
@dataset(id=".robust.2005.topics")
|
|
493
|
+
class Robust2005Topics(Dataset):
|
|
494
|
+
FILE = FileDownloader(
|
|
495
|
+
"topics", "http://trec.nist.gov/data/robust/05/05.50.topics.txt"
|
|
496
|
+
)
|
|
370
497
|
|
|
498
|
+
def config(self) -> TrecTopics:
|
|
499
|
+
return TrecTopics.C(path=self.FILE.path, parts=["title", "desc"])
|
|
371
500
|
|
|
372
|
-
@filedownloader(
|
|
373
|
-
"assessments.qrels", url="http://trec.nist.gov/data/robust/05/TREC2005.qrels.txt"
|
|
374
|
-
)
|
|
375
|
-
@dataset(TrecAdhocAssessments, id="robust.2005.qrels")
|
|
376
|
-
def robust2005_assessments(assessments):
|
|
377
|
-
return {"path": assessments}
|
|
378
501
|
|
|
502
|
+
@dataset(id=".robust.2005.qrels")
|
|
503
|
+
class Robust2005Assessments(Dataset):
|
|
504
|
+
FILE = FileDownloader(
|
|
505
|
+
"assessments.qrels",
|
|
506
|
+
url="http://trec.nist.gov/data/robust/05/TREC2005.qrels.txt",
|
|
507
|
+
)
|
|
379
508
|
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
509
|
+
def config(self) -> TrecAdhocAssessments:
|
|
510
|
+
return TrecAdhocAssessments.C(path=self.FILE.path)
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
@dataset(id=".robust.2005")
|
|
514
|
+
class Robust2005(Dataset):
|
|
385
515
|
"Ad-hoc task of TREC Robust (2005)"
|
|
386
|
-
|
|
516
|
+
|
|
517
|
+
DOCUMENTS = reference(varname="documents", reference=Aquaint)
|
|
518
|
+
TOPICS = reference(varname="topics", reference=Robust2005Topics)
|
|
519
|
+
ASSESSMENTS = reference(varname="assessments", reference=Robust2005Assessments)
|
|
520
|
+
|
|
521
|
+
def config(self) -> Adhoc:
|
|
522
|
+
return Adhoc.C(
|
|
523
|
+
documents=self.DOCUMENTS.prepare(),
|
|
524
|
+
topics=self.TOPICS.prepare(),
|
|
525
|
+
assessments=self.ASSESSMENTS.prepare(),
|
|
526
|
+
)
|