datamaestro-text 2026.1.1__py3-none-any.whl → 2026.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. datamaestro_text/__init__.py +1 -1
  2. datamaestro_text/config/com/github/aagohary/canard.py +27 -24
  3. datamaestro_text/config/com/github/apple/ml-qrecc.py +30 -25
  4. datamaestro_text/config/com/github/ikat.py +76 -62
  5. datamaestro_text/config/com/github/prdwb/orconvqa.py +41 -37
  6. datamaestro_text/config/com/microsoft/msmarco/passage.py +278 -207
  7. datamaestro_text/config/com/oscar-corpus.py +13 -10
  8. datamaestro_text/config/com/sentiment140.py +17 -12
  9. datamaestro_text/config/com/smashwords/bookcorpus.py +13 -10
  10. datamaestro_text/config/edu/stanford/aclimdb.py +14 -9
  11. datamaestro_text/config/edu/stanford/glove.py +66 -32
  12. datamaestro_text/config/edu/upenn/ldc/aquaint.py +35 -17
  13. datamaestro_text/config/fr/granddebat.py +57 -48
  14. datamaestro_text/config/gov/nist/ir/covid.py +62 -52
  15. datamaestro_text/config/gov/nist/trec/adhoc.py +395 -255
  16. datamaestro_text/config/gov/nist/trec/tipster.py +170 -64
  17. datamaestro_text/config/io/github/thunlp/fewrel.py +20 -15
  18. datamaestro_text/config/io/metamind/research/wikitext.py +51 -33
  19. datamaestro_text/config/org/grouplens/movielens.py +28 -37
  20. datamaestro_text/config/org/universaldependencies/french.py +16 -11
  21. datamaestro_text/data/conversation/__init__.py +6 -6
  22. datamaestro_text/data/conversation/base.py +2 -2
  23. datamaestro_text/data/conversation/canard.py +3 -4
  24. datamaestro_text/data/conversation/ikat.py +0 -1
  25. datamaestro_text/data/conversation/orconvqa.py +3 -3
  26. datamaestro_text/data/embeddings.py +1 -0
  27. datamaestro_text/data/ir/__init__.py +1 -1
  28. datamaestro_text/data/ir/base.py +1 -1
  29. datamaestro_text/data/ir/data.py +1 -1
  30. datamaestro_text/data/ir/formats.py +2 -1
  31. datamaestro_text/data/ir/stores.py +1 -1
  32. datamaestro_text/data/text.py +1 -0
  33. datamaestro_text/datasets/__init__.py +1 -0
  34. datamaestro_text/datasets/irds/data.py +1 -6
  35. datamaestro_text/download/tmdb.py +0 -1
  36. datamaestro_text/test/test_documented.py +2 -2
  37. datamaestro_text/transforms/ir/__init__.py +12 -13
  38. datamaestro_text/utils/shuffle.py +1 -1
  39. datamaestro_text/version.py +2 -2
  40. {datamaestro_text-2026.1.1.dist-info → datamaestro_text-2026.2.3.dist-info}/METADATA +2 -8
  41. {datamaestro_text-2026.1.1.dist-info → datamaestro_text-2026.2.3.dist-info}/RECORD +44 -43
  42. {datamaestro_text-2026.1.1.dist-info → datamaestro_text-2026.2.3.dist-info}/WHEEL +0 -0
  43. {datamaestro_text-2026.1.1.dist-info → datamaestro_text-2026.2.3.dist-info}/entry_points.txt +0 -0
  44. {datamaestro_text-2026.1.1.dist-info → datamaestro_text-2026.2.3.dist-info}/licenses/LICENSE +0 -0
@@ -4,12 +4,12 @@ See [https://trec.nist.gov/data/test_coll.html](https://trec.nist.gov/data/test_
4
4
  """
5
5
 
6
6
  from datamaestro.download import reference
7
- from datamaestro.download.single import filedownloader, concatdownload
7
+ from datamaestro.download.single import FileDownloader, ConcatDownloader
8
8
  from datamaestro.download.links import links
9
9
  from datamaestro.stream import TransformList
10
10
  from datamaestro.stream.compress import Gunzip
11
11
  from datamaestro.stream.lines import Replace, Filter
12
- from datamaestro.definitions import dataset
12
+ from datamaestro.definitions import Dataset, dataset
13
13
 
14
14
  from datamaestro_text.data.ir.trec import (
15
15
  TipsterCollection,
@@ -19,368 +19,508 @@ from datamaestro_text.data.ir.trec import (
19
19
  from datamaestro_text.data.ir import Adhoc
20
20
 
21
21
  from . import tipster
22
- from datamaestro_text.config.edu.upenn.ldc.aquaint import aquaint
22
+ from datamaestro_text.config.edu.upenn.ldc.aquaint import Aquaint
23
23
 
24
24
 
25
25
  # --- TREC 1 (1992)
26
26
 
27
27
 
28
- @links(
29
- "documents",
30
- ap88=tipster.ap88.path,
31
- ap89=tipster.ap89.path,
32
- fr88=tipster.fr88.path,
33
- fr89=tipster.fr89.path,
34
- wsj87=tipster.wsj87.path,
35
- wsj88=tipster.wsj88.path,
36
- wsj89=tipster.wsj89.path,
37
- wsj90=tipster.wsj90.path,
38
- wsj91=tipster.wsj91.path,
39
- wsj92=tipster.wsj92.path,
40
- ziff1=tipster.ziff1.path,
41
- ziff2=tipster.ziff2.path,
42
- )
43
- @dataset(TipsterCollection, id="1.documents")
44
- def trec1_documents(documents):
28
+ @dataset(id=".1.documents")
29
+ class Trec1Documents(Dataset):
45
30
  """TREC-1 to TREC-3 documents (TIPSTER volumes 1 and 2)"""
46
- return {"path": documents}
47
-
48
31
 
49
- @filedownloader(
50
- "topics.sgml",
51
- "http://trec.nist.gov/data/topics_eng/topics.51-100.gz",
52
- transforms=TransformList(Gunzip(), Replace(r"Number:(\s+)0", r"Number: \1")),
53
- )
54
- @dataset(TrecTopics, id="1.topics", url="")
55
- def trec1_topics(topics):
56
- return {"path": topics, "parts": ["desc"]}
32
+ DOCUMENTS = links(
33
+ "documents",
34
+ ap88=tipster.Ap88,
35
+ ap89=tipster.Ap89,
36
+ fr88=tipster.Fr88,
37
+ fr89=tipster.Fr89,
38
+ wsj87=tipster.Wsj87,
39
+ wsj88=tipster.Wsj88,
40
+ wsj89=tipster.Wsj89,
41
+ wsj90=tipster.Wsj90,
42
+ wsj91=tipster.Wsj91,
43
+ wsj92=tipster.Wsj92,
44
+ ziff1=tipster.Ziff1,
45
+ ziff2=tipster.Ziff2,
46
+ )
47
+
48
+ def config(self) -> TipsterCollection:
49
+ return TipsterCollection.C(path=self.DOCUMENTS.path)
50
+
51
+
52
+ @dataset(id=".1.topics", url="")
53
+ class Trec1Topics(Dataset):
54
+ FILE = FileDownloader(
55
+ "topics.sgml",
56
+ "http://trec.nist.gov/data/topics_eng/topics.51-100.gz",
57
+ transforms=TransformList(Gunzip(), Replace(r"Number:(\s+)0", r"Number: \1")),
58
+ )
59
+
60
+ def config(self) -> TrecTopics:
61
+ return TrecTopics.C(path=self.FILE.path, parts=["desc"])
62
+
63
+
64
+ @dataset(id=".1.assessments")
65
+ class Trec1Assessments(Dataset):
66
+ FILE = ConcatDownloader(
67
+ "assessments.qrels",
68
+ "http://trec.nist.gov/data/qrels_eng/qrels.51-100.disk1.disk2.parts1-5.tar.gz",
69
+ transforms=TransformList(Gunzip(), Replace(r"Number:(\s+)0", r"Number: \1")),
70
+ )
71
+
72
+ def config(self) -> TrecAdhocAssessments:
73
+ return TrecAdhocAssessments.C(path=self.FILE.path)
74
+
75
+
76
+ @dataset(id=".1")
77
+ class Trec1(Dataset):
78
+ "Ad-hoc task of TREC 1 (1992)"
57
79
 
80
+ DOCUMENTS = reference(varname="documents", reference=Trec1Documents)
81
+ TOPICS = reference(varname="topics", reference=Trec1Topics)
82
+ ASSESSMENTS = reference(varname="assessments", reference=Trec1Assessments)
58
83
 
59
- @concatdownload(
60
- "assessments.qrels",
61
- "http://trec.nist.gov/data/qrels_eng/qrels.51-100.disk1.disk2.parts1-5.tar.gz",
62
- transforms=TransformList(Gunzip(), Replace(r"Number:(\s+)0", r"Number: \1")),
63
- )
64
- @dataset(TrecAdhocAssessments, id="1.assessments")
65
- def trec1_assessments(assessments):
66
- return {"path": assessments}
84
+ def config(self) -> Adhoc:
85
+ return Adhoc.C(
86
+ documents=self.DOCUMENTS.prepare(),
87
+ topics=self.TOPICS.prepare(),
88
+ assessments=self.ASSESSMENTS.prepare(),
89
+ )
67
90
 
68
91
 
69
- @reference("documents", trec1_documents)
70
- @reference("topics", trec1_topics)
71
- @reference("assessments", trec1_assessments)
72
- @dataset(Adhoc, id="1")
73
- def trec1(documents, topics, assessments):
74
- "Ad-hoc task of TREC 1 (1992)"
75
- return {"documents": documents, "topics": topics, "assessments": assessments}
92
+ # --- TREC 2 (1993)
76
93
 
77
94
 
78
- # --- TREC 2 (1993)
95
+ @dataset(id=".2.topics")
96
+ class Trec2Topics(Dataset):
97
+ FILE = FileDownloader(
98
+ "topics.sgml",
99
+ "http://trec.nist.gov/data/topics_eng/topics.101-150.gz",
100
+ transforms=TransformList(Gunzip(), Replace(r"Number:(\s+)0", r"Number: \1")),
101
+ )
79
102
 
103
+ def config(self) -> TrecTopics:
104
+ return TrecTopics.C(path=self.FILE.path, parts=["title", "desc"])
80
105
 
81
- @filedownloader(
82
- "topics.sgml",
83
- "http://trec.nist.gov/data/topics_eng/topics.101-150.gz",
84
- transforms=TransformList(Gunzip(), Replace(r"Number:(\s+)0", r"Number: \1")),
85
- )
86
- @dataset(TrecTopics, id="2.topics")
87
- def trec2_topics(topics):
88
- return {"path": topics, "parts": ["title", "desc"]}
89
106
 
107
+ @dataset(id=".2.assessments")
108
+ class Trec2Assessments(Dataset):
109
+ FILE = ConcatDownloader(
110
+ "assessments.qrels",
111
+ "http://trec.nist.gov/data/qrels_eng/qrels.101-150.disk1.disk2.parts1-5.tar.gz",
112
+ transforms=TransformList(Gunzip(), Replace(r"Number:(\s+)0", r"Number: \1")),
113
+ )
90
114
 
91
- @concatdownload(
92
- "assessments.qrels",
93
- "http://trec.nist.gov/data/qrels_eng/qrels.101-150.disk1.disk2.parts1-5.tar.gz",
94
- transforms=TransformList(Gunzip(), Replace(r"Number:(\s+)0", r"Number: \1")),
95
- )
96
- @dataset(TrecAdhocAssessments, id="2.assessments")
97
- def trec2_assessments(assessments):
98
- return {"path": assessments}
115
+ def config(self) -> TrecAdhocAssessments:
116
+ return TrecAdhocAssessments.C(path=self.FILE.path)
99
117
 
100
118
 
101
- @reference("documents", trec1_documents)
102
- @reference("topics", trec2_topics)
103
- @reference("assessments", trec2_assessments)
104
- @dataset(Adhoc, id="2")
105
- def trec2(documents, topics, assessments):
119
+ @dataset(id=".2")
120
+ class Trec2(Dataset):
106
121
  "Ad-hoc task of TREC 2 (1993)"
107
- return {"documents": documents, "topics": topics, "assessments": assessments}
122
+
123
+ DOCUMENTS = reference(varname="documents", reference=Trec1Documents)
124
+ TOPICS = reference(varname="topics", reference=Trec2Topics)
125
+ ASSESSMENTS = reference(varname="assessments", reference=Trec2Assessments)
126
+
127
+ def config(self) -> Adhoc:
128
+ return Adhoc.C(
129
+ documents=self.DOCUMENTS.prepare(),
130
+ topics=self.TOPICS.prepare(),
131
+ assessments=self.ASSESSMENTS.prepare(),
132
+ )
108
133
 
109
134
 
110
135
  # --- TREC 3 (1994)
111
136
 
112
137
 
113
- @filedownloader("topics.sgml", "http://trec.nist.gov/data/topics_eng/topics.151-200.gz")
114
- @dataset(TrecTopics, id="3.topics")
115
- def trec3_topics(topics):
116
- return {"path": topics, "parts": ["title", "desc"]}
138
+ @dataset(id=".3.topics")
139
+ class Trec3Topics(Dataset):
140
+ FILE = FileDownloader(
141
+ "topics.sgml", "http://trec.nist.gov/data/topics_eng/topics.151-200.gz"
142
+ )
117
143
 
144
+ def config(self) -> TrecTopics:
145
+ return TrecTopics.C(path=self.FILE.path, parts=["title", "desc"])
118
146
 
119
- @concatdownload(
120
- "assessments.qrels",
121
- "http://trec.nist.gov/data/qrels_eng/qrels.151-200.201-250.disks1-3.all.tar.gz",
122
- transforms=TransformList(Gunzip(), Filter(r"^(1\d\d|200)\s")),
123
- )
124
- @dataset(TrecAdhocAssessments, id="3.assessments")
125
- def trec3_assessments(assessments):
126
- return {"path": assessments}
127
147
 
148
+ @dataset(id=".3.assessments")
149
+ class Trec3Assessments(Dataset):
150
+ FILE = ConcatDownloader(
151
+ "assessments.qrels",
152
+ "http://trec.nist.gov/data/qrels_eng/qrels.151-200.201-250.disks1-3.all.tar.gz",
153
+ transforms=TransformList(Gunzip(), Filter(r"^(1\d\d|200)\s")),
154
+ )
155
+
156
+ def config(self) -> TrecAdhocAssessments:
157
+ return TrecAdhocAssessments.C(path=self.FILE.path)
128
158
 
129
- @reference("documents", trec1_documents)
130
- @reference("topics", trec3_topics)
131
- @reference("assessments", trec3_assessments)
132
- @dataset(Adhoc, id="3")
133
- def trec3(documents, topics, assessments):
159
+
160
+ @dataset(id=".3")
161
+ class Trec3(Dataset):
134
162
  "Ad-hoc task of TREC 3 (1994)"
135
- return {"documents": documents, "topics": topics, "assessments": assessments}
163
+
164
+ DOCUMENTS = reference(varname="documents", reference=Trec1Documents)
165
+ TOPICS = reference(varname="topics", reference=Trec3Topics)
166
+ ASSESSMENTS = reference(varname="assessments", reference=Trec3Assessments)
167
+
168
+ def config(self) -> Adhoc:
169
+ return Adhoc.C(
170
+ documents=self.DOCUMENTS.prepare(),
171
+ topics=self.TOPICS.prepare(),
172
+ assessments=self.ASSESSMENTS.prepare(),
173
+ )
136
174
 
137
175
 
138
176
  # --- TREC 4 (1995)
139
177
 
140
178
 
141
- @links(
142
- "documents",
143
- ap88=tipster.ap88.path,
144
- ap89=tipster.ap89.path,
145
- ap90=tipster.ap90.path,
146
- fr88=tipster.fr88.path,
147
- sjm1=tipster.sjm1.path,
148
- wsj90=tipster.wsj90.path,
149
- wsj91=tipster.wsj91.path,
150
- wsj92=tipster.wsj92.path,
151
- ziff2=tipster.ziff2.path,
152
- ziff3=tipster.ziff3.path,
153
- )
154
- @dataset(TipsterCollection, id="4.documents")
155
- def trec4_documents(documents):
179
+ @dataset(id=".4.documents")
180
+ class Trec4Documents(Dataset):
156
181
  """TREC-4 documents"""
157
- return {"path": documents}
158
182
 
183
+ DOCUMENTS = links(
184
+ "documents",
185
+ ap88=tipster.Ap88,
186
+ ap89=tipster.Ap89,
187
+ ap90=tipster.Ap90,
188
+ fr88=tipster.Fr88,
189
+ sjm1=tipster.Sjm1,
190
+ wsj90=tipster.Wsj90,
191
+ wsj91=tipster.Wsj91,
192
+ wsj92=tipster.Wsj92,
193
+ ziff2=tipster.Ziff2,
194
+ ziff3=tipster.Ziff3,
195
+ )
159
196
 
160
- @filedownloader("topics.sgml", "http://trec.nist.gov/data/topics_eng/topics.201-250.gz")
161
- @dataset(TrecTopics, id="4.topics")
162
- def trec4_topics(topics):
163
- return {"path": topics, "parts": ["title", "desc"]}
197
+ def config(self) -> TipsterCollection:
198
+ return TipsterCollection.C(path=self.DOCUMENTS.path)
164
199
 
165
200
 
166
- @concatdownload(
167
- "assessments.qrels",
168
- "http://trec.nist.gov/data/qrels_eng/qrels.201-250.disk2.disk3.parts1-5.tar.gz",
169
- )
170
- @dataset(TrecAdhocAssessments, id="4.assessments")
171
- def trec4_assessments(assessments):
172
- return {"path": assessments}
201
+ @dataset(id=".4.topics")
202
+ class Trec4Topics(Dataset):
203
+ FILE = FileDownloader(
204
+ "topics.sgml", "http://trec.nist.gov/data/topics_eng/topics.201-250.gz"
205
+ )
206
+
207
+ def config(self) -> TrecTopics:
208
+ return TrecTopics.C(path=self.FILE.path, parts=["title", "desc"])
173
209
 
174
210
 
175
- @reference("documents", trec4_documents)
176
- @reference("topics", trec4_topics)
177
- @reference("assessments", trec4_assessments)
178
- @dataset(Adhoc, id="4")
179
- def trec4(documents, topics, assessments):
211
+ @dataset(id=".4.assessments")
212
+ class Trec4Assessments(Dataset):
213
+ FILE = ConcatDownloader(
214
+ "assessments.qrels",
215
+ "http://trec.nist.gov/data/qrels_eng/qrels.201-250.disk2.disk3.parts1-5.tar.gz",
216
+ )
217
+
218
+ def config(self) -> TrecAdhocAssessments:
219
+ return TrecAdhocAssessments.C(path=self.FILE.path)
220
+
221
+
222
+ @dataset(id=".4")
223
+ class Trec4(Dataset):
180
224
  "Ad-hoc task of TREC 4 (1995)"
181
- return {"documents": documents, "topics": topics, "assessments": assessments}
225
+
226
+ DOCUMENTS = reference(varname="documents", reference=Trec4Documents)
227
+ TOPICS = reference(varname="topics", reference=Trec4Topics)
228
+ ASSESSMENTS = reference(varname="assessments", reference=Trec4Assessments)
229
+
230
+ def config(self) -> Adhoc:
231
+ return Adhoc.C(
232
+ documents=self.DOCUMENTS.prepare(),
233
+ topics=self.TOPICS.prepare(),
234
+ assessments=self.ASSESSMENTS.prepare(),
235
+ )
182
236
 
183
237
 
184
238
  # --- TREC 5 (1995)
185
239
 
186
240
 
187
- @links(
188
- "documents",
189
- ap88=tipster.ap88.path,
190
- cr1=tipster.cr1.path,
191
- fr88=tipster.fr88.path,
192
- fr94=tipster.fr94.path,
193
- ft1=tipster.ft1.path,
194
- wsj90=tipster.wsj90.path,
195
- wsj91=tipster.wsj91.path,
196
- wsj9=tipster.wsj92.path,
197
- ziff2=tipster.ziff2.path,
198
- )
199
- @dataset(TipsterCollection, id="5.documents")
200
- def trec5_documents(documents):
241
+ @dataset(id=".5.documents")
242
+ class Trec5Documents(Dataset):
201
243
  """TREC-5 documents"""
202
- return {"path": documents}
203
244
 
245
+ DOCUMENTS = links(
246
+ "documents",
247
+ ap88=tipster.Ap88,
248
+ cr1=tipster.Cr1,
249
+ fr88=tipster.Fr88,
250
+ fr94=tipster.Fr94,
251
+ ft1=tipster.Ft1,
252
+ wsj90=tipster.Wsj90,
253
+ wsj91=tipster.Wsj91,
254
+ wsj9=tipster.Wsj92,
255
+ ziff2=tipster.Ziff2,
256
+ )
204
257
 
205
- @filedownloader("topics.sgml", "http://trec.nist.gov/data/topics_eng/topics.251-300.gz")
206
- @dataset(TrecTopics, id="5.topics")
207
- def trec5_topics(topics):
208
- return {"path": topics, "parts": ["title", "desc"]}
258
+ def config(self) -> TipsterCollection:
259
+ return TipsterCollection.C(path=self.DOCUMENTS.path)
209
260
 
210
261
 
211
- @concatdownload(
212
- "assessments.qrels",
213
- url="http://trec.nist.gov/data/qrels_eng/qrels.251-300.parts1-5.tar.gz",
214
- )
215
- @dataset(TrecAdhocAssessments, id="5.qrels")
216
- def trec5_assessments(assessments):
217
- return {"path": assessments}
262
+ @dataset(id=".5.topics")
263
+ class Trec5Topics(Dataset):
264
+ FILE = FileDownloader(
265
+ "topics.sgml", "http://trec.nist.gov/data/topics_eng/topics.251-300.gz"
266
+ )
267
+
268
+ def config(self) -> TrecTopics:
269
+ return TrecTopics.C(path=self.FILE.path, parts=["title", "desc"])
218
270
 
219
271
 
220
- @reference("documents", trec5_documents)
221
- @reference("topics", trec5_topics)
222
- @reference("assessments", trec5_assessments)
223
- @dataset(Adhoc, id="5")
224
- def trec5(documents, topics, assessments):
272
+ @dataset(id=".5.qrels")
273
+ class Trec5Assessments(Dataset):
274
+ FILE = ConcatDownloader(
275
+ "assessments.qrels",
276
+ url="http://trec.nist.gov/data/qrels_eng/qrels.251-300.parts1-5.tar.gz",
277
+ )
278
+
279
+ def config(self) -> TrecAdhocAssessments:
280
+ return TrecAdhocAssessments.C(path=self.FILE.path)
281
+
282
+
283
+ @dataset(id=".5")
284
+ class Trec5(Dataset):
225
285
  "Ad-hoc task of TREC 5 (1996)"
226
- return {"documents": documents, "topics": topics, "assessments": assessments}
286
+
287
+ DOCUMENTS = reference(varname="documents", reference=Trec5Documents)
288
+ TOPICS = reference(varname="topics", reference=Trec5Topics)
289
+ ASSESSMENTS = reference(varname="assessments", reference=Trec5Assessments)
290
+
291
+ def config(self) -> Adhoc:
292
+ return Adhoc.C(
293
+ documents=self.DOCUMENTS.prepare(),
294
+ topics=self.TOPICS.prepare(),
295
+ assessments=self.ASSESSMENTS.prepare(),
296
+ )
227
297
 
228
298
 
229
299
  # -- TREC 6 (1997)
230
300
 
231
301
 
232
- @links(
233
- "documents",
234
- cr1=tipster.cr1.path,
235
- fbis1=tipster.fbis1.path,
236
- fr94=tipster.fr94.path,
237
- ft1=tipster.ft1.path,
238
- la8990=tipster.la8990.path,
239
- )
240
- @dataset(TipsterCollection, id="6.documents")
241
- def trec6_documents(documents):
302
+ @dataset(id=".6.documents")
303
+ class Trec6Documents(Dataset):
242
304
  """TREC-5 documents"""
243
- return {"path": documents}
244
305
 
306
+ DOCUMENTS = links(
307
+ "documents",
308
+ cr1=tipster.Cr1,
309
+ fbis1=tipster.Fbis1,
310
+ fr94=tipster.Fr94,
311
+ ft1=tipster.Ft1,
312
+ la8990=tipster.La8990,
313
+ )
245
314
 
246
- @filedownloader("topics.sgml", "http://trec.nist.gov/data/topics_eng/topics.301-350.gz")
247
- @dataset(TrecTopics, id="6.topics")
248
- def trec6_topics(topics):
249
- return {"path": topics, "parts": ["title", "desc"]}
315
+ def config(self) -> TipsterCollection:
316
+ return TipsterCollection.C(path=self.DOCUMENTS.path)
250
317
 
251
318
 
252
- @concatdownload(
253
- "assessments.qrels",
254
- url="http://trec.nist.gov/data/qrels_eng/qrels.trec6.adhoc.parts1-5.tar.gz",
255
- )
256
- @dataset(TrecAdhocAssessments, id="6.qrels")
257
- def trec6_assessments(assessments):
258
- return {"path": assessments}
319
+ @dataset(id=".6.topics")
320
+ class Trec6Topics(Dataset):
321
+ FILE = FileDownloader(
322
+ "topics.sgml", "http://trec.nist.gov/data/topics_eng/topics.301-350.gz"
323
+ )
324
+
325
+ def config(self) -> TrecTopics:
326
+ return TrecTopics.C(path=self.FILE.path, parts=["title", "desc"])
259
327
 
260
328
 
261
- @reference("documents", trec6_documents)
262
- @reference("topics", trec6_topics)
263
- @reference("assessments", trec6_assessments)
264
- @dataset(Adhoc, id="6")
265
- def trec6(documents, topics, assessments):
329
+ @dataset(id=".6.qrels")
330
+ class Trec6Assessments(Dataset):
331
+ FILE = ConcatDownloader(
332
+ "assessments.qrels",
333
+ url="http://trec.nist.gov/data/qrels_eng/qrels.trec6.adhoc.parts1-5.tar.gz",
334
+ )
335
+
336
+ def config(self) -> TrecAdhocAssessments:
337
+ return TrecAdhocAssessments.C(path=self.FILE.path)
338
+
339
+
340
+ @dataset(id=".6")
341
+ class Trec6(Dataset):
266
342
  "Ad-hoc task of TREC 6 (1997)"
267
- return {"documents": documents, "topics": topics, "assessments": assessments}
343
+
344
+ DOCUMENTS = reference(varname="documents", reference=Trec6Documents)
345
+ TOPICS = reference(varname="topics", reference=Trec6Topics)
346
+ ASSESSMENTS = reference(varname="assessments", reference=Trec6Assessments)
347
+
348
+ def config(self) -> Adhoc:
349
+ return Adhoc.C(
350
+ documents=self.DOCUMENTS.prepare(),
351
+ topics=self.TOPICS.prepare(),
352
+ assessments=self.ASSESSMENTS.prepare(),
353
+ )
268
354
 
269
355
 
270
356
  # --- TREC 7 (1998)
271
357
 
272
358
 
273
- @links(
274
- "documents",
275
- fbis1=tipster.fbis1.path,
276
- fr94=tipster.fr94.path,
277
- ft1=tipster.ft1.path,
278
- la8990=tipster.la8990.path,
279
- )
280
- @dataset(TipsterCollection, id="7.documents")
281
- def trec7_documents(documents):
359
+ @dataset(id=".7.documents")
360
+ class Trec7Documents(Dataset):
282
361
  """TREC-7 documents"""
283
- return {"path": documents}
284
362
 
363
+ DOCUMENTS = links(
364
+ "documents",
365
+ fbis1=tipster.Fbis1,
366
+ fr94=tipster.Fr94,
367
+ ft1=tipster.Ft1,
368
+ la8990=tipster.La8990,
369
+ )
285
370
 
286
- @filedownloader("topics.sgml", "http://trec.nist.gov/data/topics_eng/topics.351-400.gz")
287
- @dataset(TrecTopics, id="7.topics")
288
- def trec7_topics(topics):
289
- return {"path": topics, "parts": ["title", "desc"]}
371
+ def config(self) -> TipsterCollection:
372
+ return TipsterCollection.C(path=self.DOCUMENTS.path)
290
373
 
291
374
 
292
- @concatdownload(
293
- "assessments.qrels",
294
- url="http://trec.nist.gov/data/qrels_eng/qrels.trec7.adhoc.parts1-5.tar.gz",
295
- )
296
- @dataset(TrecAdhocAssessments, id="7.qrels")
297
- def trec7_assessments(assessments):
298
- return {"path": assessments}
375
+ @dataset(id=".7.topics")
376
+ class Trec7Topics(Dataset):
377
+ FILE = FileDownloader(
378
+ "topics.sgml", "http://trec.nist.gov/data/topics_eng/topics.351-400.gz"
379
+ )
380
+
381
+ def config(self) -> TrecTopics:
382
+ return TrecTopics.C(path=self.FILE.path, parts=["title", "desc"])
299
383
 
300
384
 
301
- @reference("documents", trec7_documents)
302
- @reference("topics", trec7_topics)
303
- @reference("assessments", trec7_assessments)
304
- @dataset(Adhoc, id="7")
305
- def trec7(documents, topics, assessments):
385
+ @dataset(id=".7.qrels")
386
+ class Trec7Assessments(Dataset):
387
+ FILE = ConcatDownloader(
388
+ "assessments.qrels",
389
+ url="http://trec.nist.gov/data/qrels_eng/qrels.trec7.adhoc.parts1-5.tar.gz",
390
+ )
391
+
392
+ def config(self) -> TrecAdhocAssessments:
393
+ return TrecAdhocAssessments.C(path=self.FILE.path)
394
+
395
+
396
+ @dataset(id=".7")
397
+ class Trec7(Dataset):
306
398
  "Ad-hoc task of TREC 3 (1994)"
307
- return {"documents": documents, "topics": topics, "assessments": assessments}
399
+
400
+ DOCUMENTS = reference(varname="documents", reference=Trec7Documents)
401
+ TOPICS = reference(varname="topics", reference=Trec7Topics)
402
+ ASSESSMENTS = reference(varname="assessments", reference=Trec7Assessments)
403
+
404
+ def config(self) -> Adhoc:
405
+ return Adhoc.C(
406
+ documents=self.DOCUMENTS.prepare(),
407
+ topics=self.TOPICS.prepare(),
408
+ assessments=self.ASSESSMENTS.prepare(),
409
+ )
308
410
 
309
411
 
310
412
  # --- TREC 8 (1999)
311
413
 
312
414
 
313
- @filedownloader("topics.sgml", "http://trec.nist.gov/data/topics_eng/topics.401-450.gz")
314
- @dataset(TrecTopics, id="8.topics")
315
- def trec8_topics(topics):
316
- return {"path": topics, "parts": ["title", "desc"]}
415
+ @dataset(id=".8.topics")
416
+ class Trec8Topics(Dataset):
417
+ FILE = FileDownloader(
418
+ "topics.sgml", "http://trec.nist.gov/data/topics_eng/topics.401-450.gz"
419
+ )
317
420
 
421
+ def config(self) -> TrecTopics:
422
+ return TrecTopics.C(path=self.FILE.path, parts=["title", "desc"])
318
423
 
319
- @concatdownload(
320
- "assessments.qrels",
321
- url="https://trec.nist.gov/data/qrels_eng/qrels.trec8.adhoc.parts1-5.tar.gz",
322
- )
323
- @dataset(TrecAdhocAssessments, id="8.qrels")
324
- def trec8_assessments(assessments):
325
- return {"path": assessments}
424
+
425
+ @dataset(id=".8.qrels")
426
+ class Trec8Assessments(Dataset):
427
+ FILE = ConcatDownloader(
428
+ "assessments.qrels",
429
+ url="https://trec.nist.gov/data/qrels_eng/qrels.trec8.adhoc.parts1-5.tar.gz",
430
+ )
431
+
432
+ def config(self) -> TrecAdhocAssessments:
433
+ return TrecAdhocAssessments.C(path=self.FILE.path)
326
434
 
327
435
 
328
- @reference("documents", trec7_documents)
329
- @reference("topics", trec8_topics)
330
- @reference("assessments", trec8_assessments)
331
- @dataset(Adhoc, id="8")
332
- def trec8(documents, topics, assessments):
436
+ @dataset(id=".8")
437
+ class Trec8(Dataset):
333
438
  "Ad-hoc task of TREC 8 (1999)"
334
- return {"documents": documents, "topics": topics, "assessments": assessments}
439
+
440
+ DOCUMENTS = reference(varname="documents", reference=Trec7Documents)
441
+ TOPICS = reference(varname="topics", reference=Trec8Topics)
442
+ ASSESSMENTS = reference(varname="assessments", reference=Trec8Assessments)
443
+
444
+ def config(self) -> Adhoc:
445
+ return Adhoc.C(
446
+ documents=self.DOCUMENTS.prepare(),
447
+ topics=self.TOPICS.prepare(),
448
+ assessments=self.ASSESSMENTS.prepare(),
449
+ )
335
450
 
336
451
 
337
452
  # --- TREC Robust (2004)
338
453
 
339
454
 
340
- @filedownloader("topics", "http://trec.nist.gov/data/robust/04.testset.gz")
341
- @dataset(TrecTopics, id="robust.2004.topics")
342
- def robust2004_topics(topics):
343
- return {"path": topics, "parts": ["title", "desc"]}
455
+ @dataset(id=".robust.2004.topics")
456
+ class Robust2004Topics(Dataset):
457
+ FILE = FileDownloader("topics", "http://trec.nist.gov/data/robust/04.testset.gz")
344
458
 
459
+ def config(self) -> TrecTopics:
460
+ return TrecTopics.C(path=self.FILE.path, parts=["title", "desc"])
345
461
 
346
- @filedownloader(
347
- "assessments.qrels", "http://trec.nist.gov/data/robust/qrels.robust2004.txt"
348
- )
349
- @dataset(TrecAdhocAssessments, id="robust.2004.qrels")
350
- def robust2004_assessments(assessments):
351
- return {"path": assessments}
352
462
 
463
+ @dataset(id=".robust.2004.qrels")
464
+ class Robust2004Assessments(Dataset):
465
+ FILE = FileDownloader(
466
+ "assessments.qrels", "http://trec.nist.gov/data/robust/qrels.robust2004.txt"
467
+ )
468
+
469
+ def config(self) -> TrecAdhocAssessments:
470
+ return TrecAdhocAssessments.C(path=self.FILE.path)
353
471
 
354
- @reference("documents", trec7_documents)
355
- @reference("topics", robust2004_topics)
356
- @reference("assessments", robust2004_assessments)
357
- @dataset(Adhoc, id="robust.2004")
358
- def robust2004(documents, topics, assessments):
472
+
473
+ @dataset(id=".robust.2004")
474
+ class Robust2004(Dataset):
359
475
  "Ad-hoc task of TREC Robust (2004)"
360
- return {"documents": documents, "topics": topics, "assessments": assessments}
476
+
477
+ DOCUMENTS = reference(varname="documents", reference=Trec7Documents)
478
+ TOPICS = reference(varname="topics", reference=Robust2004Topics)
479
+ ASSESSMENTS = reference(varname="assessments", reference=Robust2004Assessments)
480
+
481
+ def config(self) -> Adhoc:
482
+ return Adhoc.C(
483
+ documents=self.DOCUMENTS.prepare(),
484
+ topics=self.TOPICS.prepare(),
485
+ assessments=self.ASSESSMENTS.prepare(),
486
+ )
361
487
 
362
488
 
363
489
  # --- TREC Robust (2005)
364
490
 
365
491
 
366
- @filedownloader("topics", "http://trec.nist.gov/data/robust/05/05.50.topics.txt")
367
- @dataset(TrecTopics, id="robust.2005.topics")
368
- def robust2005_topics(topics):
369
- return {"path": topics, "parts": ["title", "desc"]}
492
+ @dataset(id=".robust.2005.topics")
493
+ class Robust2005Topics(Dataset):
494
+ FILE = FileDownloader(
495
+ "topics", "http://trec.nist.gov/data/robust/05/05.50.topics.txt"
496
+ )
370
497
 
498
+ def config(self) -> TrecTopics:
499
+ return TrecTopics.C(path=self.FILE.path, parts=["title", "desc"])
371
500
 
372
- @filedownloader(
373
- "assessments.qrels", url="http://trec.nist.gov/data/robust/05/TREC2005.qrels.txt"
374
- )
375
- @dataset(TrecAdhocAssessments, id="robust.2005.qrels")
376
- def robust2005_assessments(assessments):
377
- return {"path": assessments}
378
501
 
502
+ @dataset(id=".robust.2005.qrels")
503
+ class Robust2005Assessments(Dataset):
504
+ FILE = FileDownloader(
505
+ "assessments.qrels",
506
+ url="http://trec.nist.gov/data/robust/05/TREC2005.qrels.txt",
507
+ )
379
508
 
380
- @reference("documents", aquaint)
381
- @reference("topics", robust2005_topics)
382
- @reference("assessments", robust2005_assessments)
383
- @dataset(Adhoc, id="robust.2005")
384
- def robust2005(documents, topics, assessments):
509
+ def config(self) -> TrecAdhocAssessments:
510
+ return TrecAdhocAssessments.C(path=self.FILE.path)
511
+
512
+
513
+ @dataset(id=".robust.2005")
514
+ class Robust2005(Dataset):
385
515
  "Ad-hoc task of TREC Robust (2005)"
386
- return {"documents": documents, "topics": topics, "assessments": assessments}
516
+
517
+ DOCUMENTS = reference(varname="documents", reference=Aquaint)
518
+ TOPICS = reference(varname="topics", reference=Robust2005Topics)
519
+ ASSESSMENTS = reference(varname="assessments", reference=Robust2005Assessments)
520
+
521
+ def config(self) -> Adhoc:
522
+ return Adhoc.C(
523
+ documents=self.DOCUMENTS.prepare(),
524
+ topics=self.TOPICS.prepare(),
525
+ assessments=self.ASSESSMENTS.prepare(),
526
+ )