datamaestro-text 2026.2.2__py3-none-any.whl → 2026.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. datamaestro_text/config/com/github/aagohary/canard.py +27 -24
  2. datamaestro_text/config/com/github/apple/ml-qrecc.py +30 -25
  3. datamaestro_text/config/com/github/ikat.py +76 -61
  4. datamaestro_text/config/com/github/prdwb/orconvqa.py +41 -37
  5. datamaestro_text/config/com/microsoft/msmarco/passage.py +278 -207
  6. datamaestro_text/config/com/oscar-corpus.py +13 -10
  7. datamaestro_text/config/com/sentiment140.py +17 -12
  8. datamaestro_text/config/com/smashwords/bookcorpus.py +13 -10
  9. datamaestro_text/config/edu/stanford/aclimdb.py +14 -9
  10. datamaestro_text/config/edu/stanford/glove.py +66 -31
  11. datamaestro_text/config/edu/upenn/ldc/aquaint.py +35 -17
  12. datamaestro_text/config/fr/granddebat.py +57 -48
  13. datamaestro_text/config/gov/nist/ir/covid.py +61 -50
  14. datamaestro_text/config/gov/nist/trec/adhoc.py +395 -255
  15. datamaestro_text/config/gov/nist/trec/tipster.py +170 -64
  16. datamaestro_text/config/io/github/thunlp/fewrel.py +20 -15
  17. datamaestro_text/config/io/metamind/research/wikitext.py +50 -32
  18. datamaestro_text/config/org/grouplens/movielens.py +28 -37
  19. datamaestro_text/config/org/universaldependencies/french.py +16 -11
  20. datamaestro_text/test/test_documented.py +2 -2
  21. datamaestro_text/version.py +2 -2
  22. {datamaestro_text-2026.2.2.dist-info → datamaestro_text-2026.2.3.dist-info}/METADATA +2 -2
  23. {datamaestro_text-2026.2.2.dist-info → datamaestro_text-2026.2.3.dist-info}/RECORD +26 -26
  24. {datamaestro_text-2026.2.2.dist-info → datamaestro_text-2026.2.3.dist-info}/WHEEL +0 -0
  25. {datamaestro_text-2026.2.2.dist-info → datamaestro_text-2026.2.3.dist-info}/entry_points.txt +0 -0
  26. {datamaestro_text-2026.2.2.dist-info → datamaestro_text-2026.2.3.dist-info}/licenses/LICENSE +0 -0
@@ -10,10 +10,10 @@ See [https://github.com/microsoft/MSMARCO-Passage-Ranking](https://github.com/mi
10
10
 
11
11
  from datamaestro.annotations.agreement import useragreement
12
12
  from datamaestro.data import Folder
13
- from datamaestro.download.single import filedownloader
13
+ from datamaestro.download.single import FileDownloader
14
14
  from datamaestro.download import reference
15
- from datamaestro.definitions import datatasks, datatags, dataset
16
- from datamaestro.download.archive import tardownloader
15
+ from datamaestro.definitions import Dataset, datatasks, datatags, dataset
16
+ from datamaestro.download.archive import TarDownloader
17
17
  from datamaestro_text.data.ir import RerankAdhoc, Adhoc, TrainingTripletsLines
18
18
  from datamaestro_text.data.ir.csv import (
19
19
  Topics,
@@ -39,144 +39,172 @@ http://www.msmarco.org/dataset.aspx""",
39
39
  # TODO: Not ideal since it would be better to have small versions right away
40
40
  # instead of downloading again the MS Marco Collection
41
41
  @lua
42
- @tardownloader(
43
- "data",
44
- url="https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz",
45
- checker=HashCheck("31644046b18952c1386cd4564ba2ae69", md5),
46
- )
47
42
  @dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
48
- def collection_etc(data) -> Folder:
43
+ class CollectionEtc(Dataset):
49
44
  """Documents and some more files"""
50
- return Folder.C(path=data)
45
+
46
+ DATA = TarDownloader(
47
+ "data",
48
+ url="https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz",
49
+ checker=HashCheck("31644046b18952c1386cd4564ba2ae69", md5),
50
+ )
51
+
52
+ def config(self) -> Folder:
53
+ return Folder.C(path=self.DATA.path)
51
54
 
52
55
 
53
56
  @lua
54
- @reference("data", collection_etc)
55
- @dataset(Documents, size="2.9GB")
56
- def collection(data):
57
+ @dataset(size="2.9GB")
58
+ class Collection(Dataset):
57
59
  """MS-Marco documents
58
60
 
59
61
  This file contains each passage in the larger MSMARCO dataset.
60
62
 
61
- Format is TSV (PID \t Passage)"""
62
- return {"path": data.path / "collection.tsv"}
63
+ Format is TSV (PID \\t Passage)"""
64
+
65
+ DATA = reference(varname="data", reference=CollectionEtc)
66
+
67
+ def config(self) -> Documents:
68
+ return Documents.C(path=self.DATA.prepare().path / "collection.tsv")
63
69
 
64
70
 
65
71
  # --- Train
66
72
 
67
73
 
68
74
  @lua
69
- @tardownloader(
70
- "run",
71
- url="https://msmarco.blob.core.windows.net/msmarcoranking/top1000.train.tar.gz",
72
- checker=HashCheck("d99fdbd5b2ea84af8aa23194a3263052", md5),
73
- )
74
- @dataset(AdhocRunWithText, size="2.5GB")
75
- def train_run(run):
75
+ @dataset(size="2.5GB")
76
+ class TrainRun(Dataset):
76
77
  """
77
78
 
78
79
  TSV format: qid, pid, query, passage
79
80
  """
80
- return {"path": run / "top1000.train.tsv"}
81
+
82
+ RUN = TarDownloader(
83
+ "run",
84
+ url="https://msmarco.blob.core.windows.net/msmarcoranking/top1000.train.tar.gz",
85
+ checker=HashCheck("d99fdbd5b2ea84af8aa23194a3263052", md5),
86
+ )
87
+
88
+ def config(self) -> AdhocRunWithText:
89
+ return AdhocRunWithText.C(path=self.RUN.path / "top1000.train.tsv")
81
90
 
82
91
 
83
92
  @lua
84
- @tardownloader(
85
- "queries",
86
- url="https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz",
87
- files=["queries.train.tsv"],
88
- checker=HashCheck("c177b2795d5f2dcc524cf00fcd973be1", md5),
89
- )
90
- @dataset(Topics)
91
- def train_queries(queries):
92
- return {"path": queries / "queries.train.tsv"}
93
+ @dataset()
94
+ class TrainQueries(Dataset):
95
+ QUERIES = TarDownloader(
96
+ "queries",
97
+ url="https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz",
98
+ files=["queries.train.tsv"],
99
+ checker=HashCheck("c177b2795d5f2dcc524cf00fcd973be1", md5),
100
+ )
101
+
102
+ def config(self) -> Topics:
103
+ return Topics.C(path=self.QUERIES.path / "queries.train.tsv")
93
104
 
94
105
 
95
106
  @lua
96
- @filedownloader(
97
- "qrels.tsv",
98
- url="https://msmarco.blob.core.windows.net/msmarcoranking/qrels.train.tsv",
99
- checker=HashCheck("733fb9fe12d93e497f7289409316eccf", md5),
100
- )
101
- @dataset(TrecAdhocAssessments, size="10.1MB")
102
- def train_qrels(qrels):
103
- return {"path": qrels}
107
+ @dataset(size="10.1MB")
108
+ class TrainQrels(Dataset):
109
+ QRELS = FileDownloader(
110
+ "qrels.tsv",
111
+ url="https://msmarco.blob.core.windows.net/msmarcoranking/qrels.train.tsv",
112
+ checker=HashCheck("733fb9fe12d93e497f7289409316eccf", md5),
113
+ )
114
+
115
+ def config(self) -> TrecAdhocAssessments:
116
+ return TrecAdhocAssessments.C(path=self.QRELS.path)
104
117
 
105
118
 
106
119
  @lua
107
- @reference("collection", collection)
108
- @reference("topics", train_queries)
109
- @reference("qrels", train_qrels)
110
120
  @datatasks("information retrieval", "passage retrieval")
111
- @dataset(Adhoc, url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
112
- def train(topics, qrels, collection):
121
+ @dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
122
+ class Train(Dataset):
113
123
  """MS-Marco train dataset"""
114
- return {
115
- "documents": collection,
116
- "topics": topics,
117
- "assessments": qrels,
118
- }
124
+
125
+ COLLECTION = reference(varname="collection", reference=Collection)
126
+ TOPICS = reference(varname="topics", reference=TrainQueries)
127
+ QRELS = reference(varname="qrels", reference=TrainQrels)
128
+
129
+ def config(self) -> Adhoc:
130
+ return Adhoc.C(
131
+ documents=self.COLLECTION.prepare(),
132
+ topics=self.TOPICS.prepare(),
133
+ assessments=self.QRELS.prepare(),
134
+ )
119
135
 
120
136
 
121
137
  @lua
122
- @reference("train", train)
123
- @reference("run", train_run)
124
138
  @datatasks("information retrieval", "passage retrieval")
125
- @dataset(RerankAdhoc, url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
126
- def train_withrun(train, run):
139
+ @dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
140
+ class TrainWithrun(Dataset):
127
141
  """MSMarco train dataset, including the top-1000 to documents to re-rank"""
128
- return {**train.__arguments__(), "run": run}
142
+
143
+ TRAIN = reference(varname="train", reference=Train)
144
+ RUN = reference(varname="run", reference=TrainRun)
145
+
146
+ def config(self) -> RerankAdhoc:
147
+ train = self.TRAIN.prepare()
148
+ return RerankAdhoc.C(**train.__arguments__(), run=self.RUN.prepare())
129
149
 
130
150
 
131
151
  # Training triplets
132
152
 
133
153
 
134
- @filedownloader(
135
- "triples.tsv",
136
- size=1_841_693_309,
137
- url="https://msmarco.blob.core.windows.net/msmarcoranking/qidpidtriples.train.full.2.tsv.gz",
138
- checker=HashCheck("4e58f45f82f3fe99e3239ecffd8ed371", md5),
139
- )
140
154
  @dataset(
141
- TrainingTripletsLines,
142
155
  url="https://github.com/microsoft/MSMARCO-Passage-Ranking",
143
156
  size="5.7GB",
144
157
  )
145
- def train_idtriples(triples):
158
+ class TrainIdtriples(Dataset):
146
159
  """Full training triples (query, positive passage, negative passage) with IDs"""
147
- return {"path": triples, "doc_ids": True, "topic_ids": True}
160
+
161
+ TRIPLES = FileDownloader(
162
+ "triples.tsv",
163
+ size=1_841_693_309,
164
+ url="https://msmarco.blob.core.windows.net/msmarcoranking/qidpidtriples.train.full.2.tsv.gz",
165
+ checker=HashCheck("4e58f45f82f3fe99e3239ecffd8ed371", md5),
166
+ )
167
+
168
+ def config(self) -> TrainingTripletsLines:
169
+ return TrainingTripletsLines.C(
170
+ path=self.TRIPLES.path, doc_ids=True, topic_ids=True
171
+ )
148
172
 
149
173
 
150
- @filedownloader(
151
- "triples.tsv",
152
- size=7_930_881_353,
153
- url="https://msmarco.blob.core.windows.net/msmarcoranking/triples.train.small.tar.gz",
154
- checker=HashCheck("c13bf99ff23ca691105ad12eab837f84", md5),
155
- )
156
174
  @dataset(
157
- TrainingTripletsLines,
158
175
  url="https://github.com/microsoft/MSMARCO-Passage-Ranking",
159
176
  size="27.1GB",
160
177
  )
161
- def train_texttriples_small(triples):
178
+ class TrainTexttriplesSmall(Dataset):
162
179
  """Small training triples (query, positive passage, negative passage) with text"""
163
- return {"path": triples}
180
+
181
+ TRIPLES = FileDownloader(
182
+ "triples.tsv",
183
+ size=7_930_881_353,
184
+ url="https://msmarco.blob.core.windows.net/msmarcoranking/triples.train.small.tar.gz",
185
+ checker=HashCheck("c13bf99ff23ca691105ad12eab837f84", md5),
186
+ )
187
+
188
+ def config(self) -> TrainingTripletsLines:
189
+ return TrainingTripletsLines.C(path=self.TRIPLES.path)
164
190
 
165
191
 
166
- @filedownloader(
167
- "triples.tsv",
168
- size=77_877_731_328,
169
- url="https://msmarco.blob.core.windows.net/msmarcoranking/triples.train.full.tar.gz",
170
- checker=HashCheck("8d509d484ea1971e792b812ae4800c6f", md5),
171
- )
172
192
  @dataset(
173
- TrainingTripletsLines,
174
193
  url="https://github.com/microsoft/MSMARCO-Passage-Ranking",
175
194
  size="272.2GB",
176
195
  )
177
- def train_texttriples_full(triples):
196
+ class TrainTexttripleFull(Dataset):
178
197
  """Full training triples (query, positive passage, negative passage) with text"""
179
- return {"path": triples}
198
+
199
+ TRIPLES = FileDownloader(
200
+ "triples.tsv",
201
+ size=77_877_731_328,
202
+ url="https://msmarco.blob.core.windows.net/msmarcoranking/triples.train.full.tar.gz",
203
+ checker=HashCheck("8d509d484ea1971e792b812ae4800c6f", md5),
204
+ )
205
+
206
+ def config(self) -> TrainingTripletsLines:
207
+ return TrainingTripletsLines.C(path=self.TRIPLES.path)
180
208
 
181
209
 
182
210
  # ---
@@ -185,73 +213,88 @@ def train_texttriples_full(triples):
185
213
 
186
214
 
187
215
  @lua
188
- @tardownloader(
189
- "queries",
190
- url="https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz",
191
- files=["queries.dev.tsv"],
192
- checker=HashCheck("c177b2795d5f2dcc524cf00fcd973be1", md5),
193
- )
194
- @dataset(Topics)
195
- def dev_queries(queries):
196
- return {"path": queries / "queries.dev.tsv"}
216
+ @dataset()
217
+ class DevQueries(Dataset):
218
+ QUERIES = TarDownloader(
219
+ "queries",
220
+ url="https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz",
221
+ files=["queries.dev.tsv"],
222
+ checker=HashCheck("c177b2795d5f2dcc524cf00fcd973be1", md5),
223
+ )
224
+
225
+ def config(self) -> Topics:
226
+ return Topics.C(path=self.QUERIES.path / "queries.dev.tsv")
197
227
 
198
228
 
199
229
  @lua
200
- @tardownloader(
201
- "run",
202
- url="https://msmarco.blob.core.windows.net/msmarcoranking/top1000.dev.tar.gz",
203
- checker=HashCheck("8c140662bdf123a98fbfe3bb174c5831", md5),
204
- )
205
- @dataset(AdhocRunWithText)
206
- def dev_run(run):
207
- return {"path": run / "top1000.eval.tsv"}
230
+ @dataset()
231
+ class DevRun(Dataset):
232
+ RUN = TarDownloader(
233
+ "run",
234
+ url="https://msmarco.blob.core.windows.net/msmarcoranking/top1000.dev.tar.gz",
235
+ checker=HashCheck("8c140662bdf123a98fbfe3bb174c5831", md5),
236
+ )
237
+
238
+ def config(self) -> AdhocRunWithText:
239
+ return AdhocRunWithText.C(path=self.RUN.path / "top1000.eval.tsv")
208
240
 
209
241
 
210
242
  @lua
211
- @filedownloader(
212
- "qrels.tsv",
213
- url="https://msmarco.blob.core.windows.net/msmarcoranking/qrels.dev.tsv",
214
- checker=HashCheck("9157ccaeaa8227f91722ba5770787b16", md5),
215
- )
216
- @dataset(TrecAdhocAssessments)
217
- def dev_qrels(qrels):
218
- return {"path": qrels}
243
+ @dataset()
244
+ class DevQrels(Dataset):
245
+ QRELS = FileDownloader(
246
+ "qrels.tsv",
247
+ url="https://msmarco.blob.core.windows.net/msmarcoranking/qrels.dev.tsv",
248
+ checker=HashCheck("9157ccaeaa8227f91722ba5770787b16", md5),
249
+ )
250
+
251
+ def config(self) -> TrecAdhocAssessments:
252
+ return TrecAdhocAssessments.C(path=self.QRELS.path)
219
253
 
220
254
 
221
255
  @lua
222
- @reference("collection", collection)
223
- @reference("topics", dev_queries)
224
- @reference("qrels", dev_qrels)
225
256
  @datatasks("information retrieval", "passage retrieval")
226
- @dataset(Adhoc, url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
227
- def dev(topics, qrels, collection):
257
+ @dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
258
+ class Dev(Dataset):
228
259
  """MS-Marco dev dataset"""
229
- return {
230
- "documents": collection,
231
- "topics": topics,
232
- "assessments": qrels,
233
- }
260
+
261
+ COLLECTION = reference(varname="collection", reference=Collection)
262
+ TOPICS = reference(varname="topics", reference=DevQueries)
263
+ QRELS = reference(varname="qrels", reference=DevQrels)
264
+
265
+ def config(self) -> Adhoc:
266
+ return Adhoc.C(
267
+ documents=self.COLLECTION.prepare(),
268
+ topics=self.TOPICS.prepare(),
269
+ assessments=self.QRELS.prepare(),
270
+ )
234
271
 
235
272
 
236
273
  @lua
237
- @reference("dev", dev)
238
- @reference("run", dev_run)
239
274
  @datatasks("information retrieval", "passage retrieval")
240
- @dataset(RerankAdhoc, url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
241
- def dev_withrun(dev, run):
275
+ @dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
276
+ class DevWithrun(Dataset):
242
277
  """MSMarco dev dataset, including the top-1000 to documents to re-rank"""
243
- return {**dev.__arguments__(), "run": run}
278
+
279
+ DEV = reference(varname="dev", reference=Dev)
280
+ RUN = reference(varname="run", reference=DevRun)
281
+
282
+ def config(self) -> RerankAdhoc:
283
+ dev = self.DEV.prepare()
284
+ return RerankAdhoc.C(**dev.__arguments__(), run=self.RUN.prepare())
244
285
 
245
286
 
246
287
  @lua
247
- @tardownloader(
248
- "run",
249
- url="https://msmarco.blob.core.windows.net/msmarcoranking/top1000.eval.tar.gz",
250
- checker=HashCheck("73778cd99f6e0632d12d0b5731b20a02", md5),
251
- )
252
- @dataset(AdhocRunWithText)
253
- def eval_withrun(run):
254
- return {"path": run / "top1000.eval.tsv"}
288
+ @dataset()
289
+ class EvalWithrun(Dataset):
290
+ RUN = TarDownloader(
291
+ "run",
292
+ url="https://msmarco.blob.core.windows.net/msmarcoranking/top1000.eval.tar.gz",
293
+ checker=HashCheck("73778cd99f6e0632d12d0b5731b20a02", md5),
294
+ )
295
+
296
+ def config(self) -> AdhocRunWithText:
297
+ return AdhocRunWithText.C(path=self.RUN.path / "top1000.eval.tsv")
255
298
 
256
299
 
257
300
  # ---
@@ -260,32 +303,44 @@ def eval_withrun(run):
260
303
  # ---
261
304
 
262
305
 
263
- @reference("data", collection_etc)
264
- @dataset(Topics, url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
265
- def dev_small_queries(data):
266
- return {"path": data.path / "queries.dev.small.tsv"}
306
+ @dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
307
+ class DevSmallQueries(Dataset):
308
+ DATA = reference(varname="data", reference=CollectionEtc)
267
309
 
310
+ def config(self) -> Topics:
311
+ return Topics.C(path=self.DATA.prepare().path / "queries.dev.small.tsv")
268
312
 
269
- @reference("data", collection_etc)
270
- @dataset(
271
- TrecAdhocAssessments, url="https://github.com/microsoft/MSMARCO-Passage-Ranking"
272
- )
273
- def dev_small_qrels(data):
274
- return {"path": data.path / "qrels.dev.small.tsv"}
275
313
 
314
+ @dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
315
+ class DevSmallQrels(Dataset):
316
+ DATA = reference(varname="data", reference=CollectionEtc)
317
+
318
+ def config(self) -> TrecAdhocAssessments:
319
+ return TrecAdhocAssessments.C(
320
+ path=self.DATA.prepare().path / "qrels.dev.small.tsv"
321
+ )
276
322
 
277
- @reference("topics", dev_small_queries)
278
- @reference("qrels", dev_small_qrels)
279
- @reference("collection", collection)
280
- @dataset(Adhoc, url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
281
- def dev_small(collection, topics, qrels):
282
- return {"documents": collection, "topics": topics, "assessments": qrels}
283
323
 
324
+ @dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
325
+ class DevSmall(Dataset):
326
+ TOPICS = reference(varname="topics", reference=DevSmallQueries)
327
+ QRELS = reference(varname="qrels", reference=DevSmallQrels)
328
+ COLLECTION = reference(varname="collection", reference=Collection)
284
329
 
285
- @reference("data", collection_etc)
286
- @dataset(Topics, url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
287
- def eval_queries_small(data):
288
- return {"path": data.path / "queries.eval.small.tsv"}
330
+ def config(self) -> Adhoc:
331
+ return Adhoc.C(
332
+ documents=self.COLLECTION.prepare(),
333
+ topics=self.TOPICS.prepare(),
334
+ assessments=self.QRELS.prepare(),
335
+ )
336
+
337
+
338
+ @dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
339
+ class EvalQueriesSmall(Dataset):
340
+ DATA = reference(varname="data", reference=CollectionEtc)
341
+
342
+ def config(self) -> Topics:
343
+ return Topics.C(path=self.DATA.prepare().path / "queries.eval.small.tsv")
289
344
 
290
345
 
291
346
  # ---
@@ -294,63 +349,74 @@ def eval_queries_small(data):
294
349
 
295
350
 
296
351
  @lua
297
- @filedownloader(
298
- "queries.tsv",
299
- url="https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz",
300
- checker=HashCheck("756e60d714cee28d3b552289d6272f1d", md5),
301
- )
302
- @dataset(Topics)
303
- def trec2019_test_queries(queries):
304
- return {"path": queries}
352
+ @dataset()
353
+ class Trec2019TestQueries(Dataset):
354
+ QUERIES = FileDownloader(
355
+ "queries.tsv",
356
+ url="https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz",
357
+ checker=HashCheck("756e60d714cee28d3b552289d6272f1d", md5),
358
+ )
359
+
360
+ def config(self) -> Topics:
361
+ return Topics.C(path=self.QUERIES.path)
305
362
 
306
363
 
307
364
  @lua
308
- @filedownloader(
309
- "run.tsv",
310
- url="https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-passagetest2019-top1000.tsv.gz",
311
- checker=HashCheck("ec9e012746aa9763c7ff10b3336a3ce1", md5),
312
- )
313
- @dataset(AdhocRunWithText)
314
- def trec2019_test_run(run):
315
- return {"path": run / "top1000.eval.tsv"}
365
+ @dataset()
366
+ class Trec2019TestRun(Dataset):
367
+ RUN = FileDownloader(
368
+ "run.tsv",
369
+ url="https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-passagetest2019-top1000.tsv.gz",
370
+ checker=HashCheck("ec9e012746aa9763c7ff10b3336a3ce1", md5),
371
+ )
372
+
373
+ def config(self) -> AdhocRunWithText:
374
+ return AdhocRunWithText.C(path=self.RUN.path / "top1000.eval.tsv")
316
375
 
317
376
 
318
377
  @lua
319
- @filedownloader(
320
- "qrels.tsv",
321
- url="https://trec.nist.gov/data/deep/2019qrels-pass.txt",
322
- checker=HashCheck("2f4be390198da108f6845c822e5ada14", md5),
323
- )
324
- @dataset(TrecAdhocAssessments)
325
- def trec2019_test_qrels(qrels):
326
- return {"path": qrels}
378
+ @dataset()
379
+ class Trec2019TestQrels(Dataset):
380
+ QRELS = FileDownloader(
381
+ "qrels.tsv",
382
+ url="https://trec.nist.gov/data/deep/2019qrels-pass.txt",
383
+ checker=HashCheck("2f4be390198da108f6845c822e5ada14", md5),
384
+ )
385
+
386
+ def config(self) -> TrecAdhocAssessments:
387
+ return TrecAdhocAssessments.C(path=self.QRELS.path)
327
388
 
328
389
 
329
390
  @lua
330
- @reference("collection", collection)
331
- @reference("topics", trec2019_test_queries)
332
- @reference("qrels", trec2019_test_qrels)
333
391
  @datatasks("information retrieval", "passage retrieval")
334
- @dataset(Adhoc, url="https://microsoft.github.io/msmarco/TREC-Deep-Learning-2019.html")
335
- def trec2019_test(topics, qrels, collection):
392
+ @dataset(url="https://microsoft.github.io/msmarco/TREC-Deep-Learning-2019.html")
393
+ class Trec2019Test(Dataset):
336
394
  "TREC Deep Learning (2019)"
337
- return {
338
- "documents": collection,
339
- "topics": topics,
340
- "assessments": qrels,
341
- }
395
+
396
+ COLLECTION = reference(varname="collection", reference=Collection)
397
+ TOPICS = reference(varname="topics", reference=Trec2019TestQueries)
398
+ QRELS = reference(varname="qrels", reference=Trec2019TestQrels)
399
+
400
+ def config(self) -> Adhoc:
401
+ return Adhoc.C(
402
+ documents=self.COLLECTION.prepare(),
403
+ topics=self.TOPICS.prepare(),
404
+ assessments=self.QRELS.prepare(),
405
+ )
342
406
 
343
407
 
344
408
  @lua
345
- @reference("trec2019", trec2019_test)
346
- @reference("run", trec2019_test_run)
347
409
  @datatasks("information retrieval", "passage retrieval")
348
- @dataset(
349
- RerankAdhoc, url="https://microsoft.github.io/msmarco/TREC-Deep-Learning-2019.html"
350
- )
351
- def trec2019_test_withrun(trec2019, run):
410
+ @dataset(url="https://microsoft.github.io/msmarco/TREC-Deep-Learning-2019.html")
411
+ class Trec2019TestWithrun(Dataset):
352
412
  """TREC Deep Learning (2019), including the top-1000 to documents to re-rank"""
353
- return {**trec2019.__arguments__(), "run": run}
413
+
414
+ TREC2019 = reference(varname="trec2019", reference=Trec2019Test)
415
+ RUN = reference(varname="run", reference=Trec2019TestRun)
416
+
417
+ def config(self) -> RerankAdhoc:
418
+ trec2019 = self.TREC2019.prepare()
419
+ return RerankAdhoc.C(**trec2019.__arguments__(), run=self.RUN.prepare())
354
420
 
355
421
 
356
422
  # ---
@@ -359,33 +425,38 @@ def trec2019_test_withrun(trec2019, run):
359
425
 
360
426
 
361
427
  @lua
362
- @filedownloader(
363
- "queries.tsv",
364
- url="https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2020-queries.tsv.gz",
365
- checker=HashCheck("00a406fb0d14ed3752d70d1e4eb98600", md5),
366
- )
367
- @dataset(Topics, size="12K")
368
- def trec2020_test_queries(queries):
428
+ @dataset(size="12K")
429
+ class Trec2020TestQueries(Dataset):
369
430
  """TREC Deep Learning 2019 (topics)
370
431
 
371
432
  Topics of the TREC 2019 MS-Marco Deep Learning track"""
372
- return {"path": queries}
433
+
434
+ QUERIES = FileDownloader(
435
+ "queries.tsv",
436
+ url="https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2020-queries.tsv.gz",
437
+ checker=HashCheck("00a406fb0d14ed3752d70d1e4eb98600", md5),
438
+ )
439
+
440
+ def config(self) -> Topics:
441
+ return Topics.C(path=self.QUERIES.path)
373
442
 
374
443
 
375
444
  @lua
376
445
  @datatasks("information retrieval", "passage retrieval")
377
446
  @datatags("reranking")
378
- @filedownloader(
379
- "run.tsv",
380
- url="https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-passagetest2020-top1000.tsv.gz",
381
- checker=HashCheck("aa6fbc51d66bd1dc745964c0e140a727", md5),
382
- )
383
447
  @dataset(
384
- AdhocRunWithText,
385
448
  url="https://microsoft.github.io/msmarco/TREC-Deep-Learning-2020.html",
386
449
  )
387
- def trec2020_test_run(run):
450
+ class Trec2020TestRun(Dataset):
388
451
  """TREC Deep Learning (2020)
389
452
 
390
453
  Set of query/passages for the passage re-ranking task re-rank (TREC 2020)"""
391
- return {"path": run / "top1000.eval.tsv"}
454
+
455
+ RUN = FileDownloader(
456
+ "run.tsv",
457
+ url="https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-passagetest2020-top1000.tsv.gz",
458
+ checker=HashCheck("aa6fbc51d66bd1dc745964c0e140a727", md5),
459
+ )
460
+
461
+ def config(self) -> AdhocRunWithText:
462
+ return AdhocRunWithText.C(path=self.RUN.path / "top1000.eval.tsv")
@@ -1,20 +1,23 @@
1
- from datamaestro.definitions import dataset
2
- from datamaestro.download.single import filedownloader
1
+ from datamaestro.definitions import Dataset, dataset
2
+ from datamaestro.download.single import FileDownloader
3
3
  from datamaestro_text.data.text import TextFile
4
4
  from datamaestro.utils import HashCheck
5
5
 
6
6
 
7
- @filedownloader(
8
- "file",
9
- "https://oscar-public.huma-num.fr/shuffled/en_dedup.txt.gz",
10
- checker=HashCheck("5c906ede3c5265f8934b62c275a754bc"),
11
- )
12
- @dataset(TextFile, url="https://oscar-corpus.com/", size="2.3T")
13
- def english(file):
7
+ @dataset(url="https://oscar-corpus.com/", size="2.3T")
8
+ class English(Dataset):
14
9
  """Huge French corpus from INRIA
15
10
 
16
11
  OSCAR or Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus
17
12
  obtained by language classification and filtering of the Common Crawl corpus using
18
13
  the goclassy architecture.
19
14
  """
20
- return {"path": file}
15
+
16
+ FILE = FileDownloader(
17
+ "file",
18
+ "https://oscar-public.huma-num.fr/shuffled/en_dedup.txt.gz",
19
+ checker=HashCheck("5c906ede3c5265f8934b62c275a754bc"),
20
+ )
21
+
22
+ def config(self) -> TextFile:
23
+ return TextFile.C(path=self.FILE.path)