datamaestro-text 2026.1.1__py3-none-any.whl → 2026.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamaestro_text/__init__.py +1 -1
- datamaestro_text/config/com/github/aagohary/canard.py +27 -24
- datamaestro_text/config/com/github/apple/ml-qrecc.py +30 -25
- datamaestro_text/config/com/github/ikat.py +76 -62
- datamaestro_text/config/com/github/prdwb/orconvqa.py +41 -37
- datamaestro_text/config/com/microsoft/msmarco/passage.py +278 -207
- datamaestro_text/config/com/oscar-corpus.py +13 -10
- datamaestro_text/config/com/sentiment140.py +17 -12
- datamaestro_text/config/com/smashwords/bookcorpus.py +13 -10
- datamaestro_text/config/edu/stanford/aclimdb.py +14 -9
- datamaestro_text/config/edu/stanford/glove.py +66 -32
- datamaestro_text/config/edu/upenn/ldc/aquaint.py +35 -17
- datamaestro_text/config/fr/granddebat.py +57 -48
- datamaestro_text/config/gov/nist/ir/covid.py +62 -52
- datamaestro_text/config/gov/nist/trec/adhoc.py +395 -255
- datamaestro_text/config/gov/nist/trec/tipster.py +170 -64
- datamaestro_text/config/io/github/thunlp/fewrel.py +20 -15
- datamaestro_text/config/io/metamind/research/wikitext.py +51 -33
- datamaestro_text/config/org/grouplens/movielens.py +28 -37
- datamaestro_text/config/org/universaldependencies/french.py +16 -11
- datamaestro_text/data/conversation/__init__.py +6 -6
- datamaestro_text/data/conversation/base.py +2 -2
- datamaestro_text/data/conversation/canard.py +3 -4
- datamaestro_text/data/conversation/ikat.py +0 -1
- datamaestro_text/data/conversation/orconvqa.py +3 -3
- datamaestro_text/data/embeddings.py +1 -0
- datamaestro_text/data/ir/__init__.py +1 -1
- datamaestro_text/data/ir/base.py +1 -1
- datamaestro_text/data/ir/data.py +1 -1
- datamaestro_text/data/ir/formats.py +2 -1
- datamaestro_text/data/ir/stores.py +1 -1
- datamaestro_text/data/text.py +1 -0
- datamaestro_text/datasets/__init__.py +1 -0
- datamaestro_text/datasets/irds/data.py +1 -6
- datamaestro_text/download/tmdb.py +0 -1
- datamaestro_text/test/test_documented.py +2 -2
- datamaestro_text/transforms/ir/__init__.py +12 -13
- datamaestro_text/utils/shuffle.py +1 -1
- datamaestro_text/version.py +2 -2
- {datamaestro_text-2026.1.1.dist-info → datamaestro_text-2026.2.3.dist-info}/METADATA +2 -8
- {datamaestro_text-2026.1.1.dist-info → datamaestro_text-2026.2.3.dist-info}/RECORD +44 -43
- {datamaestro_text-2026.1.1.dist-info → datamaestro_text-2026.2.3.dist-info}/WHEEL +0 -0
- {datamaestro_text-2026.1.1.dist-info → datamaestro_text-2026.2.3.dist-info}/entry_points.txt +0 -0
- {datamaestro_text-2026.1.1.dist-info → datamaestro_text-2026.2.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -10,10 +10,10 @@ See [https://github.com/microsoft/MSMARCO-Passage-Ranking](https://github.com/mi
|
|
|
10
10
|
|
|
11
11
|
from datamaestro.annotations.agreement import useragreement
|
|
12
12
|
from datamaestro.data import Folder
|
|
13
|
-
from datamaestro.download.single import
|
|
13
|
+
from datamaestro.download.single import FileDownloader
|
|
14
14
|
from datamaestro.download import reference
|
|
15
|
-
from datamaestro.definitions import datatasks, datatags, dataset
|
|
16
|
-
from datamaestro.download.archive import
|
|
15
|
+
from datamaestro.definitions import Dataset, datatasks, datatags, dataset
|
|
16
|
+
from datamaestro.download.archive import TarDownloader
|
|
17
17
|
from datamaestro_text.data.ir import RerankAdhoc, Adhoc, TrainingTripletsLines
|
|
18
18
|
from datamaestro_text.data.ir.csv import (
|
|
19
19
|
Topics,
|
|
@@ -39,144 +39,172 @@ http://www.msmarco.org/dataset.aspx""",
|
|
|
39
39
|
# TODO: Not ideal since it would be better to have small versions right away
|
|
40
40
|
# instead of downloading again the MS Marco Collection
|
|
41
41
|
@lua
|
|
42
|
-
@tardownloader(
|
|
43
|
-
"data",
|
|
44
|
-
url="https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz",
|
|
45
|
-
checker=HashCheck("31644046b18952c1386cd4564ba2ae69", md5),
|
|
46
|
-
)
|
|
47
42
|
@dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
|
|
48
|
-
|
|
43
|
+
class CollectionEtc(Dataset):
|
|
49
44
|
"""Documents and some more files"""
|
|
50
|
-
|
|
45
|
+
|
|
46
|
+
DATA = TarDownloader(
|
|
47
|
+
"data",
|
|
48
|
+
url="https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz",
|
|
49
|
+
checker=HashCheck("31644046b18952c1386cd4564ba2ae69", md5),
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
def config(self) -> Folder:
|
|
53
|
+
return Folder.C(path=self.DATA.path)
|
|
51
54
|
|
|
52
55
|
|
|
53
56
|
@lua
|
|
54
|
-
@
|
|
55
|
-
|
|
56
|
-
def collection(data):
|
|
57
|
+
@dataset(size="2.9GB")
|
|
58
|
+
class Collection(Dataset):
|
|
57
59
|
"""MS-Marco documents
|
|
58
60
|
|
|
59
61
|
This file contains each passage in the larger MSMARCO dataset.
|
|
60
62
|
|
|
61
|
-
Format is TSV (PID
|
|
62
|
-
|
|
63
|
+
Format is TSV (PID \\t Passage)"""
|
|
64
|
+
|
|
65
|
+
DATA = reference(varname="data", reference=CollectionEtc)
|
|
66
|
+
|
|
67
|
+
def config(self) -> Documents:
|
|
68
|
+
return Documents.C(path=self.DATA.prepare().path / "collection.tsv")
|
|
63
69
|
|
|
64
70
|
|
|
65
71
|
# --- Train
|
|
66
72
|
|
|
67
73
|
|
|
68
74
|
@lua
|
|
69
|
-
@
|
|
70
|
-
|
|
71
|
-
url="https://msmarco.blob.core.windows.net/msmarcoranking/top1000.train.tar.gz",
|
|
72
|
-
checker=HashCheck("d99fdbd5b2ea84af8aa23194a3263052", md5),
|
|
73
|
-
)
|
|
74
|
-
@dataset(AdhocRunWithText, size="2.5GB")
|
|
75
|
-
def train_run(run):
|
|
75
|
+
@dataset(size="2.5GB")
|
|
76
|
+
class TrainRun(Dataset):
|
|
76
77
|
"""
|
|
77
78
|
|
|
78
79
|
TSV format: qid, pid, query, passage
|
|
79
80
|
"""
|
|
80
|
-
|
|
81
|
+
|
|
82
|
+
RUN = TarDownloader(
|
|
83
|
+
"run",
|
|
84
|
+
url="https://msmarco.blob.core.windows.net/msmarcoranking/top1000.train.tar.gz",
|
|
85
|
+
checker=HashCheck("d99fdbd5b2ea84af8aa23194a3263052", md5),
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
def config(self) -> AdhocRunWithText:
|
|
89
|
+
return AdhocRunWithText.C(path=self.RUN.path / "top1000.train.tsv")
|
|
81
90
|
|
|
82
91
|
|
|
83
92
|
@lua
|
|
84
|
-
@
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
+
@dataset()
|
|
94
|
+
class TrainQueries(Dataset):
|
|
95
|
+
QUERIES = TarDownloader(
|
|
96
|
+
"queries",
|
|
97
|
+
url="https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz",
|
|
98
|
+
files=["queries.train.tsv"],
|
|
99
|
+
checker=HashCheck("c177b2795d5f2dcc524cf00fcd973be1", md5),
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
def config(self) -> Topics:
|
|
103
|
+
return Topics.C(path=self.QUERIES.path / "queries.train.tsv")
|
|
93
104
|
|
|
94
105
|
|
|
95
106
|
@lua
|
|
96
|
-
@
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
107
|
+
@dataset(size="10.1MB")
|
|
108
|
+
class TrainQrels(Dataset):
|
|
109
|
+
QRELS = FileDownloader(
|
|
110
|
+
"qrels.tsv",
|
|
111
|
+
url="https://msmarco.blob.core.windows.net/msmarcoranking/qrels.train.tsv",
|
|
112
|
+
checker=HashCheck("733fb9fe12d93e497f7289409316eccf", md5),
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
def config(self) -> TrecAdhocAssessments:
|
|
116
|
+
return TrecAdhocAssessments.C(path=self.QRELS.path)
|
|
104
117
|
|
|
105
118
|
|
|
106
119
|
@lua
|
|
107
|
-
@reference("collection", collection)
|
|
108
|
-
@reference("topics", train_queries)
|
|
109
|
-
@reference("qrels", train_qrels)
|
|
110
120
|
@datatasks("information retrieval", "passage retrieval")
|
|
111
|
-
@dataset(
|
|
112
|
-
|
|
121
|
+
@dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
|
|
122
|
+
class Train(Dataset):
|
|
113
123
|
"""MS-Marco train dataset"""
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
124
|
+
|
|
125
|
+
COLLECTION = reference(varname="collection", reference=Collection)
|
|
126
|
+
TOPICS = reference(varname="topics", reference=TrainQueries)
|
|
127
|
+
QRELS = reference(varname="qrels", reference=TrainQrels)
|
|
128
|
+
|
|
129
|
+
def config(self) -> Adhoc:
|
|
130
|
+
return Adhoc.C(
|
|
131
|
+
documents=self.COLLECTION.prepare(),
|
|
132
|
+
topics=self.TOPICS.prepare(),
|
|
133
|
+
assessments=self.QRELS.prepare(),
|
|
134
|
+
)
|
|
119
135
|
|
|
120
136
|
|
|
121
137
|
@lua
|
|
122
|
-
@reference("train", train)
|
|
123
|
-
@reference("run", train_run)
|
|
124
138
|
@datatasks("information retrieval", "passage retrieval")
|
|
125
|
-
@dataset(
|
|
126
|
-
|
|
139
|
+
@dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
|
|
140
|
+
class TrainWithrun(Dataset):
|
|
127
141
|
"""MSMarco train dataset, including the top-1000 to documents to re-rank"""
|
|
128
|
-
|
|
142
|
+
|
|
143
|
+
TRAIN = reference(varname="train", reference=Train)
|
|
144
|
+
RUN = reference(varname="run", reference=TrainRun)
|
|
145
|
+
|
|
146
|
+
def config(self) -> RerankAdhoc:
|
|
147
|
+
train = self.TRAIN.prepare()
|
|
148
|
+
return RerankAdhoc.C(**train.__arguments__(), run=self.RUN.prepare())
|
|
129
149
|
|
|
130
150
|
|
|
131
151
|
# Training triplets
|
|
132
152
|
|
|
133
153
|
|
|
134
|
-
@filedownloader(
|
|
135
|
-
"triples.tsv",
|
|
136
|
-
size=1_841_693_309,
|
|
137
|
-
url="https://msmarco.blob.core.windows.net/msmarcoranking/qidpidtriples.train.full.2.tsv.gz",
|
|
138
|
-
checker=HashCheck("4e58f45f82f3fe99e3239ecffd8ed371", md5),
|
|
139
|
-
)
|
|
140
154
|
@dataset(
|
|
141
|
-
TrainingTripletsLines,
|
|
142
155
|
url="https://github.com/microsoft/MSMARCO-Passage-Ranking",
|
|
143
156
|
size="5.7GB",
|
|
144
157
|
)
|
|
145
|
-
|
|
158
|
+
class TrainIdtriples(Dataset):
|
|
146
159
|
"""Full training triples (query, positive passage, negative passage) with IDs"""
|
|
147
|
-
|
|
160
|
+
|
|
161
|
+
TRIPLES = FileDownloader(
|
|
162
|
+
"triples.tsv",
|
|
163
|
+
size=1_841_693_309,
|
|
164
|
+
url="https://msmarco.blob.core.windows.net/msmarcoranking/qidpidtriples.train.full.2.tsv.gz",
|
|
165
|
+
checker=HashCheck("4e58f45f82f3fe99e3239ecffd8ed371", md5),
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
def config(self) -> TrainingTripletsLines:
|
|
169
|
+
return TrainingTripletsLines.C(
|
|
170
|
+
path=self.TRIPLES.path, doc_ids=True, topic_ids=True
|
|
171
|
+
)
|
|
148
172
|
|
|
149
173
|
|
|
150
|
-
@filedownloader(
|
|
151
|
-
"triples.tsv",
|
|
152
|
-
size=7_930_881_353,
|
|
153
|
-
url="https://msmarco.blob.core.windows.net/msmarcoranking/triples.train.small.tar.gz",
|
|
154
|
-
checker=HashCheck("c13bf99ff23ca691105ad12eab837f84", md5),
|
|
155
|
-
)
|
|
156
174
|
@dataset(
|
|
157
|
-
TrainingTripletsLines,
|
|
158
175
|
url="https://github.com/microsoft/MSMARCO-Passage-Ranking",
|
|
159
176
|
size="27.1GB",
|
|
160
177
|
)
|
|
161
|
-
|
|
178
|
+
class TrainTexttriplesSmall(Dataset):
|
|
162
179
|
"""Small training triples (query, positive passage, negative passage) with text"""
|
|
163
|
-
|
|
180
|
+
|
|
181
|
+
TRIPLES = FileDownloader(
|
|
182
|
+
"triples.tsv",
|
|
183
|
+
size=7_930_881_353,
|
|
184
|
+
url="https://msmarco.blob.core.windows.net/msmarcoranking/triples.train.small.tar.gz",
|
|
185
|
+
checker=HashCheck("c13bf99ff23ca691105ad12eab837f84", md5),
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
def config(self) -> TrainingTripletsLines:
|
|
189
|
+
return TrainingTripletsLines.C(path=self.TRIPLES.path)
|
|
164
190
|
|
|
165
191
|
|
|
166
|
-
@filedownloader(
|
|
167
|
-
"triples.tsv",
|
|
168
|
-
size=77_877_731_328,
|
|
169
|
-
url="https://msmarco.blob.core.windows.net/msmarcoranking/triples.train.full.tar.gz",
|
|
170
|
-
checker=HashCheck("8d509d484ea1971e792b812ae4800c6f", md5),
|
|
171
|
-
)
|
|
172
192
|
@dataset(
|
|
173
|
-
TrainingTripletsLines,
|
|
174
193
|
url="https://github.com/microsoft/MSMARCO-Passage-Ranking",
|
|
175
194
|
size="272.2GB",
|
|
176
195
|
)
|
|
177
|
-
|
|
196
|
+
class TrainTexttripleFull(Dataset):
|
|
178
197
|
"""Full training triples (query, positive passage, negative passage) with text"""
|
|
179
|
-
|
|
198
|
+
|
|
199
|
+
TRIPLES = FileDownloader(
|
|
200
|
+
"triples.tsv",
|
|
201
|
+
size=77_877_731_328,
|
|
202
|
+
url="https://msmarco.blob.core.windows.net/msmarcoranking/triples.train.full.tar.gz",
|
|
203
|
+
checker=HashCheck("8d509d484ea1971e792b812ae4800c6f", md5),
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
def config(self) -> TrainingTripletsLines:
|
|
207
|
+
return TrainingTripletsLines.C(path=self.TRIPLES.path)
|
|
180
208
|
|
|
181
209
|
|
|
182
210
|
# ---
|
|
@@ -185,73 +213,88 @@ def train_texttriples_full(triples):
|
|
|
185
213
|
|
|
186
214
|
|
|
187
215
|
@lua
|
|
188
|
-
@
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
216
|
+
@dataset()
|
|
217
|
+
class DevQueries(Dataset):
|
|
218
|
+
QUERIES = TarDownloader(
|
|
219
|
+
"queries",
|
|
220
|
+
url="https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz",
|
|
221
|
+
files=["queries.dev.tsv"],
|
|
222
|
+
checker=HashCheck("c177b2795d5f2dcc524cf00fcd973be1", md5),
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
def config(self) -> Topics:
|
|
226
|
+
return Topics.C(path=self.QUERIES.path / "queries.dev.tsv")
|
|
197
227
|
|
|
198
228
|
|
|
199
229
|
@lua
|
|
200
|
-
@
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
230
|
+
@dataset()
|
|
231
|
+
class DevRun(Dataset):
|
|
232
|
+
RUN = TarDownloader(
|
|
233
|
+
"run",
|
|
234
|
+
url="https://msmarco.blob.core.windows.net/msmarcoranking/top1000.dev.tar.gz",
|
|
235
|
+
checker=HashCheck("8c140662bdf123a98fbfe3bb174c5831", md5),
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
def config(self) -> AdhocRunWithText:
|
|
239
|
+
return AdhocRunWithText.C(path=self.RUN.path / "top1000.eval.tsv")
|
|
208
240
|
|
|
209
241
|
|
|
210
242
|
@lua
|
|
211
|
-
@
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
243
|
+
@dataset()
|
|
244
|
+
class DevQrels(Dataset):
|
|
245
|
+
QRELS = FileDownloader(
|
|
246
|
+
"qrels.tsv",
|
|
247
|
+
url="https://msmarco.blob.core.windows.net/msmarcoranking/qrels.dev.tsv",
|
|
248
|
+
checker=HashCheck("9157ccaeaa8227f91722ba5770787b16", md5),
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
def config(self) -> TrecAdhocAssessments:
|
|
252
|
+
return TrecAdhocAssessments.C(path=self.QRELS.path)
|
|
219
253
|
|
|
220
254
|
|
|
221
255
|
@lua
|
|
222
|
-
@reference("collection", collection)
|
|
223
|
-
@reference("topics", dev_queries)
|
|
224
|
-
@reference("qrels", dev_qrels)
|
|
225
256
|
@datatasks("information retrieval", "passage retrieval")
|
|
226
|
-
@dataset(
|
|
227
|
-
|
|
257
|
+
@dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
|
|
258
|
+
class Dev(Dataset):
|
|
228
259
|
"""MS-Marco dev dataset"""
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
260
|
+
|
|
261
|
+
COLLECTION = reference(varname="collection", reference=Collection)
|
|
262
|
+
TOPICS = reference(varname="topics", reference=DevQueries)
|
|
263
|
+
QRELS = reference(varname="qrels", reference=DevQrels)
|
|
264
|
+
|
|
265
|
+
def config(self) -> Adhoc:
|
|
266
|
+
return Adhoc.C(
|
|
267
|
+
documents=self.COLLECTION.prepare(),
|
|
268
|
+
topics=self.TOPICS.prepare(),
|
|
269
|
+
assessments=self.QRELS.prepare(),
|
|
270
|
+
)
|
|
234
271
|
|
|
235
272
|
|
|
236
273
|
@lua
|
|
237
|
-
@reference("dev", dev)
|
|
238
|
-
@reference("run", dev_run)
|
|
239
274
|
@datatasks("information retrieval", "passage retrieval")
|
|
240
|
-
@dataset(
|
|
241
|
-
|
|
275
|
+
@dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
|
|
276
|
+
class DevWithrun(Dataset):
|
|
242
277
|
"""MSMarco dev dataset, including the top-1000 to documents to re-rank"""
|
|
243
|
-
|
|
278
|
+
|
|
279
|
+
DEV = reference(varname="dev", reference=Dev)
|
|
280
|
+
RUN = reference(varname="run", reference=DevRun)
|
|
281
|
+
|
|
282
|
+
def config(self) -> RerankAdhoc:
|
|
283
|
+
dev = self.DEV.prepare()
|
|
284
|
+
return RerankAdhoc.C(**dev.__arguments__(), run=self.RUN.prepare())
|
|
244
285
|
|
|
245
286
|
|
|
246
287
|
@lua
|
|
247
|
-
@
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
288
|
+
@dataset()
|
|
289
|
+
class EvalWithrun(Dataset):
|
|
290
|
+
RUN = TarDownloader(
|
|
291
|
+
"run",
|
|
292
|
+
url="https://msmarco.blob.core.windows.net/msmarcoranking/top1000.eval.tar.gz",
|
|
293
|
+
checker=HashCheck("73778cd99f6e0632d12d0b5731b20a02", md5),
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
def config(self) -> AdhocRunWithText:
|
|
297
|
+
return AdhocRunWithText.C(path=self.RUN.path / "top1000.eval.tsv")
|
|
255
298
|
|
|
256
299
|
|
|
257
300
|
# ---
|
|
@@ -260,32 +303,44 @@ def eval_withrun(run):
|
|
|
260
303
|
# ---
|
|
261
304
|
|
|
262
305
|
|
|
263
|
-
@
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
return {"path": data.path / "queries.dev.small.tsv"}
|
|
306
|
+
@dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
|
|
307
|
+
class DevSmallQueries(Dataset):
|
|
308
|
+
DATA = reference(varname="data", reference=CollectionEtc)
|
|
267
309
|
|
|
310
|
+
def config(self) -> Topics:
|
|
311
|
+
return Topics.C(path=self.DATA.prepare().path / "queries.dev.small.tsv")
|
|
268
312
|
|
|
269
|
-
@reference("data", collection_etc)
|
|
270
|
-
@dataset(
|
|
271
|
-
TrecAdhocAssessments, url="https://github.com/microsoft/MSMARCO-Passage-Ranking"
|
|
272
|
-
)
|
|
273
|
-
def dev_small_qrels(data):
|
|
274
|
-
return {"path": data.path / "qrels.dev.small.tsv"}
|
|
275
313
|
|
|
314
|
+
@dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
|
|
315
|
+
class DevSmallQrels(Dataset):
|
|
316
|
+
DATA = reference(varname="data", reference=CollectionEtc)
|
|
317
|
+
|
|
318
|
+
def config(self) -> TrecAdhocAssessments:
|
|
319
|
+
return TrecAdhocAssessments.C(
|
|
320
|
+
path=self.DATA.prepare().path / "qrels.dev.small.tsv"
|
|
321
|
+
)
|
|
276
322
|
|
|
277
|
-
@reference("topics", dev_small_queries)
|
|
278
|
-
@reference("qrels", dev_small_qrels)
|
|
279
|
-
@reference("collection", collection)
|
|
280
|
-
@dataset(Adhoc, url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
|
|
281
|
-
def dev_small(collection, topics, qrels):
|
|
282
|
-
return {"documents": collection, "topics": topics, "assessments": qrels}
|
|
283
323
|
|
|
324
|
+
@dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
|
|
325
|
+
class DevSmall(Dataset):
|
|
326
|
+
TOPICS = reference(varname="topics", reference=DevSmallQueries)
|
|
327
|
+
QRELS = reference(varname="qrels", reference=DevSmallQrels)
|
|
328
|
+
COLLECTION = reference(varname="collection", reference=Collection)
|
|
284
329
|
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
330
|
+
def config(self) -> Adhoc:
|
|
331
|
+
return Adhoc.C(
|
|
332
|
+
documents=self.COLLECTION.prepare(),
|
|
333
|
+
topics=self.TOPICS.prepare(),
|
|
334
|
+
assessments=self.QRELS.prepare(),
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
@dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
|
|
339
|
+
class EvalQueriesSmall(Dataset):
|
|
340
|
+
DATA = reference(varname="data", reference=CollectionEtc)
|
|
341
|
+
|
|
342
|
+
def config(self) -> Topics:
|
|
343
|
+
return Topics.C(path=self.DATA.prepare().path / "queries.eval.small.tsv")
|
|
289
344
|
|
|
290
345
|
|
|
291
346
|
# ---
|
|
@@ -294,63 +349,74 @@ def eval_queries_small(data):
|
|
|
294
349
|
|
|
295
350
|
|
|
296
351
|
@lua
|
|
297
|
-
@
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
352
|
+
@dataset()
|
|
353
|
+
class Trec2019TestQueries(Dataset):
|
|
354
|
+
QUERIES = FileDownloader(
|
|
355
|
+
"queries.tsv",
|
|
356
|
+
url="https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz",
|
|
357
|
+
checker=HashCheck("756e60d714cee28d3b552289d6272f1d", md5),
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
def config(self) -> Topics:
|
|
361
|
+
return Topics.C(path=self.QUERIES.path)
|
|
305
362
|
|
|
306
363
|
|
|
307
364
|
@lua
|
|
308
|
-
@
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
365
|
+
@dataset()
|
|
366
|
+
class Trec2019TestRun(Dataset):
|
|
367
|
+
RUN = FileDownloader(
|
|
368
|
+
"run.tsv",
|
|
369
|
+
url="https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-passagetest2019-top1000.tsv.gz",
|
|
370
|
+
checker=HashCheck("ec9e012746aa9763c7ff10b3336a3ce1", md5),
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
def config(self) -> AdhocRunWithText:
|
|
374
|
+
return AdhocRunWithText.C(path=self.RUN.path / "top1000.eval.tsv")
|
|
316
375
|
|
|
317
376
|
|
|
318
377
|
@lua
|
|
319
|
-
@
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
378
|
+
@dataset()
|
|
379
|
+
class Trec2019TestQrels(Dataset):
|
|
380
|
+
QRELS = FileDownloader(
|
|
381
|
+
"qrels.tsv",
|
|
382
|
+
url="https://trec.nist.gov/data/deep/2019qrels-pass.txt",
|
|
383
|
+
checker=HashCheck("2f4be390198da108f6845c822e5ada14", md5),
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
def config(self) -> TrecAdhocAssessments:
|
|
387
|
+
return TrecAdhocAssessments.C(path=self.QRELS.path)
|
|
327
388
|
|
|
328
389
|
|
|
329
390
|
@lua
|
|
330
|
-
@reference("collection", collection)
|
|
331
|
-
@reference("topics", trec2019_test_queries)
|
|
332
|
-
@reference("qrels", trec2019_test_qrels)
|
|
333
391
|
@datatasks("information retrieval", "passage retrieval")
|
|
334
|
-
@dataset(
|
|
335
|
-
|
|
392
|
+
@dataset(url="https://microsoft.github.io/msmarco/TREC-Deep-Learning-2019.html")
|
|
393
|
+
class Trec2019Test(Dataset):
|
|
336
394
|
"TREC Deep Learning (2019)"
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
395
|
+
|
|
396
|
+
COLLECTION = reference(varname="collection", reference=Collection)
|
|
397
|
+
TOPICS = reference(varname="topics", reference=Trec2019TestQueries)
|
|
398
|
+
QRELS = reference(varname="qrels", reference=Trec2019TestQrels)
|
|
399
|
+
|
|
400
|
+
def config(self) -> Adhoc:
|
|
401
|
+
return Adhoc.C(
|
|
402
|
+
documents=self.COLLECTION.prepare(),
|
|
403
|
+
topics=self.TOPICS.prepare(),
|
|
404
|
+
assessments=self.QRELS.prepare(),
|
|
405
|
+
)
|
|
342
406
|
|
|
343
407
|
|
|
344
408
|
@lua
|
|
345
|
-
@reference("trec2019", trec2019_test)
|
|
346
|
-
@reference("run", trec2019_test_run)
|
|
347
409
|
@datatasks("information retrieval", "passage retrieval")
|
|
348
|
-
@dataset(
|
|
349
|
-
|
|
350
|
-
)
|
|
351
|
-
def trec2019_test_withrun(trec2019, run):
|
|
410
|
+
@dataset(url="https://microsoft.github.io/msmarco/TREC-Deep-Learning-2019.html")
|
|
411
|
+
class Trec2019TestWithrun(Dataset):
|
|
352
412
|
"""TREC Deep Learning (2019), including the top-1000 to documents to re-rank"""
|
|
353
|
-
|
|
413
|
+
|
|
414
|
+
TREC2019 = reference(varname="trec2019", reference=Trec2019Test)
|
|
415
|
+
RUN = reference(varname="run", reference=Trec2019TestRun)
|
|
416
|
+
|
|
417
|
+
def config(self) -> RerankAdhoc:
|
|
418
|
+
trec2019 = self.TREC2019.prepare()
|
|
419
|
+
return RerankAdhoc.C(**trec2019.__arguments__(), run=self.RUN.prepare())
|
|
354
420
|
|
|
355
421
|
|
|
356
422
|
# ---
|
|
@@ -359,33 +425,38 @@ def trec2019_test_withrun(trec2019, run):
|
|
|
359
425
|
|
|
360
426
|
|
|
361
427
|
@lua
|
|
362
|
-
@
|
|
363
|
-
|
|
364
|
-
url="https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2020-queries.tsv.gz",
|
|
365
|
-
checker=HashCheck("00a406fb0d14ed3752d70d1e4eb98600", md5),
|
|
366
|
-
)
|
|
367
|
-
@dataset(Topics, size="12K")
|
|
368
|
-
def trec2020_test_queries(queries):
|
|
428
|
+
@dataset(size="12K")
|
|
429
|
+
class Trec2020TestQueries(Dataset):
|
|
369
430
|
"""TREC Deep Learning 2019 (topics)
|
|
370
431
|
|
|
371
432
|
Topics of the TREC 2019 MS-Marco Deep Learning track"""
|
|
372
|
-
|
|
433
|
+
|
|
434
|
+
QUERIES = FileDownloader(
|
|
435
|
+
"queries.tsv",
|
|
436
|
+
url="https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2020-queries.tsv.gz",
|
|
437
|
+
checker=HashCheck("00a406fb0d14ed3752d70d1e4eb98600", md5),
|
|
438
|
+
)
|
|
439
|
+
|
|
440
|
+
def config(self) -> Topics:
|
|
441
|
+
return Topics.C(path=self.QUERIES.path)
|
|
373
442
|
|
|
374
443
|
|
|
375
444
|
@lua
|
|
376
445
|
@datatasks("information retrieval", "passage retrieval")
|
|
377
446
|
@datatags("reranking")
|
|
378
|
-
@filedownloader(
|
|
379
|
-
"run.tsv",
|
|
380
|
-
url="https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-passagetest2020-top1000.tsv.gz",
|
|
381
|
-
checker=HashCheck("aa6fbc51d66bd1dc745964c0e140a727", md5),
|
|
382
|
-
)
|
|
383
447
|
@dataset(
|
|
384
|
-
AdhocRunWithText,
|
|
385
448
|
url="https://microsoft.github.io/msmarco/TREC-Deep-Learning-2020.html",
|
|
386
449
|
)
|
|
387
|
-
|
|
450
|
+
class Trec2020TestRun(Dataset):
|
|
388
451
|
"""TREC Deep Learning (2020)
|
|
389
452
|
|
|
390
453
|
Set of query/passages for the passage re-ranking task re-rank (TREC 2020)"""
|
|
391
|
-
|
|
454
|
+
|
|
455
|
+
RUN = FileDownloader(
|
|
456
|
+
"run.tsv",
|
|
457
|
+
url="https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-passagetest2020-top1000.tsv.gz",
|
|
458
|
+
checker=HashCheck("aa6fbc51d66bd1dc745964c0e140a727", md5),
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
def config(self) -> AdhocRunWithText:
|
|
462
|
+
return AdhocRunWithText.C(path=self.RUN.path / "top1000.eval.tsv")
|
|
@@ -1,20 +1,23 @@
|
|
|
1
|
-
from datamaestro.definitions import
|
|
2
|
-
from datamaestro.download.single import
|
|
1
|
+
from datamaestro.definitions import Dataset, dataset
|
|
2
|
+
from datamaestro.download.single import FileDownloader
|
|
3
3
|
from datamaestro_text.data.text import TextFile
|
|
4
4
|
from datamaestro.utils import HashCheck
|
|
5
5
|
|
|
6
6
|
|
|
7
|
-
@
|
|
8
|
-
|
|
9
|
-
"https://oscar-public.huma-num.fr/shuffled/en_dedup.txt.gz",
|
|
10
|
-
checker=HashCheck("5c906ede3c5265f8934b62c275a754bc"),
|
|
11
|
-
)
|
|
12
|
-
@dataset(TextFile, url="https://oscar-corpus.com/", size="2.3T")
|
|
13
|
-
def english(file):
|
|
7
|
+
@dataset(url="https://oscar-corpus.com/", size="2.3T")
|
|
8
|
+
class English(Dataset):
|
|
14
9
|
"""Huge French corpus from INRIA
|
|
15
10
|
|
|
16
11
|
OSCAR or Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus
|
|
17
12
|
obtained by language classification and filtering of the Common Crawl corpus using
|
|
18
13
|
the goclassy architecture.
|
|
19
14
|
"""
|
|
20
|
-
|
|
15
|
+
|
|
16
|
+
FILE = FileDownloader(
|
|
17
|
+
"file",
|
|
18
|
+
"https://oscar-public.huma-num.fr/shuffled/en_dedup.txt.gz",
|
|
19
|
+
checker=HashCheck("5c906ede3c5265f8934b62c275a754bc"),
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
def config(self) -> TextFile:
|
|
23
|
+
return TextFile.C(path=self.FILE.path)
|