datamaestro-text 2026.1.1__py3-none-any.whl → 2026.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamaestro_text/__init__.py +1 -1
- datamaestro_text/config/com/github/aagohary/canard.py +27 -24
- datamaestro_text/config/com/github/apple/ml-qrecc.py +30 -25
- datamaestro_text/config/com/github/ikat.py +76 -62
- datamaestro_text/config/com/github/prdwb/orconvqa.py +41 -37
- datamaestro_text/config/com/microsoft/msmarco/passage.py +278 -207
- datamaestro_text/config/com/oscar-corpus.py +13 -10
- datamaestro_text/config/com/sentiment140.py +17 -12
- datamaestro_text/config/com/smashwords/bookcorpus.py +13 -10
- datamaestro_text/config/edu/stanford/aclimdb.py +14 -9
- datamaestro_text/config/edu/stanford/glove.py +66 -32
- datamaestro_text/config/edu/upenn/ldc/aquaint.py +35 -17
- datamaestro_text/config/fr/granddebat.py +57 -48
- datamaestro_text/config/gov/nist/ir/covid.py +62 -52
- datamaestro_text/config/gov/nist/trec/adhoc.py +395 -255
- datamaestro_text/config/gov/nist/trec/tipster.py +170 -64
- datamaestro_text/config/io/github/thunlp/fewrel.py +20 -15
- datamaestro_text/config/io/metamind/research/wikitext.py +51 -33
- datamaestro_text/config/org/grouplens/movielens.py +28 -37
- datamaestro_text/config/org/universaldependencies/french.py +16 -11
- datamaestro_text/data/conversation/__init__.py +6 -6
- datamaestro_text/data/conversation/base.py +2 -2
- datamaestro_text/data/conversation/canard.py +3 -4
- datamaestro_text/data/conversation/ikat.py +0 -1
- datamaestro_text/data/conversation/orconvqa.py +3 -3
- datamaestro_text/data/embeddings.py +1 -0
- datamaestro_text/data/ir/__init__.py +1 -1
- datamaestro_text/data/ir/base.py +1 -1
- datamaestro_text/data/ir/data.py +1 -1
- datamaestro_text/data/ir/formats.py +2 -1
- datamaestro_text/data/ir/stores.py +1 -1
- datamaestro_text/data/text.py +1 -0
- datamaestro_text/datasets/__init__.py +1 -0
- datamaestro_text/datasets/irds/data.py +1 -6
- datamaestro_text/download/tmdb.py +0 -1
- datamaestro_text/test/test_documented.py +2 -2
- datamaestro_text/transforms/ir/__init__.py +12 -13
- datamaestro_text/utils/shuffle.py +1 -1
- datamaestro_text/version.py +2 -2
- {datamaestro_text-2026.1.1.dist-info → datamaestro_text-2026.2.3.dist-info}/METADATA +2 -8
- {datamaestro_text-2026.1.1.dist-info → datamaestro_text-2026.2.3.dist-info}/RECORD +44 -43
- {datamaestro_text-2026.1.1.dist-info → datamaestro_text-2026.2.3.dist-info}/WHEEL +0 -0
- {datamaestro_text-2026.1.1.dist-info → datamaestro_text-2026.2.3.dist-info}/entry_points.txt +0 -0
- {datamaestro_text-2026.1.1.dist-info → datamaestro_text-2026.2.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -20,165 +20,271 @@ See also https://trec.nist.gov/data/docs_eng.html and https://trec.nist.gov/data
|
|
|
20
20
|
from datamaestro_text.data.ir.trec import TipsterCollection
|
|
21
21
|
from datamaestro.download.links import linkfolder
|
|
22
22
|
from datamaestro.definitions import (
|
|
23
|
+
Dataset,
|
|
23
24
|
dataset,
|
|
24
25
|
)
|
|
25
26
|
from datamaestro.context import DatafolderPath
|
|
26
27
|
|
|
27
28
|
# Store meta-information
|
|
28
|
-
TIPSTER = dataset(
|
|
29
|
+
TIPSTER = dataset(url="https://catalog.ldc.upenn.edu/LDC93T3A")
|
|
29
30
|
|
|
30
31
|
|
|
31
|
-
@linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk2/AP")])
|
|
32
32
|
@TIPSTER
|
|
33
|
-
|
|
33
|
+
class Ap88(Dataset):
|
|
34
34
|
"""Associated Press document collection (1988)"""
|
|
35
|
-
|
|
35
|
+
|
|
36
|
+
DOCUMENTS = linkfolder(
|
|
37
|
+
"documents", [DatafolderPath("gov.nist.trec.tipster", "Disk2/AP")]
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
def config(self) -> TipsterCollection:
|
|
41
|
+
return TipsterCollection.C(path=self.DOCUMENTS.path)
|
|
36
42
|
|
|
37
43
|
|
|
38
|
-
@linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk1/AP")])
|
|
39
44
|
@TIPSTER
|
|
40
|
-
|
|
45
|
+
class Ap89(Dataset):
|
|
41
46
|
"""Associated Press document collection (1989)"""
|
|
42
|
-
|
|
47
|
+
|
|
48
|
+
DOCUMENTS = linkfolder(
|
|
49
|
+
"documents", [DatafolderPath("gov.nist.trec.tipster", "Disk1/AP")]
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
def config(self) -> TipsterCollection:
|
|
53
|
+
return TipsterCollection.C(path=self.DOCUMENTS.path)
|
|
43
54
|
|
|
44
55
|
|
|
45
|
-
@linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk3/AP")])
|
|
46
56
|
@TIPSTER
|
|
47
|
-
|
|
57
|
+
class Ap90(Dataset):
|
|
48
58
|
"""Associated Press document collection (1990)"""
|
|
49
|
-
|
|
59
|
+
|
|
60
|
+
DOCUMENTS = linkfolder(
|
|
61
|
+
"documents", [DatafolderPath("gov.nist.trec.tipster", "Disk3/AP")]
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
def config(self) -> TipsterCollection:
|
|
65
|
+
return TipsterCollection.C(path=self.DOCUMENTS.path)
|
|
50
66
|
|
|
51
67
|
|
|
52
|
-
@linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk1/DOE")])
|
|
53
68
|
@TIPSTER
|
|
54
|
-
|
|
69
|
+
class Doe1(Dataset):
|
|
55
70
|
"""Department of Energy documents"""
|
|
56
|
-
|
|
71
|
+
|
|
72
|
+
DOCUMENTS = linkfolder(
|
|
73
|
+
"documents", [DatafolderPath("gov.nist.trec.tipster", "Disk1/DOE")]
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
def config(self) -> TipsterCollection:
|
|
77
|
+
return TipsterCollection.C(path=self.DOCUMENTS.path)
|
|
57
78
|
|
|
58
79
|
|
|
59
80
|
# --- Wall Street Journal (1987-92)
|
|
60
81
|
|
|
61
82
|
|
|
62
|
-
@linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk1/WSJ/1987")])
|
|
63
83
|
@TIPSTER
|
|
64
|
-
|
|
84
|
+
class Wsj87(Dataset):
|
|
65
85
|
"""Wall Street Journal (1987)"""
|
|
66
|
-
|
|
86
|
+
|
|
87
|
+
DOCUMENTS = linkfolder(
|
|
88
|
+
"documents", [DatafolderPath("gov.nist.trec.tipster", "Disk1/WSJ/1987")]
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
def config(self) -> TipsterCollection:
|
|
92
|
+
return TipsterCollection.C(path=self.DOCUMENTS.path)
|
|
67
93
|
|
|
68
94
|
|
|
69
|
-
@linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk1/WSJ/1988")])
|
|
70
95
|
@TIPSTER
|
|
71
|
-
|
|
96
|
+
class Wsj88(Dataset):
|
|
72
97
|
"""Wall Street Journal (1988)"""
|
|
73
|
-
|
|
98
|
+
|
|
99
|
+
DOCUMENTS = linkfolder(
|
|
100
|
+
"documents", [DatafolderPath("gov.nist.trec.tipster", "Disk1/WSJ/1988")]
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
def config(self) -> TipsterCollection:
|
|
104
|
+
return TipsterCollection.C(path=self.DOCUMENTS.path)
|
|
74
105
|
|
|
75
106
|
|
|
76
|
-
@linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk1/WSJ/1989")])
|
|
77
107
|
@TIPSTER
|
|
78
|
-
|
|
108
|
+
class Wsj89(Dataset):
|
|
79
109
|
"""Wall Street Journal (1989)"""
|
|
80
|
-
|
|
110
|
+
|
|
111
|
+
DOCUMENTS = linkfolder(
|
|
112
|
+
"documents", [DatafolderPath("gov.nist.trec.tipster", "Disk1/WSJ/1989")]
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
def config(self) -> TipsterCollection:
|
|
116
|
+
return TipsterCollection.C(path=self.DOCUMENTS.path)
|
|
81
117
|
|
|
82
118
|
|
|
83
|
-
@linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk2/WSJ/1990")])
|
|
84
119
|
@TIPSTER
|
|
85
|
-
|
|
120
|
+
class Wsj90(Dataset):
|
|
86
121
|
"""Wall Street Journal (1990)"""
|
|
87
|
-
|
|
122
|
+
|
|
123
|
+
DOCUMENTS = linkfolder(
|
|
124
|
+
"documents", [DatafolderPath("gov.nist.trec.tipster", "Disk2/WSJ/1990")]
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
def config(self) -> TipsterCollection:
|
|
128
|
+
return TipsterCollection.C(path=self.DOCUMENTS.path)
|
|
88
129
|
|
|
89
130
|
|
|
90
|
-
@linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk2/WSJ/1991")])
|
|
91
131
|
@TIPSTER
|
|
92
|
-
|
|
132
|
+
class Wsj91(Dataset):
|
|
93
133
|
"""Wall Street Journal (1991)"""
|
|
94
|
-
|
|
134
|
+
|
|
135
|
+
DOCUMENTS = linkfolder(
|
|
136
|
+
"documents", [DatafolderPath("gov.nist.trec.tipster", "Disk2/WSJ/1991")]
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
def config(self) -> TipsterCollection:
|
|
140
|
+
return TipsterCollection.C(path=self.DOCUMENTS.path)
|
|
95
141
|
|
|
96
142
|
|
|
97
|
-
@linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk2/WSJ/1992")])
|
|
98
143
|
@TIPSTER
|
|
99
|
-
|
|
144
|
+
class Wsj92(Dataset):
|
|
100
145
|
"""Wall Street Journal (1992)"""
|
|
101
|
-
|
|
146
|
+
|
|
147
|
+
DOCUMENTS = linkfolder(
|
|
148
|
+
"documents", [DatafolderPath("gov.nist.trec.tipster", "Disk2/WSJ/1992")]
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
def config(self) -> TipsterCollection:
|
|
152
|
+
return TipsterCollection.C(path=self.DOCUMENTS.path)
|
|
102
153
|
|
|
103
154
|
|
|
104
155
|
# --- Federal Register (1988-89)
|
|
105
156
|
|
|
106
157
|
|
|
107
|
-
@linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk2/FR")])
|
|
108
158
|
@TIPSTER
|
|
109
|
-
|
|
159
|
+
class Fr88(Dataset):
|
|
110
160
|
"""Federal Register (1988)"""
|
|
111
|
-
|
|
161
|
+
|
|
162
|
+
DOCUMENTS = linkfolder(
|
|
163
|
+
"documents", [DatafolderPath("gov.nist.trec.tipster", "Disk2/FR")]
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
def config(self) -> TipsterCollection:
|
|
167
|
+
return TipsterCollection.C(path=self.DOCUMENTS.path)
|
|
112
168
|
|
|
113
169
|
|
|
114
|
-
@linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk1/FR")])
|
|
115
170
|
@TIPSTER
|
|
116
|
-
|
|
171
|
+
class Fr89(Dataset):
|
|
117
172
|
"""Federal Register (1989)"""
|
|
118
|
-
|
|
173
|
+
|
|
174
|
+
DOCUMENTS = linkfolder(
|
|
175
|
+
"documents", [DatafolderPath("gov.nist.trec.tipster", "Disk1/FR")]
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
def config(self) -> TipsterCollection:
|
|
179
|
+
return TipsterCollection.C(path=self.DOCUMENTS.path)
|
|
119
180
|
|
|
120
181
|
|
|
121
|
-
@linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk4/FR94")])
|
|
122
182
|
@TIPSTER
|
|
123
|
-
|
|
183
|
+
class Fr94(Dataset):
|
|
124
184
|
"""Federal Register (1994)"""
|
|
125
|
-
|
|
185
|
+
|
|
186
|
+
DOCUMENTS = linkfolder(
|
|
187
|
+
"documents", [DatafolderPath("gov.nist.trec.tipster", "Disk4/FR94")]
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
def config(self) -> TipsterCollection:
|
|
191
|
+
return TipsterCollection.C(path=self.DOCUMENTS.path)
|
|
126
192
|
|
|
127
193
|
|
|
128
194
|
# # ZIFF (1988-92)
|
|
129
195
|
|
|
130
196
|
|
|
131
|
-
@linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk1/ZIFF")])
|
|
132
197
|
@TIPSTER
|
|
133
|
-
|
|
198
|
+
class Ziff1(Dataset):
|
|
134
199
|
"""Information from the Computer Select disks (1989-90)"""
|
|
135
|
-
|
|
200
|
+
|
|
201
|
+
DOCUMENTS = linkfolder(
|
|
202
|
+
"documents", [DatafolderPath("gov.nist.trec.tipster", "Disk1/ZIFF")]
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
def config(self) -> TipsterCollection:
|
|
206
|
+
return TipsterCollection.C(path=self.DOCUMENTS.path)
|
|
136
207
|
|
|
137
208
|
|
|
138
|
-
@linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk2/ZIFF")])
|
|
139
209
|
@TIPSTER
|
|
140
|
-
|
|
210
|
+
class Ziff2(Dataset):
|
|
141
211
|
"""Information from the Computer Select disks (1989-90)"""
|
|
142
|
-
|
|
212
|
+
|
|
213
|
+
DOCUMENTS = linkfolder(
|
|
214
|
+
"documents", [DatafolderPath("gov.nist.trec.tipster", "Disk2/ZIFF")]
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
def config(self) -> TipsterCollection:
|
|
218
|
+
return TipsterCollection.C(path=self.DOCUMENTS.path)
|
|
143
219
|
|
|
144
220
|
|
|
145
|
-
@linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk3/ZIFF")])
|
|
146
221
|
@TIPSTER
|
|
147
|
-
|
|
222
|
+
class Ziff3(Dataset):
|
|
148
223
|
"""Information from the Computer Select disks (1990-91)"""
|
|
149
|
-
|
|
224
|
+
|
|
225
|
+
DOCUMENTS = linkfolder(
|
|
226
|
+
"documents", [DatafolderPath("gov.nist.trec.tipster", "Disk3/ZIFF")]
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
def config(self) -> TipsterCollection:
|
|
230
|
+
return TipsterCollection.C(path=self.DOCUMENTS.path)
|
|
150
231
|
|
|
151
232
|
|
|
152
|
-
@linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk3/SJM")])
|
|
153
233
|
@TIPSTER
|
|
154
|
-
|
|
234
|
+
class Sjm1(Dataset):
|
|
155
235
|
"""San Jose Mercury News (1991)"""
|
|
156
|
-
|
|
236
|
+
|
|
237
|
+
DOCUMENTS = linkfolder(
|
|
238
|
+
"documents", [DatafolderPath("gov.nist.trec.tipster", "Disk3/SJM")]
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
def config(self) -> TipsterCollection:
|
|
242
|
+
return TipsterCollection.C(path=self.DOCUMENTS.path)
|
|
157
243
|
|
|
158
244
|
|
|
159
|
-
@linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk4/CR")])
|
|
160
245
|
@TIPSTER
|
|
161
|
-
|
|
246
|
+
class Cr1(Dataset):
|
|
162
247
|
"""TODO"""
|
|
163
|
-
|
|
248
|
+
|
|
249
|
+
DOCUMENTS = linkfolder(
|
|
250
|
+
"documents", [DatafolderPath("gov.nist.trec.tipster", "Disk4/CR")]
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
def config(self) -> TipsterCollection:
|
|
254
|
+
return TipsterCollection.C(path=self.DOCUMENTS.path)
|
|
164
255
|
|
|
165
256
|
|
|
166
|
-
@linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk4/FT")])
|
|
167
257
|
@TIPSTER
|
|
168
|
-
|
|
258
|
+
class Ft1(Dataset):
|
|
169
259
|
"""Financial Times"""
|
|
170
|
-
|
|
260
|
+
|
|
261
|
+
DOCUMENTS = linkfolder(
|
|
262
|
+
"documents", [DatafolderPath("gov.nist.trec.tipster", "Disk4/FT")]
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
def config(self) -> TipsterCollection:
|
|
266
|
+
return TipsterCollection.C(path=self.DOCUMENTS.path)
|
|
171
267
|
|
|
172
268
|
|
|
173
|
-
@linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk5/FBIS")])
|
|
174
269
|
@TIPSTER
|
|
175
|
-
|
|
270
|
+
class Fbis1(Dataset):
|
|
176
271
|
"""Foreign Broadcast Information Service (1996)"""
|
|
177
|
-
|
|
272
|
+
|
|
273
|
+
DOCUMENTS = linkfolder(
|
|
274
|
+
"documents", [DatafolderPath("gov.nist.trec.tipster", "Disk5/FBIS")]
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
def config(self) -> TipsterCollection:
|
|
278
|
+
return TipsterCollection.C(path=self.DOCUMENTS.path)
|
|
178
279
|
|
|
179
280
|
|
|
180
|
-
@linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk5/LATIMES")])
|
|
181
281
|
@TIPSTER
|
|
182
|
-
|
|
282
|
+
class La8990(Dataset):
|
|
183
283
|
"""Los Angeles Times (1989-90)"""
|
|
184
|
-
|
|
284
|
+
|
|
285
|
+
DOCUMENTS = linkfolder(
|
|
286
|
+
"documents", [DatafolderPath("gov.nist.trec.tipster", "Disk5/LATIMES")]
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
def config(self) -> TipsterCollection:
|
|
290
|
+
return TipsterCollection.C(path=self.DOCUMENTS.path)
|
|
@@ -2,28 +2,17 @@
|
|
|
2
2
|
|
|
3
3
|
from datamaestro.data import File
|
|
4
4
|
from datamaestro.data.ml import Supervised
|
|
5
|
-
from datamaestro.definitions import datatasks, datatags, dataset
|
|
6
|
-
from datamaestro.download.single import
|
|
5
|
+
from datamaestro.definitions import Dataset, datatasks, datatags, dataset
|
|
6
|
+
from datamaestro.download.single import FileDownloader
|
|
7
7
|
from datamaestro.utils import HashCheck
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
@datatags("unsupervised")
|
|
11
11
|
@datatasks("information extraction")
|
|
12
|
-
@filedownloader(
|
|
13
|
-
"train.json",
|
|
14
|
-
"https://github.com/thunlp/FewRel/raw/master/data/train_wiki.json",
|
|
15
|
-
checker=HashCheck("5e663e9c3f1bfbdb2de72696e9504fd7"),
|
|
16
|
-
)
|
|
17
|
-
@filedownloader(
|
|
18
|
-
"validation.json",
|
|
19
|
-
"https://github.com/thunlp/FewRel/raw/master/data/val_wiki.json",
|
|
20
|
-
checker=HashCheck("3f25573428c0332cb64b367a275ab0c7"),
|
|
21
|
-
)
|
|
22
12
|
@dataset(
|
|
23
|
-
Supervised,
|
|
24
13
|
url="https://thunlp.github.io/1/fewrel1.html",
|
|
25
14
|
)
|
|
26
|
-
|
|
15
|
+
class V1(Dataset):
|
|
27
16
|
"""FewRel 1.0 - a Few-shot Relation classification dataset
|
|
28
17
|
|
|
29
18
|
FewRel is a Few-shot Relation classification dataset, which features 70, 000 natural
|
|
@@ -32,4 +21,20 @@ def v1(train, validation):
|
|
|
32
21
|
Only the train and validation dataset are available. The test set is hidden
|
|
33
22
|
for the leaderboard.
|
|
34
23
|
"""
|
|
35
|
-
|
|
24
|
+
|
|
25
|
+
TRAIN = FileDownloader(
|
|
26
|
+
"train.json",
|
|
27
|
+
"https://github.com/thunlp/FewRel/raw/master/data/train_wiki.json",
|
|
28
|
+
checker=HashCheck("5e663e9c3f1bfbdb2de72696e9504fd7"),
|
|
29
|
+
)
|
|
30
|
+
VALIDATION = FileDownloader(
|
|
31
|
+
"validation.json",
|
|
32
|
+
"https://github.com/thunlp/FewRel/raw/master/data/val_wiki.json",
|
|
33
|
+
checker=HashCheck("3f25573428c0332cb64b367a275ab0c7"),
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
def config(self) -> Supervised:
|
|
37
|
+
return Supervised.C(
|
|
38
|
+
train=File.C(path=self.TRAIN.path),
|
|
39
|
+
validation=File.C(path=self.VALIDATION.path),
|
|
40
|
+
)
|
|
@@ -1,14 +1,24 @@
|
|
|
1
|
-
from datamaestro.data import
|
|
1
|
+
from datamaestro.data import File
|
|
2
2
|
from datamaestro.definitions import (
|
|
3
|
+
Dataset,
|
|
3
4
|
datatasks,
|
|
4
5
|
datatags,
|
|
5
6
|
dataset,
|
|
6
7
|
metadataset,
|
|
7
8
|
)
|
|
8
|
-
from datamaestro.download.archive import
|
|
9
|
+
from datamaestro.download.archive import ZipDownloader
|
|
9
10
|
from datamaestro_text.data.text import TrainingText
|
|
10
11
|
|
|
11
12
|
|
|
13
|
+
def _wikitext(data, type):
|
|
14
|
+
"""Helper to build a TrainingText from data path and type."""
|
|
15
|
+
return TrainingText.C(
|
|
16
|
+
train=File.C(path=data / ("wiki.train.%s" % type)),
|
|
17
|
+
validation=File.C(path=data / ("wiki.valid.%s" % type)),
|
|
18
|
+
test=File.C(path=data / ("wiki.test.%s" % type)),
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
12
22
|
@datatags("text")
|
|
13
23
|
@datatasks("language modeling")
|
|
14
24
|
@metadataset(TrainingText)
|
|
@@ -29,44 +39,52 @@ def WikiText(data, type):
|
|
|
29
39
|
|
|
30
40
|
https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/
|
|
31
41
|
"""
|
|
32
|
-
return
|
|
33
|
-
"train": File.C(path=data / ("wiki.train.%s" % type)),
|
|
34
|
-
"validation": File.C(path=data / ("wiki.valid.%s" % type)),
|
|
35
|
-
"test": File.C(path=data / ("wiki.test.%s" % type)),
|
|
36
|
-
}
|
|
42
|
+
return _wikitext(data, type)
|
|
37
43
|
|
|
38
44
|
|
|
39
|
-
@
|
|
40
|
-
|
|
41
|
-
)
|
|
42
|
-
@dataset(WikiText, id="2.tokens")
|
|
43
|
-
def wikitext_2_words(data):
|
|
45
|
+
@dataset(WikiText, id=".2.tokens")
|
|
46
|
+
class Wikitext2Words(Dataset):
|
|
44
47
|
"""The small wikitext corpus, already tokenized"""
|
|
45
|
-
return WikiText(data, "tokens")
|
|
46
48
|
|
|
49
|
+
DATA = ZipDownloader(
|
|
50
|
+
"data",
|
|
51
|
+
"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip",
|
|
52
|
+
)
|
|
47
53
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
@dataset(WikiText, id="2.raw")
|
|
53
|
-
|
|
54
|
+
def config(self) -> TrainingText:
|
|
55
|
+
return _wikitext(self.DATA.path, "tokens")
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataset(WikiText, id=".2.raw")
|
|
59
|
+
class Wikitext2Raw(Dataset):
|
|
54
60
|
"""The small wikitext corpus (raw data)"""
|
|
55
|
-
return WikiText(data, "raw")
|
|
56
61
|
|
|
62
|
+
DATA = ZipDownloader(
|
|
63
|
+
"data",
|
|
64
|
+
"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip",
|
|
65
|
+
)
|
|
57
66
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
)
|
|
61
|
-
@dataset(WikiText, id="103.tokens")
|
|
62
|
-
def wikitext_103_words(data):
|
|
63
|
-
return WikiText(data, "tokens")
|
|
67
|
+
def config(self) -> TrainingText:
|
|
68
|
+
return _wikitext(self.DATA.path, "raw")
|
|
64
69
|
|
|
65
70
|
|
|
66
|
-
@
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
71
|
+
@dataset(WikiText, id=".103.tokens")
|
|
72
|
+
class Wikitext103Words(Dataset):
|
|
73
|
+
DATA = ZipDownloader(
|
|
74
|
+
"data",
|
|
75
|
+
"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip",
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
def config(self) -> TrainingText:
|
|
79
|
+
return _wikitext(self.DATA.path, "tokens")
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@dataset(WikiText, id=".103.raw")
|
|
83
|
+
class Wikitext103Raw(Dataset):
|
|
84
|
+
DATA = ZipDownloader(
|
|
85
|
+
"data",
|
|
86
|
+
"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip",
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
def config(self) -> TrainingText:
|
|
90
|
+
return _wikitext(self.DATA.path, "raw")
|
|
@@ -1,53 +1,44 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
# - Collaborative Filtering
|
|
4
|
-
|
|
5
|
-
# download:
|
|
6
|
-
# handler: /archive:Zip
|
|
7
|
-
# url: http://files.grouplens.org/datasets/movielens/ml-20m.zip
|
|
8
|
-
# size: 190M
|
|
9
|
-
# checksum: cd245b17a1ae2cc31bb14903e1204af3
|
|
10
|
-
# ...
|
|
11
|
-
# ---
|
|
12
|
-
# id: tmdb
|
|
13
|
-
# description: TMDB (The Movie database) download for MovieLens movies
|
|
14
|
-
# download:
|
|
15
|
-
# handler: tmdb:MovieLens
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
from datamaestro.definitions import dataset
|
|
19
|
-
from datamaestro.download.archive import zipdownloader
|
|
1
|
+
from datamaestro.definitions import Dataset, dataset
|
|
2
|
+
from datamaestro.download.archive import ZipDownloader
|
|
20
3
|
import datamaestro.data.csv as csv
|
|
21
4
|
from datamaestro_text.data.recommendation import Movielens
|
|
22
5
|
|
|
23
6
|
|
|
24
|
-
@zipdownloader(
|
|
25
|
-
"ds", "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
|
|
26
|
-
)
|
|
27
7
|
@dataset(url="https://grouplens.org/datasets/movielens/latest/", timestamp=True)
|
|
28
|
-
|
|
8
|
+
class Small(Dataset):
|
|
29
9
|
"""MovieLens (small dataset)
|
|
30
10
|
|
|
31
11
|
100,000 ratings and 3,600 tag applications applied to 9,000 movies by 600 users (as of 9/2018)
|
|
32
12
|
"""
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
"
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
13
|
+
|
|
14
|
+
DS = ZipDownloader(
|
|
15
|
+
"ds", "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
def config(self) -> Movielens:
|
|
19
|
+
return Movielens.C(
|
|
20
|
+
ratings=csv.Generic.C(path=self.DS.path / "ratings.csv", names_row=0),
|
|
21
|
+
links=csv.Generic.C(path=self.DS.path / "links.csv", names_row=0),
|
|
22
|
+
movies=csv.Generic.C(path=self.DS.path / "movies.csv", names_row=0),
|
|
23
|
+
tags=csv.Generic.C(path=self.DS.path / "tags.csv", names_row=0),
|
|
24
|
+
)
|
|
39
25
|
|
|
40
26
|
|
|
41
|
-
@zipdownloader("ds", "http://files.grouplens.org/datasets/movielens/ml-latest.zip")
|
|
42
27
|
@dataset(url="https://grouplens.org/datasets/movielens/latest/", timestamp=True)
|
|
43
|
-
|
|
28
|
+
class Full(Dataset):
|
|
44
29
|
"""MovieLens (full dataset)
|
|
45
30
|
|
|
46
31
|
27,000,000 ratings and 1,100,000 tag applications applied to 58,000 movies by 280,000 users (as of 9/2018)
|
|
47
32
|
"""
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
"
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
33
|
+
|
|
34
|
+
DS = ZipDownloader(
|
|
35
|
+
"ds", "http://files.grouplens.org/datasets/movielens/ml-latest.zip"
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
def config(self) -> Movielens:
|
|
39
|
+
return Movielens.C(
|
|
40
|
+
ratings=csv.Generic.C(path=self.DS.path / "ratings.csv", names_row=0),
|
|
41
|
+
links=csv.Generic.C(path=self.DS.path / "links.csv", names_row=0),
|
|
42
|
+
movies=csv.Generic.C(path=self.DS.path / "movies.csv", names_row=0),
|
|
43
|
+
tags=csv.Generic.C(path=self.DS.path / "tags.csv", names_row=0),
|
|
44
|
+
)
|
|
@@ -1,7 +1,8 @@
|
|
|
1
|
-
from datamaestro.definitions import dataset
|
|
1
|
+
from datamaestro.definitions import Dataset, dataset
|
|
2
2
|
from datamaestro.data.ml import Supervised
|
|
3
3
|
from datamaestro_text.data.tagging import CoNLL_U
|
|
4
|
-
from datamaestro.download.archive import
|
|
4
|
+
from datamaestro.download.archive import ZipDownloader
|
|
5
|
+
|
|
5
6
|
|
|
6
7
|
# --- gsd
|
|
7
8
|
|
|
@@ -22,22 +23,26 @@ from datamaestro.download.archive import zipdownloader
|
|
|
22
23
|
# path: fr_gsd-ud-test.conllu
|
|
23
24
|
|
|
24
25
|
|
|
25
|
-
@zipdownloader(
|
|
26
|
-
"ds", "https://codeload.github.com/UniversalDependencies/UD_French-GSD/zip/master"
|
|
27
|
-
)
|
|
28
26
|
@dataset(url="https://github.com/UniversalDependencies/UD_French-GSD")
|
|
29
|
-
|
|
27
|
+
class Gsd(Dataset):
|
|
30
28
|
"""French GSD
|
|
31
29
|
|
|
32
30
|
The UD_French-GSD was converted in 2015 from the content head version of the
|
|
33
31
|
universal dependency treebank v2.0 (https://github.com/ryanmcd/uni-dep-tb). It
|
|
34
32
|
is updated since 2015 independently from the previous source.
|
|
35
33
|
"""
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
"
|
|
39
|
-
"
|
|
40
|
-
|
|
34
|
+
|
|
35
|
+
DS = ZipDownloader(
|
|
36
|
+
"ds",
|
|
37
|
+
"https://codeload.github.com/UniversalDependencies/UD_French-GSD/zip/master",
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
def config(self) -> Supervised:
|
|
41
|
+
return Supervised.C(
|
|
42
|
+
train=CoNLL_U.C(path=self.DS.path / "fr_gsd-ud-train.conllu"),
|
|
43
|
+
test=CoNLL_U.C(path=self.DS.path / "fr_gsd-ud-dev.conllu"),
|
|
44
|
+
validation=CoNLL_U.C(path=self.DS.path / "fr_gsd-ud-test.conllu"),
|
|
45
|
+
)
|
|
41
46
|
|
|
42
47
|
|
|
43
48
|
# --- partut
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
from .base import (
|
|
2
|
-
AnswerEntry,
|
|
3
|
-
ConversationDataset,
|
|
4
|
-
ConversationHistory,
|
|
5
|
-
ConversationHistoryItem,
|
|
6
|
-
DecontextualizedItem,
|
|
7
|
-
EntryType,
|
|
2
|
+
AnswerEntry as AnswerEntry,
|
|
3
|
+
ConversationDataset as ConversationDataset,
|
|
4
|
+
ConversationHistory as ConversationHistory,
|
|
5
|
+
ConversationHistoryItem as ConversationHistoryItem,
|
|
6
|
+
DecontextualizedItem as DecontextualizedItem,
|
|
7
|
+
EntryType as EntryType,
|
|
8
8
|
)
|
|
@@ -2,13 +2,13 @@ from abc import ABC, abstractmethod
|
|
|
2
2
|
from enum import Enum
|
|
3
3
|
from datamaestro_text.data.ir.base import IDItem, SimpleTextItem
|
|
4
4
|
from experimaestro import Param
|
|
5
|
-
from typing import Dict,
|
|
5
|
+
from typing import Dict, Iterator, List, Optional, Sequence, Tuple
|
|
6
6
|
from attr import define
|
|
7
7
|
from datamaestro.record import record_type
|
|
8
8
|
from datamaestro.data import Base
|
|
9
9
|
from datamaestro.record import Record, Item
|
|
10
10
|
from datamaestro_text.data.ir import TopicRecord, Topics
|
|
11
|
-
from datamaestro_text.utils.iter import FactoryIterable, LazyList
|
|
11
|
+
from datamaestro_text.utils.iter import FactoryIterable, LazyList
|
|
12
12
|
|
|
13
13
|
# ---- Basic types
|
|
14
14
|
|