datamaestro-text 2026.2.2__py3-none-any.whl → 2026.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. datamaestro_text/config/com/github/aagohary/canard.py +27 -24
  2. datamaestro_text/config/com/github/apple/ml-qrecc.py +30 -25
  3. datamaestro_text/config/com/github/ikat.py +76 -61
  4. datamaestro_text/config/com/github/prdwb/orconvqa.py +41 -37
  5. datamaestro_text/config/com/microsoft/msmarco/passage.py +278 -207
  6. datamaestro_text/config/com/oscar-corpus.py +13 -10
  7. datamaestro_text/config/com/sentiment140.py +17 -12
  8. datamaestro_text/config/com/smashwords/bookcorpus.py +13 -10
  9. datamaestro_text/config/edu/stanford/aclimdb.py +14 -9
  10. datamaestro_text/config/edu/stanford/glove.py +66 -31
  11. datamaestro_text/config/edu/upenn/ldc/aquaint.py +35 -17
  12. datamaestro_text/config/fr/granddebat.py +57 -48
  13. datamaestro_text/config/gov/nist/ir/covid.py +61 -50
  14. datamaestro_text/config/gov/nist/trec/adhoc.py +395 -255
  15. datamaestro_text/config/gov/nist/trec/tipster.py +170 -64
  16. datamaestro_text/config/io/github/thunlp/fewrel.py +20 -15
  17. datamaestro_text/config/io/metamind/research/wikitext.py +50 -32
  18. datamaestro_text/config/org/grouplens/movielens.py +28 -37
  19. datamaestro_text/config/org/universaldependencies/french.py +16 -11
  20. datamaestro_text/test/test_documented.py +2 -2
  21. datamaestro_text/version.py +2 -2
  22. {datamaestro_text-2026.2.2.dist-info → datamaestro_text-2026.2.3.dist-info}/METADATA +2 -2
  23. {datamaestro_text-2026.2.2.dist-info → datamaestro_text-2026.2.3.dist-info}/RECORD +26 -26
  24. {datamaestro_text-2026.2.2.dist-info → datamaestro_text-2026.2.3.dist-info}/WHEEL +0 -0
  25. {datamaestro_text-2026.2.2.dist-info → datamaestro_text-2026.2.3.dist-info}/entry_points.txt +0 -0
  26. {datamaestro_text-2026.2.2.dist-info → datamaestro_text-2026.2.3.dist-info}/licenses/LICENSE +0 -0
@@ -20,165 +20,271 @@ See also https://trec.nist.gov/data/docs_eng.html and https://trec.nist.gov/data
20
20
  from datamaestro_text.data.ir.trec import TipsterCollection
21
21
  from datamaestro.download.links import linkfolder
22
22
  from datamaestro.definitions import (
23
+ Dataset,
23
24
  dataset,
24
25
  )
25
26
  from datamaestro.context import DatafolderPath
26
27
 
27
28
  # Store meta-information
28
- TIPSTER = dataset(TipsterCollection, url="https://catalog.ldc.upenn.edu/LDC93T3A")
29
+ TIPSTER = dataset(url="https://catalog.ldc.upenn.edu/LDC93T3A")
29
30
 
30
31
 
31
- @linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk2/AP")])
32
32
  @TIPSTER
33
- def ap88(documents):
33
+ class Ap88(Dataset):
34
34
  """Associated Press document collection (1988)"""
35
- return {"path": documents}
35
+
36
+ DOCUMENTS = linkfolder(
37
+ "documents", [DatafolderPath("gov.nist.trec.tipster", "Disk2/AP")]
38
+ )
39
+
40
+ def config(self) -> TipsterCollection:
41
+ return TipsterCollection.C(path=self.DOCUMENTS.path)
36
42
 
37
43
 
38
- @linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk1/AP")])
39
44
  @TIPSTER
40
- def ap89(documents):
45
+ class Ap89(Dataset):
41
46
  """Associated Press document collection (1989)"""
42
- return {"path": documents}
47
+
48
+ DOCUMENTS = linkfolder(
49
+ "documents", [DatafolderPath("gov.nist.trec.tipster", "Disk1/AP")]
50
+ )
51
+
52
+ def config(self) -> TipsterCollection:
53
+ return TipsterCollection.C(path=self.DOCUMENTS.path)
43
54
 
44
55
 
45
- @linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk3/AP")])
46
56
  @TIPSTER
47
- def ap90(documents):
57
+ class Ap90(Dataset):
48
58
  """Associated Press document collection (1990)"""
49
- return {"path": documents}
59
+
60
+ DOCUMENTS = linkfolder(
61
+ "documents", [DatafolderPath("gov.nist.trec.tipster", "Disk3/AP")]
62
+ )
63
+
64
+ def config(self) -> TipsterCollection:
65
+ return TipsterCollection.C(path=self.DOCUMENTS.path)
50
66
 
51
67
 
52
- @linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk1/DOE")])
53
68
  @TIPSTER
54
- def doe1(documents):
69
+ class Doe1(Dataset):
55
70
  """Department of Energy documents"""
56
- return {"path": documents}
71
+
72
+ DOCUMENTS = linkfolder(
73
+ "documents", [DatafolderPath("gov.nist.trec.tipster", "Disk1/DOE")]
74
+ )
75
+
76
+ def config(self) -> TipsterCollection:
77
+ return TipsterCollection.C(path=self.DOCUMENTS.path)
57
78
 
58
79
 
59
80
  # --- Wall Street Journal (1987-92)
60
81
 
61
82
 
62
- @linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk1/WSJ/1987")])
63
83
  @TIPSTER
64
- def wsj87(documents):
84
+ class Wsj87(Dataset):
65
85
  """Wall Street Journal (1987)"""
66
- return {"path": documents}
86
+
87
+ DOCUMENTS = linkfolder(
88
+ "documents", [DatafolderPath("gov.nist.trec.tipster", "Disk1/WSJ/1987")]
89
+ )
90
+
91
+ def config(self) -> TipsterCollection:
92
+ return TipsterCollection.C(path=self.DOCUMENTS.path)
67
93
 
68
94
 
69
- @linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk1/WSJ/1988")])
70
95
  @TIPSTER
71
- def wsj88(documents):
96
+ class Wsj88(Dataset):
72
97
  """Wall Street Journal (1988)"""
73
- return {"path": documents}
98
+
99
+ DOCUMENTS = linkfolder(
100
+ "documents", [DatafolderPath("gov.nist.trec.tipster", "Disk1/WSJ/1988")]
101
+ )
102
+
103
+ def config(self) -> TipsterCollection:
104
+ return TipsterCollection.C(path=self.DOCUMENTS.path)
74
105
 
75
106
 
76
- @linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk1/WSJ/1989")])
77
107
  @TIPSTER
78
- def wsj89(documents):
108
+ class Wsj89(Dataset):
79
109
  """Wall Street Journal (1989)"""
80
- return {"path": documents}
110
+
111
+ DOCUMENTS = linkfolder(
112
+ "documents", [DatafolderPath("gov.nist.trec.tipster", "Disk1/WSJ/1989")]
113
+ )
114
+
115
+ def config(self) -> TipsterCollection:
116
+ return TipsterCollection.C(path=self.DOCUMENTS.path)
81
117
 
82
118
 
83
- @linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk2/WSJ/1990")])
84
119
  @TIPSTER
85
- def wsj90(documents):
120
+ class Wsj90(Dataset):
86
121
  """Wall Street Journal (1990)"""
87
- return {"path": documents}
122
+
123
+ DOCUMENTS = linkfolder(
124
+ "documents", [DatafolderPath("gov.nist.trec.tipster", "Disk2/WSJ/1990")]
125
+ )
126
+
127
+ def config(self) -> TipsterCollection:
128
+ return TipsterCollection.C(path=self.DOCUMENTS.path)
88
129
 
89
130
 
90
- @linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk2/WSJ/1991")])
91
131
  @TIPSTER
92
- def wsj91(documents):
132
+ class Wsj91(Dataset):
93
133
  """Wall Street Journal (1991)"""
94
- return {"path": documents}
134
+
135
+ DOCUMENTS = linkfolder(
136
+ "documents", [DatafolderPath("gov.nist.trec.tipster", "Disk2/WSJ/1991")]
137
+ )
138
+
139
+ def config(self) -> TipsterCollection:
140
+ return TipsterCollection.C(path=self.DOCUMENTS.path)
95
141
 
96
142
 
97
- @linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk2/WSJ/1992")])
98
143
  @TIPSTER
99
- def wsj92(documents):
144
+ class Wsj92(Dataset):
100
145
  """Wall Street Journal (1992)"""
101
- return {"path": documents}
146
+
147
+ DOCUMENTS = linkfolder(
148
+ "documents", [DatafolderPath("gov.nist.trec.tipster", "Disk2/WSJ/1992")]
149
+ )
150
+
151
+ def config(self) -> TipsterCollection:
152
+ return TipsterCollection.C(path=self.DOCUMENTS.path)
102
153
 
103
154
 
104
155
  # --- Federal Register (1988-89)
105
156
 
106
157
 
107
- @linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk2/FR")])
108
158
  @TIPSTER
109
- def fr88(documents):
159
+ class Fr88(Dataset):
110
160
  """Federal Register (1988)"""
111
- return {"path": documents}
161
+
162
+ DOCUMENTS = linkfolder(
163
+ "documents", [DatafolderPath("gov.nist.trec.tipster", "Disk2/FR")]
164
+ )
165
+
166
+ def config(self) -> TipsterCollection:
167
+ return TipsterCollection.C(path=self.DOCUMENTS.path)
112
168
 
113
169
 
114
- @linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk1/FR")])
115
170
  @TIPSTER
116
- def fr89(documents):
171
+ class Fr89(Dataset):
117
172
  """Federal Register (1989)"""
118
- return {"path": documents}
173
+
174
+ DOCUMENTS = linkfolder(
175
+ "documents", [DatafolderPath("gov.nist.trec.tipster", "Disk1/FR")]
176
+ )
177
+
178
+ def config(self) -> TipsterCollection:
179
+ return TipsterCollection.C(path=self.DOCUMENTS.path)
119
180
 
120
181
 
121
- @linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk4/FR94")])
122
182
  @TIPSTER
123
- def fr94(documents):
183
+ class Fr94(Dataset):
124
184
  """Federal Register (1994)"""
125
- return {"path": documents}
185
+
186
+ DOCUMENTS = linkfolder(
187
+ "documents", [DatafolderPath("gov.nist.trec.tipster", "Disk4/FR94")]
188
+ )
189
+
190
+ def config(self) -> TipsterCollection:
191
+ return TipsterCollection.C(path=self.DOCUMENTS.path)
126
192
 
127
193
 
128
194
  # # ZIFF (1988-92)
129
195
 
130
196
 
131
- @linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk1/ZIFF")])
132
197
  @TIPSTER
133
- def ziff1(documents):
198
+ class Ziff1(Dataset):
134
199
  """Information from the Computer Select disks (1989-90)"""
135
- return {"path": documents}
200
+
201
+ DOCUMENTS = linkfolder(
202
+ "documents", [DatafolderPath("gov.nist.trec.tipster", "Disk1/ZIFF")]
203
+ )
204
+
205
+ def config(self) -> TipsterCollection:
206
+ return TipsterCollection.C(path=self.DOCUMENTS.path)
136
207
 
137
208
 
138
- @linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk2/ZIFF")])
139
209
  @TIPSTER
140
- def ziff2(documents):
210
+ class Ziff2(Dataset):
141
211
  """Information from the Computer Select disks (1989-90)"""
142
- return {"path": documents}
212
+
213
+ DOCUMENTS = linkfolder(
214
+ "documents", [DatafolderPath("gov.nist.trec.tipster", "Disk2/ZIFF")]
215
+ )
216
+
217
+ def config(self) -> TipsterCollection:
218
+ return TipsterCollection.C(path=self.DOCUMENTS.path)
143
219
 
144
220
 
145
- @linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk3/ZIFF")])
146
221
  @TIPSTER
147
- def ziff3(documents):
222
+ class Ziff3(Dataset):
148
223
  """Information from the Computer Select disks (1990-91)"""
149
- return {"path": documents}
224
+
225
+ DOCUMENTS = linkfolder(
226
+ "documents", [DatafolderPath("gov.nist.trec.tipster", "Disk3/ZIFF")]
227
+ )
228
+
229
+ def config(self) -> TipsterCollection:
230
+ return TipsterCollection.C(path=self.DOCUMENTS.path)
150
231
 
151
232
 
152
- @linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk3/SJM")])
153
233
  @TIPSTER
154
- def sjm1(documents):
234
+ class Sjm1(Dataset):
155
235
  """San Jose Mercury News (1991)"""
156
- return {"path": documents}
236
+
237
+ DOCUMENTS = linkfolder(
238
+ "documents", [DatafolderPath("gov.nist.trec.tipster", "Disk3/SJM")]
239
+ )
240
+
241
+ def config(self) -> TipsterCollection:
242
+ return TipsterCollection.C(path=self.DOCUMENTS.path)
157
243
 
158
244
 
159
- @linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk4/CR")])
160
245
  @TIPSTER
161
- def cr1(documents):
246
+ class Cr1(Dataset):
162
247
  """TODO"""
163
- return {"path": documents}
248
+
249
+ DOCUMENTS = linkfolder(
250
+ "documents", [DatafolderPath("gov.nist.trec.tipster", "Disk4/CR")]
251
+ )
252
+
253
+ def config(self) -> TipsterCollection:
254
+ return TipsterCollection.C(path=self.DOCUMENTS.path)
164
255
 
165
256
 
166
- @linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk4/FT")])
167
257
  @TIPSTER
168
- def ft1(documents):
258
+ class Ft1(Dataset):
169
259
  """Financial Times"""
170
- return {"path": documents}
260
+
261
+ DOCUMENTS = linkfolder(
262
+ "documents", [DatafolderPath("gov.nist.trec.tipster", "Disk4/FT")]
263
+ )
264
+
265
+ def config(self) -> TipsterCollection:
266
+ return TipsterCollection.C(path=self.DOCUMENTS.path)
171
267
 
172
268
 
173
- @linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk5/FBIS")])
174
269
  @TIPSTER
175
- def fbis1(documents):
270
+ class Fbis1(Dataset):
176
271
  """Foreign Broadcast Information Service (1996)"""
177
- return {"path": documents}
272
+
273
+ DOCUMENTS = linkfolder(
274
+ "documents", [DatafolderPath("gov.nist.trec.tipster", "Disk5/FBIS")]
275
+ )
276
+
277
+ def config(self) -> TipsterCollection:
278
+ return TipsterCollection.C(path=self.DOCUMENTS.path)
178
279
 
179
280
 
180
- @linkfolder("documents", [DatafolderPath("gov.nist.trec.tipster", "Disk5/LATIMES")])
181
281
  @TIPSTER
182
- def la8990(documents):
282
+ class La8990(Dataset):
183
283
  """Los Angeles Times (1989-90)"""
184
- return {"path": documents}
284
+
285
+ DOCUMENTS = linkfolder(
286
+ "documents", [DatafolderPath("gov.nist.trec.tipster", "Disk5/LATIMES")]
287
+ )
288
+
289
+ def config(self) -> TipsterCollection:
290
+ return TipsterCollection.C(path=self.DOCUMENTS.path)
@@ -2,28 +2,17 @@
2
2
 
3
3
  from datamaestro.data import File
4
4
  from datamaestro.data.ml import Supervised
5
- from datamaestro.definitions import datatasks, datatags, dataset
6
- from datamaestro.download.single import filedownloader
5
+ from datamaestro.definitions import Dataset, datatasks, datatags, dataset
6
+ from datamaestro.download.single import FileDownloader
7
7
  from datamaestro.utils import HashCheck
8
8
 
9
9
 
10
10
  @datatags("unsupervised")
11
11
  @datatasks("information extraction")
12
- @filedownloader(
13
- "train.json",
14
- "https://github.com/thunlp/FewRel/raw/master/data/train_wiki.json",
15
- checker=HashCheck("5e663e9c3f1bfbdb2de72696e9504fd7"),
16
- )
17
- @filedownloader(
18
- "validation.json",
19
- "https://github.com/thunlp/FewRel/raw/master/data/val_wiki.json",
20
- checker=HashCheck("3f25573428c0332cb64b367a275ab0c7"),
21
- )
22
12
  @dataset(
23
- Supervised,
24
13
  url="https://thunlp.github.io/1/fewrel1.html",
25
14
  )
26
- def v1(train, validation):
15
+ class V1(Dataset):
27
16
  """FewRel 1.0 - a Few-shot Relation classification dataset
28
17
 
29
18
  FewRel is a Few-shot Relation classification dataset, which features 70, 000 natural
@@ -32,4 +21,20 @@ def v1(train, validation):
32
21
  Only the train and validation dataset are available. The test set is hidden
33
22
  for the leaderboard.
34
23
  """
35
- return {"train": File.C(path=train), "validation": File.C(path=validation)}
24
+
25
+ TRAIN = FileDownloader(
26
+ "train.json",
27
+ "https://github.com/thunlp/FewRel/raw/master/data/train_wiki.json",
28
+ checker=HashCheck("5e663e9c3f1bfbdb2de72696e9504fd7"),
29
+ )
30
+ VALIDATION = FileDownloader(
31
+ "validation.json",
32
+ "https://github.com/thunlp/FewRel/raw/master/data/val_wiki.json",
33
+ checker=HashCheck("3f25573428c0332cb64b367a275ab0c7"),
34
+ )
35
+
36
+ def config(self) -> Supervised:
37
+ return Supervised.C(
38
+ train=File.C(path=self.TRAIN.path),
39
+ validation=File.C(path=self.VALIDATION.path),
40
+ )
@@ -1,14 +1,24 @@
1
1
  from datamaestro.data import File
2
2
  from datamaestro.definitions import (
3
+ Dataset,
3
4
  datatasks,
4
5
  datatags,
5
6
  dataset,
6
7
  metadataset,
7
8
  )
8
- from datamaestro.download.archive import zipdownloader
9
+ from datamaestro.download.archive import ZipDownloader
9
10
  from datamaestro_text.data.text import TrainingText
10
11
 
11
12
 
13
+ def _wikitext(data, type):
14
+ """Helper to build a TrainingText from data path and type."""
15
+ return TrainingText.C(
16
+ train=File.C(path=data / ("wiki.train.%s" % type)),
17
+ validation=File.C(path=data / ("wiki.valid.%s" % type)),
18
+ test=File.C(path=data / ("wiki.test.%s" % type)),
19
+ )
20
+
21
+
12
22
  @datatags("text")
13
23
  @datatasks("language modeling")
14
24
  @metadataset(TrainingText)
@@ -29,44 +39,52 @@ def WikiText(data, type):
29
39
 
30
40
  https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/
31
41
  """
32
- return {
33
- "train": File.C(path=data / ("wiki.train.%s" % type)),
34
- "validation": File.C(path=data / ("wiki.valid.%s" % type)),
35
- "test": File.C(path=data / ("wiki.test.%s" % type)),
36
- }
42
+ return _wikitext(data, type)
37
43
 
38
44
 
39
- @zipdownloader(
40
- "data", "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip"
41
- )
42
- @dataset(WikiText, id="2.tokens")
43
- def wikitext_2_words(data):
45
+ @dataset(WikiText, id=".2.tokens")
46
+ class Wikitext2Words(Dataset):
44
47
  """The small wikitext corpus, already tokenized"""
45
- return WikiText(data, "tokens")
46
48
 
49
+ DATA = ZipDownloader(
50
+ "data",
51
+ "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip",
52
+ )
47
53
 
48
- @zipdownloader(
49
- "data",
50
- "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip",
51
- )
52
- @dataset(WikiText, id="2.raw")
53
- def wikitext_2_raw(data):
54
+ def config(self) -> TrainingText:
55
+ return _wikitext(self.DATA.path, "tokens")
56
+
57
+
58
+ @dataset(WikiText, id=".2.raw")
59
+ class Wikitext2Raw(Dataset):
54
60
  """The small wikitext corpus (raw data)"""
55
- return WikiText(data, "raw")
56
61
 
62
+ DATA = ZipDownloader(
63
+ "data",
64
+ "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip",
65
+ )
57
66
 
58
- @zipdownloader(
59
- "data", "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip"
60
- )
61
- @dataset(WikiText, id="103.tokens")
62
- def wikitext_103_words(data):
63
- return WikiText(data, "tokens")
67
+ def config(self) -> TrainingText:
68
+ return _wikitext(self.DATA.path, "raw")
64
69
 
65
70
 
66
- @zipdownloader(
67
- "data",
68
- "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip",
69
- )
70
- @dataset(WikiText, id="103.raw")
71
- def wikitext_103_raw(data):
72
- return WikiText(data, "raw")
71
+ @dataset(WikiText, id=".103.tokens")
72
+ class Wikitext103Words(Dataset):
73
+ DATA = ZipDownloader(
74
+ "data",
75
+ "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip",
76
+ )
77
+
78
+ def config(self) -> TrainingText:
79
+ return _wikitext(self.DATA.path, "tokens")
80
+
81
+
82
+ @dataset(WikiText, id=".103.raw")
83
+ class Wikitext103Raw(Dataset):
84
+ DATA = ZipDownloader(
85
+ "data",
86
+ "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip",
87
+ )
88
+
89
+ def config(self) -> TrainingText:
90
+ return _wikitext(self.DATA.path, "raw")
@@ -1,53 +1,44 @@
1
- # tasks:
2
- # - Recommendation
3
- # - Collaborative Filtering
4
-
5
- # download:
6
- # handler: /archive:Zip
7
- # url: http://files.grouplens.org/datasets/movielens/ml-20m.zip
8
- # size: 190M
9
- # checksum: cd245b17a1ae2cc31bb14903e1204af3
10
- # ...
11
- # ---
12
- # id: tmdb
13
- # description: TMDB (The Movie database) download for MovieLens movies
14
- # download:
15
- # handler: tmdb:MovieLens
16
-
17
-
18
- from datamaestro.definitions import dataset
19
- from datamaestro.download.archive import zipdownloader
1
+ from datamaestro.definitions import Dataset, dataset
2
+ from datamaestro.download.archive import ZipDownloader
20
3
  import datamaestro.data.csv as csv
21
4
  from datamaestro_text.data.recommendation import Movielens
22
5
 
23
6
 
24
- @zipdownloader(
25
- "ds", "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
26
- )
27
7
  @dataset(url="https://grouplens.org/datasets/movielens/latest/", timestamp=True)
28
- def small(ds) -> Movielens:
8
+ class Small(Dataset):
29
9
  """MovieLens (small dataset)
30
10
 
31
11
  100,000 ratings and 3,600 tag applications applied to 9,000 movies by 600 users (as of 9/2018)
32
12
  """
33
- return {
34
- "ratings": csv.Generic.C(path=ds / "ratings.csv", names_row=0),
35
- "links": csv.Generic.C(path=ds / "links.csv", names_row=0),
36
- "movies": csv.Generic.C(path=ds / "movies.csv", names_row=0),
37
- "tags": csv.Generic.C(path=ds / "tags.csv", names_row=0),
38
- }
13
+
14
+ DS = ZipDownloader(
15
+ "ds", "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
16
+ )
17
+
18
+ def config(self) -> Movielens:
19
+ return Movielens.C(
20
+ ratings=csv.Generic.C(path=self.DS.path / "ratings.csv", names_row=0),
21
+ links=csv.Generic.C(path=self.DS.path / "links.csv", names_row=0),
22
+ movies=csv.Generic.C(path=self.DS.path / "movies.csv", names_row=0),
23
+ tags=csv.Generic.C(path=self.DS.path / "tags.csv", names_row=0),
24
+ )
39
25
 
40
26
 
41
- @zipdownloader("ds", "http://files.grouplens.org/datasets/movielens/ml-latest.zip")
42
27
  @dataset(url="https://grouplens.org/datasets/movielens/latest/", timestamp=True)
43
- def full(ds) -> Movielens:
28
+ class Full(Dataset):
44
29
  """MovieLens (full dataset)
45
30
 
46
31
  27,000,000 ratings and 1,100,000 tag applications applied to 58,000 movies by 280,000 users (as of 9/2018)
47
32
  """
48
- return {
49
- "ratings": csv.Generic.C(path=ds / "ratings.csv", names_row=0),
50
- "links": csv.Generic.C(path=ds / "links.csv", names_row=0),
51
- "movies": csv.Generic.C(path=ds / "movies.csv", names_row=0),
52
- "tags": csv.Generic.C(path=ds / "tags.csv", names_row=0),
53
- }
33
+
34
+ DS = ZipDownloader(
35
+ "ds", "http://files.grouplens.org/datasets/movielens/ml-latest.zip"
36
+ )
37
+
38
+ def config(self) -> Movielens:
39
+ return Movielens.C(
40
+ ratings=csv.Generic.C(path=self.DS.path / "ratings.csv", names_row=0),
41
+ links=csv.Generic.C(path=self.DS.path / "links.csv", names_row=0),
42
+ movies=csv.Generic.C(path=self.DS.path / "movies.csv", names_row=0),
43
+ tags=csv.Generic.C(path=self.DS.path / "tags.csv", names_row=0),
44
+ )
@@ -1,7 +1,8 @@
1
- from datamaestro.definitions import dataset
1
+ from datamaestro.definitions import Dataset, dataset
2
2
  from datamaestro.data.ml import Supervised
3
3
  from datamaestro_text.data.tagging import CoNLL_U
4
- from datamaestro.download.archive import zipdownloader
4
+ from datamaestro.download.archive import ZipDownloader
5
+
5
6
 
6
7
  # --- gsd
7
8
 
@@ -22,22 +23,26 @@ from datamaestro.download.archive import zipdownloader
22
23
  # path: fr_gsd-ud-test.conllu
23
24
 
24
25
 
25
- @zipdownloader(
26
- "ds", "https://codeload.github.com/UniversalDependencies/UD_French-GSD/zip/master"
27
- )
28
26
  @dataset(url="https://github.com/UniversalDependencies/UD_French-GSD")
29
- def gsd(ds) -> Supervised:
27
+ class Gsd(Dataset):
30
28
  """French GSD
31
29
 
32
30
  The UD_French-GSD was converted in 2015 from the content head version of the
33
31
  universal dependency treebank v2.0 (https://github.com/ryanmcd/uni-dep-tb). It
34
32
  is updated since 2015 independently from the previous source.
35
33
  """
36
- return {
37
- "train": CoNLL_U.C(path=ds / "fr_gsd-ud-train.conllu"),
38
- "test": CoNLL_U.C(path=ds / "fr_gsd-ud-dev.conllu"),
39
- "validation": CoNLL_U.C(path=ds / "fr_gsd-ud-test.conllu"),
40
- }
34
+
35
+ DS = ZipDownloader(
36
+ "ds",
37
+ "https://codeload.github.com/UniversalDependencies/UD_French-GSD/zip/master",
38
+ )
39
+
40
+ def config(self) -> Supervised:
41
+ return Supervised.C(
42
+ train=CoNLL_U.C(path=self.DS.path / "fr_gsd-ud-train.conllu"),
43
+ test=CoNLL_U.C(path=self.DS.path / "fr_gsd-ud-dev.conllu"),
44
+ validation=CoNLL_U.C(path=self.DS.path / "fr_gsd-ud-test.conllu"),
45
+ )
41
46
 
42
47
 
43
48
  # --- partut
@@ -1,11 +1,11 @@
1
1
  from pathlib import Path
2
- from experimaestro.tools.documentation import DocumentationAnalyzer
2
+ from datamaestro.test.checks import DatamaestroAnalyzer
3
3
 
4
4
 
5
5
  def test_documented():
6
6
  """Test if every configuration is documented"""
7
7
  doc_path = Path(__file__).parents[3] / "docs" / "source" / "index.rst"
8
- analyzer = DocumentationAnalyzer(
8
+ analyzer = DatamaestroAnalyzer(
9
9
  doc_path, set(["datamaestro_text"]), set(["datamaestro_text.test"])
10
10
  )
11
11
 
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '2026.2.2'
32
- __version_tuple__ = version_tuple = (2026, 2, 2)
31
+ __version__ = version = '2026.2.3'
32
+ __version_tuple__ = version_tuple = (2026, 2, 3)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datamaestro-text
3
- Version: 2026.2.2
3
+ Version: 2026.2.3
4
4
  Summary: Datamaestro module for text-related datasets
5
5
  Project-URL: Homepage, https://github.com/experimaestro/datamaestro_text
6
6
  Project-URL: Documentation, https://datamaestro-text.readthedocs.io/en/latest/
@@ -22,7 +22,7 @@ Classifier: Programming Language :: Python :: 3.12
22
22
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
23
23
  Requires-Python: >=3.10
24
24
  Requires-Dist: attrs
25
- Requires-Dist: datamaestro>=1.6.2
25
+ Requires-Dist: datamaestro>=1.8.0
26
26
  Requires-Dist: experimaestro
27
27
  Requires-Dist: ir-datasets>=0.5.8
28
28
  Description-Content-Type: text/markdown