datamaestro-text 2026.2.2__py3-none-any.whl → 2026.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. datamaestro_text/config/com/github/aagohary/canard.py +27 -24
  2. datamaestro_text/config/com/github/apple/ml-qrecc.py +30 -25
  3. datamaestro_text/config/com/github/ikat.py +76 -61
  4. datamaestro_text/config/com/github/prdwb/orconvqa.py +41 -37
  5. datamaestro_text/config/com/microsoft/msmarco/passage.py +278 -207
  6. datamaestro_text/config/com/oscar-corpus.py +13 -10
  7. datamaestro_text/config/com/sentiment140.py +17 -12
  8. datamaestro_text/config/com/smashwords/bookcorpus.py +13 -10
  9. datamaestro_text/config/edu/stanford/aclimdb.py +14 -9
  10. datamaestro_text/config/edu/stanford/glove.py +66 -31
  11. datamaestro_text/config/edu/upenn/ldc/aquaint.py +35 -17
  12. datamaestro_text/config/fr/granddebat.py +57 -48
  13. datamaestro_text/config/gov/nist/ir/covid.py +61 -50
  14. datamaestro_text/config/gov/nist/trec/adhoc.py +395 -255
  15. datamaestro_text/config/gov/nist/trec/tipster.py +170 -64
  16. datamaestro_text/config/io/github/thunlp/fewrel.py +20 -15
  17. datamaestro_text/config/io/metamind/research/wikitext.py +50 -32
  18. datamaestro_text/config/org/grouplens/movielens.py +28 -37
  19. datamaestro_text/config/org/universaldependencies/french.py +16 -11
  20. datamaestro_text/test/test_documented.py +2 -2
  21. datamaestro_text/version.py +2 -2
  22. {datamaestro_text-2026.2.2.dist-info → datamaestro_text-2026.2.3.dist-info}/METADATA +2 -2
  23. {datamaestro_text-2026.2.2.dist-info → datamaestro_text-2026.2.3.dist-info}/RECORD +26 -26
  24. {datamaestro_text-2026.2.2.dist-info → datamaestro_text-2026.2.3.dist-info}/WHEEL +0 -0
  25. {datamaestro_text-2026.2.2.dist-info → datamaestro_text-2026.2.3.dist-info}/entry_points.txt +0 -0
  26. {datamaestro_text-2026.2.2.dist-info → datamaestro_text-2026.2.3.dist-info}/licenses/LICENSE +0 -0
@@ -1,19 +1,14 @@
1
1
  from datamaestro.data.csv import Generic
2
- from datamaestro.definitions import datatasks, datatags, dataset
3
- from datamaestro.download.archive import zipdownloader
2
+ from datamaestro.definitions import Dataset, datatasks, datatags, dataset
3
+ from datamaestro.download.archive import ZipDownloader
4
4
  from datamaestro.data.ml import Supervised
5
5
  from datamaestro.utils import HashCheck
6
6
 
7
7
 
8
- @zipdownloader(
9
- "dir",
10
- "http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip",
11
- checker=HashCheck("1647eb110dd2492512e27b9a70d5d1bc"),
12
- )
13
8
  @datatasks("sentiment analysis")
14
9
  @datatags("english", "sentiment", "text")
15
- @dataset(Supervised, url="http://help.sentiment140.com/for-students/", size="228M")
16
- def english(dir):
10
+ @dataset(url="http://help.sentiment140.com/for-students/", size="228M")
11
+ class English(Dataset):
17
12
  """Sentiment analysis dataset 140
18
13
 
19
14
  The data is a CSV with emoticons removed. Data file format has 6 fields:
@@ -26,7 +21,17 @@ def english(dir):
26
21
 
27
22
  If you use this data, please cite Sentiment140 as your source.
28
23
  """
29
- return Supervised.C(
30
- train=Generic.C(path=dir / "training.1600000.processed.noemoticon.csv"),
31
- test=Generic.C(path=dir / "testdata.manual.2009.06.14.csv"),
24
+
25
+ DIR = ZipDownloader(
26
+ "dir",
27
+ "http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip",
28
+ checker=HashCheck("1647eb110dd2492512e27b9a70d5d1bc"),
32
29
  )
30
+
31
+ def config(self) -> Supervised:
32
+ return Supervised.C(
33
+ train=Generic.C(
34
+ path=self.DIR.path / "training.1600000.processed.noemoticon.csv"
35
+ ),
36
+ test=Generic.C(path=self.DIR.path / "testdata.manual.2009.06.14.csv"),
37
+ )
@@ -1,23 +1,26 @@
1
1
  # See documentation on https://datamaestro.readthedocs.io
2
2
 
3
- from datamaestro.definitions import datatasks, datatags, dataset
3
+ from datamaestro.definitions import Dataset, datatasks, datatags, dataset
4
4
  from datamaestro_text.data.text import TextFolder
5
- from datamaestro.download.archive import tardownloader
5
+ from datamaestro.download.archive import TarDownloader
6
6
  from datamaestro.utils import HashCheck
7
7
 
8
8
 
9
9
  @datatags("text", "books", "English")
10
10
  @datatasks("language modeling")
11
- @tardownloader(
12
- "folder",
13
- "https://storage.googleapis.com/huggingface-nlp/datasets/bookcorpus/bookcorpus.tar.bz2",
14
- checker=HashCheck("5c906ede3c5265f8934b62c275a754bc"),
15
- )
16
- @dataset(TextFolder, id="", url="https://yknzhu.wixsite.com/mbweb", size="4.3G")
17
- def main(folder):
11
+ @dataset(id="", url="https://yknzhu.wixsite.com/mbweb", size="4.3G")
12
+ class Main(Dataset):
18
13
  """Unpublished books from Smashwords
19
14
 
20
15
  The books are concatened in two files hosted on huggingface NLP storage.
21
16
  Each sentence is on a separate line and tokens are space separated.
22
17
  """
23
- return {"path": folder}
18
+
19
+ FOLDER = TarDownloader(
20
+ "folder",
21
+ "https://storage.googleapis.com/huggingface-nlp/datasets/bookcorpus/bookcorpus.tar.bz2",
22
+ checker=HashCheck("5c906ede3c5265f8934b62c275a754bc"),
23
+ )
24
+
25
+ def config(self) -> TextFolder:
26
+ return TextFolder.C(path=self.FOLDER.path)
@@ -1,16 +1,21 @@
1
1
  from datamaestro.data.ml import FolderBased, Supervised
2
- from datamaestro.definitions import dataset
3
- from datamaestro.download.archive import tardownloader
2
+ from datamaestro.definitions import Dataset, dataset
3
+ from datamaestro.download.archive import TarDownloader
4
4
 
5
5
 
6
- @tardownloader("data", "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz")
7
- @dataset(Supervised, url="http://ai.stanford.edu/~amaas/data/sentiment/", id="")
8
- def aclimdb(data):
6
+ @dataset(url="http://ai.stanford.edu/~amaas/data/sentiment/", id="")
7
+ class Aclimdb(Dataset):
9
8
  """Large Movie Review Dataset
10
9
 
11
10
  Paper http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf
12
11
  """
13
- return {
14
- "train": FolderBased.C(path=data / "train", classes=["neg", "pos"]),
15
- "test": FolderBased.C(path=data / "test", classes=["neg", "pos"]),
16
- }
12
+
13
+ DATA = TarDownloader(
14
+ "data", "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
15
+ )
16
+
17
+ def config(self) -> Supervised:
18
+ return Supervised.C(
19
+ train=FolderBased.C(path=self.DATA.path / "train", classes=["neg", "pos"]),
20
+ test=FolderBased.C(path=self.DATA.path / "test", classes=["neg", "pos"]),
21
+ )
@@ -4,10 +4,10 @@ GloVe is an unsupervised learning algorithm for obtaining vector representations
4
4
  and the resulting representations showcase interesting linear substructures of the word vector space.
5
5
  """
6
6
 
7
- from datamaestro.definitions import dataset
7
+ from datamaestro.definitions import Dataset, dataset
8
8
  from datamaestro.download import reference
9
- from datamaestro.download.archive import zipdownloader
10
- from datamaestro.download.single import filedownloader
9
+ from datamaestro.download.archive import ZipDownloader
10
+ from datamaestro.download.single import FileDownloader
11
11
  from datamaestro_text.data.embeddings import WordEmbeddingsText
12
12
 
13
13
 
@@ -16,65 +16,100 @@ from datamaestro_text.data.embeddings import WordEmbeddingsText
16
16
  # tokens: 6G
17
17
  # vocabulary: 400K
18
18
  # cased: false
19
- @zipdownloader("embeddings", "http://nlp.stanford.edu/data/glove.6B.zip")
20
- @dataset(WordEmbeddingsText, id="6b")
21
- def glove_6b(embeddings):
19
+ @dataset(id=".6b")
20
+ class Glove6B(Dataset):
22
21
  """Embeddings for 6B words in various dimensions"""
23
- return {"path": embeddings}
24
22
 
23
+ EMBEDDINGS = ZipDownloader(
24
+ "embeddings", "http://nlp.stanford.edu/data/glove.6B.zip"
25
+ )
25
26
 
26
- @reference("data_6b", glove_6b)
27
- @dataset(WordEmbeddingsText, id="6b.50")
28
- def glove_6b_50(data_6b):
27
+ def config(self) -> WordEmbeddingsText:
28
+ return WordEmbeddingsText.C(path=self.EMBEDDINGS.path)
29
+
30
+
31
+ @dataset(id=".6b.50")
32
+ class Glove6B50(Dataset):
29
33
  """Glove 6B - dimension 50"""
30
- return {"path": data_6b.path / "glove.6B.50d.txt"}
34
+
35
+ DATA_6B = reference(varname="data_6b", reference=Glove6B)
36
+
37
+ def config(self) -> WordEmbeddingsText:
38
+ return WordEmbeddingsText.C(
39
+ path=self.DATA_6B.prepare().path / "glove.6B.50d.txt"
40
+ )
31
41
 
32
42
 
33
- @reference("data_6b", glove_6b)
34
- @dataset(WordEmbeddingsText, id="6b.100")
35
- def glove_6b_100(data_6b):
43
+ @dataset(id=".6b.100")
44
+ class Glove6B100(Dataset):
36
45
  """Glove 6B - dimension 100"""
37
- return {"path": data_6b.path / "glove.6B.100d.txt"}
38
46
 
47
+ DATA_6B = reference(varname="data_6b", reference=Glove6B)
39
48
 
40
- @reference("data_6b", glove_6b)
41
- @dataset(WordEmbeddingsText, id="6b.200")
42
- def glove_6b_200(data_6b):
49
+ def config(self) -> WordEmbeddingsText:
50
+ return WordEmbeddingsText.C(
51
+ path=self.DATA_6B.prepare().path / "glove.6B.100d.txt"
52
+ )
53
+
54
+
55
+ @dataset(id=".6b.200")
56
+ class Glove6B200(Dataset):
43
57
  """Glove 6B - dimension 200"""
44
- return {"path": data_6b.path / "glove.6B.200d.txt"}
58
+
59
+ DATA_6B = reference(varname="data_6b", reference=Glove6B)
60
+
61
+ def config(self) -> WordEmbeddingsText:
62
+ return WordEmbeddingsText.C(
63
+ path=self.DATA_6B.prepare().path / "glove.6B.200d.txt"
64
+ )
45
65
 
46
66
 
47
67
  ...
48
68
 
49
69
 
50
- @reference("data_6b", glove_6b)
51
- @dataset(WordEmbeddingsText, id="6b.300")
52
- def glove_6b_300(data_6b):
70
+ @dataset(id=".6b.300")
71
+ class Glove6B300(Dataset):
53
72
  """Glove 6B - dimension 200"""
54
- return {"path": data_6b.path / "glove.6B.200d.txt"}
55
73
 
74
+ DATA_6B = reference(varname="data_6b", reference=Glove6B)
75
+
76
+ def config(self) -> WordEmbeddingsText:
77
+ return WordEmbeddingsText.C(
78
+ path=self.DATA_6B.prepare().path / "glove.6B.200d.txt"
79
+ )
56
80
 
57
- @filedownloader("embeddings", "http://nlp.stanford.edu/data/glove.42B.300d.zip")
58
- @dataset(WordEmbeddingsText, id="42b")
81
+
82
+ @dataset(id=".42b")
59
83
  # size: 2.03G
60
84
  # statistics:
61
85
  # cased: true
62
86
  # tokens: 42B
63
87
  # vocabulary: 2.2M
64
88
  # dimension: 300
65
- def glove_42b(embeddings):
89
+ class Glove42B(Dataset):
66
90
  """Glove embeddings trained on Common Crawl with 42B tokens"""
67
- return {"path": embeddings}
68
91
 
92
+ EMBEDDINGS = FileDownloader(
93
+ "embeddings", "http://nlp.stanford.edu/data/glove.42B.300d.zip"
94
+ )
95
+
96
+ def config(self) -> WordEmbeddingsText:
97
+ return WordEmbeddingsText.C(path=self.EMBEDDINGS.path)
69
98
 
70
- @filedownloader("embeddings", "http://nlp.stanford.edu/data/glove.840B.300d.zip")
71
- @dataset(WordEmbeddingsText, id="840b")
99
+
100
+ @dataset(id=".840b")
72
101
  # size: 2.03G
73
102
  # statistics:
74
103
  # cased: true
75
104
  # tokens: 840G
76
105
  # vocabulary: 2.2M
77
106
  # dimension: 300
78
- def glove_840b(embeddings):
107
+ class Glove840B(Dataset):
79
108
  """Glove embeddings trained on Common Crawl with 840B tokens"""
80
- return {"path": embeddings}
109
+
110
+ EMBEDDINGS = FileDownloader(
111
+ "embeddings", "http://nlp.stanford.edu/data/glove.840B.300d.zip"
112
+ )
113
+
114
+ def config(self) -> WordEmbeddingsText:
115
+ return WordEmbeddingsText.C(path=self.EMBEDDINGS.path)
@@ -1,7 +1,7 @@
1
1
  """The AQUAINT Corpus, Linguistic Data Consortium (LDC) catalog number LDC2002T31 and ISBN 1-58563-240-6 consists of newswire text data in English, drawn from three sources: the Xinhua News Service (People's Republic of China), the New York Times News Service, and the Associated Press Worldstream News Service. It was prepared by the LDC for the AQUAINT Project, and will be used in official benchmark evaluations conducted by National Institute of Standards and Technology (NIST)."""
2
2
 
3
3
  from datamaestro.context import DatafolderPath
4
- from datamaestro.definitions import dataset
4
+ from datamaestro.definitions import Dataset, dataset
5
5
  from datamaestro.download.links import links, linkfolder
6
6
  from datamaestro_text.data.ir.trec import TipsterCollection
7
7
 
@@ -9,29 +9,47 @@ from datamaestro_text.data.ir.trec import TipsterCollection
9
9
  URL = "https://catalog.ldc.upenn.edu/LDC2002T31"
10
10
 
11
11
 
12
- @linkfolder("documents", [DatafolderPath("edu.upenn.ldc.aquaint", "APW")])
13
- @dataset(TipsterCollection, url=URL, id="apw")
14
- def apw(documents):
12
+ @dataset(url=URL, id=".apw")
13
+ class Apw(Dataset):
15
14
  """Associated Press (1998-2000)"""
16
- return {"path": documents}
17
15
 
16
+ DOCUMENTS = linkfolder(
17
+ "documents", [DatafolderPath("edu.upenn.ldc.aquaint", "APW")]
18
+ )
18
19
 
19
- @linkfolder("documents", [DatafolderPath("edu.upenn.ldc.aquaint", "NYT")])
20
- @dataset(TipsterCollection, url=URL, id="nyt")
21
- def nyt(documents):
20
+ def config(self) -> TipsterCollection:
21
+ return TipsterCollection.C(path=self.DOCUMENTS.path)
22
+
23
+
24
+ @dataset(url=URL, id=".nyt")
25
+ class Nyt(Dataset):
22
26
  """New York Times (1998-2000)"""
23
- return {"path": documents}
27
+
28
+ DOCUMENTS = linkfolder(
29
+ "documents", [DatafolderPath("edu.upenn.ldc.aquaint", "NYT")]
30
+ )
31
+
32
+ def config(self) -> TipsterCollection:
33
+ return TipsterCollection.C(path=self.DOCUMENTS.path)
24
34
 
25
35
 
26
- @linkfolder("documents", [DatafolderPath("edu.upenn.ldc.aquaint", "XIE")])
27
- @dataset(TipsterCollection, url=URL, id="xie")
28
- def xie(documents):
36
+ @dataset(url=URL, id=".xie")
37
+ class Xie(Dataset):
29
38
  """Xinhua News Agency newswires (1996-2000)"""
30
- return {"path": documents}
31
39
 
40
+ DOCUMENTS = linkfolder(
41
+ "documents", [DatafolderPath("edu.upenn.ldc.aquaint", "XIE")]
42
+ )
32
43
 
33
- @links("documents", apw=apw.path, nyt=nyt.path, xie=xie.path)
34
- @dataset(TipsterCollection, url=URL, id="")
35
- def aquaint(documents):
44
+ def config(self) -> TipsterCollection:
45
+ return TipsterCollection.C(path=self.DOCUMENTS.path)
46
+
47
+
48
+ @dataset(url=URL, id="")
49
+ class Aquaint(Dataset):
36
50
  """Aquaint documents"""
37
- return {"path": documents}
51
+
52
+ DOCUMENTS = links("documents", apw=Apw, nyt=Nyt, xie=Xie)
53
+
54
+ def config(self) -> TipsterCollection:
55
+ return TipsterCollection.C(path=self.DOCUMENTS.path)
@@ -1,9 +1,8 @@
1
1
  # See documentation on https://datamaestro.readthedocs.io
2
2
 
3
- from pathlib import Path
4
- from datamaestro.definitions import datatags, dataset
3
+ from datamaestro.definitions import Dataset, datatags, dataset
5
4
  from datamaestro_text.data.debate import GrandDebatFile
6
- from datamaestro.download.single import filedownloader
5
+ from datamaestro.download.single import FileDownloader
7
6
  from datamaestro.utils import HashCheck
8
7
  from datamaestro.stream import Transform
9
8
  import io
@@ -46,18 +45,11 @@ class JsonToJsonl(Transform):
46
45
  return r_file
47
46
 
48
47
 
49
- @filedownloader(
50
- "la_transition_ecologique_2019_03_21.jsonl",
51
- "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LA_TRANSITION_ECOLOGIQUE.json",
52
- checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
53
- transforms=JsonToJsonl(),
54
- )
55
48
  @datatags("politics", "debate", "french")
56
49
  @dataset(
57
- GrandDebatFile,
58
50
  url="https://granddebat.fr",
59
51
  )
60
- def transition(la_transition_ecologique_2019_03_21: Path):
52
+ class Transition(Dataset):
61
53
  """Grand Débat National (transition écologique)
62
54
 
63
55
  The *Grand Débat National* (GDN) is a country-wide citizen consultation held
@@ -71,21 +63,23 @@ def transition(la_transition_ecologique_2019_03_21: Path):
71
63
  each concluding with a critical open-ended prompt: "Do you have anything to
72
64
  add about [theme]?".
73
65
  """
74
- return GrandDebatFile.C(path=la_transition_ecologique_2019_03_21)
66
+
67
+ FILE = FileDownloader(
68
+ "la_transition_ecologique_2019_03_21.jsonl",
69
+ "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LA_TRANSITION_ECOLOGIQUE.json",
70
+ checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
71
+ transforms=JsonToJsonl(),
72
+ )
73
+
74
+ def config(self) -> GrandDebatFile:
75
+ return GrandDebatFile.C(path=self.FILE.path)
75
76
 
76
77
 
77
- @filedownloader(
78
- "fiscalité_et_dépenses_publiques_2019_03_21.jsonl",
79
- "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json",
80
- checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
81
- transforms=JsonToJsonl(),
82
- )
83
78
  @datatags("politics", "debate", "french")
84
79
  @dataset(
85
- GrandDebatFile,
86
80
  url="https://granddebat.fr",
87
81
  )
88
- def fiscalité(fiscalité_et_dépenses_publiques_2019_03_21: Path):
82
+ class Fiscalité(Dataset):
89
83
  """Grand Débat National (fiscalité et dépenses publiques)
90
84
 
91
85
  The *Grand Débat National* (GDN) is a country-wide citizen consultation held
@@ -99,21 +93,23 @@ def fiscalité(fiscalité_et_dépenses_publiques_2019_03_21: Path):
99
93
  each concluding with a critical open-ended prompt: "Do you have anything to
100
94
  add about [theme]?".
101
95
  """
102
- return GrandDebatFile.C(path=fiscalité_et_dépenses_publiques_2019_03_21)
96
+
97
+ FILE = FileDownloader(
98
+ "fiscalité_et_dépenses_publiques_2019_03_21.jsonl",
99
+ "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json",
100
+ checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
101
+ transforms=JsonToJsonl(),
102
+ )
103
+
104
+ def config(self) -> GrandDebatFile:
105
+ return GrandDebatFile.C(path=self.FILE.path)
103
106
 
104
107
 
105
- @filedownloader(
106
- "democratie_et_citoyennete_2019_03_21.jsonl",
107
- "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/DEMOCRATIE_ET_CITOYENNETE.json",
108
- checker=HashCheck("049aaeca7e51747f2da5d68143c760fa"),
109
- transforms=JsonToJsonl(),
110
- )
111
108
  @datatags("politics", "debate", "french")
112
109
  @dataset(
113
- GrandDebatFile,
114
110
  url="https://granddebat.fr",
115
111
  )
116
- def démocratie(democratie_et_citoyennete_2019_03_21: Path):
112
+ class Démocratie(Dataset):
117
113
  """Grand Débat National (démocratie et citoyenneté)
118
114
 
119
115
  The *Grand Débat National* (GDN) is a country-wide citizen consultation held
@@ -127,21 +123,23 @@ def démocratie(democratie_et_citoyennete_2019_03_21: Path):
127
123
  each concluding with a critical open-ended prompt: "Do you have anything to
128
124
  add about [theme]?".
129
125
  """
130
- return GrandDebatFile.C(path=democratie_et_citoyennete_2019_03_21)
126
+
127
+ FILE = FileDownloader(
128
+ "democratie_et_citoyennete_2019_03_21.jsonl",
129
+ "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/DEMOCRATIE_ET_CITOYENNETE.json",
130
+ checker=HashCheck("049aaeca7e51747f2da5d68143c760fa"),
131
+ transforms=JsonToJsonl(),
132
+ )
133
+
134
+ def config(self) -> GrandDebatFile:
135
+ return GrandDebatFile.C(path=self.FILE.path)
131
136
 
132
137
 
133
- @filedownloader(
134
- "organisation_etat_services_publics_2019_03_21.jsonl",
135
- "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json",
136
- checker=HashCheck("0ccb7c401889f738b73b0caab897a68b"),
137
- transforms=JsonToJsonl(),
138
- )
139
138
  @datatags("politics", "debate", "french")
140
139
  @dataset(
141
- GrandDebatFile,
142
140
  url="https://granddebat.fr",
143
141
  )
144
- def organisation(organisation_etat_services_publics_2019_03_21: Path):
142
+ class Organisation(Dataset):
145
143
  """Grand Débat National (organisation de l'État et des services publics)
146
144
 
147
145
  The *Grand Débat National* (GDN) is a country-wide citizen consultation held
@@ -155,21 +153,23 @@ def organisation(organisation_etat_services_publics_2019_03_21: Path):
155
153
  each concluding with a critical open-ended prompt: "Do you have anything to
156
154
  add about [theme]?".
157
155
  """
158
- return GrandDebatFile.C(path=organisation_etat_services_publics_2019_03_21)
156
+
157
+ FILE = FileDownloader(
158
+ "organisation_etat_services_publics_2019_03_21.jsonl",
159
+ "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json",
160
+ checker=HashCheck("0ccb7c401889f738b73b0caab897a68b"),
161
+ transforms=JsonToJsonl(),
162
+ )
163
+
164
+ def config(self) -> GrandDebatFile:
165
+ return GrandDebatFile.C(path=self.FILE.path)
159
166
 
160
167
 
161
- @filedownloader(
162
- "les_evenements_2019_03_21.jsonl",
163
- "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LES_EVENEMENTS.json",
164
- checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
165
- transforms=JsonToJsonl(),
166
- )
167
168
  @datatags("politics", "debate", "french")
168
169
  @dataset(
169
- GrandDebatFile,
170
170
  url="https://granddebat.fr",
171
171
  )
172
- def evenements(les_evenements_2019_03_21: Path):
172
+ class Evenements(Dataset):
173
173
  """Grand Débat National (événements)
174
174
 
175
175
  The *Grand Débat National* (GDN) is a country-wide citizen consultation held
@@ -183,4 +183,13 @@ def evenements(les_evenements_2019_03_21: Path):
183
183
  each concluding with a critical open-ended prompt: "Do you have anything to
184
184
  add about [theme]?".
185
185
  """
186
- return GrandDebatFile.C(path=les_evenements_2019_03_21)
186
+
187
+ FILE = FileDownloader(
188
+ "les_evenements_2019_03_21.jsonl",
189
+ "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LES_EVENEMENTS.json",
190
+ checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
191
+ transforms=JsonToJsonl(),
192
+ )
193
+
194
+ def config(self) -> GrandDebatFile:
195
+ return GrandDebatFile.C(path=self.FILE.path)
@@ -1,9 +1,9 @@
1
1
  """CORD-19 dataset"""
2
2
 
3
3
  from datamaestro.annotations.agreement import useragreement
4
- from datamaestro.definitions import datatasks, dataset
4
+ from datamaestro.definitions import Dataset, datatasks, dataset
5
5
  from datamaestro.download import reference
6
- from datamaestro.download.single import filedownloader
6
+ from datamaestro.download.single import FileDownloader
7
7
  from datamaestro.utils import HashCheck
8
8
  from datamaestro_text.data.ir import Adhoc
9
9
  import datamaestro_text.data.ir.cord19 as d_cord19
@@ -14,19 +14,19 @@ cord19_lua = useragreement(
14
14
 
15
15
  By accessing, downloading or otherwise using any Journals, Articles, Metadata, Abstracts,
16
16
  Full-Texts or any other content types provided in the COVID-19 Open Research Dataset (CORD-19)
17
- Database (the Data), You expressly acknowledge and agree to the following:
17
+ Database (the "Data"), You expressly acknowledge and agree to the following:
18
18
 
19
- AI2 grants to You a worldwide, perpetual, non-exclusive, non-transferablelicenseto use and
19
+ \u2022 AI2 grants to You a worldwide, perpetual, non-exclusive, non-transferablelicenseto use and
20
20
  make derivatives of the Datafor text and data mining only.
21
21
 
22
- AI2 warrants that it has the right to make the Data available to Youas provided for in and
22
+ \u2022 AI2 warrants that it has the right to make the Data available to Youas provided for in and
23
23
  subject to this Agreement and in accordance with applicable law. EXCEPT FOR THE LIMITED WARRANTY
24
- IN THIS SECTION, THE DATA IS PROVIDED AS IS”, WITHOUT ANY WARRANTIES OF ANY KIND.
24
+ IN THIS SECTION, THE DATA IS PROVIDED "AS IS", WITHOUT ANY WARRANTIES OF ANY KIND.
25
25
 
26
- You agree to comply with all applicable local, state, national, and international laws and
27
- regulations with respect to AI2s license and Youruse of the Data.• Data provided by AI2 is
26
+ \u2022 You agree to comply with all applicable local, state, national, and international laws and
27
+ regulations with respect to AI2's license and Youruse of the Data.\u2022 Data provided by AI2 is
28
28
  from copyrighted sources of the respective copyright holders. You are solely responsible
29
- for Your and Your users compliance with any copyright, patent or trademark restrictions
29
+ for Your and Your users' compliance with any copyright, patent or trademark restrictions
30
30
  and are referred to the copyright, patent or trademark notices appearing in the original
31
31
  sources, all of which are hereby incorporated by reference""",
32
32
  id="ai2.cord19",
@@ -34,62 +34,73 @@ sources, all of which are hereby incorporated by reference""",
34
34
 
35
35
 
36
36
  @cord19_lua
37
- @filedownloader(
38
- "data.csv",
39
- url="https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv",
40
- checker=HashCheck("80d664e496b8b7e50a39c6f6bb92e0ef"),
41
- )
42
37
  @dataset(
43
- d_cord19.Documents,
44
38
  url="https://ir.nist.gov/covidSubmit/index.html",
45
39
  )
46
- def cord19_round5_metadata(data):
40
+ class Cord19Round5Metadata(Dataset):
47
41
  """Cord 19 metadata (round 5)
48
42
 
49
43
  Released on 2020-07-16
50
44
  """
51
- return {
52
- "path": data,
53
- "names_row": 0,
54
- # Number of documents
55
- "count": 192509,
56
- }
57
-
58
-
59
- @filedownloader(
60
- "data.xml",
61
- url="https://ir.nist.gov/covidSubmit/data/topics-rnd5.xml",
62
- checker=HashCheck("0307a37b6b9f1a5f233340a769d538ea"),
63
- )
64
- @dataset(d_cord19.Topics)
65
- def cord19_round5_topics(data):
45
+
46
+ DATA = FileDownloader(
47
+ "data.csv",
48
+ url="https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv",
49
+ checker=HashCheck("80d664e496b8b7e50a39c6f6bb92e0ef"),
50
+ )
51
+
52
+ def config(self) -> d_cord19.Documents:
53
+ return d_cord19.Documents.C(
54
+ path=self.DATA.path,
55
+ names_row=0,
56
+ # Number of documents
57
+ count=192509,
58
+ )
59
+
60
+
61
+ @dataset()
62
+ class Cord19Round5Topics(Dataset):
66
63
  """CORD-19 topics (round 5)"""
67
- return {"path": data}
68
64
 
65
+ DATA = FileDownloader(
66
+ "data.xml",
67
+ url="https://ir.nist.gov/covidSubmit/data/topics-rnd5.xml",
68
+ checker=HashCheck("0307a37b6b9f1a5f233340a769d538ea"),
69
+ )
69
70
 
70
- @filedownloader(
71
- "data.ssv",
72
- url="https://ir.nist.gov/covidSubmit/data/qrels-covid_d5_j0.5-5.txt",
73
- checker=HashCheck("8138424a59daea0aba751c8a891e5f54"),
74
- )
75
- @dataset(TrecAdhocAssessments)
76
- def cord19_round5_assessments(data):
71
+ def config(self) -> d_cord19.Topics:
72
+ return d_cord19.Topics.C(path=self.DATA.path)
73
+
74
+
75
+ @dataset()
76
+ class Cord19Round5Assessments(Dataset):
77
77
  """CORD19 assessments (round 5)"""
78
- return {"path": data}
78
+
79
+ DATA = FileDownloader(
80
+ "data.ssv",
81
+ url="https://ir.nist.gov/covidSubmit/data/qrels-covid_d5_j0.5-5.txt",
82
+ checker=HashCheck("8138424a59daea0aba751c8a891e5f54"),
83
+ )
84
+
85
+ def config(self) -> TrecAdhocAssessments:
86
+ return TrecAdhocAssessments.C(path=self.DATA.path)
79
87
 
80
88
 
81
- @reference("collection", cord19_round5_metadata)
82
- @reference("topics", cord19_round5_topics)
83
- @reference("qrels", cord19_round5_assessments)
84
89
  @datatasks("information retrieval", "passage retrieval")
85
- @dataset(Adhoc, url="https://ir.nist.gov/covidSubmit/data.html")
86
- def cord19_round5(topics, qrels, collection):
90
+ @dataset(url="https://ir.nist.gov/covidSubmit/data.html")
91
+ class Cord19Round5(Dataset):
87
92
  """CORD-19 IR collection (round 5)
88
93
 
89
94
  This is the primary test collection for ad hoc retrieval that is the outcome of all five rounds of TREC-COVID. The test set, called TREC-COVID Complete, consists of the Round 5 document set (July 16 release of CORD-19); the final set of 50 topics; and the cumulative judgments from all assessing rounds with CORD-UIDs mapped to July 16 ids if necessary, previously judged documents no longer in the July 16 release removed, and the last judgments for documents judged multiple times due to significant content changes between rounds. Note that no TREC-COVID submissions correspond to this collection since all TREC-COVID submissions were subject to residual collection evaluation.
90
95
  """
91
- return {
92
- "documents": collection,
93
- "topics": topics,
94
- "assessments": qrels,
95
- }
96
+
97
+ COLLECTION = reference(varname="collection", reference=Cord19Round5Metadata)
98
+ TOPICS = reference(varname="topics", reference=Cord19Round5Topics)
99
+ QRELS = reference(varname="qrels", reference=Cord19Round5Assessments)
100
+
101
+ def config(self) -> Adhoc:
102
+ return Adhoc.C(
103
+ documents=self.COLLECTION.prepare(),
104
+ topics=self.TOPICS.prepare(),
105
+ assessments=self.QRELS.prepare(),
106
+ )