datamaestro-text 2026.2.2__py3-none-any.whl → 2026.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamaestro_text/config/com/github/aagohary/canard.py +27 -24
- datamaestro_text/config/com/github/apple/ml-qrecc.py +30 -25
- datamaestro_text/config/com/github/ikat.py +76 -61
- datamaestro_text/config/com/github/prdwb/orconvqa.py +41 -37
- datamaestro_text/config/com/microsoft/msmarco/passage.py +278 -207
- datamaestro_text/config/com/oscar-corpus.py +13 -10
- datamaestro_text/config/com/sentiment140.py +17 -12
- datamaestro_text/config/com/smashwords/bookcorpus.py +13 -10
- datamaestro_text/config/edu/stanford/aclimdb.py +14 -9
- datamaestro_text/config/edu/stanford/glove.py +66 -31
- datamaestro_text/config/edu/upenn/ldc/aquaint.py +35 -17
- datamaestro_text/config/fr/granddebat.py +57 -48
- datamaestro_text/config/gov/nist/ir/covid.py +61 -50
- datamaestro_text/config/gov/nist/trec/adhoc.py +395 -255
- datamaestro_text/config/gov/nist/trec/tipster.py +170 -64
- datamaestro_text/config/io/github/thunlp/fewrel.py +20 -15
- datamaestro_text/config/io/metamind/research/wikitext.py +50 -32
- datamaestro_text/config/org/grouplens/movielens.py +28 -37
- datamaestro_text/config/org/universaldependencies/french.py +16 -11
- datamaestro_text/test/test_documented.py +2 -2
- datamaestro_text/version.py +2 -2
- {datamaestro_text-2026.2.2.dist-info → datamaestro_text-2026.2.3.dist-info}/METADATA +2 -2
- {datamaestro_text-2026.2.2.dist-info → datamaestro_text-2026.2.3.dist-info}/RECORD +26 -26
- {datamaestro_text-2026.2.2.dist-info → datamaestro_text-2026.2.3.dist-info}/WHEEL +0 -0
- {datamaestro_text-2026.2.2.dist-info → datamaestro_text-2026.2.3.dist-info}/entry_points.txt +0 -0
- {datamaestro_text-2026.2.2.dist-info → datamaestro_text-2026.2.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,19 +1,14 @@
|
|
|
1
1
|
from datamaestro.data.csv import Generic
|
|
2
|
-
from datamaestro.definitions import datatasks, datatags, dataset
|
|
3
|
-
from datamaestro.download.archive import
|
|
2
|
+
from datamaestro.definitions import Dataset, datatasks, datatags, dataset
|
|
3
|
+
from datamaestro.download.archive import ZipDownloader
|
|
4
4
|
from datamaestro.data.ml import Supervised
|
|
5
5
|
from datamaestro.utils import HashCheck
|
|
6
6
|
|
|
7
7
|
|
|
8
|
-
@zipdownloader(
|
|
9
|
-
"dir",
|
|
10
|
-
"http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip",
|
|
11
|
-
checker=HashCheck("1647eb110dd2492512e27b9a70d5d1bc"),
|
|
12
|
-
)
|
|
13
8
|
@datatasks("sentiment analysis")
|
|
14
9
|
@datatags("english", "sentiment", "text")
|
|
15
|
-
@dataset(
|
|
16
|
-
|
|
10
|
+
@dataset(url="http://help.sentiment140.com/for-students/", size="228M")
|
|
11
|
+
class English(Dataset):
|
|
17
12
|
"""Sentiment analysis dataset 140
|
|
18
13
|
|
|
19
14
|
The data is a CSV with emoticons removed. Data file format has 6 fields:
|
|
@@ -26,7 +21,17 @@ def english(dir):
|
|
|
26
21
|
|
|
27
22
|
If you use this data, please cite Sentiment140 as your source.
|
|
28
23
|
"""
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
24
|
+
|
|
25
|
+
DIR = ZipDownloader(
|
|
26
|
+
"dir",
|
|
27
|
+
"http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip",
|
|
28
|
+
checker=HashCheck("1647eb110dd2492512e27b9a70d5d1bc"),
|
|
32
29
|
)
|
|
30
|
+
|
|
31
|
+
def config(self) -> Supervised:
|
|
32
|
+
return Supervised.C(
|
|
33
|
+
train=Generic.C(
|
|
34
|
+
path=self.DIR.path / "training.1600000.processed.noemoticon.csv"
|
|
35
|
+
),
|
|
36
|
+
test=Generic.C(path=self.DIR.path / "testdata.manual.2009.06.14.csv"),
|
|
37
|
+
)
|
|
@@ -1,23 +1,26 @@
|
|
|
1
1
|
# See documentation on https://datamaestro.readthedocs.io
|
|
2
2
|
|
|
3
|
-
from datamaestro.definitions import datatasks, datatags, dataset
|
|
3
|
+
from datamaestro.definitions import Dataset, datatasks, datatags, dataset
|
|
4
4
|
from datamaestro_text.data.text import TextFolder
|
|
5
|
-
from datamaestro.download.archive import
|
|
5
|
+
from datamaestro.download.archive import TarDownloader
|
|
6
6
|
from datamaestro.utils import HashCheck
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
@datatags("text", "books", "English")
|
|
10
10
|
@datatasks("language modeling")
|
|
11
|
-
@
|
|
12
|
-
|
|
13
|
-
"https://storage.googleapis.com/huggingface-nlp/datasets/bookcorpus/bookcorpus.tar.bz2",
|
|
14
|
-
checker=HashCheck("5c906ede3c5265f8934b62c275a754bc"),
|
|
15
|
-
)
|
|
16
|
-
@dataset(TextFolder, id="", url="https://yknzhu.wixsite.com/mbweb", size="4.3G")
|
|
17
|
-
def main(folder):
|
|
11
|
+
@dataset(id="", url="https://yknzhu.wixsite.com/mbweb", size="4.3G")
|
|
12
|
+
class Main(Dataset):
|
|
18
13
|
"""Unpublished books from Smashwords
|
|
19
14
|
|
|
20
15
|
The books are concatened in two files hosted on huggingface NLP storage.
|
|
21
16
|
Each sentence is on a separate line and tokens are space separated.
|
|
22
17
|
"""
|
|
23
|
-
|
|
18
|
+
|
|
19
|
+
FOLDER = TarDownloader(
|
|
20
|
+
"folder",
|
|
21
|
+
"https://storage.googleapis.com/huggingface-nlp/datasets/bookcorpus/bookcorpus.tar.bz2",
|
|
22
|
+
checker=HashCheck("5c906ede3c5265f8934b62c275a754bc"),
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
def config(self) -> TextFolder:
|
|
26
|
+
return TextFolder.C(path=self.FOLDER.path)
|
|
@@ -1,16 +1,21 @@
|
|
|
1
1
|
from datamaestro.data.ml import FolderBased, Supervised
|
|
2
|
-
from datamaestro.definitions import dataset
|
|
3
|
-
from datamaestro.download.archive import
|
|
2
|
+
from datamaestro.definitions import Dataset, dataset
|
|
3
|
+
from datamaestro.download.archive import TarDownloader
|
|
4
4
|
|
|
5
5
|
|
|
6
|
-
@
|
|
7
|
-
|
|
8
|
-
def aclimdb(data):
|
|
6
|
+
@dataset(url="http://ai.stanford.edu/~amaas/data/sentiment/", id="")
|
|
7
|
+
class Aclimdb(Dataset):
|
|
9
8
|
"""Large Movie Review Dataset
|
|
10
9
|
|
|
11
10
|
Paper http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf
|
|
12
11
|
"""
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
"
|
|
16
|
-
|
|
12
|
+
|
|
13
|
+
DATA = TarDownloader(
|
|
14
|
+
"data", "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
def config(self) -> Supervised:
|
|
18
|
+
return Supervised.C(
|
|
19
|
+
train=FolderBased.C(path=self.DATA.path / "train", classes=["neg", "pos"]),
|
|
20
|
+
test=FolderBased.C(path=self.DATA.path / "test", classes=["neg", "pos"]),
|
|
21
|
+
)
|
|
@@ -4,10 +4,10 @@ GloVe is an unsupervised learning algorithm for obtaining vector representations
|
|
|
4
4
|
and the resulting representations showcase interesting linear substructures of the word vector space.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
from datamaestro.definitions import dataset
|
|
7
|
+
from datamaestro.definitions import Dataset, dataset
|
|
8
8
|
from datamaestro.download import reference
|
|
9
|
-
from datamaestro.download.archive import
|
|
10
|
-
from datamaestro.download.single import
|
|
9
|
+
from datamaestro.download.archive import ZipDownloader
|
|
10
|
+
from datamaestro.download.single import FileDownloader
|
|
11
11
|
from datamaestro_text.data.embeddings import WordEmbeddingsText
|
|
12
12
|
|
|
13
13
|
|
|
@@ -16,65 +16,100 @@ from datamaestro_text.data.embeddings import WordEmbeddingsText
|
|
|
16
16
|
# tokens: 6G
|
|
17
17
|
# vocabulary: 400K
|
|
18
18
|
# cased: false
|
|
19
|
-
@
|
|
20
|
-
|
|
21
|
-
def glove_6b(embeddings):
|
|
19
|
+
@dataset(id=".6b")
|
|
20
|
+
class Glove6B(Dataset):
|
|
22
21
|
"""Embeddings for 6B words in various dimensions"""
|
|
23
|
-
return {"path": embeddings}
|
|
24
22
|
|
|
23
|
+
EMBEDDINGS = ZipDownloader(
|
|
24
|
+
"embeddings", "http://nlp.stanford.edu/data/glove.6B.zip"
|
|
25
|
+
)
|
|
25
26
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
27
|
+
def config(self) -> WordEmbeddingsText:
|
|
28
|
+
return WordEmbeddingsText.C(path=self.EMBEDDINGS.path)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataset(id=".6b.50")
|
|
32
|
+
class Glove6B50(Dataset):
|
|
29
33
|
"""Glove 6B - dimension 50"""
|
|
30
|
-
|
|
34
|
+
|
|
35
|
+
DATA_6B = reference(varname="data_6b", reference=Glove6B)
|
|
36
|
+
|
|
37
|
+
def config(self) -> WordEmbeddingsText:
|
|
38
|
+
return WordEmbeddingsText.C(
|
|
39
|
+
path=self.DATA_6B.prepare().path / "glove.6B.50d.txt"
|
|
40
|
+
)
|
|
31
41
|
|
|
32
42
|
|
|
33
|
-
@
|
|
34
|
-
|
|
35
|
-
def glove_6b_100(data_6b):
|
|
43
|
+
@dataset(id=".6b.100")
|
|
44
|
+
class Glove6B100(Dataset):
|
|
36
45
|
"""Glove 6B - dimension 100"""
|
|
37
|
-
return {"path": data_6b.path / "glove.6B.100d.txt"}
|
|
38
46
|
|
|
47
|
+
DATA_6B = reference(varname="data_6b", reference=Glove6B)
|
|
39
48
|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
49
|
+
def config(self) -> WordEmbeddingsText:
|
|
50
|
+
return WordEmbeddingsText.C(
|
|
51
|
+
path=self.DATA_6B.prepare().path / "glove.6B.100d.txt"
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataset(id=".6b.200")
|
|
56
|
+
class Glove6B200(Dataset):
|
|
43
57
|
"""Glove 6B - dimension 200"""
|
|
44
|
-
|
|
58
|
+
|
|
59
|
+
DATA_6B = reference(varname="data_6b", reference=Glove6B)
|
|
60
|
+
|
|
61
|
+
def config(self) -> WordEmbeddingsText:
|
|
62
|
+
return WordEmbeddingsText.C(
|
|
63
|
+
path=self.DATA_6B.prepare().path / "glove.6B.200d.txt"
|
|
64
|
+
)
|
|
45
65
|
|
|
46
66
|
|
|
47
67
|
...
|
|
48
68
|
|
|
49
69
|
|
|
50
|
-
@
|
|
51
|
-
|
|
52
|
-
def glove_6b_300(data_6b):
|
|
70
|
+
@dataset(id=".6b.300")
|
|
71
|
+
class Glove6B300(Dataset):
|
|
53
72
|
"""Glove 6B - dimension 200"""
|
|
54
|
-
return {"path": data_6b.path / "glove.6B.200d.txt"}
|
|
55
73
|
|
|
74
|
+
DATA_6B = reference(varname="data_6b", reference=Glove6B)
|
|
75
|
+
|
|
76
|
+
def config(self) -> WordEmbeddingsText:
|
|
77
|
+
return WordEmbeddingsText.C(
|
|
78
|
+
path=self.DATA_6B.prepare().path / "glove.6B.200d.txt"
|
|
79
|
+
)
|
|
56
80
|
|
|
57
|
-
|
|
58
|
-
@dataset(
|
|
81
|
+
|
|
82
|
+
@dataset(id=".42b")
|
|
59
83
|
# size: 2.03G
|
|
60
84
|
# statistics:
|
|
61
85
|
# cased: true
|
|
62
86
|
# tokens: 42B
|
|
63
87
|
# vocabulary: 2.2M
|
|
64
88
|
# dimension: 300
|
|
65
|
-
|
|
89
|
+
class Glove42B(Dataset):
|
|
66
90
|
"""Glove embeddings trained on Common Crawl with 42B tokens"""
|
|
67
|
-
return {"path": embeddings}
|
|
68
91
|
|
|
92
|
+
EMBEDDINGS = FileDownloader(
|
|
93
|
+
"embeddings", "http://nlp.stanford.edu/data/glove.42B.300d.zip"
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
def config(self) -> WordEmbeddingsText:
|
|
97
|
+
return WordEmbeddingsText.C(path=self.EMBEDDINGS.path)
|
|
69
98
|
|
|
70
|
-
|
|
71
|
-
@dataset(
|
|
99
|
+
|
|
100
|
+
@dataset(id=".840b")
|
|
72
101
|
# size: 2.03G
|
|
73
102
|
# statistics:
|
|
74
103
|
# cased: true
|
|
75
104
|
# tokens: 840G
|
|
76
105
|
# vocabulary: 2.2M
|
|
77
106
|
# dimension: 300
|
|
78
|
-
|
|
107
|
+
class Glove840B(Dataset):
|
|
79
108
|
"""Glove embeddings trained on Common Crawl with 840B tokens"""
|
|
80
|
-
|
|
109
|
+
|
|
110
|
+
EMBEDDINGS = FileDownloader(
|
|
111
|
+
"embeddings", "http://nlp.stanford.edu/data/glove.840B.300d.zip"
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
def config(self) -> WordEmbeddingsText:
|
|
115
|
+
return WordEmbeddingsText.C(path=self.EMBEDDINGS.path)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""The AQUAINT Corpus, Linguistic Data Consortium (LDC) catalog number LDC2002T31 and ISBN 1-58563-240-6 consists of newswire text data in English, drawn from three sources: the Xinhua News Service (People's Republic of China), the New York Times News Service, and the Associated Press Worldstream News Service. It was prepared by the LDC for the AQUAINT Project, and will be used in official benchmark evaluations conducted by National Institute of Standards and Technology (NIST)."""
|
|
2
2
|
|
|
3
3
|
from datamaestro.context import DatafolderPath
|
|
4
|
-
from datamaestro.definitions import dataset
|
|
4
|
+
from datamaestro.definitions import Dataset, dataset
|
|
5
5
|
from datamaestro.download.links import links, linkfolder
|
|
6
6
|
from datamaestro_text.data.ir.trec import TipsterCollection
|
|
7
7
|
|
|
@@ -9,29 +9,47 @@ from datamaestro_text.data.ir.trec import TipsterCollection
|
|
|
9
9
|
URL = "https://catalog.ldc.upenn.edu/LDC2002T31"
|
|
10
10
|
|
|
11
11
|
|
|
12
|
-
@
|
|
13
|
-
|
|
14
|
-
def apw(documents):
|
|
12
|
+
@dataset(url=URL, id=".apw")
|
|
13
|
+
class Apw(Dataset):
|
|
15
14
|
"""Associated Press (1998-2000)"""
|
|
16
|
-
return {"path": documents}
|
|
17
15
|
|
|
16
|
+
DOCUMENTS = linkfolder(
|
|
17
|
+
"documents", [DatafolderPath("edu.upenn.ldc.aquaint", "APW")]
|
|
18
|
+
)
|
|
18
19
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
20
|
+
def config(self) -> TipsterCollection:
|
|
21
|
+
return TipsterCollection.C(path=self.DOCUMENTS.path)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataset(url=URL, id=".nyt")
|
|
25
|
+
class Nyt(Dataset):
|
|
22
26
|
"""New York Times (1998-2000)"""
|
|
23
|
-
|
|
27
|
+
|
|
28
|
+
DOCUMENTS = linkfolder(
|
|
29
|
+
"documents", [DatafolderPath("edu.upenn.ldc.aquaint", "NYT")]
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
def config(self) -> TipsterCollection:
|
|
33
|
+
return TipsterCollection.C(path=self.DOCUMENTS.path)
|
|
24
34
|
|
|
25
35
|
|
|
26
|
-
@
|
|
27
|
-
|
|
28
|
-
def xie(documents):
|
|
36
|
+
@dataset(url=URL, id=".xie")
|
|
37
|
+
class Xie(Dataset):
|
|
29
38
|
"""Xinhua News Agency newswires (1996-2000)"""
|
|
30
|
-
return {"path": documents}
|
|
31
39
|
|
|
40
|
+
DOCUMENTS = linkfolder(
|
|
41
|
+
"documents", [DatafolderPath("edu.upenn.ldc.aquaint", "XIE")]
|
|
42
|
+
)
|
|
32
43
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
44
|
+
def config(self) -> TipsterCollection:
|
|
45
|
+
return TipsterCollection.C(path=self.DOCUMENTS.path)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataset(url=URL, id="")
|
|
49
|
+
class Aquaint(Dataset):
|
|
36
50
|
"""Aquaint documents"""
|
|
37
|
-
|
|
51
|
+
|
|
52
|
+
DOCUMENTS = links("documents", apw=Apw, nyt=Nyt, xie=Xie)
|
|
53
|
+
|
|
54
|
+
def config(self) -> TipsterCollection:
|
|
55
|
+
return TipsterCollection.C(path=self.DOCUMENTS.path)
|
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
# See documentation on https://datamaestro.readthedocs.io
|
|
2
2
|
|
|
3
|
-
from
|
|
4
|
-
from datamaestro.definitions import datatags, dataset
|
|
3
|
+
from datamaestro.definitions import Dataset, datatags, dataset
|
|
5
4
|
from datamaestro_text.data.debate import GrandDebatFile
|
|
6
|
-
from datamaestro.download.single import
|
|
5
|
+
from datamaestro.download.single import FileDownloader
|
|
7
6
|
from datamaestro.utils import HashCheck
|
|
8
7
|
from datamaestro.stream import Transform
|
|
9
8
|
import io
|
|
@@ -46,18 +45,11 @@ class JsonToJsonl(Transform):
|
|
|
46
45
|
return r_file
|
|
47
46
|
|
|
48
47
|
|
|
49
|
-
@filedownloader(
|
|
50
|
-
"la_transition_ecologique_2019_03_21.jsonl",
|
|
51
|
-
"http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LA_TRANSITION_ECOLOGIQUE.json",
|
|
52
|
-
checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
|
|
53
|
-
transforms=JsonToJsonl(),
|
|
54
|
-
)
|
|
55
48
|
@datatags("politics", "debate", "french")
|
|
56
49
|
@dataset(
|
|
57
|
-
GrandDebatFile,
|
|
58
50
|
url="https://granddebat.fr",
|
|
59
51
|
)
|
|
60
|
-
|
|
52
|
+
class Transition(Dataset):
|
|
61
53
|
"""Grand Débat National (transition écologique)
|
|
62
54
|
|
|
63
55
|
The *Grand Débat National* (GDN) is a country-wide citizen consultation held
|
|
@@ -71,21 +63,23 @@ def transition(la_transition_ecologique_2019_03_21: Path):
|
|
|
71
63
|
each concluding with a critical open-ended prompt: "Do you have anything to
|
|
72
64
|
add about [theme]?".
|
|
73
65
|
"""
|
|
74
|
-
|
|
66
|
+
|
|
67
|
+
FILE = FileDownloader(
|
|
68
|
+
"la_transition_ecologique_2019_03_21.jsonl",
|
|
69
|
+
"http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LA_TRANSITION_ECOLOGIQUE.json",
|
|
70
|
+
checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
|
|
71
|
+
transforms=JsonToJsonl(),
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
def config(self) -> GrandDebatFile:
|
|
75
|
+
return GrandDebatFile.C(path=self.FILE.path)
|
|
75
76
|
|
|
76
77
|
|
|
77
|
-
@filedownloader(
|
|
78
|
-
"fiscalité_et_dépenses_publiques_2019_03_21.jsonl",
|
|
79
|
-
"http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json",
|
|
80
|
-
checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
|
|
81
|
-
transforms=JsonToJsonl(),
|
|
82
|
-
)
|
|
83
78
|
@datatags("politics", "debate", "french")
|
|
84
79
|
@dataset(
|
|
85
|
-
GrandDebatFile,
|
|
86
80
|
url="https://granddebat.fr",
|
|
87
81
|
)
|
|
88
|
-
|
|
82
|
+
class Fiscalité(Dataset):
|
|
89
83
|
"""Grand Débat National (fiscalité et dépenses publiques)
|
|
90
84
|
|
|
91
85
|
The *Grand Débat National* (GDN) is a country-wide citizen consultation held
|
|
@@ -99,21 +93,23 @@ def fiscalité(fiscalité_et_dépenses_publiques_2019_03_21: Path):
|
|
|
99
93
|
each concluding with a critical open-ended prompt: "Do you have anything to
|
|
100
94
|
add about [theme]?".
|
|
101
95
|
"""
|
|
102
|
-
|
|
96
|
+
|
|
97
|
+
FILE = FileDownloader(
|
|
98
|
+
"fiscalité_et_dépenses_publiques_2019_03_21.jsonl",
|
|
99
|
+
"http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json",
|
|
100
|
+
checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
|
|
101
|
+
transforms=JsonToJsonl(),
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
def config(self) -> GrandDebatFile:
|
|
105
|
+
return GrandDebatFile.C(path=self.FILE.path)
|
|
103
106
|
|
|
104
107
|
|
|
105
|
-
@filedownloader(
|
|
106
|
-
"democratie_et_citoyennete_2019_03_21.jsonl",
|
|
107
|
-
"http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/DEMOCRATIE_ET_CITOYENNETE.json",
|
|
108
|
-
checker=HashCheck("049aaeca7e51747f2da5d68143c760fa"),
|
|
109
|
-
transforms=JsonToJsonl(),
|
|
110
|
-
)
|
|
111
108
|
@datatags("politics", "debate", "french")
|
|
112
109
|
@dataset(
|
|
113
|
-
GrandDebatFile,
|
|
114
110
|
url="https://granddebat.fr",
|
|
115
111
|
)
|
|
116
|
-
|
|
112
|
+
class Démocratie(Dataset):
|
|
117
113
|
"""Grand Débat National (démocratie et citoyenneté)
|
|
118
114
|
|
|
119
115
|
The *Grand Débat National* (GDN) is a country-wide citizen consultation held
|
|
@@ -127,21 +123,23 @@ def démocratie(democratie_et_citoyennete_2019_03_21: Path):
|
|
|
127
123
|
each concluding with a critical open-ended prompt: "Do you have anything to
|
|
128
124
|
add about [theme]?".
|
|
129
125
|
"""
|
|
130
|
-
|
|
126
|
+
|
|
127
|
+
FILE = FileDownloader(
|
|
128
|
+
"democratie_et_citoyennete_2019_03_21.jsonl",
|
|
129
|
+
"http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/DEMOCRATIE_ET_CITOYENNETE.json",
|
|
130
|
+
checker=HashCheck("049aaeca7e51747f2da5d68143c760fa"),
|
|
131
|
+
transforms=JsonToJsonl(),
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
def config(self) -> GrandDebatFile:
|
|
135
|
+
return GrandDebatFile.C(path=self.FILE.path)
|
|
131
136
|
|
|
132
137
|
|
|
133
|
-
@filedownloader(
|
|
134
|
-
"organisation_etat_services_publics_2019_03_21.jsonl",
|
|
135
|
-
"http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json",
|
|
136
|
-
checker=HashCheck("0ccb7c401889f738b73b0caab897a68b"),
|
|
137
|
-
transforms=JsonToJsonl(),
|
|
138
|
-
)
|
|
139
138
|
@datatags("politics", "debate", "french")
|
|
140
139
|
@dataset(
|
|
141
|
-
GrandDebatFile,
|
|
142
140
|
url="https://granddebat.fr",
|
|
143
141
|
)
|
|
144
|
-
|
|
142
|
+
class Organisation(Dataset):
|
|
145
143
|
"""Grand Débat National (organisation de l'État et des services publics)
|
|
146
144
|
|
|
147
145
|
The *Grand Débat National* (GDN) is a country-wide citizen consultation held
|
|
@@ -155,21 +153,23 @@ def organisation(organisation_etat_services_publics_2019_03_21: Path):
|
|
|
155
153
|
each concluding with a critical open-ended prompt: "Do you have anything to
|
|
156
154
|
add about [theme]?".
|
|
157
155
|
"""
|
|
158
|
-
|
|
156
|
+
|
|
157
|
+
FILE = FileDownloader(
|
|
158
|
+
"organisation_etat_services_publics_2019_03_21.jsonl",
|
|
159
|
+
"http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json",
|
|
160
|
+
checker=HashCheck("0ccb7c401889f738b73b0caab897a68b"),
|
|
161
|
+
transforms=JsonToJsonl(),
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
def config(self) -> GrandDebatFile:
|
|
165
|
+
return GrandDebatFile.C(path=self.FILE.path)
|
|
159
166
|
|
|
160
167
|
|
|
161
|
-
@filedownloader(
|
|
162
|
-
"les_evenements_2019_03_21.jsonl",
|
|
163
|
-
"http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LES_EVENEMENTS.json",
|
|
164
|
-
checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
|
|
165
|
-
transforms=JsonToJsonl(),
|
|
166
|
-
)
|
|
167
168
|
@datatags("politics", "debate", "french")
|
|
168
169
|
@dataset(
|
|
169
|
-
GrandDebatFile,
|
|
170
170
|
url="https://granddebat.fr",
|
|
171
171
|
)
|
|
172
|
-
|
|
172
|
+
class Evenements(Dataset):
|
|
173
173
|
"""Grand Débat National (événements)
|
|
174
174
|
|
|
175
175
|
The *Grand Débat National* (GDN) is a country-wide citizen consultation held
|
|
@@ -183,4 +183,13 @@ def evenements(les_evenements_2019_03_21: Path):
|
|
|
183
183
|
each concluding with a critical open-ended prompt: "Do you have anything to
|
|
184
184
|
add about [theme]?".
|
|
185
185
|
"""
|
|
186
|
-
|
|
186
|
+
|
|
187
|
+
FILE = FileDownloader(
|
|
188
|
+
"les_evenements_2019_03_21.jsonl",
|
|
189
|
+
"http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LES_EVENEMENTS.json",
|
|
190
|
+
checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
|
|
191
|
+
transforms=JsonToJsonl(),
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
def config(self) -> GrandDebatFile:
|
|
195
|
+
return GrandDebatFile.C(path=self.FILE.path)
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
"""CORD-19 dataset"""
|
|
2
2
|
|
|
3
3
|
from datamaestro.annotations.agreement import useragreement
|
|
4
|
-
from datamaestro.definitions import datatasks, dataset
|
|
4
|
+
from datamaestro.definitions import Dataset, datatasks, dataset
|
|
5
5
|
from datamaestro.download import reference
|
|
6
|
-
from datamaestro.download.single import
|
|
6
|
+
from datamaestro.download.single import FileDownloader
|
|
7
7
|
from datamaestro.utils import HashCheck
|
|
8
8
|
from datamaestro_text.data.ir import Adhoc
|
|
9
9
|
import datamaestro_text.data.ir.cord19 as d_cord19
|
|
@@ -14,19 +14,19 @@ cord19_lua = useragreement(
|
|
|
14
14
|
|
|
15
15
|
By accessing, downloading or otherwise using any Journals, Articles, Metadata, Abstracts,
|
|
16
16
|
Full-Texts or any other content types provided in the COVID-19 Open Research Dataset (CORD-19)
|
|
17
|
-
Database (the
|
|
17
|
+
Database (the "Data"), You expressly acknowledge and agree to the following:
|
|
18
18
|
|
|
19
|
-
|
|
19
|
+
\u2022 AI2 grants to You a worldwide, perpetual, non-exclusive, non-transferablelicenseto use and
|
|
20
20
|
make derivatives of the Datafor text and data mining only.
|
|
21
21
|
|
|
22
|
-
|
|
22
|
+
\u2022 AI2 warrants that it has the right to make the Data available to Youas provided for in and
|
|
23
23
|
subject to this Agreement and in accordance with applicable law. EXCEPT FOR THE LIMITED WARRANTY
|
|
24
|
-
IN THIS SECTION, THE DATA IS PROVIDED
|
|
24
|
+
IN THIS SECTION, THE DATA IS PROVIDED "AS IS", WITHOUT ANY WARRANTIES OF ANY KIND.
|
|
25
25
|
|
|
26
|
-
|
|
27
|
-
regulations with respect to AI2
|
|
26
|
+
\u2022 You agree to comply with all applicable local, state, national, and international laws and
|
|
27
|
+
regulations with respect to AI2's license and Youruse of the Data.\u2022 Data provided by AI2 is
|
|
28
28
|
from copyrighted sources of the respective copyright holders. You are solely responsible
|
|
29
|
-
for Your and Your users
|
|
29
|
+
for Your and Your users' compliance with any copyright, patent or trademark restrictions
|
|
30
30
|
and are referred to the copyright, patent or trademark notices appearing in the original
|
|
31
31
|
sources, all of which are hereby incorporated by reference""",
|
|
32
32
|
id="ai2.cord19",
|
|
@@ -34,62 +34,73 @@ sources, all of which are hereby incorporated by reference""",
|
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
@cord19_lua
|
|
37
|
-
@filedownloader(
|
|
38
|
-
"data.csv",
|
|
39
|
-
url="https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv",
|
|
40
|
-
checker=HashCheck("80d664e496b8b7e50a39c6f6bb92e0ef"),
|
|
41
|
-
)
|
|
42
37
|
@dataset(
|
|
43
|
-
d_cord19.Documents,
|
|
44
38
|
url="https://ir.nist.gov/covidSubmit/index.html",
|
|
45
39
|
)
|
|
46
|
-
|
|
40
|
+
class Cord19Round5Metadata(Dataset):
|
|
47
41
|
"""Cord 19 metadata (round 5)
|
|
48
42
|
|
|
49
43
|
Released on 2020-07-16
|
|
50
44
|
"""
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
"
|
|
54
|
-
|
|
55
|
-
"
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
45
|
+
|
|
46
|
+
DATA = FileDownloader(
|
|
47
|
+
"data.csv",
|
|
48
|
+
url="https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv",
|
|
49
|
+
checker=HashCheck("80d664e496b8b7e50a39c6f6bb92e0ef"),
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
def config(self) -> d_cord19.Documents:
|
|
53
|
+
return d_cord19.Documents.C(
|
|
54
|
+
path=self.DATA.path,
|
|
55
|
+
names_row=0,
|
|
56
|
+
# Number of documents
|
|
57
|
+
count=192509,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataset()
|
|
62
|
+
class Cord19Round5Topics(Dataset):
|
|
66
63
|
"""CORD-19 topics (round 5)"""
|
|
67
|
-
return {"path": data}
|
|
68
64
|
|
|
65
|
+
DATA = FileDownloader(
|
|
66
|
+
"data.xml",
|
|
67
|
+
url="https://ir.nist.gov/covidSubmit/data/topics-rnd5.xml",
|
|
68
|
+
checker=HashCheck("0307a37b6b9f1a5f233340a769d538ea"),
|
|
69
|
+
)
|
|
69
70
|
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
)
|
|
75
|
-
|
|
76
|
-
def cord19_round5_assessments(data):
|
|
71
|
+
def config(self) -> d_cord19.Topics:
|
|
72
|
+
return d_cord19.Topics.C(path=self.DATA.path)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@dataset()
|
|
76
|
+
class Cord19Round5Assessments(Dataset):
|
|
77
77
|
"""CORD19 assessments (round 5)"""
|
|
78
|
-
|
|
78
|
+
|
|
79
|
+
DATA = FileDownloader(
|
|
80
|
+
"data.ssv",
|
|
81
|
+
url="https://ir.nist.gov/covidSubmit/data/qrels-covid_d5_j0.5-5.txt",
|
|
82
|
+
checker=HashCheck("8138424a59daea0aba751c8a891e5f54"),
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
def config(self) -> TrecAdhocAssessments:
|
|
86
|
+
return TrecAdhocAssessments.C(path=self.DATA.path)
|
|
79
87
|
|
|
80
88
|
|
|
81
|
-
@reference("collection", cord19_round5_metadata)
|
|
82
|
-
@reference("topics", cord19_round5_topics)
|
|
83
|
-
@reference("qrels", cord19_round5_assessments)
|
|
84
89
|
@datatasks("information retrieval", "passage retrieval")
|
|
85
|
-
@dataset(
|
|
86
|
-
|
|
90
|
+
@dataset(url="https://ir.nist.gov/covidSubmit/data.html")
|
|
91
|
+
class Cord19Round5(Dataset):
|
|
87
92
|
"""CORD-19 IR collection (round 5)
|
|
88
93
|
|
|
89
94
|
This is the primary test collection for ad hoc retrieval that is the outcome of all five rounds of TREC-COVID. The test set, called TREC-COVID Complete, consists of the Round 5 document set (July 16 release of CORD-19); the final set of 50 topics; and the cumulative judgments from all assessing rounds with CORD-UIDs mapped to July 16 ids if necessary, previously judged documents no longer in the July 16 release removed, and the last judgments for documents judged multiple times due to significant content changes between rounds. Note that no TREC-COVID submissions correspond to this collection since all TREC-COVID submissions were subject to residual collection evaluation.
|
|
90
95
|
"""
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
+
|
|
97
|
+
COLLECTION = reference(varname="collection", reference=Cord19Round5Metadata)
|
|
98
|
+
TOPICS = reference(varname="topics", reference=Cord19Round5Topics)
|
|
99
|
+
QRELS = reference(varname="qrels", reference=Cord19Round5Assessments)
|
|
100
|
+
|
|
101
|
+
def config(self) -> Adhoc:
|
|
102
|
+
return Adhoc.C(
|
|
103
|
+
documents=self.COLLECTION.prepare(),
|
|
104
|
+
topics=self.TOPICS.prepare(),
|
|
105
|
+
assessments=self.QRELS.prepare(),
|
|
106
|
+
)
|