datamaestro-text 2025.7.28__py3-none-any.whl → 2026.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamaestro_text/config/com/github/aagohary/canard.py +3 -3
- datamaestro_text/config/com/github/apple/ml-qrecc.py +2 -1
- datamaestro_text/config/com/microsoft/msmarco/passage.py +1 -1
- datamaestro_text/config/com/oscar-corpus.py +1 -1
- datamaestro_text/config/com/sentiment140.py +2 -2
- datamaestro_text/config/com/smashwords/bookcorpus.py +1 -1
- datamaestro_text/config/edu/stanford/aclimdb.py +2 -2
- datamaestro_text/config/fr/granddebat.py +186 -0
- datamaestro_text/config/io/github/thunlp/fewrel.py +1 -1
- datamaestro_text/config/io/metamind/research/wikitext.py +3 -3
- datamaestro_text/config/org/grouplens/movielens.py +8 -8
- datamaestro_text/config/org/universaldependencies/french.py +3 -3
- datamaestro_text/data/conversation/base.py +2 -2
- datamaestro_text/data/debate/__init__.py +5 -0
- datamaestro_text/data/debate/granddebat.py +68 -0
- datamaestro_text/data/ir/__init__.py +19 -2
- datamaestro_text/data/ir/csv.py +7 -8
- datamaestro_text/data/ir/formats.py +1 -3
- datamaestro_text/datasets/irds/data.py +24 -13
- datamaestro_text/datasets/irds/datasets.py +1 -1
- datamaestro_text/transforms/ir/__init__.py +1 -1
- datamaestro_text/version.py +16 -3
- {datamaestro_text-2025.7.28.dist-info → datamaestro_text-2026.1.1.dist-info}/METADATA +20 -16
- {datamaestro_text-2025.7.28.dist-info → datamaestro_text-2026.1.1.dist-info}/RECORD +27 -25
- {datamaestro_text-2025.7.28.dist-info → datamaestro_text-2026.1.1.dist-info}/WHEEL +1 -2
- datamaestro_text-2025.7.28.dist-info/top_level.txt +0 -1
- {datamaestro_text-2025.7.28.dist-info → datamaestro_text-2026.1.1.dist-info}/entry_points.txt +0 -0
- {datamaestro_text-2025.7.28.dist-info → datamaestro_text-2026.1.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -37,7 +37,7 @@ def main(train, dev, test):
|
|
|
37
37
|
Each dataset is an instance of :class:`datamaestro_text.data.conversation.CanardDataset`
|
|
38
38
|
"""
|
|
39
39
|
return {
|
|
40
|
-
"train": CanardDataset(path=train),
|
|
41
|
-
"validation": CanardDataset(path=dev),
|
|
42
|
-
"test": CanardDataset(path=test),
|
|
40
|
+
"train": CanardDataset.C(path=train),
|
|
41
|
+
"validation": CanardDataset.C(path=dev),
|
|
42
|
+
"test": CanardDataset.C(path=test),
|
|
43
43
|
}
|
|
@@ -51,6 +51,7 @@ def main(data: Path) -> Supervised[QReCCDataset, None, QReCCDataset]:
|
|
|
51
51
|
)
|
|
52
52
|
class Content(LZ4JSONLDocumentStore):
|
|
53
53
|
"""QReCC mentionned URLs content"""
|
|
54
|
+
|
|
54
55
|
@staticmethod
|
|
55
56
|
def __create_dataset__(dataset, options=None):
|
|
56
57
|
ds = reference(reference=main).setup(dataset, options)
|
|
@@ -65,7 +66,7 @@ class Content(LZ4JSONLDocumentStore):
|
|
|
65
66
|
"id",
|
|
66
67
|
).setup(dataset, options)
|
|
67
68
|
|
|
68
|
-
return Content(jsonl_path=store_path)
|
|
69
|
+
return Content.C(jsonl_path=store_path)
|
|
69
70
|
|
|
70
71
|
@staticmethod
|
|
71
72
|
def _documents(path: Path):
|
|
@@ -47,7 +47,7 @@ http://www.msmarco.org/dataset.aspx""",
|
|
|
47
47
|
@dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
|
|
48
48
|
def collection_etc(data) -> Folder:
|
|
49
49
|
"""Documents and some more files"""
|
|
50
|
-
return Folder(path=data)
|
|
50
|
+
return Folder.C(path=data)
|
|
51
51
|
|
|
52
52
|
|
|
53
53
|
@lua
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from datamaestro.definitions import
|
|
1
|
+
from datamaestro.definitions import datatasks, datatags, dataset
|
|
2
2
|
from datamaestro.download.single import filedownloader
|
|
3
3
|
from datamaestro_text.data.text import TextFile
|
|
4
4
|
from datamaestro.utils import HashCheck
|
|
@@ -27,6 +27,6 @@ def english(dir):
|
|
|
27
27
|
If you use this data, please cite Sentiment140 as your source.
|
|
28
28
|
"""
|
|
29
29
|
return Supervised.C(
|
|
30
|
-
train=Generic(path=dir / "training.1600000.processed.noemoticon.csv"),
|
|
31
|
-
test=Generic(path=dir / "testdata.manual.2009.06.14.csv"),
|
|
30
|
+
train=Generic.C(path=dir / "training.1600000.processed.noemoticon.csv"),
|
|
31
|
+
test=Generic.C(path=dir / "testdata.manual.2009.06.14.csv"),
|
|
32
32
|
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# See documentation on https://datamaestro.readthedocs.io
|
|
2
2
|
|
|
3
|
-
from datamaestro.definitions import
|
|
3
|
+
from datamaestro.definitions import datatasks, datatags, dataset
|
|
4
4
|
from datamaestro_text.data.text import TextFolder
|
|
5
5
|
from datamaestro.download.archive import tardownloader
|
|
6
6
|
from datamaestro.utils import HashCheck
|
|
@@ -11,6 +11,6 @@ def aclimdb(data):
|
|
|
11
11
|
Paper http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf
|
|
12
12
|
"""
|
|
13
13
|
return {
|
|
14
|
-
"train": FolderBased(path=data / "train", classes=["neg", "pos"]),
|
|
15
|
-
"test": FolderBased(path=data / "test", classes=["neg", "pos"]),
|
|
14
|
+
"train": FolderBased.C(path=data / "train", classes=["neg", "pos"]),
|
|
15
|
+
"test": FolderBased.C(path=data / "test", classes=["neg", "pos"]),
|
|
16
16
|
}
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
# See documentation on https://datamaestro.readthedocs.io
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from datamaestro.definitions import datatags, dataset
|
|
5
|
+
from datamaestro_text.data.debate import GrandDebatFile
|
|
6
|
+
from datamaestro.download.single import filedownloader
|
|
7
|
+
from datamaestro.utils import HashCheck
|
|
8
|
+
from datamaestro.stream import Transform
|
|
9
|
+
import io
|
|
10
|
+
import json
|
|
11
|
+
import ijson
|
|
12
|
+
import os
|
|
13
|
+
import threading
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class JsonToJsonl(Transform):
|
|
17
|
+
"""Transforms a JSON file with an array into a JSONL file with one line per
|
|
18
|
+
array element"""
|
|
19
|
+
|
|
20
|
+
def __call__(self, fileobj: io.IOBase) -> io.IOBase:
|
|
21
|
+
# Stream items from the top-level array into a read-end pipe.
|
|
22
|
+
try:
|
|
23
|
+
fileobj.seek(0)
|
|
24
|
+
except Exception:
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
r_fd, w_fd = os.pipe()
|
|
28
|
+
r_file = os.fdopen(r_fd, "rb")
|
|
29
|
+
w_file = os.fdopen(w_fd, "wb")
|
|
30
|
+
|
|
31
|
+
def _writer(fin, fout):
|
|
32
|
+
try:
|
|
33
|
+
for item in ijson.items(fin, "item"):
|
|
34
|
+
line = json.dumps(item, ensure_ascii=False) + "\n"
|
|
35
|
+
fout.write(line.encode("utf-8"))
|
|
36
|
+
fout.close()
|
|
37
|
+
except Exception:
|
|
38
|
+
try:
|
|
39
|
+
fout.close()
|
|
40
|
+
except Exception:
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
t = threading.Thread(target=_writer, args=(fileobj, w_file), daemon=True)
|
|
44
|
+
t.start()
|
|
45
|
+
|
|
46
|
+
return r_file
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@filedownloader(
|
|
50
|
+
"la_transition_ecologique_2019_03_21.jsonl",
|
|
51
|
+
"http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LA_TRANSITION_ECOLOGIQUE.json",
|
|
52
|
+
checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
|
|
53
|
+
transforms=JsonToJsonl(),
|
|
54
|
+
)
|
|
55
|
+
@datatags("politics", "debate", "french")
|
|
56
|
+
@dataset(
|
|
57
|
+
GrandDebatFile,
|
|
58
|
+
url="https://granddebat.fr",
|
|
59
|
+
)
|
|
60
|
+
def transition(la_transition_ecologique_2019_03_21: Path):
|
|
61
|
+
"""Grand Débat National (transition écologique)
|
|
62
|
+
|
|
63
|
+
The *Grand Débat National* (GDN) is a country-wide citizen consultation held
|
|
64
|
+
in France in 2019.
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
The consultation prompted citizens to express their views across four main
|
|
68
|
+
themes: *Taxation and public spending*, *Organization of the state and
|
|
69
|
+
public services*, *Democracy and citizenship*, and *Ecological transition*.
|
|
70
|
+
A significant portion of this consultation involved online questionnaires,
|
|
71
|
+
each concluding with a critical open-ended prompt: "Do you have anything to
|
|
72
|
+
add about [theme]?".
|
|
73
|
+
"""
|
|
74
|
+
return GrandDebatFile.C(path=la_transition_ecologique_2019_03_21)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@filedownloader(
|
|
78
|
+
"fiscalité_et_dépenses_publiques_2019_03_21.jsonl",
|
|
79
|
+
"http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json",
|
|
80
|
+
checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
|
|
81
|
+
transforms=JsonToJsonl(),
|
|
82
|
+
)
|
|
83
|
+
@datatags("politics", "debate", "french")
|
|
84
|
+
@dataset(
|
|
85
|
+
GrandDebatFile,
|
|
86
|
+
url="https://granddebat.fr",
|
|
87
|
+
)
|
|
88
|
+
def fiscalité(fiscalité_et_dépenses_publiques_2019_03_21: Path):
|
|
89
|
+
"""Grand Débat National (fiscalité et dépenses publiques)
|
|
90
|
+
|
|
91
|
+
The *Grand Débat National* (GDN) is a country-wide citizen consultation held
|
|
92
|
+
in France in 2019.
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
The consultation prompted citizens to express their views across four main
|
|
96
|
+
themes: *Taxation and public spending*, *Organization of the state and
|
|
97
|
+
public services*, *Democracy and citizenship*, and *Ecological transition*.
|
|
98
|
+
A significant portion of this consultation involved online questionnaires,
|
|
99
|
+
each concluding with a critical open-ended prompt: "Do you have anything to
|
|
100
|
+
add about [theme]?".
|
|
101
|
+
"""
|
|
102
|
+
return GrandDebatFile.C(path=fiscalité_et_dépenses_publiques_2019_03_21)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
@filedownloader(
|
|
106
|
+
"democratie_et_citoyennete_2019_03_21.jsonl",
|
|
107
|
+
"http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/DEMOCRATIE_ET_CITOYENNETE.json",
|
|
108
|
+
checker=HashCheck("049aaeca7e51747f2da5d68143c760fa"),
|
|
109
|
+
transforms=JsonToJsonl(),
|
|
110
|
+
)
|
|
111
|
+
@datatags("politics", "debate", "french")
|
|
112
|
+
@dataset(
|
|
113
|
+
GrandDebatFile,
|
|
114
|
+
url="https://granddebat.fr",
|
|
115
|
+
)
|
|
116
|
+
def démocratie(democratie_et_citoyennete_2019_03_21: Path):
|
|
117
|
+
"""Grand Débat National (démocratie et citoyenneté)
|
|
118
|
+
|
|
119
|
+
The *Grand Débat National* (GDN) is a country-wide citizen consultation held
|
|
120
|
+
in France in 2019.
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
The consultation prompted citizens to express their views across four main
|
|
124
|
+
themes: *Taxation and public spending*, *Organization of the state and
|
|
125
|
+
public services*, *Democracy and citizenship*, and *Ecological transition*.
|
|
126
|
+
A significant portion of this consultation involved online questionnaires,
|
|
127
|
+
each concluding with a critical open-ended prompt: "Do you have anything to
|
|
128
|
+
add about [theme]?".
|
|
129
|
+
"""
|
|
130
|
+
return GrandDebatFile.C(path=democratie_et_citoyennete_2019_03_21)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
@filedownloader(
|
|
134
|
+
"organisation_etat_services_publics_2019_03_21.jsonl",
|
|
135
|
+
"http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json",
|
|
136
|
+
checker=HashCheck("0ccb7c401889f738b73b0caab897a68b"),
|
|
137
|
+
transforms=JsonToJsonl(),
|
|
138
|
+
)
|
|
139
|
+
@datatags("politics", "debate", "french")
|
|
140
|
+
@dataset(
|
|
141
|
+
GrandDebatFile,
|
|
142
|
+
url="https://granddebat.fr",
|
|
143
|
+
)
|
|
144
|
+
def organisation(organisation_etat_services_publics_2019_03_21: Path):
|
|
145
|
+
"""Grand Débat National (organisation de l'État et des services publics)
|
|
146
|
+
|
|
147
|
+
The *Grand Débat National* (GDN) is a country-wide citizen consultation held
|
|
148
|
+
in France in 2019.
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
The consultation prompted citizens to express their views across four main
|
|
152
|
+
themes: *Taxation and public spending*, *Organization of the state and
|
|
153
|
+
public services*, *Democracy and citizenship*, and *Ecological transition*.
|
|
154
|
+
A significant portion of this consultation involved online questionnaires,
|
|
155
|
+
each concluding with a critical open-ended prompt: "Do you have anything to
|
|
156
|
+
add about [theme]?".
|
|
157
|
+
"""
|
|
158
|
+
return GrandDebatFile.C(path=organisation_etat_services_publics_2019_03_21)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
@filedownloader(
|
|
162
|
+
"les_evenements_2019_03_21.jsonl",
|
|
163
|
+
"http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LES_EVENEMENTS.json",
|
|
164
|
+
checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
|
|
165
|
+
transforms=JsonToJsonl(),
|
|
166
|
+
)
|
|
167
|
+
@datatags("politics", "debate", "french")
|
|
168
|
+
@dataset(
|
|
169
|
+
GrandDebatFile,
|
|
170
|
+
url="https://granddebat.fr",
|
|
171
|
+
)
|
|
172
|
+
def evenements(les_evenements_2019_03_21: Path):
|
|
173
|
+
"""Grand Débat National (événements)
|
|
174
|
+
|
|
175
|
+
The *Grand Débat National* (GDN) is a country-wide citizen consultation held
|
|
176
|
+
in France in 2019.
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
The consultation prompted citizens to express their views across four main
|
|
180
|
+
themes: *Taxation and public spending*, *Organization of the state and
|
|
181
|
+
public services*, *Democracy and citizenship*, and *Ecological transition*.
|
|
182
|
+
A significant portion of this consultation involved online questionnaires,
|
|
183
|
+
each concluding with a critical open-ended prompt: "Do you have anything to
|
|
184
|
+
add about [theme]?".
|
|
185
|
+
"""
|
|
186
|
+
return GrandDebatFile.C(path=les_evenements_2019_03_21)
|
|
@@ -32,4 +32,4 @@ def v1(train, validation):
|
|
|
32
32
|
Only the train and validation dataset are available. The test set is hidden
|
|
33
33
|
for the leaderboard.
|
|
34
34
|
"""
|
|
35
|
-
return {"train": File(path=train), "validation": File(path=validation)}
|
|
35
|
+
return {"train": File.C(path=train), "validation": File.C(path=validation)}
|
|
@@ -30,9 +30,9 @@ def WikiText(data, type):
|
|
|
30
30
|
https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/
|
|
31
31
|
"""
|
|
32
32
|
return {
|
|
33
|
-
"train": File(path=data / ("wiki.train.%s" % type)),
|
|
34
|
-
"validation": File(path=data / ("wiki.valid.%s" % type)),
|
|
35
|
-
"test": File(path=data / ("wiki.test.%s" % type)),
|
|
33
|
+
"train": File.C(path=data / ("wiki.train.%s" % type)),
|
|
34
|
+
"validation": File.C(path=data / ("wiki.valid.%s" % type)),
|
|
35
|
+
"test": File.C(path=data / ("wiki.test.%s" % type)),
|
|
36
36
|
}
|
|
37
37
|
|
|
38
38
|
|
|
@@ -31,10 +31,10 @@ def small(ds) -> Movielens:
|
|
|
31
31
|
100,000 ratings and 3,600 tag applications applied to 9,000 movies by 600 users (as of 9/2018)
|
|
32
32
|
"""
|
|
33
33
|
return {
|
|
34
|
-
"ratings": csv.Generic(path=ds / "ratings.csv", names_row=0),
|
|
35
|
-
"links": csv.Generic(path=ds / "links.csv", names_row=0),
|
|
36
|
-
"movies": csv.Generic(path=ds / "movies.csv", names_row=0),
|
|
37
|
-
"tags": csv.Generic(path=ds / "tags.csv", names_row=0),
|
|
34
|
+
"ratings": csv.Generic.C(path=ds / "ratings.csv", names_row=0),
|
|
35
|
+
"links": csv.Generic.C(path=ds / "links.csv", names_row=0),
|
|
36
|
+
"movies": csv.Generic.C(path=ds / "movies.csv", names_row=0),
|
|
37
|
+
"tags": csv.Generic.C(path=ds / "tags.csv", names_row=0),
|
|
38
38
|
}
|
|
39
39
|
|
|
40
40
|
|
|
@@ -46,8 +46,8 @@ def full(ds) -> Movielens:
|
|
|
46
46
|
27,000,000 ratings and 1,100,000 tag applications applied to 58,000 movies by 280,000 users (as of 9/2018)
|
|
47
47
|
"""
|
|
48
48
|
return {
|
|
49
|
-
"ratings": csv.Generic(path=ds / "ratings.csv", names_row=0),
|
|
50
|
-
"links": csv.Generic(path=ds / "links.csv", names_row=0),
|
|
51
|
-
"movies": csv.Generic(path=ds / "movies.csv", names_row=0),
|
|
52
|
-
"tags": csv.Generic(path=ds / "tags.csv", names_row=0),
|
|
49
|
+
"ratings": csv.Generic.C(path=ds / "ratings.csv", names_row=0),
|
|
50
|
+
"links": csv.Generic.C(path=ds / "links.csv", names_row=0),
|
|
51
|
+
"movies": csv.Generic.C(path=ds / "movies.csv", names_row=0),
|
|
52
|
+
"tags": csv.Generic.C(path=ds / "tags.csv", names_row=0),
|
|
53
53
|
}
|
|
@@ -34,9 +34,9 @@ def gsd(ds) -> Supervised:
|
|
|
34
34
|
is updated since 2015 independently from the previous source.
|
|
35
35
|
"""
|
|
36
36
|
return {
|
|
37
|
-
"train": CoNLL_U(path=ds / "fr_gsd-ud-train.conllu"),
|
|
38
|
-
"test": CoNLL_U(path=ds / "fr_gsd-ud-dev.conllu"),
|
|
39
|
-
"validation": CoNLL_U(path=ds / "fr_gsd-ud-test.conllu"),
|
|
37
|
+
"train": CoNLL_U.C(path=ds / "fr_gsd-ud-train.conllu"),
|
|
38
|
+
"test": CoNLL_U.C(path=ds / "fr_gsd-ud-dev.conllu"),
|
|
39
|
+
"validation": CoNLL_U.C(path=ds / "fr_gsd-ud-test.conllu"),
|
|
40
40
|
}
|
|
41
41
|
|
|
42
42
|
|
|
@@ -267,7 +267,7 @@ class ConversationUserTopics(Topics):
|
|
|
267
267
|
"""Returns an iterator over topics"""
|
|
268
268
|
# Extracts topics from conversations, Each user query is a topic (can perform retrieval on it)
|
|
269
269
|
# TODO: merge with xpmir.learning.DatasetConversationBase -> same logic
|
|
270
|
-
|
|
270
|
+
|
|
271
271
|
records: List[TopicRecord] = []
|
|
272
272
|
for conversation in self.conversations.__iter__():
|
|
273
273
|
nodes = [
|
|
@@ -279,4 +279,4 @@ class ConversationUserTopics(Topics):
|
|
|
279
279
|
records.append(
|
|
280
280
|
node.entry.update(ConversationHistoryItem(node.history()))
|
|
281
281
|
)
|
|
282
|
-
return iter(records)
|
|
282
|
+
return iter(records)
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""Data classes for the Grand Débat National dataset"""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from typing import Iterator, List, Optional
|
|
6
|
+
|
|
7
|
+
from datamaestro.data import File
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class GrandDebatResponse:
|
|
12
|
+
"""A response to a question in the Grand Débat National"""
|
|
13
|
+
|
|
14
|
+
question_id: str
|
|
15
|
+
question_title: str
|
|
16
|
+
value: Optional[str]
|
|
17
|
+
formatted_value: Optional[str]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class GrandDebatEntry:
|
|
22
|
+
"""An entry (contribution) in the Grand Débat National dataset"""
|
|
23
|
+
|
|
24
|
+
id: str
|
|
25
|
+
reference: str
|
|
26
|
+
title: str
|
|
27
|
+
created_at: str
|
|
28
|
+
published_at: str
|
|
29
|
+
updated_at: Optional[str]
|
|
30
|
+
trashed: bool
|
|
31
|
+
trashed_status: Optional[str]
|
|
32
|
+
author_id: str
|
|
33
|
+
author_type: str
|
|
34
|
+
author_zip_code: str
|
|
35
|
+
responses: List[GrandDebatResponse] = field(default_factory=list)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class GrandDebatFile(File):
|
|
39
|
+
"""A Grand Débat National JSONL file with iteration support"""
|
|
40
|
+
|
|
41
|
+
def __iter__(self) -> Iterator[GrandDebatEntry]:
|
|
42
|
+
"""Iterate over entries in the JSONL file"""
|
|
43
|
+
with self.path.open("r", encoding="utf-8") as f:
|
|
44
|
+
for line in f:
|
|
45
|
+
data = json.loads(line)
|
|
46
|
+
responses = [
|
|
47
|
+
GrandDebatResponse(
|
|
48
|
+
question_id=r["questionId"],
|
|
49
|
+
question_title=r["questionTitle"],
|
|
50
|
+
value=r.get("value"),
|
|
51
|
+
formatted_value=r.get("formattedValue"),
|
|
52
|
+
)
|
|
53
|
+
for r in data.get("responses", [])
|
|
54
|
+
]
|
|
55
|
+
yield GrandDebatEntry(
|
|
56
|
+
id=data["id"],
|
|
57
|
+
reference=data["reference"],
|
|
58
|
+
title=data["title"],
|
|
59
|
+
created_at=data["createdAt"],
|
|
60
|
+
published_at=data["publishedAt"],
|
|
61
|
+
updated_at=data.get("updatedAt"),
|
|
62
|
+
trashed=data["trashed"],
|
|
63
|
+
trashed_status=data.get("trashedStatus"),
|
|
64
|
+
author_id=data["authorId"],
|
|
65
|
+
author_type=data["authorType"],
|
|
66
|
+
author_zip_code=data["authorZipCode"],
|
|
67
|
+
responses=responses,
|
|
68
|
+
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Generic data types for information retrieval"""
|
|
2
2
|
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
|
+
from enum import Enum
|
|
4
5
|
from functools import cached_property
|
|
5
6
|
import logging
|
|
6
7
|
from pathlib import Path
|
|
@@ -88,6 +89,19 @@ class Documents(Base):
|
|
|
88
89
|
...
|
|
89
90
|
|
|
90
91
|
|
|
92
|
+
class FileAccess(Enum):
|
|
93
|
+
"""Defines how to access files (e.g. for document stores)"""
|
|
94
|
+
|
|
95
|
+
FILE = 0
|
|
96
|
+
"""Direct file access"""
|
|
97
|
+
|
|
98
|
+
MMAP = 1
|
|
99
|
+
"""Use mmap"""
|
|
100
|
+
|
|
101
|
+
MEMORY = 2
|
|
102
|
+
"""Use memory"""
|
|
103
|
+
|
|
104
|
+
|
|
91
105
|
class DocumentStore(Documents):
|
|
92
106
|
"""A document store
|
|
93
107
|
|
|
@@ -97,6 +111,10 @@ class DocumentStore(Documents):
|
|
|
97
111
|
- return the number of documents
|
|
98
112
|
"""
|
|
99
113
|
|
|
114
|
+
file_access: Meta[FileAccess] = FileAccess.MMAP
|
|
115
|
+
"""How to access the file collection (might not have any impact, depends on
|
|
116
|
+
the docstore)"""
|
|
117
|
+
|
|
100
118
|
def docid_internal2external(self, docid: int):
|
|
101
119
|
"""Converts an internal collection ID (integer) to an external ID"""
|
|
102
120
|
raise NotImplementedError(f"For class {self.__class__}")
|
|
@@ -327,5 +345,4 @@ class PairwiseSampleDataset(Base, ABC):
|
|
|
327
345
|
"""Datasets where each record is a query with positive and negative samples"""
|
|
328
346
|
|
|
329
347
|
@abstractmethod
|
|
330
|
-
def iter(self) -> Iterator[PairwiseSample]:
|
|
331
|
-
...
|
|
348
|
+
def iter(self) -> Iterator[PairwiseSample]: ...
|
datamaestro_text/data/ir/csv.py
CHANGED
|
@@ -1,27 +1,26 @@
|
|
|
1
1
|
from functools import cached_property
|
|
2
2
|
from pathlib import Path
|
|
3
|
-
from typing import Iterator, Tuple, Type
|
|
4
3
|
|
|
5
|
-
from experimaestro import Param,
|
|
6
|
-
from datamaestro.definitions import argument
|
|
4
|
+
from experimaestro import Param, Meta
|
|
7
5
|
from datamaestro.record import Record, RecordType
|
|
8
6
|
import datamaestro_text.data.ir as ir
|
|
9
7
|
from datamaestro_text.data.ir.base import IDItem, SimpleTextItem
|
|
10
8
|
from datamaestro_text.interfaces.plaintext import read_tsv
|
|
11
9
|
|
|
12
10
|
|
|
13
|
-
@argument("path", type=Path)
|
|
14
|
-
@argument("separator", type=str, default="\t", ignored=True)
|
|
15
11
|
class AdhocRunWithText(ir.AdhocRun):
|
|
16
12
|
"(qid, doc.id, query, passage)"
|
|
17
|
-
|
|
13
|
+
|
|
14
|
+
path: Meta[Path]
|
|
15
|
+
separator: Meta[str] = "\t"
|
|
18
16
|
|
|
19
17
|
|
|
20
|
-
@argument("path", type=Path)
|
|
21
|
-
@argument("separator", type=str, default="\t", ignored=True)
|
|
22
18
|
class Topics(ir.Topics):
|
|
23
19
|
"Pairs of query id - query using a separator"
|
|
24
20
|
|
|
21
|
+
path: Meta[Path]
|
|
22
|
+
separator: Meta[str] = "\t"
|
|
23
|
+
|
|
25
24
|
def iter(self):
|
|
26
25
|
return (
|
|
27
26
|
Record(IDItem(qid), SimpleTextItem(title))
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from abc import ABC, abstractmethod
|
|
3
3
|
from dataclasses import dataclass
|
|
4
|
-
from functools import partial
|
|
4
|
+
from functools import cached_property, partial
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from typing import Dict, Iterator, List, NamedTuple, Tuple, Type
|
|
7
7
|
|
|
@@ -9,7 +9,6 @@ import ir_datasets
|
|
|
9
9
|
import ir_datasets.datasets as _irds
|
|
10
10
|
from datamaestro.record import RecordType, record_type
|
|
11
11
|
from experimaestro import Config, Meta, Option, Param
|
|
12
|
-
from experimaestro.compat import cached_property
|
|
13
12
|
from ir_datasets.formats import (
|
|
14
13
|
GenericDoc,
|
|
15
14
|
GenericDocPair,
|
|
@@ -112,6 +111,9 @@ class Documents(ir.DocumentStore, IRDSId):
|
|
|
112
111
|
_irds.beir.BeirCordDoc: tuple_constructor(
|
|
113
112
|
formats.CordDocument, "doc_id", "text", "title", "url", "pubmed_id"
|
|
114
113
|
),
|
|
114
|
+
_irds.miracl.MiraclDoc: tuple_constructor(
|
|
115
|
+
formats.DocumentWithTitle, "doc_id", "title", "text"
|
|
116
|
+
),
|
|
115
117
|
_irds.beir.BeirTitleDoc: tuple_constructor(
|
|
116
118
|
formats.TitleDocument, "doc_id", "text", "title"
|
|
117
119
|
),
|
|
@@ -202,11 +204,11 @@ class Documents(ir.DocumentStore, IRDSId):
|
|
|
202
204
|
|
|
203
205
|
def iter(self) -> Iterator[ir.DocumentRecord]:
|
|
204
206
|
"""Returns an iterator over adhoc documents"""
|
|
205
|
-
for doc in self.
|
|
207
|
+
for doc in self._docs:
|
|
206
208
|
yield self.converter(self.document_recordtype, doc)
|
|
207
209
|
|
|
208
210
|
def iter_documents_from(self, start=0):
|
|
209
|
-
for doc in self.
|
|
211
|
+
for doc in self._docs[start:]:
|
|
210
212
|
yield self.converter(self.document_recordtype, doc)
|
|
211
213
|
|
|
212
214
|
@property
|
|
@@ -215,11 +217,26 @@ class Documents(ir.DocumentStore, IRDSId):
|
|
|
215
217
|
|
|
216
218
|
@cached_property
|
|
217
219
|
def store(self):
|
|
218
|
-
|
|
220
|
+
kwargs = {}
|
|
221
|
+
try:
|
|
222
|
+
# Translate to ir datasets docstore options
|
|
223
|
+
import ir_datasets.indices as ir_indices
|
|
224
|
+
|
|
225
|
+
file_access = {
|
|
226
|
+
ir.FileAccess.MMAP: ir_indices.FileAccess.MMAP,
|
|
227
|
+
ir.FileAccess.FILE: ir_indices.FileAccess.FILE,
|
|
228
|
+
ir.FileAccess.MEMORY: ir_indices.FileAccess.MEMORY,
|
|
229
|
+
}[self.file_access]
|
|
230
|
+
kwargs = {"options": ir_indices.DocstoreOptions(file_access=file_access)}
|
|
231
|
+
except ImportError:
|
|
232
|
+
logging.warning(
|
|
233
|
+
"This version of ir-datasets cannot handle docstore options"
|
|
234
|
+
)
|
|
235
|
+
return self.dataset.docs_store(**kwargs)
|
|
219
236
|
|
|
220
|
-
@
|
|
237
|
+
@property
|
|
221
238
|
def _docs(self):
|
|
222
|
-
return self.
|
|
239
|
+
return iter(self.store)
|
|
223
240
|
|
|
224
241
|
def docid_internal2external(self, ix: int):
|
|
225
242
|
return self._docs[ix].doc_id
|
|
@@ -249,12 +266,6 @@ class Documents(ir.DocumentStore, IRDSId):
|
|
|
249
266
|
return converter
|
|
250
267
|
|
|
251
268
|
|
|
252
|
-
if hasattr(_irds, "miracl"):
|
|
253
|
-
Documents.CONVERTERS[_irds.miracl.MiraclDoc] = tuple_constructor(
|
|
254
|
-
formats.DocumentWithTitle, "doc_id", "text", "title"
|
|
255
|
-
)
|
|
256
|
-
|
|
257
|
-
|
|
258
269
|
class LZ4DocumentStore(ir.DocumentStore, ABC):
|
|
259
270
|
"""A LZ4-based document store"""
|
|
260
271
|
|
datamaestro_text/version.py
CHANGED
|
@@ -1,7 +1,14 @@
|
|
|
1
1
|
# file generated by setuptools-scm
|
|
2
2
|
# don't change, don't track in version control
|
|
3
3
|
|
|
4
|
-
__all__ = [
|
|
4
|
+
__all__ = [
|
|
5
|
+
"__version__",
|
|
6
|
+
"__version_tuple__",
|
|
7
|
+
"version",
|
|
8
|
+
"version_tuple",
|
|
9
|
+
"__commit_id__",
|
|
10
|
+
"commit_id",
|
|
11
|
+
]
|
|
5
12
|
|
|
6
13
|
TYPE_CHECKING = False
|
|
7
14
|
if TYPE_CHECKING:
|
|
@@ -9,13 +16,19 @@ if TYPE_CHECKING:
|
|
|
9
16
|
from typing import Union
|
|
10
17
|
|
|
11
18
|
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
19
|
+
COMMIT_ID = Union[str, None]
|
|
12
20
|
else:
|
|
13
21
|
VERSION_TUPLE = object
|
|
22
|
+
COMMIT_ID = object
|
|
14
23
|
|
|
15
24
|
version: str
|
|
16
25
|
__version__: str
|
|
17
26
|
__version_tuple__: VERSION_TUPLE
|
|
18
27
|
version_tuple: VERSION_TUPLE
|
|
28
|
+
commit_id: COMMIT_ID
|
|
29
|
+
__commit_id__: COMMIT_ID
|
|
19
30
|
|
|
20
|
-
__version__ = version = '
|
|
21
|
-
__version_tuple__ = version_tuple = (
|
|
31
|
+
__version__ = version = '2026.1.1'
|
|
32
|
+
__version_tuple__ = version_tuple = (2026, 1, 1)
|
|
33
|
+
|
|
34
|
+
__commit_id__ = commit_id = None
|
|
@@ -1,33 +1,37 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datamaestro-text
|
|
3
|
-
Version:
|
|
3
|
+
Version: 2026.1.1
|
|
4
4
|
Summary: Datamaestro module for text-related datasets
|
|
5
|
+
Project-URL: Homepage, https://github.com/experimaestro/datamaestro_text
|
|
6
|
+
Project-URL: Documentation, https://datamaestro-text.readthedocs.io/en/latest/
|
|
7
|
+
Project-URL: Repository, https://github.com/experimaestro/datamaestro_text
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/experimaestro/datamaestro_text/issues
|
|
5
9
|
Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
|
|
6
|
-
License: GPL-3
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
Project-URL: repository, https://github.com/experimaestro/datamaestro_text
|
|
10
|
-
Keywords: dataset manager,information retrieval,experiments
|
|
10
|
+
License: GPL-3.0-or-later
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: dataset manager,experiments,information retrieval
|
|
11
13
|
Classifier: Development Status :: 4 - Beta
|
|
12
14
|
Classifier: Intended Audience :: Science/Research
|
|
13
15
|
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
14
16
|
Classifier: Operating System :: OS Independent
|
|
15
17
|
Classifier: Programming Language :: Python
|
|
16
18
|
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
22
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
18
23
|
Requires-Python: >=3.10
|
|
19
|
-
Description-Content-Type: text/markdown
|
|
20
|
-
License-File: LICENSE
|
|
21
|
-
Requires-Dist: datamaestro>=1.5.0
|
|
22
|
-
Requires-Dist: ir_datasets>=0.5.8
|
|
23
24
|
Requires-Dist: attrs
|
|
25
|
+
Requires-Dist: datamaestro>=1.6.2
|
|
26
|
+
Requires-Dist: experimaestro
|
|
27
|
+
Requires-Dist: ir-datasets>=0.5.8
|
|
24
28
|
Provides-Extra: dev
|
|
25
|
-
Requires-Dist:
|
|
26
|
-
Requires-Dist:
|
|
27
|
-
Requires-Dist:
|
|
28
|
-
Requires-Dist:
|
|
29
|
-
Requires-Dist:
|
|
30
|
-
|
|
29
|
+
Requires-Dist: docutils; extra == 'dev'
|
|
30
|
+
Requires-Dist: flake8; extra == 'dev'
|
|
31
|
+
Requires-Dist: pytest; extra == 'dev'
|
|
32
|
+
Requires-Dist: sphinx<8; extra == 'dev'
|
|
33
|
+
Requires-Dist: sphobjinv; extra == 'dev'
|
|
34
|
+
Description-Content-Type: text/markdown
|
|
31
35
|
|
|
32
36
|
[](https://github.com/pre-commit/pre-commit) [](https://badge.fury.io/py/datamaestro-text)
|
|
33
37
|
|
|
@@ -1,26 +1,27 @@
|
|
|
1
1
|
datamaestro_text/__init__.py,sha256=hU8jZpkXl3F74qIfqnJl7v4nJ9YxfoR7IpJpUREFNRI,248
|
|
2
|
-
datamaestro_text/version.py,sha256=
|
|
2
|
+
datamaestro_text/version.py,sha256=KRGjQBj37k6x1t02kZiDs0px7bfHmVSimH49hjV1IAU,710
|
|
3
3
|
datamaestro_text/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
datamaestro_text/config/ai/quac.yaml,sha256=h1D7UJo2z1nZ_9MXpDHuQNJG9Ma2oayUdJV6lyumAIg,1103
|
|
5
|
-
datamaestro_text/config/com/oscar-corpus.py,sha256=
|
|
6
|
-
datamaestro_text/config/com/sentiment140.py,sha256=
|
|
5
|
+
datamaestro_text/config/com/oscar-corpus.py,sha256=6F2RYOyE9_5uq_t8VrTggWxcFzefFPrmcxQXvXhfia8,723
|
|
6
|
+
datamaestro_text/config/com/sentiment140.py,sha256=itfBEgcOniECXKOw8I2dhzyS9LOMsltMLfKK6NGRpVY,1293
|
|
7
7
|
datamaestro_text/config/com/fastml/goodbooks-10k.yaml,sha256=5ZABxUnBFs2ZnCXtBH8YoBiPb3SocRRdH1DLSfVWF-Y,1172
|
|
8
8
|
datamaestro_text/config/com/github/ikat.py,sha256=DCayX-t2OBeW5bOJvRxoQgIH3vy-__mYzdmVcnayAkk,4230
|
|
9
|
-
datamaestro_text/config/com/github/aagohary/canard.py,sha256=
|
|
10
|
-
datamaestro_text/config/com/github/apple/ml-qrecc.py,sha256=
|
|
9
|
+
datamaestro_text/config/com/github/aagohary/canard.py,sha256=5fLwCLNBGM_7--naTCDayAMYLvK3yTD8auaEf-dqrb4,1768
|
|
10
|
+
datamaestro_text/config/com/github/apple/ml-qrecc.py,sha256=zP3w7A9KSvJVCo44OaB1az1pDKWxE6qXS4qFm3hqg3Y,3064
|
|
11
11
|
datamaestro_text/config/com/github/prdwb/orconvqa.py,sha256=MFJYaxJoqJQ6kMDHa2PIE0zoxYTA8Kyl26-vzFoMML0,3032
|
|
12
12
|
datamaestro_text/config/com/github/soskek/bookcorpus.yaml,sha256=qJKs35yeEIilEMgNvU3OEqMp1TSn7mDM2T-uYyA7kTU,1607
|
|
13
13
|
datamaestro_text/config/com/microsoft/wikiqa.yaml,sha256=U7rU-W2Xz1MYv2YXT4jCTj5DsHyM0VssbQPNa3EISaM,540
|
|
14
|
-
datamaestro_text/config/com/microsoft/msmarco/passage.py,sha256=
|
|
15
|
-
datamaestro_text/config/com/smashwords/bookcorpus.py,sha256=
|
|
14
|
+
datamaestro_text/config/com/microsoft/msmarco/passage.py,sha256=hN2KOdi6ToHlodozqsYAOtxaqiUGkGGtRtb3RFSgnEU,11645
|
|
15
|
+
datamaestro_text/config/com/smashwords/bookcorpus.py,sha256=LgUcnR-z99kTrZj6QaCLuLrj1bG-wHMM5GlVNmbrY2k,851
|
|
16
16
|
datamaestro_text/config/edu/cornell/nlvr.yaml,sha256=9Yk5VZMncSmrP7JNuGXqExksgX5nQ_Zfnlps8hWze3Q,921
|
|
17
17
|
datamaestro_text/config/edu/stanford/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
|
-
datamaestro_text/config/edu/stanford/aclimdb.py,sha256=
|
|
18
|
+
datamaestro_text/config/edu/stanford/aclimdb.py,sha256=QtriReAVsbJlxkgfJWQCZdCeJ9LswYnOR9mFrgghL9c,647
|
|
19
19
|
datamaestro_text/config/edu/stanford/glove.py,sha256=bXxwiJqT8alPs5MwwxVuY0xBO8g1QWhm2L3AKVskTlA,2391
|
|
20
20
|
datamaestro_text/config/edu/stanford/im2p.yaml,sha256=JoToNyEPpmwdyLFedCBot5ypyw7p9rzi12mGXJuZin0,2909
|
|
21
21
|
datamaestro_text/config/edu/upenn/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
22
|
datamaestro_text/config/edu/upenn/ldc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
23
23
|
datamaestro_text/config/edu/upenn/ldc/aquaint.py,sha256=YEU3kIXzv71Vjg9bUoXyQ-vSLvC-y4LlE3mJQf38XuY,1589
|
|
24
|
+
datamaestro_text/config/fr/granddebat.py,sha256=JRLC3q6o-XhJECjAh40w2p40pCSRw9K3-YMDUpdNwMM,7016
|
|
24
25
|
datamaestro_text/config/gov/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
26
|
datamaestro_text/config/gov/nist/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
27
|
datamaestro_text/config/gov/nist/ir/covid.py,sha256=wn2E7sQ8M6pAucVD4sKJYImyzKUKphyiDFJD0oYRCbg,4004
|
|
@@ -32,14 +33,14 @@ datamaestro_text/config/gov/nist/trec/index.yaml,sha256=oSUhUmtukf5oXqUbJLhae8xZ
|
|
|
32
33
|
datamaestro_text/config/gov/nist/trec/tipster.py,sha256=DirpnHpS10e27LcL7v9ksKreKVy7EgfVhyztV49VRds,5364
|
|
33
34
|
datamaestro_text/config/gov/nist/trec/web.yaml,sha256=iNIJ-PCeLwpUaJByJesyCqfFJolWCts_VETACha3hfQ,563
|
|
34
35
|
datamaestro_text/config/io/github/rajpurkar/squad.yaml,sha256=JchkTruEhVd0uOTGgek2tOgBL4UT3G3KVASvZcNigLg,1143
|
|
35
|
-
datamaestro_text/config/io/github/thunlp/fewrel.py,sha256=
|
|
36
|
+
datamaestro_text/config/io/github/thunlp/fewrel.py,sha256=rg_qAnMrXYUZhQYxA12r_Npl0ggyfTLJQjdSCjU0QxM,1228
|
|
36
37
|
datamaestro_text/config/io/metamind/research/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
37
|
-
datamaestro_text/config/io/metamind/research/wikitext.py,sha256=
|
|
38
|
+
datamaestro_text/config/io/metamind/research/wikitext.py,sha256=DjyBmG74JvuMt9RpMwuLAnxzOdByIWsk4VnXgkJp1NM,2307
|
|
38
39
|
datamaestro_text/config/net/mattmahoney/enwiki.yaml,sha256=HCUn3s0AydXX3BjJ6yUXY0vGLGWSBkOCaDhQ4PA2Adg,2452
|
|
39
40
|
datamaestro_text/config/org/acm/recsys/cb2014.yaml,sha256=5SAK3Am1k0HFugSSCIQN5mLPBfr1zZZAkhLrSH5pHQc,1274
|
|
40
41
|
datamaestro_text/config/org/cocodataset/index.yaml,sha256=KISJChMeKwlZbSnHmRcGMsm6jbcFGVe1aA4GhP2fzqw,474
|
|
41
|
-
datamaestro_text/config/org/grouplens/movielens.py,sha256=
|
|
42
|
-
datamaestro_text/config/org/universaldependencies/french.py,sha256=
|
|
42
|
+
datamaestro_text/config/org/grouplens/movielens.py,sha256=tV6OSTDdtjll1dQBCsYIlsBbtOO-MCiLles2aj0MgDA,1840
|
|
43
|
+
datamaestro_text/config/org/universaldependencies/french.py,sha256=etedb3_SC-fV5Oa2rM4_smZk6t4CPiNvU4C4keUFZHY,2214
|
|
43
44
|
datamaestro_text/config/uk/ac/ucl/cs/qangaroo.yaml,sha256=IBy82CDNNLjJPNPzues1EgDXu0A5WDvUFeVNSOyrIpI,1137
|
|
44
45
|
datamaestro_text/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
45
46
|
datamaestro_text/data/embeddings.py,sha256=qdeZ4m6drFNkqW_4pKxJKdJHtOnDRs0P7QVJ7AO6xFE,1127
|
|
@@ -47,24 +48,26 @@ datamaestro_text/data/recommendation.py,sha256=wHV_9SXSclouuXaBmvwg3ncZLOFfdKRZG
|
|
|
47
48
|
datamaestro_text/data/tagging.py,sha256=yWm7bNLks77cAySa1aZNCmLFxTTqhqXZ0PaoaEYU6hI,697
|
|
48
49
|
datamaestro_text/data/text.py,sha256=_9J7-j06BOIZ1HsOcBsWy_asanAKkDoZIsWWMMj4tm4,496
|
|
49
50
|
datamaestro_text/data/conversation/__init__.py,sha256=esOWnSImMlQs5QtfxUt559ABLd6a5wwoNpj9XtIq71Y,159
|
|
50
|
-
datamaestro_text/data/conversation/base.py,sha256=
|
|
51
|
+
datamaestro_text/data/conversation/base.py,sha256=rLOP2dqvMr2E9ONKgPF6CKUCAEHcXt5WqabJyo76AEI,7508
|
|
51
52
|
datamaestro_text/data/conversation/canard.py,sha256=FI02rPNuzSsA_uXOcgcuQn31d3QahmHndzYCV1Zm8rk,3305
|
|
52
53
|
datamaestro_text/data/conversation/ikat.py,sha256=djxTv0vHLXIUPzfmwpB25fyUWJNVPqthTtDbQ6CzmQo,4363
|
|
53
54
|
datamaestro_text/data/conversation/orconvqa.py,sha256=RL9cpK7QK2dsk_j2e4Th8wzGFq-G3ww_EjdM1eKh-8o,3764
|
|
54
55
|
datamaestro_text/data/conversation/qrecc.py,sha256=es4GmqPtE63A7O_GARe8Zy3rQvuLEhAvUA7CfN_nMeA,2562
|
|
55
|
-
datamaestro_text/data/
|
|
56
|
+
datamaestro_text/data/debate/__init__.py,sha256=PzCV3Bd9fmonE-OQp4VtK1NglH42-iv34WAWUIU-eYk,187
|
|
57
|
+
datamaestro_text/data/debate/granddebat.py,sha256=4-HMfgvF2bPru56D3hkA1E2bN3dgIUmcvX9eOIXroLA,2176
|
|
58
|
+
datamaestro_text/data/ir/__init__.py,sha256=ZRJrUeeUyD1ncMN5JINVvFJ2lDr3KsbgiiEBJkczSi0,9814
|
|
56
59
|
datamaestro_text/data/ir/base.py,sha256=uwIiKn0ryK5DWUQsEegeTs6bye0uAOGl0XVm_cvV3ZU,1506
|
|
57
60
|
datamaestro_text/data/ir/cord19.py,sha256=yu1Khgy6AZjH2BPQKdnxDid0vQgQ8zvb8-FQlHH-GOU,1465
|
|
58
|
-
datamaestro_text/data/ir/csv.py,sha256=
|
|
61
|
+
datamaestro_text/data/ir/csv.py,sha256=0jnaV-wKLgslH7izR-xP_RX7l90vykQTn3bPhaCFR-c,1027
|
|
59
62
|
datamaestro_text/data/ir/data.py,sha256=ERmmOxz_9mUkIuccNbzUa5Y6gVLLVDdyc4cCxbCCUbY,20
|
|
60
|
-
datamaestro_text/data/ir/formats.py,sha256=
|
|
63
|
+
datamaestro_text/data/ir/formats.py,sha256=eyP7PJ6A4Pd1uv3nbeU1N_Q3Bee7XSYTaYsiHP1MFns,3639
|
|
61
64
|
datamaestro_text/data/ir/huggingface.py,sha256=G71VFDN-SllZy4LFxumEbCumEJvb5-orAbLemHjWhiA,956
|
|
62
65
|
datamaestro_text/data/ir/stores.py,sha256=A4Ew0L4P6iLLmDKhxqjjVkHcz797BHf4d76YguGkB1A,4412
|
|
63
66
|
datamaestro_text/data/ir/trec.py,sha256=IOtQRMUz8zx-dYEMR2NIIM6qXEUjsV0eVOhGvKIRJK4,1974
|
|
64
67
|
datamaestro_text/data/ir/utils.py,sha256=6-GhXVtgkBZGhIs2-ODZua_3DmKjSSVydStpHDqbAwE,833
|
|
65
68
|
datamaestro_text/datasets/irds/__init__.py,sha256=Tq0HN1qojnZYLBumM59BuTkz7r0gcu-5OXmDDLgPpAc,707
|
|
66
|
-
datamaestro_text/datasets/irds/data.py,sha256=
|
|
67
|
-
datamaestro_text/datasets/irds/datasets.py,sha256=
|
|
69
|
+
datamaestro_text/datasets/irds/data.py,sha256=5ZtJTEV9qtbl_Do4VR6EvYoxPTlsRkkjoBunXDLfmHI,23012
|
|
70
|
+
datamaestro_text/datasets/irds/datasets.py,sha256=CJ8MA44XCwIQGZTzYIJnR-qFm890rUZZB7C3lKIwNyY,5627
|
|
68
71
|
datamaestro_text/datasets/irds/helpers.py,sha256=fGE-fbuJbXdTzl1qo55247jzn9cvApY-d82GJBgfY1E,3982
|
|
69
72
|
datamaestro_text/datasets/irds/utils.py,sha256=m30JLIrV_HgilN11TvY9dGTyumES6LLzWZDUAMT915M,1425
|
|
70
73
|
datamaestro_text/download/tmdb.py,sha256=kU_Vz9jhznlyeKMHziVu58IHoWv8zPu6CZTHVNQvmu4,4009
|
|
@@ -74,15 +77,14 @@ datamaestro_text/test/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3h
|
|
|
74
77
|
datamaestro_text/test/test_datasets.py,sha256=hD1pe-CjEnOj0JNqDLasz_97ltFzMbYfyYBy1QyYdf8,202
|
|
75
78
|
datamaestro_text/test/test_documented.py,sha256=Kl90fz_r_dsV0oXE1Mad34GiQyZ9dc6bfZKNtxP0O2s,453
|
|
76
79
|
datamaestro_text/transforms/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
77
|
-
datamaestro_text/transforms/ir/__init__.py,sha256=
|
|
80
|
+
datamaestro_text/transforms/ir/__init__.py,sha256=Pb8C-jwjtCur6gU-Lv4AosSFFKw2o6jMQcJE1A_5PD8,6555
|
|
78
81
|
datamaestro_text/utils/__init__.py,sha256=2449YLTAtKJzkmt84Mu8sBRCCveNs5fiaqTCK_p5ha0,3340
|
|
79
82
|
datamaestro_text/utils/files.py,sha256=jhcirufgTztEkx1hs2-qgScEcHnIcGF_BjCeuCSsNv0,2838
|
|
80
83
|
datamaestro_text/utils/iter.py,sha256=QBajeSPLHvkeh6BCTZDSqWlOYNjwUDvgTTZ_YxJntXw,2701
|
|
81
84
|
datamaestro_text/utils/randomstream.py,sha256=_-boH4IIqN8qcl3IktjpNp9vmF4TWRzHUSNVwg7WAr8,973
|
|
82
85
|
datamaestro_text/utils/shuffle.py,sha256=o8JTz3mr0lYWyv0zEh91jEK12ci1etMiUnzh5GkOHCM,3490
|
|
83
|
-
datamaestro_text-
|
|
84
|
-
datamaestro_text-
|
|
85
|
-
datamaestro_text-
|
|
86
|
-
datamaestro_text-
|
|
87
|
-
datamaestro_text-
|
|
88
|
-
datamaestro_text-2025.7.28.dist-info/RECORD,,
|
|
86
|
+
datamaestro_text-2026.1.1.dist-info/METADATA,sha256=PnX4hQbQmrRUUBily4GEDJ53HsE-u1QqhK7gcv5nWeU,2103
|
|
87
|
+
datamaestro_text-2026.1.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
88
|
+
datamaestro_text-2026.1.1.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
|
|
89
|
+
datamaestro_text-2026.1.1.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
90
|
+
datamaestro_text-2026.1.1.dist-info/RECORD,,
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
datamaestro_text
|
{datamaestro_text-2025.7.28.dist-info → datamaestro_text-2026.1.1.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{datamaestro_text-2025.7.28.dist-info → datamaestro_text-2026.1.1.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|