datamaestro-text 2025.9.11__py3-none-any.whl → 2026.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamaestro_text/__init__.py +1 -1
- datamaestro_text/config/com/github/ikat.py +0 -1
- datamaestro_text/config/com/oscar-corpus.py +1 -1
- datamaestro_text/config/com/smashwords/bookcorpus.py +1 -1
- datamaestro_text/config/edu/stanford/aclimdb.py +1 -1
- datamaestro_text/config/edu/stanford/glove.py +0 -1
- datamaestro_text/config/fr/granddebat.py +186 -0
- datamaestro_text/config/gov/nist/ir/covid.py +1 -2
- datamaestro_text/config/io/metamind/research/wikitext.py +1 -1
- datamaestro_text/data/conversation/__init__.py +6 -6
- datamaestro_text/data/conversation/base.py +4 -4
- datamaestro_text/data/conversation/canard.py +3 -4
- datamaestro_text/data/conversation/ikat.py +0 -1
- datamaestro_text/data/conversation/orconvqa.py +3 -3
- datamaestro_text/data/debate/__init__.py +5 -0
- datamaestro_text/data/debate/granddebat.py +68 -0
- datamaestro_text/data/embeddings.py +1 -0
- datamaestro_text/data/ir/__init__.py +1 -1
- datamaestro_text/data/ir/base.py +1 -1
- datamaestro_text/data/ir/csv.py +7 -8
- datamaestro_text/data/ir/data.py +1 -1
- datamaestro_text/data/ir/formats.py +2 -3
- datamaestro_text/data/ir/stores.py +1 -1
- datamaestro_text/data/text.py +1 -0
- datamaestro_text/datasets/__init__.py +1 -0
- datamaestro_text/datasets/irds/data.py +14 -20
- datamaestro_text/datasets/irds/datasets.py +1 -1
- datamaestro_text/download/tmdb.py +0 -1
- datamaestro_text/transforms/ir/__init__.py +13 -14
- datamaestro_text/utils/shuffle.py +1 -1
- datamaestro_text/version.py +3 -3
- {datamaestro_text-2025.9.11.dist-info → datamaestro_text-2026.2.2.dist-info}/METADATA +15 -17
- {datamaestro_text-2025.9.11.dist-info → datamaestro_text-2026.2.2.dist-info}/RECORD +36 -33
- {datamaestro_text-2025.9.11.dist-info → datamaestro_text-2026.2.2.dist-info}/WHEEL +1 -2
- datamaestro_text-2025.9.11.dist-info/top_level.txt +0 -1
- {datamaestro_text-2025.9.11.dist-info → datamaestro_text-2026.2.2.dist-info}/entry_points.txt +0 -0
- {datamaestro_text-2025.9.11.dist-info → datamaestro_text-2026.2.2.dist-info}/licenses/LICENSE +0 -0
datamaestro_text/__init__.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# See documentation on https://datamaestro.readthedocs.io
|
|
2
2
|
|
|
3
|
-
from datamaestro.definitions import
|
|
3
|
+
from datamaestro.definitions import datatasks, datatags, dataset
|
|
4
4
|
from datamaestro_text.data.text import TextFolder
|
|
5
5
|
from datamaestro.download.archive import tardownloader
|
|
6
6
|
from datamaestro.utils import HashCheck
|
|
@@ -5,7 +5,6 @@ GloVe is an unsupervised learning algorithm for obtaining vector representations
|
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
from datamaestro.definitions import dataset
|
|
8
|
-
from datamaestro.data import Base, Generic
|
|
9
8
|
from datamaestro.download import reference
|
|
10
9
|
from datamaestro.download.archive import zipdownloader
|
|
11
10
|
from datamaestro.download.single import filedownloader
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
# See documentation on https://datamaestro.readthedocs.io
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from datamaestro.definitions import datatags, dataset
|
|
5
|
+
from datamaestro_text.data.debate import GrandDebatFile
|
|
6
|
+
from datamaestro.download.single import filedownloader
|
|
7
|
+
from datamaestro.utils import HashCheck
|
|
8
|
+
from datamaestro.stream import Transform
|
|
9
|
+
import io
|
|
10
|
+
import json
|
|
11
|
+
import ijson
|
|
12
|
+
import os
|
|
13
|
+
import threading
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class JsonToJsonl(Transform):
|
|
17
|
+
"""Transforms a JSON file with an array into a JSONL file with one line per
|
|
18
|
+
array element"""
|
|
19
|
+
|
|
20
|
+
def __call__(self, fileobj: io.IOBase) -> io.IOBase:
|
|
21
|
+
# Stream items from the top-level array into a read-end pipe.
|
|
22
|
+
try:
|
|
23
|
+
fileobj.seek(0)
|
|
24
|
+
except Exception:
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
r_fd, w_fd = os.pipe()
|
|
28
|
+
r_file = os.fdopen(r_fd, "rb")
|
|
29
|
+
w_file = os.fdopen(w_fd, "wb")
|
|
30
|
+
|
|
31
|
+
def _writer(fin, fout):
|
|
32
|
+
try:
|
|
33
|
+
for item in ijson.items(fin, "item"):
|
|
34
|
+
line = json.dumps(item, ensure_ascii=False) + "\n"
|
|
35
|
+
fout.write(line.encode("utf-8"))
|
|
36
|
+
fout.close()
|
|
37
|
+
except Exception:
|
|
38
|
+
try:
|
|
39
|
+
fout.close()
|
|
40
|
+
except Exception:
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
t = threading.Thread(target=_writer, args=(fileobj, w_file), daemon=True)
|
|
44
|
+
t.start()
|
|
45
|
+
|
|
46
|
+
return r_file
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@filedownloader(
|
|
50
|
+
"la_transition_ecologique_2019_03_21.jsonl",
|
|
51
|
+
"http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LA_TRANSITION_ECOLOGIQUE.json",
|
|
52
|
+
checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
|
|
53
|
+
transforms=JsonToJsonl(),
|
|
54
|
+
)
|
|
55
|
+
@datatags("politics", "debate", "french")
|
|
56
|
+
@dataset(
|
|
57
|
+
GrandDebatFile,
|
|
58
|
+
url="https://granddebat.fr",
|
|
59
|
+
)
|
|
60
|
+
def transition(la_transition_ecologique_2019_03_21: Path):
|
|
61
|
+
"""Grand Débat National (transition écologique)
|
|
62
|
+
|
|
63
|
+
The *Grand Débat National* (GDN) is a country-wide citizen consultation held
|
|
64
|
+
in France in 2019.
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
The consultation prompted citizens to express their views across four main
|
|
68
|
+
themes: *Taxation and public spending*, *Organization of the state and
|
|
69
|
+
public services*, *Democracy and citizenship*, and *Ecological transition*.
|
|
70
|
+
A significant portion of this consultation involved online questionnaires,
|
|
71
|
+
each concluding with a critical open-ended prompt: "Do you have anything to
|
|
72
|
+
add about [theme]?".
|
|
73
|
+
"""
|
|
74
|
+
return GrandDebatFile.C(path=la_transition_ecologique_2019_03_21)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@filedownloader(
|
|
78
|
+
"fiscalité_et_dépenses_publiques_2019_03_21.jsonl",
|
|
79
|
+
"http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json",
|
|
80
|
+
checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
|
|
81
|
+
transforms=JsonToJsonl(),
|
|
82
|
+
)
|
|
83
|
+
@datatags("politics", "debate", "french")
|
|
84
|
+
@dataset(
|
|
85
|
+
GrandDebatFile,
|
|
86
|
+
url="https://granddebat.fr",
|
|
87
|
+
)
|
|
88
|
+
def fiscalité(fiscalité_et_dépenses_publiques_2019_03_21: Path):
|
|
89
|
+
"""Grand Débat National (fiscalité et dépenses publiques)
|
|
90
|
+
|
|
91
|
+
The *Grand Débat National* (GDN) is a country-wide citizen consultation held
|
|
92
|
+
in France in 2019.
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
The consultation prompted citizens to express their views across four main
|
|
96
|
+
themes: *Taxation and public spending*, *Organization of the state and
|
|
97
|
+
public services*, *Democracy and citizenship*, and *Ecological transition*.
|
|
98
|
+
A significant portion of this consultation involved online questionnaires,
|
|
99
|
+
each concluding with a critical open-ended prompt: "Do you have anything to
|
|
100
|
+
add about [theme]?".
|
|
101
|
+
"""
|
|
102
|
+
return GrandDebatFile.C(path=fiscalité_et_dépenses_publiques_2019_03_21)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
@filedownloader(
|
|
106
|
+
"democratie_et_citoyennete_2019_03_21.jsonl",
|
|
107
|
+
"http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/DEMOCRATIE_ET_CITOYENNETE.json",
|
|
108
|
+
checker=HashCheck("049aaeca7e51747f2da5d68143c760fa"),
|
|
109
|
+
transforms=JsonToJsonl(),
|
|
110
|
+
)
|
|
111
|
+
@datatags("politics", "debate", "french")
|
|
112
|
+
@dataset(
|
|
113
|
+
GrandDebatFile,
|
|
114
|
+
url="https://granddebat.fr",
|
|
115
|
+
)
|
|
116
|
+
def démocratie(democratie_et_citoyennete_2019_03_21: Path):
|
|
117
|
+
"""Grand Débat National (démocratie et citoyenneté)
|
|
118
|
+
|
|
119
|
+
The *Grand Débat National* (GDN) is a country-wide citizen consultation held
|
|
120
|
+
in France in 2019.
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
The consultation prompted citizens to express their views across four main
|
|
124
|
+
themes: *Taxation and public spending*, *Organization of the state and
|
|
125
|
+
public services*, *Democracy and citizenship*, and *Ecological transition*.
|
|
126
|
+
A significant portion of this consultation involved online questionnaires,
|
|
127
|
+
each concluding with a critical open-ended prompt: "Do you have anything to
|
|
128
|
+
add about [theme]?".
|
|
129
|
+
"""
|
|
130
|
+
return GrandDebatFile.C(path=democratie_et_citoyennete_2019_03_21)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
@filedownloader(
|
|
134
|
+
"organisation_etat_services_publics_2019_03_21.jsonl",
|
|
135
|
+
"http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json",
|
|
136
|
+
checker=HashCheck("0ccb7c401889f738b73b0caab897a68b"),
|
|
137
|
+
transforms=JsonToJsonl(),
|
|
138
|
+
)
|
|
139
|
+
@datatags("politics", "debate", "french")
|
|
140
|
+
@dataset(
|
|
141
|
+
GrandDebatFile,
|
|
142
|
+
url="https://granddebat.fr",
|
|
143
|
+
)
|
|
144
|
+
def organisation(organisation_etat_services_publics_2019_03_21: Path):
|
|
145
|
+
"""Grand Débat National (organisation de l'État et des services publics)
|
|
146
|
+
|
|
147
|
+
The *Grand Débat National* (GDN) is a country-wide citizen consultation held
|
|
148
|
+
in France in 2019.
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
The consultation prompted citizens to express their views across four main
|
|
152
|
+
themes: *Taxation and public spending*, *Organization of the state and
|
|
153
|
+
public services*, *Democracy and citizenship*, and *Ecological transition*.
|
|
154
|
+
A significant portion of this consultation involved online questionnaires,
|
|
155
|
+
each concluding with a critical open-ended prompt: "Do you have anything to
|
|
156
|
+
add about [theme]?".
|
|
157
|
+
"""
|
|
158
|
+
return GrandDebatFile.C(path=organisation_etat_services_publics_2019_03_21)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
@filedownloader(
|
|
162
|
+
"les_evenements_2019_03_21.jsonl",
|
|
163
|
+
"http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LES_EVENEMENTS.json",
|
|
164
|
+
checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
|
|
165
|
+
transforms=JsonToJsonl(),
|
|
166
|
+
)
|
|
167
|
+
@datatags("politics", "debate", "french")
|
|
168
|
+
@dataset(
|
|
169
|
+
GrandDebatFile,
|
|
170
|
+
url="https://granddebat.fr",
|
|
171
|
+
)
|
|
172
|
+
def evenements(les_evenements_2019_03_21: Path):
|
|
173
|
+
"""Grand Débat National (événements)
|
|
174
|
+
|
|
175
|
+
The *Grand Débat National* (GDN) is a country-wide citizen consultation held
|
|
176
|
+
in France in 2019.
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
The consultation prompted citizens to express their views across four main
|
|
180
|
+
themes: *Taxation and public spending*, *Organization of the state and
|
|
181
|
+
public services*, *Democracy and citizenship*, and *Ecological transition*.
|
|
182
|
+
A significant portion of this consultation involved online questionnaires,
|
|
183
|
+
each concluding with a critical open-ended prompt: "Do you have anything to
|
|
184
|
+
add about [theme]?".
|
|
185
|
+
"""
|
|
186
|
+
return GrandDebatFile.C(path=les_evenements_2019_03_21)
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
from .base import (
|
|
2
|
-
AnswerEntry,
|
|
3
|
-
ConversationDataset,
|
|
4
|
-
ConversationHistory,
|
|
5
|
-
ConversationHistoryItem,
|
|
6
|
-
DecontextualizedItem,
|
|
7
|
-
EntryType,
|
|
2
|
+
AnswerEntry as AnswerEntry,
|
|
3
|
+
ConversationDataset as ConversationDataset,
|
|
4
|
+
ConversationHistory as ConversationHistory,
|
|
5
|
+
ConversationHistoryItem as ConversationHistoryItem,
|
|
6
|
+
DecontextualizedItem as DecontextualizedItem,
|
|
7
|
+
EntryType as EntryType,
|
|
8
8
|
)
|
|
@@ -2,13 +2,13 @@ from abc import ABC, abstractmethod
|
|
|
2
2
|
from enum import Enum
|
|
3
3
|
from datamaestro_text.data.ir.base import IDItem, SimpleTextItem
|
|
4
4
|
from experimaestro import Param
|
|
5
|
-
from typing import Dict,
|
|
5
|
+
from typing import Dict, Iterator, List, Optional, Sequence, Tuple
|
|
6
6
|
from attr import define
|
|
7
7
|
from datamaestro.record import record_type
|
|
8
8
|
from datamaestro.data import Base
|
|
9
9
|
from datamaestro.record import Record, Item
|
|
10
10
|
from datamaestro_text.data.ir import TopicRecord, Topics
|
|
11
|
-
from datamaestro_text.utils.iter import FactoryIterable, LazyList
|
|
11
|
+
from datamaestro_text.utils.iter import FactoryIterable, LazyList
|
|
12
12
|
|
|
13
13
|
# ---- Basic types
|
|
14
14
|
|
|
@@ -267,7 +267,7 @@ class ConversationUserTopics(Topics):
|
|
|
267
267
|
"""Returns an iterator over topics"""
|
|
268
268
|
# Extracts topics from conversations, Each user query is a topic (can perform retrieval on it)
|
|
269
269
|
# TODO: merge with xpmir.learning.DatasetConversationBase -> same logic
|
|
270
|
-
|
|
270
|
+
|
|
271
271
|
records: List[TopicRecord] = []
|
|
272
272
|
for conversation in self.conversations.__iter__():
|
|
273
273
|
nodes = [
|
|
@@ -279,4 +279,4 @@ class ConversationUserTopics(Topics):
|
|
|
279
279
|
records.append(
|
|
280
280
|
node.entry.update(ConversationHistoryItem(node.history()))
|
|
281
281
|
)
|
|
282
|
-
return iter(records)
|
|
282
|
+
return iter(records)
|
|
@@ -11,7 +11,6 @@ from datamaestro_text.data.conversation.base import (
|
|
|
11
11
|
EntryType,
|
|
12
12
|
)
|
|
13
13
|
from datamaestro_text.data.ir import IDItem, SimpleTextItem
|
|
14
|
-
import logging
|
|
15
14
|
|
|
16
15
|
|
|
17
16
|
@define(kw_only=True)
|
|
@@ -82,9 +81,9 @@ class CanardDataset(ConversationDataset, File):
|
|
|
82
81
|
)
|
|
83
82
|
else:
|
|
84
83
|
# The utterance before the last is the last user query
|
|
85
|
-
assert (
|
|
86
|
-
entry.history
|
|
87
|
-
)
|
|
84
|
+
assert entry.history[-2] == history[-1][SimpleTextItem].text, (
|
|
85
|
+
f"{entry.dialogue_id} {entry.history} / {history[-4:-1]}"
|
|
86
|
+
)
|
|
88
87
|
|
|
89
88
|
# The last utterance is the system side
|
|
90
89
|
history.append(
|
|
@@ -113,9 +113,9 @@ class OrConvQADataset(ConversationDataset, File):
|
|
|
113
113
|
if relevance > 0:
|
|
114
114
|
relevances[rank] = (entry.answer.answer_start, None)
|
|
115
115
|
|
|
116
|
-
assert (
|
|
117
|
-
len(relevances)
|
|
118
|
-
)
|
|
116
|
+
assert len(relevances) <= 1, (
|
|
117
|
+
f"Too many relevance labels ({len(relevances)}) for {entry.query_id}"
|
|
118
|
+
)
|
|
119
119
|
|
|
120
120
|
history.append(
|
|
121
121
|
Record(
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""Data classes for the Grand Débat National dataset"""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from typing import Iterator, List, Optional
|
|
6
|
+
|
|
7
|
+
from datamaestro.data import File
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class GrandDebatResponse:
|
|
12
|
+
"""A response to a question in the Grand Débat National"""
|
|
13
|
+
|
|
14
|
+
question_id: str
|
|
15
|
+
question_title: str
|
|
16
|
+
value: Optional[str]
|
|
17
|
+
formatted_value: Optional[str]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class GrandDebatEntry:
|
|
22
|
+
"""An entry (contribution) in the Grand Débat National dataset"""
|
|
23
|
+
|
|
24
|
+
id: str
|
|
25
|
+
reference: str
|
|
26
|
+
title: str
|
|
27
|
+
created_at: str
|
|
28
|
+
published_at: str
|
|
29
|
+
updated_at: Optional[str]
|
|
30
|
+
trashed: bool
|
|
31
|
+
trashed_status: Optional[str]
|
|
32
|
+
author_id: str
|
|
33
|
+
author_type: str
|
|
34
|
+
author_zip_code: str
|
|
35
|
+
responses: List[GrandDebatResponse] = field(default_factory=list)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class GrandDebatFile(File):
|
|
39
|
+
"""A Grand Débat National JSONL file with iteration support"""
|
|
40
|
+
|
|
41
|
+
def __iter__(self) -> Iterator[GrandDebatEntry]:
|
|
42
|
+
"""Iterate over entries in the JSONL file"""
|
|
43
|
+
with self.path.open("r", encoding="utf-8") as f:
|
|
44
|
+
for line in f:
|
|
45
|
+
data = json.loads(line)
|
|
46
|
+
responses = [
|
|
47
|
+
GrandDebatResponse(
|
|
48
|
+
question_id=r["questionId"],
|
|
49
|
+
question_title=r["questionTitle"],
|
|
50
|
+
value=r.get("value"),
|
|
51
|
+
formatted_value=r.get("formattedValue"),
|
|
52
|
+
)
|
|
53
|
+
for r in data.get("responses", [])
|
|
54
|
+
]
|
|
55
|
+
yield GrandDebatEntry(
|
|
56
|
+
id=data["id"],
|
|
57
|
+
reference=data["reference"],
|
|
58
|
+
title=data["title"],
|
|
59
|
+
created_at=data["createdAt"],
|
|
60
|
+
published_at=data["publishedAt"],
|
|
61
|
+
updated_at=data.get("updatedAt"),
|
|
62
|
+
trashed=data["trashed"],
|
|
63
|
+
trashed_status=data.get("trashedStatus"),
|
|
64
|
+
author_id=data["authorId"],
|
|
65
|
+
author_type=data["authorType"],
|
|
66
|
+
author_zip_code=data["authorZipCode"],
|
|
67
|
+
responses=responses,
|
|
68
|
+
)
|
|
@@ -6,7 +6,7 @@ from functools import cached_property
|
|
|
6
6
|
import logging
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
from attrs import define
|
|
9
|
-
from typing import Callable, Dict, Iterator, List, Optional, Tuple, Type
|
|
9
|
+
from typing import Callable, Dict, Iterator, List, Optional, Tuple, Type
|
|
10
10
|
import random
|
|
11
11
|
from experimaestro import Config
|
|
12
12
|
from datamaestro.definitions import datatasks, Param, Meta
|
datamaestro_text/data/ir/base.py
CHANGED
datamaestro_text/data/ir/csv.py
CHANGED
|
@@ -1,27 +1,26 @@
|
|
|
1
1
|
from functools import cached_property
|
|
2
2
|
from pathlib import Path
|
|
3
|
-
from typing import Iterator, Tuple, Type
|
|
4
3
|
|
|
5
|
-
from experimaestro import Param,
|
|
6
|
-
from datamaestro.definitions import argument
|
|
4
|
+
from experimaestro import Param, Meta
|
|
7
5
|
from datamaestro.record import Record, RecordType
|
|
8
6
|
import datamaestro_text.data.ir as ir
|
|
9
7
|
from datamaestro_text.data.ir.base import IDItem, SimpleTextItem
|
|
10
8
|
from datamaestro_text.interfaces.plaintext import read_tsv
|
|
11
9
|
|
|
12
10
|
|
|
13
|
-
@argument("path", type=Path)
|
|
14
|
-
@argument("separator", type=str, default="\t", ignored=True)
|
|
15
11
|
class AdhocRunWithText(ir.AdhocRun):
|
|
16
12
|
"(qid, doc.id, query, passage)"
|
|
17
|
-
|
|
13
|
+
|
|
14
|
+
path: Meta[Path]
|
|
15
|
+
separator: Meta[str] = "\t"
|
|
18
16
|
|
|
19
17
|
|
|
20
|
-
@argument("path", type=Path)
|
|
21
|
-
@argument("separator", type=str, default="\t", ignored=True)
|
|
22
18
|
class Topics(ir.Topics):
|
|
23
19
|
"Pairs of query id - query using a separator"
|
|
24
20
|
|
|
21
|
+
path: Meta[Path]
|
|
22
|
+
separator: Meta[str] = "\t"
|
|
23
|
+
|
|
25
24
|
def iter(self):
|
|
26
25
|
return (
|
|
27
26
|
Record(IDItem(qid), SimpleTextItem(title))
|
datamaestro_text/data/ir/data.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
from .base import *
|
|
1
|
+
from .base import * # noqa: F403
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from functools import cached_property
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import Tuple, List
|
|
3
3
|
from attrs import define
|
|
4
4
|
from datamaestro.record import record_type
|
|
5
5
|
from ir_datasets.datasets.wapo import WapoDocMedia
|
|
@@ -11,9 +11,8 @@ from ir_datasets.datasets.cord19 import Cord19FullTextSection
|
|
|
11
11
|
class DocumentWithTitle(TextItem):
|
|
12
12
|
"""Web document with title and body"""
|
|
13
13
|
|
|
14
|
-
body: str
|
|
15
|
-
|
|
16
14
|
title: str
|
|
15
|
+
body: str
|
|
17
16
|
|
|
18
17
|
@cached_property
|
|
19
18
|
def text(self):
|
|
@@ -82,7 +82,7 @@ class IKatClueWeb22DocumentStore(LZ4DocumentStore):
|
|
|
82
82
|
|
|
83
83
|
file_checksum = hasher.hexdigest()
|
|
84
84
|
assert file_checksum == checksum, (
|
|
85
|
-
f"Expected {checksum},
|
|
85
|
+
f"Expected {checksum}, got {file_checksum} for {filename}"
|
|
86
86
|
)
|
|
87
87
|
|
|
88
88
|
# Get the MD5 hashes of all the passages
|
datamaestro_text/data/text.py
CHANGED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# IR datasets integration package
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from abc import ABC, abstractmethod
|
|
3
|
-
from
|
|
4
|
-
from functools import partial
|
|
3
|
+
from functools import cached_property, partial
|
|
5
4
|
from pathlib import Path
|
|
6
5
|
from typing import Dict, Iterator, List, NamedTuple, Tuple, Type
|
|
7
6
|
|
|
@@ -9,7 +8,6 @@ import ir_datasets
|
|
|
9
8
|
import ir_datasets.datasets as _irds
|
|
10
9
|
from datamaestro.record import RecordType, record_type
|
|
11
10
|
from experimaestro import Config, Meta, Option, Param
|
|
12
|
-
from experimaestro.compat import cached_property
|
|
13
11
|
from ir_datasets.formats import (
|
|
14
12
|
GenericDoc,
|
|
15
13
|
GenericDocPair,
|
|
@@ -112,6 +110,9 @@ class Documents(ir.DocumentStore, IRDSId):
|
|
|
112
110
|
_irds.beir.BeirCordDoc: tuple_constructor(
|
|
113
111
|
formats.CordDocument, "doc_id", "text", "title", "url", "pubmed_id"
|
|
114
112
|
),
|
|
113
|
+
_irds.miracl.MiraclDoc: tuple_constructor(
|
|
114
|
+
formats.DocumentWithTitle, "doc_id", "title", "text"
|
|
115
|
+
),
|
|
115
116
|
_irds.beir.BeirTitleDoc: tuple_constructor(
|
|
116
117
|
formats.TitleDocument, "doc_id", "text", "title"
|
|
117
118
|
),
|
|
@@ -202,11 +203,11 @@ class Documents(ir.DocumentStore, IRDSId):
|
|
|
202
203
|
|
|
203
204
|
def iter(self) -> Iterator[ir.DocumentRecord]:
|
|
204
205
|
"""Returns an iterator over adhoc documents"""
|
|
205
|
-
for doc in self.
|
|
206
|
+
for doc in self._docs:
|
|
206
207
|
yield self.converter(self.document_recordtype, doc)
|
|
207
208
|
|
|
208
209
|
def iter_documents_from(self, start=0):
|
|
209
|
-
for doc in self.
|
|
210
|
+
for doc in self._docs[start:]:
|
|
210
211
|
yield self.converter(self.document_recordtype, doc)
|
|
211
212
|
|
|
212
213
|
@property
|
|
@@ -219,19 +220,22 @@ class Documents(ir.DocumentStore, IRDSId):
|
|
|
219
220
|
try:
|
|
220
221
|
# Translate to ir datasets docstore options
|
|
221
222
|
import ir_datasets.indices as ir_indices
|
|
223
|
+
|
|
222
224
|
file_access = {
|
|
223
225
|
ir.FileAccess.MMAP: ir_indices.FileAccess.MMAP,
|
|
224
226
|
ir.FileAccess.FILE: ir_indices.FileAccess.FILE,
|
|
225
|
-
ir.FileAccess.MEMORY: ir_indices.FileAccess.MEMORY
|
|
227
|
+
ir.FileAccess.MEMORY: ir_indices.FileAccess.MEMORY,
|
|
226
228
|
}[self.file_access]
|
|
227
229
|
kwargs = {"options": ir_indices.DocstoreOptions(file_access=file_access)}
|
|
228
230
|
except ImportError:
|
|
229
|
-
logging.warning(
|
|
231
|
+
logging.warning(
|
|
232
|
+
"This version of ir-datasets cannot handle docstore options"
|
|
233
|
+
)
|
|
230
234
|
return self.dataset.docs_store(**kwargs)
|
|
231
235
|
|
|
232
|
-
@
|
|
236
|
+
@property
|
|
233
237
|
def _docs(self):
|
|
234
|
-
return self.
|
|
238
|
+
return iter(self.store)
|
|
235
239
|
|
|
236
240
|
def docid_internal2external(self, ix: int):
|
|
237
241
|
return self._docs[ix].doc_id
|
|
@@ -261,12 +265,6 @@ class Documents(ir.DocumentStore, IRDSId):
|
|
|
261
265
|
return converter
|
|
262
266
|
|
|
263
267
|
|
|
264
|
-
if hasattr(_irds, "miracl"):
|
|
265
|
-
Documents.CONVERTERS[_irds.miracl.MiraclDoc] = tuple_constructor(
|
|
266
|
-
formats.DocumentWithTitle, "doc_id", "text", "title"
|
|
267
|
-
)
|
|
268
|
-
|
|
269
|
-
|
|
270
268
|
class LZ4DocumentStore(ir.DocumentStore, ABC):
|
|
271
269
|
"""A LZ4-based document store"""
|
|
272
270
|
|
|
@@ -614,11 +612,7 @@ class Cast2022TopicsHandler(CastTopicsHandler):
|
|
|
614
612
|
records = []
|
|
615
613
|
nodes: Dict[str, ConversationTreeNode] = {}
|
|
616
614
|
|
|
617
|
-
for (
|
|
618
|
-
query
|
|
619
|
-
) in (
|
|
620
|
-
self.dataset.dataset.queries_iter()
|
|
621
|
-
): # type: _irds.trec_cast.Cast2022Query
|
|
615
|
+
for query in self.dataset.dataset.queries_iter(): # type: _irds.trec_cast.Cast2022Query
|
|
622
616
|
parent = nodes[query.parent_id] if query.parent_id else None
|
|
623
617
|
|
|
624
618
|
if query.participant == "User":
|
|
@@ -2,7 +2,6 @@ import logging
|
|
|
2
2
|
import gzip
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Type
|
|
6
5
|
from experimaestro import Config, Task, Param, Annotated, pathgenerator, Option, tqdm
|
|
7
6
|
import numpy as np
|
|
8
7
|
from datamaestro.record import RecordType
|
|
@@ -131,26 +130,26 @@ class ShuffledTrainingTripletsLines(Task):
|
|
|
131
130
|
|
|
132
131
|
def __validate__(self):
|
|
133
132
|
if self.topic_ids:
|
|
134
|
-
assert self.data.topic_recordtype.has(
|
|
135
|
-
|
|
136
|
-
)
|
|
133
|
+
assert self.data.topic_recordtype.has(ir.IDItem), (
|
|
134
|
+
f"No topic ID in the source data ({self.data.topic_recordtype})"
|
|
135
|
+
)
|
|
137
136
|
else:
|
|
138
|
-
assert self.data.topic_recordtype.has(
|
|
139
|
-
|
|
140
|
-
)
|
|
137
|
+
assert self.data.topic_recordtype.has(ir.TextItem), (
|
|
138
|
+
f"No topic text in the source data ({self.data.topic_recordtype})"
|
|
139
|
+
)
|
|
141
140
|
|
|
142
141
|
if self.doc_ids:
|
|
143
|
-
assert self.data.document_recordtype.has(
|
|
144
|
-
|
|
145
|
-
)
|
|
142
|
+
assert self.data.document_recordtype.has(ir.IDItem), (
|
|
143
|
+
"No doc ID in the source data"
|
|
144
|
+
)
|
|
146
145
|
else:
|
|
147
|
-
assert self.data.document_recordtype.has(
|
|
148
|
-
|
|
149
|
-
)
|
|
146
|
+
assert self.data.document_recordtype.has(ir.TextItem), (
|
|
147
|
+
"No doc text in the source data"
|
|
148
|
+
)
|
|
150
149
|
|
|
151
150
|
def task_outputs(self, dep):
|
|
152
151
|
return dep(
|
|
153
|
-
ir.TrainingTripletsLines(
|
|
152
|
+
ir.TrainingTripletsLines.C(
|
|
154
153
|
id="",
|
|
155
154
|
path=self.path,
|
|
156
155
|
topic_ids=self.topic_ids,
|
datamaestro_text/version.py
CHANGED
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '
|
|
32
|
-
__version_tuple__ = version_tuple = (
|
|
31
|
+
__version__ = version = '2026.2.2'
|
|
32
|
+
__version_tuple__ = version_tuple = (2026, 2, 2)
|
|
33
33
|
|
|
34
|
-
__commit_id__ = commit_id =
|
|
34
|
+
__commit_id__ = commit_id = None
|
|
@@ -1,33 +1,31 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datamaestro-text
|
|
3
|
-
Version:
|
|
3
|
+
Version: 2026.2.2
|
|
4
4
|
Summary: Datamaestro module for text-related datasets
|
|
5
|
+
Project-URL: Homepage, https://github.com/experimaestro/datamaestro_text
|
|
6
|
+
Project-URL: Documentation, https://datamaestro-text.readthedocs.io/en/latest/
|
|
7
|
+
Project-URL: Repository, https://github.com/experimaestro/datamaestro_text
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/experimaestro/datamaestro_text/issues
|
|
5
9
|
Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
|
|
6
|
-
License: GPL-3
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
Project-URL: repository, https://github.com/experimaestro/datamaestro_text
|
|
10
|
-
Keywords: dataset manager,information retrieval,experiments
|
|
10
|
+
License: GPL-3.0-or-later
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: dataset manager,experiments,information retrieval
|
|
11
13
|
Classifier: Development Status :: 4 - Beta
|
|
12
14
|
Classifier: Intended Audience :: Science/Research
|
|
13
15
|
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
14
16
|
Classifier: Operating System :: OS Independent
|
|
15
17
|
Classifier: Programming Language :: Python
|
|
16
18
|
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
22
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
18
23
|
Requires-Python: >=3.10
|
|
19
|
-
Description-Content-Type: text/markdown
|
|
20
|
-
License-File: LICENSE
|
|
21
|
-
Requires-Dist: datamaestro>=1.5.0
|
|
22
|
-
Requires-Dist: ir_datasets>=0.5.8
|
|
23
24
|
Requires-Dist: attrs
|
|
24
|
-
|
|
25
|
-
Requires-Dist:
|
|
26
|
-
Requires-Dist:
|
|
27
|
-
|
|
28
|
-
Requires-Dist: flake8; extra == "dev"
|
|
29
|
-
Requires-Dist: sphinx; extra == "dev"
|
|
30
|
-
Dynamic: license-file
|
|
25
|
+
Requires-Dist: datamaestro>=1.6.2
|
|
26
|
+
Requires-Dist: experimaestro
|
|
27
|
+
Requires-Dist: ir-datasets>=0.5.8
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
31
29
|
|
|
32
30
|
[](https://github.com/pre-commit/pre-commit) [](https://badge.fury.io/py/datamaestro-text)
|
|
33
31
|
|
|
@@ -1,29 +1,30 @@
|
|
|
1
|
-
datamaestro_text/__init__.py,sha256=
|
|
2
|
-
datamaestro_text/version.py,sha256=
|
|
1
|
+
datamaestro_text/__init__.py,sha256=MP7ShYx32k5irdgml1PjnmSofzioYQh9rzUEcHs5eys,276
|
|
2
|
+
datamaestro_text/version.py,sha256=PcJXzZYuv0SaBM1rOymP9IhKDJxqcLKUPHINlOD-hL0,710
|
|
3
3
|
datamaestro_text/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
datamaestro_text/config/ai/quac.yaml,sha256=h1D7UJo2z1nZ_9MXpDHuQNJG9Ma2oayUdJV6lyumAIg,1103
|
|
5
|
-
datamaestro_text/config/com/oscar-corpus.py,sha256=
|
|
5
|
+
datamaestro_text/config/com/oscar-corpus.py,sha256=gEWz8Nxpv7VXU8X-vfRZLwPfq0KXtkGSNtsfoqfcUI0,702
|
|
6
6
|
datamaestro_text/config/com/sentiment140.py,sha256=itfBEgcOniECXKOw8I2dhzyS9LOMsltMLfKK6NGRpVY,1293
|
|
7
7
|
datamaestro_text/config/com/fastml/goodbooks-10k.yaml,sha256=5ZABxUnBFs2ZnCXtBH8YoBiPb3SocRRdH1DLSfVWF-Y,1172
|
|
8
|
-
datamaestro_text/config/com/github/ikat.py,sha256=
|
|
8
|
+
datamaestro_text/config/com/github/ikat.py,sha256=nAmBre9zNlnGhx-C50EvLGvHqtoB7Ce-mZUZqM_ymO8,4219
|
|
9
9
|
datamaestro_text/config/com/github/aagohary/canard.py,sha256=5fLwCLNBGM_7--naTCDayAMYLvK3yTD8auaEf-dqrb4,1768
|
|
10
10
|
datamaestro_text/config/com/github/apple/ml-qrecc.py,sha256=zP3w7A9KSvJVCo44OaB1az1pDKWxE6qXS4qFm3hqg3Y,3064
|
|
11
11
|
datamaestro_text/config/com/github/prdwb/orconvqa.py,sha256=MFJYaxJoqJQ6kMDHa2PIE0zoxYTA8Kyl26-vzFoMML0,3032
|
|
12
12
|
datamaestro_text/config/com/github/soskek/bookcorpus.yaml,sha256=qJKs35yeEIilEMgNvU3OEqMp1TSn7mDM2T-uYyA7kTU,1607
|
|
13
13
|
datamaestro_text/config/com/microsoft/wikiqa.yaml,sha256=U7rU-W2Xz1MYv2YXT4jCTj5DsHyM0VssbQPNa3EISaM,540
|
|
14
14
|
datamaestro_text/config/com/microsoft/msmarco/passage.py,sha256=hN2KOdi6ToHlodozqsYAOtxaqiUGkGGtRtb3RFSgnEU,11645
|
|
15
|
-
datamaestro_text/config/com/smashwords/bookcorpus.py,sha256=
|
|
15
|
+
datamaestro_text/config/com/smashwords/bookcorpus.py,sha256=LgUcnR-z99kTrZj6QaCLuLrj1bG-wHMM5GlVNmbrY2k,851
|
|
16
16
|
datamaestro_text/config/edu/cornell/nlvr.yaml,sha256=9Yk5VZMncSmrP7JNuGXqExksgX5nQ_Zfnlps8hWze3Q,921
|
|
17
17
|
datamaestro_text/config/edu/stanford/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
|
-
datamaestro_text/config/edu/stanford/aclimdb.py,sha256=
|
|
19
|
-
datamaestro_text/config/edu/stanford/glove.py,sha256=
|
|
18
|
+
datamaestro_text/config/edu/stanford/aclimdb.py,sha256=gv_4IauUCURbMzMWpSMyx3qgOAXVQuBwKR-mMaKExpc,626
|
|
19
|
+
datamaestro_text/config/edu/stanford/glove.py,sha256=FiVYbzQMD11CiKfklrggtm7YXBCevyTXXwhehRd65H8,2348
|
|
20
20
|
datamaestro_text/config/edu/stanford/im2p.yaml,sha256=JoToNyEPpmwdyLFedCBot5ypyw7p9rzi12mGXJuZin0,2909
|
|
21
21
|
datamaestro_text/config/edu/upenn/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
22
|
datamaestro_text/config/edu/upenn/ldc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
23
23
|
datamaestro_text/config/edu/upenn/ldc/aquaint.py,sha256=YEU3kIXzv71Vjg9bUoXyQ-vSLvC-y4LlE3mJQf38XuY,1589
|
|
24
|
+
datamaestro_text/config/fr/granddebat.py,sha256=JRLC3q6o-XhJECjAh40w2p40pCSRw9K3-YMDUpdNwMM,7016
|
|
24
25
|
datamaestro_text/config/gov/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
26
|
datamaestro_text/config/gov/nist/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
|
-
datamaestro_text/config/gov/nist/ir/covid.py,sha256=
|
|
27
|
+
datamaestro_text/config/gov/nist/ir/covid.py,sha256=i9xxZcrKeX1gezK_TE68oropMF9PKHX2ofyREEUWYPY,4003
|
|
27
28
|
datamaestro_text/config/gov/nist/trec/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
28
29
|
datamaestro_text/config/gov/nist/trec/adhoc.py,sha256=vcFaLlZ-chwDt013MYI8bYZ2ug39jPaeimsiok_sqfU,11035
|
|
29
30
|
datamaestro_text/config/gov/nist/trec/clueweb.yaml,sha256=sm4UbdtMzWoDVPsewtVDS7Vj2jBOdgp18Xqo1X4ysQc,792
|
|
@@ -34,7 +35,7 @@ datamaestro_text/config/gov/nist/trec/web.yaml,sha256=iNIJ-PCeLwpUaJByJesyCqfFJo
|
|
|
34
35
|
datamaestro_text/config/io/github/rajpurkar/squad.yaml,sha256=JchkTruEhVd0uOTGgek2tOgBL4UT3G3KVASvZcNigLg,1143
|
|
35
36
|
datamaestro_text/config/io/github/thunlp/fewrel.py,sha256=rg_qAnMrXYUZhQYxA12r_Npl0ggyfTLJQjdSCjU0QxM,1228
|
|
36
37
|
datamaestro_text/config/io/metamind/research/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
37
|
-
datamaestro_text/config/io/metamind/research/wikitext.py,sha256=
|
|
38
|
+
datamaestro_text/config/io/metamind/research/wikitext.py,sha256=jw_CbBbradIUp_mrhG-z3rfa4_0ybvIBSkDqJvGLCCI,2301
|
|
38
39
|
datamaestro_text/config/net/mattmahoney/enwiki.yaml,sha256=HCUn3s0AydXX3BjJ6yUXY0vGLGWSBkOCaDhQ4PA2Adg,2452
|
|
39
40
|
datamaestro_text/config/org/acm/recsys/cb2014.yaml,sha256=5SAK3Am1k0HFugSSCIQN5mLPBfr1zZZAkhLrSH5pHQc,1274
|
|
40
41
|
datamaestro_text/config/org/cocodataset/index.yaml,sha256=KISJChMeKwlZbSnHmRcGMsm6jbcFGVe1aA4GhP2fzqw,474
|
|
@@ -42,47 +43,49 @@ datamaestro_text/config/org/grouplens/movielens.py,sha256=tV6OSTDdtjll1dQBCsYIls
|
|
|
42
43
|
datamaestro_text/config/org/universaldependencies/french.py,sha256=etedb3_SC-fV5Oa2rM4_smZk6t4CPiNvU4C4keUFZHY,2214
|
|
43
44
|
datamaestro_text/config/uk/ac/ucl/cs/qangaroo.yaml,sha256=IBy82CDNNLjJPNPzues1EgDXu0A5WDvUFeVNSOyrIpI,1137
|
|
44
45
|
datamaestro_text/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
45
|
-
datamaestro_text/data/embeddings.py,sha256=
|
|
46
|
+
datamaestro_text/data/embeddings.py,sha256=YMoNLyVvaOt86bq_8X71_Fgu7EYYI71vr67xSQsi57I,1128
|
|
46
47
|
datamaestro_text/data/recommendation.py,sha256=wHV_9SXSclouuXaBmvwg3ncZLOFfdKRZG3IHkkPJX9Y,279
|
|
47
48
|
datamaestro_text/data/tagging.py,sha256=yWm7bNLks77cAySa1aZNCmLFxTTqhqXZ0PaoaEYU6hI,697
|
|
48
|
-
datamaestro_text/data/text.py,sha256=
|
|
49
|
-
datamaestro_text/data/conversation/__init__.py,sha256=
|
|
50
|
-
datamaestro_text/data/conversation/base.py,sha256=
|
|
51
|
-
datamaestro_text/data/conversation/canard.py,sha256=
|
|
52
|
-
datamaestro_text/data/conversation/ikat.py,sha256=
|
|
53
|
-
datamaestro_text/data/conversation/orconvqa.py,sha256=
|
|
49
|
+
datamaestro_text/data/text.py,sha256=Lln4eoegU9B27oS-2mv3eEQC6MyRBgVhoewQ2-YNxEQ,497
|
|
50
|
+
datamaestro_text/data/conversation/__init__.py,sha256=Kk7FxPz_0oGO2PtIa8zH7UBqbCUsywTHfA-yKd_KO6c,284
|
|
51
|
+
datamaestro_text/data/conversation/base.py,sha256=gF_-izQ1ijX7w49pKQvjfjUVzrX3VSHXxcqVIPWmAfY,7488
|
|
52
|
+
datamaestro_text/data/conversation/canard.py,sha256=aYpkHzuJWGT3-myFNUjCYAtvG3gVh_d3Zc5lyiasQ04,3290
|
|
53
|
+
datamaestro_text/data/conversation/ikat.py,sha256=hoGqHUWyT8BhC_ouUmnwoh93B2jGLHn8uc6npKP4Sl8,4319
|
|
54
|
+
datamaestro_text/data/conversation/orconvqa.py,sha256=zNp02jyYgny0qtIFOMjmrUy7hG8VKWcELHWrg3FBCc0,3764
|
|
54
55
|
datamaestro_text/data/conversation/qrecc.py,sha256=es4GmqPtE63A7O_GARe8Zy3rQvuLEhAvUA7CfN_nMeA,2562
|
|
55
|
-
datamaestro_text/data/
|
|
56
|
-
datamaestro_text/data/
|
|
56
|
+
datamaestro_text/data/debate/__init__.py,sha256=PzCV3Bd9fmonE-OQp4VtK1NglH42-iv34WAWUIU-eYk,187
|
|
57
|
+
datamaestro_text/data/debate/granddebat.py,sha256=4-HMfgvF2bPru56D3hkA1E2bN3dgIUmcvX9eOIXroLA,2176
|
|
58
|
+
datamaestro_text/data/ir/__init__.py,sha256=oYI7eIScg-olxPh95XBgTK-E2PunieXvqQPlrRlHU8M,9799
|
|
59
|
+
datamaestro_text/data/ir/base.py,sha256=ksluGOOzOwbdZ2SPnwiDMMUhBa6P1Ti2sr6Ch5xXUgg,1493
|
|
57
60
|
datamaestro_text/data/ir/cord19.py,sha256=yu1Khgy6AZjH2BPQKdnxDid0vQgQ8zvb8-FQlHH-GOU,1465
|
|
58
|
-
datamaestro_text/data/ir/csv.py,sha256=
|
|
59
|
-
datamaestro_text/data/ir/data.py,sha256=
|
|
60
|
-
datamaestro_text/data/ir/formats.py,sha256=
|
|
61
|
+
datamaestro_text/data/ir/csv.py,sha256=0jnaV-wKLgslH7izR-xP_RX7l90vykQTn3bPhaCFR-c,1027
|
|
62
|
+
datamaestro_text/data/ir/data.py,sha256=6ASVsyVVfiSd1m8C8QTrxVLnFVmtoW3d9c9nQ07zlbY,34
|
|
63
|
+
datamaestro_text/data/ir/formats.py,sha256=rKflCuY8UBpXC3nltBqzC4waWYoxuyP91xJvG7p690Y,3630
|
|
61
64
|
datamaestro_text/data/ir/huggingface.py,sha256=G71VFDN-SllZy4LFxumEbCumEJvb5-orAbLemHjWhiA,956
|
|
62
|
-
datamaestro_text/data/ir/stores.py,sha256=
|
|
65
|
+
datamaestro_text/data/ir/stores.py,sha256=rdOwYCG_NzHSsUQpJ1aneiA2SDWrcfdi16aY-df852U,4408
|
|
63
66
|
datamaestro_text/data/ir/trec.py,sha256=IOtQRMUz8zx-dYEMR2NIIM6qXEUjsV0eVOhGvKIRJK4,1974
|
|
64
67
|
datamaestro_text/data/ir/utils.py,sha256=6-GhXVtgkBZGhIs2-ODZua_3DmKjSSVydStpHDqbAwE,833
|
|
68
|
+
datamaestro_text/datasets/__init__.py,sha256=ORn-Q1gGibg-N5grVc7MqOYfExels3FRI51oQ4xI1QA,34
|
|
65
69
|
datamaestro_text/datasets/irds/__init__.py,sha256=Tq0HN1qojnZYLBumM59BuTkz7r0gcu-5OXmDDLgPpAc,707
|
|
66
|
-
datamaestro_text/datasets/irds/data.py,sha256=
|
|
67
|
-
datamaestro_text/datasets/irds/datasets.py,sha256=
|
|
70
|
+
datamaestro_text/datasets/irds/data.py,sha256=sIU7_rt4I1E9rjkIGcpNfbD5mtO97vxFsUDmouRMDV4,22914
|
|
71
|
+
datamaestro_text/datasets/irds/datasets.py,sha256=CJ8MA44XCwIQGZTzYIJnR-qFm890rUZZB7C3lKIwNyY,5627
|
|
68
72
|
datamaestro_text/datasets/irds/helpers.py,sha256=fGE-fbuJbXdTzl1qo55247jzn9cvApY-d82GJBgfY1E,3982
|
|
69
73
|
datamaestro_text/datasets/irds/utils.py,sha256=m30JLIrV_HgilN11TvY9dGTyumES6LLzWZDUAMT915M,1425
|
|
70
|
-
datamaestro_text/download/tmdb.py,sha256=
|
|
74
|
+
datamaestro_text/download/tmdb.py,sha256=sfnSUJwGSjBsLNVVhT30db2m0R8mrRkDZpbpBUt7GMg,3960
|
|
71
75
|
datamaestro_text/interfaces/plaintext.py,sha256=cWfS_xjqZxQ0EV4Ax5BEarZ4lnhQ1I7mc_vgfBgE76w,885
|
|
72
76
|
datamaestro_text/interfaces/trec.py,sha256=GrP0N_Hcj5f73KS6CSzkyN4aaI-XoBJ19oVMpHVR3QM,3579
|
|
73
77
|
datamaestro_text/test/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
74
78
|
datamaestro_text/test/test_datasets.py,sha256=hD1pe-CjEnOj0JNqDLasz_97ltFzMbYfyYBy1QyYdf8,202
|
|
75
79
|
datamaestro_text/test/test_documented.py,sha256=Kl90fz_r_dsV0oXE1Mad34GiQyZ9dc6bfZKNtxP0O2s,453
|
|
76
80
|
datamaestro_text/transforms/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
77
|
-
datamaestro_text/transforms/ir/__init__.py,sha256=
|
|
81
|
+
datamaestro_text/transforms/ir/__init__.py,sha256=7D6wurKVQf-f2mu1I3tT-baQbKo7yRCxW8pOHh-MSjM,6539
|
|
78
82
|
datamaestro_text/utils/__init__.py,sha256=2449YLTAtKJzkmt84Mu8sBRCCveNs5fiaqTCK_p5ha0,3340
|
|
79
83
|
datamaestro_text/utils/files.py,sha256=jhcirufgTztEkx1hs2-qgScEcHnIcGF_BjCeuCSsNv0,2838
|
|
80
84
|
datamaestro_text/utils/iter.py,sha256=QBajeSPLHvkeh6BCTZDSqWlOYNjwUDvgTTZ_YxJntXw,2701
|
|
81
85
|
datamaestro_text/utils/randomstream.py,sha256=_-boH4IIqN8qcl3IktjpNp9vmF4TWRzHUSNVwg7WAr8,973
|
|
82
|
-
datamaestro_text/utils/shuffle.py,sha256=
|
|
83
|
-
datamaestro_text-
|
|
84
|
-
datamaestro_text-
|
|
85
|
-
datamaestro_text-
|
|
86
|
-
datamaestro_text-
|
|
87
|
-
datamaestro_text-
|
|
88
|
-
datamaestro_text-2025.9.11.dist-info/RECORD,,
|
|
86
|
+
datamaestro_text/utils/shuffle.py,sha256=xXzgBQ8An7tKboxI0z123Tl6ywXI4S0tWf8MnfOon0c,3491
|
|
87
|
+
datamaestro_text-2026.2.2.dist-info/METADATA,sha256=cHXRhpnNO6sliuE09Jg-eHJtr2kl1Z4Dy3mE1RCGELA,1886
|
|
88
|
+
datamaestro_text-2026.2.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
89
|
+
datamaestro_text-2026.2.2.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
|
|
90
|
+
datamaestro_text-2026.2.2.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
91
|
+
datamaestro_text-2026.2.2.dist-info/RECORD,,
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
datamaestro_text
|
{datamaestro_text-2025.9.11.dist-info → datamaestro_text-2026.2.2.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{datamaestro_text-2025.9.11.dist-info → datamaestro_text-2026.2.2.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|