datamaestro-text 2025.4.3__py3-none-any.whl → 2025.6.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamaestro_text/config/com/github/aagohary/canard.py +19 -12
- datamaestro_text/config/com/github/apple/ml-qrecc.py +6 -8
- datamaestro_text/config/com/github/prdwb/orconvqa.py +3 -3
- datamaestro_text/config/com/sentiment140.py +1 -2
- datamaestro_text/config/edu/stanford/glove.py +1 -0
- datamaestro_text/data/conversation/base.py +10 -8
- datamaestro_text/data/conversation/canard.py +52 -13
- datamaestro_text/data/conversation/orconvqa.py +0 -1
- datamaestro_text/data/ir/base.py +6 -0
- datamaestro_text/data/ir/formats.py +31 -4
- datamaestro_text/datasets/irds/data.py +65 -0
- datamaestro_text/datasets/irds/datasets.py +4 -8
- datamaestro_text/utils/iter.py +5 -2
- datamaestro_text/version.py +2 -2
- {datamaestro_text-2025.4.3.dist-info → datamaestro_text-2025.6.11.dist-info}/METADATA +8 -2
- {datamaestro_text-2025.4.3.dist-info → datamaestro_text-2025.6.11.dist-info}/RECORD +20 -20
- {datamaestro_text-2025.4.3.dist-info → datamaestro_text-2025.6.11.dist-info}/WHEEL +1 -1
- {datamaestro_text-2025.4.3.dist-info → datamaestro_text-2025.6.11.dist-info}/entry_points.txt +0 -0
- {datamaestro_text-2025.4.3.dist-info → datamaestro_text-2025.6.11.dist-info}/licenses/LICENSE +0 -0
- {datamaestro_text-2025.4.3.dist-info → datamaestro_text-2025.6.11.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,5 @@
|
|
|
1
|
-
# See documentation on https://datamaestro.readthedocs.io
|
|
2
|
-
|
|
3
1
|
from datamaestro.definitions import datatasks, datatags, dataset
|
|
4
|
-
from datamaestro.download.
|
|
2
|
+
from datamaestro.download.single import filedownloader
|
|
5
3
|
from datamaestro.utils import HashCheck
|
|
6
4
|
|
|
7
5
|
from datamaestro.data.ml import Supervised
|
|
@@ -10,14 +8,23 @@ from datamaestro_text.data.conversation.canard import CanardDataset
|
|
|
10
8
|
|
|
11
9
|
@datatags("conversation", "context", "query")
|
|
12
10
|
@datatasks("query rewriting")
|
|
13
|
-
@
|
|
14
|
-
"
|
|
15
|
-
"https://
|
|
16
|
-
|
|
17
|
-
|
|
11
|
+
@filedownloader(
|
|
12
|
+
"train.json",
|
|
13
|
+
"https://raw.githubusercontent.com/aagohary/canard/refs/heads/master/data/release/train.json",
|
|
14
|
+
checker=HashCheck("73624ac646fb81e09b0fd7f01370ada3"),
|
|
15
|
+
)
|
|
16
|
+
@filedownloader(
|
|
17
|
+
"dev.json",
|
|
18
|
+
"https://raw.githubusercontent.com/aagohary/canard/refs/heads/master/data/release/dev.json",
|
|
19
|
+
checker=HashCheck("c84525631a83bc771c58ff31f4a9b601"),
|
|
20
|
+
)
|
|
21
|
+
@filedownloader(
|
|
22
|
+
"test.json",
|
|
23
|
+
"https://raw.githubusercontent.com/aagohary/canard/refs/heads/master/data/release/test.json",
|
|
24
|
+
checker=HashCheck("3fc14d0078e7a5056f5da571728f024e"),
|
|
18
25
|
)
|
|
19
26
|
@dataset(Supervised, url="https://sites.google.com/view/qanta/projects/canard", id="")
|
|
20
|
-
def main(
|
|
27
|
+
def main(train, dev, test):
|
|
21
28
|
"""Question-in-context rewriting
|
|
22
29
|
|
|
23
30
|
CANARD is a dataset for question-in-context rewriting that consists of
|
|
@@ -30,7 +37,7 @@ def main(archive):
|
|
|
30
37
|
Each dataset is an instance of :class:`datamaestro_text.data.conversation.CanardDataset`
|
|
31
38
|
"""
|
|
32
39
|
return {
|
|
33
|
-
"train": CanardDataset(path=
|
|
34
|
-
"validation": CanardDataset(path=
|
|
35
|
-
"test": CanardDataset(path=
|
|
40
|
+
"train": CanardDataset(path=train),
|
|
41
|
+
"validation": CanardDataset(path=dev),
|
|
42
|
+
"test": CanardDataset(path=test),
|
|
36
43
|
}
|
|
@@ -25,12 +25,11 @@ from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
|
|
|
25
25
|
checker=HashCheck("f88fcc7ef3678cd6312080389c8abd67"),
|
|
26
26
|
)
|
|
27
27
|
@dataset(
|
|
28
|
-
Supervised[QReCCDataset, None, QReCCDataset],
|
|
29
28
|
url="https://github.com/apple/ml-qrecc",
|
|
30
29
|
doi="https://doi.org/10.48550/arXiv.2010.04898",
|
|
31
30
|
id="",
|
|
32
31
|
)
|
|
33
|
-
def main(data: Path):
|
|
32
|
+
def main(data: Path) -> Supervised[QReCCDataset, None, QReCCDataset]:
|
|
34
33
|
"""Open-Domain Question Answering Goes Conversational via Question Rewriting
|
|
35
34
|
|
|
36
35
|
We introduce QReCC (Question Rewriting in Conversational Context), an
|
|
@@ -40,10 +39,10 @@ def main(data: Path):
|
|
|
40
39
|
answering that includes the individual subtasks of question rewriting,
|
|
41
40
|
passage retrieval and reading comprehension
|
|
42
41
|
"""
|
|
43
|
-
return
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
42
|
+
return Supervised(
|
|
43
|
+
train=QReCCDataset(path=data / "qrecc_train.json"),
|
|
44
|
+
test=QReCCDataset(path=data / "qrecc_test.json"),
|
|
45
|
+
)
|
|
47
46
|
|
|
48
47
|
|
|
49
48
|
@dataset(
|
|
@@ -52,7 +51,6 @@ def main(data: Path):
|
|
|
52
51
|
)
|
|
53
52
|
class Content(LZ4JSONLDocumentStore):
|
|
54
53
|
"""QReCC mentionned URLs content"""
|
|
55
|
-
|
|
56
54
|
@staticmethod
|
|
57
55
|
def __create_dataset__(dataset, options=None):
|
|
58
56
|
ds = reference(reference=main).setup(dataset, options)
|
|
@@ -67,7 +65,7 @@ class Content(LZ4JSONLDocumentStore):
|
|
|
67
65
|
"id",
|
|
68
66
|
).setup(dataset, options)
|
|
69
67
|
|
|
70
|
-
return
|
|
68
|
+
return Content(jsonl_path=store_path)
|
|
71
69
|
|
|
72
70
|
@staticmethod
|
|
73
71
|
def _documents(path: Path):
|
|
@@ -49,9 +49,9 @@ def preprocessed(train, dev, test):
|
|
|
49
49
|
Each dataset is an instance of :class:`datamaestro_text.data.conversation.OrConvQADataset`
|
|
50
50
|
"""
|
|
51
51
|
return {
|
|
52
|
-
"train": OrConvQADataset(path=train),
|
|
53
|
-
"validation": OrConvQADataset(path=dev),
|
|
54
|
-
"test": OrConvQADataset(path=test),
|
|
52
|
+
"train": OrConvQADataset.C(path=train),
|
|
53
|
+
"validation": OrConvQADataset.C(path=dev),
|
|
54
|
+
"test": OrConvQADataset.C(path=test),
|
|
55
55
|
}
|
|
56
56
|
|
|
57
57
|
|
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
from datamaestro.data.csv import Generic
|
|
2
|
-
from datamaestro.definitions import
|
|
2
|
+
from datamaestro.definitions import datatasks, datatags, dataset
|
|
3
3
|
from datamaestro.download.archive import zipdownloader
|
|
4
4
|
from datamaestro.data.ml import Supervised
|
|
5
5
|
from datamaestro.utils import HashCheck
|
|
6
6
|
|
|
7
7
|
|
|
8
|
-
|
|
9
8
|
@zipdownloader(
|
|
10
9
|
"dir",
|
|
11
10
|
"http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip",
|
|
@@ -129,6 +129,8 @@ class ConversationNode:
|
|
|
129
129
|
|
|
130
130
|
|
|
131
131
|
class ConversationTree(ABC):
|
|
132
|
+
"""Represents a conversation tree"""
|
|
133
|
+
|
|
132
134
|
@abstractmethod
|
|
133
135
|
def root(self) -> ConversationNode:
|
|
134
136
|
...
|
|
@@ -192,7 +194,7 @@ class SingleConversationTreeNode(ConversationNode):
|
|
|
192
194
|
return (
|
|
193
195
|
SingleConversationTreeNode(self.tree, self.index + 1)
|
|
194
196
|
if self.index < len(self.tree.history) - 1
|
|
195
|
-
else
|
|
197
|
+
else None
|
|
196
198
|
)
|
|
197
199
|
|
|
198
200
|
def children(self) -> List[ConversationNode]:
|
|
@@ -207,8 +209,8 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
|
|
|
207
209
|
"""A conversation tree node"""
|
|
208
210
|
|
|
209
211
|
entry: Record
|
|
210
|
-
|
|
211
|
-
|
|
212
|
+
_parent: Optional["ConversationTreeNode"]
|
|
213
|
+
_children: List["ConversationTreeNode"]
|
|
212
214
|
|
|
213
215
|
def __init__(self, entry):
|
|
214
216
|
self.entry = entry
|
|
@@ -216,8 +218,8 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
|
|
|
216
218
|
self.children = []
|
|
217
219
|
|
|
218
220
|
def add(self, node: "ConversationTreeNode") -> "ConversationTreeNode":
|
|
219
|
-
self.
|
|
220
|
-
node.
|
|
221
|
+
self._children.append(node)
|
|
222
|
+
node._parent = self
|
|
221
223
|
return node
|
|
222
224
|
|
|
223
225
|
def conversation(self, skip_self: bool) -> ConversationHistory:
|
|
@@ -232,14 +234,14 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
|
|
|
232
234
|
def __iter__(self) -> Iterator["ConversationTreeNode"]:
|
|
233
235
|
"""Iterates over all conversation tree nodes (pre-order)"""
|
|
234
236
|
yield self.entry
|
|
235
|
-
for child in self.
|
|
237
|
+
for child in self._children:
|
|
236
238
|
yield from child
|
|
237
239
|
|
|
238
240
|
def parent(self) -> Optional[ConversationNode]:
|
|
239
|
-
return self.
|
|
241
|
+
return self._parent
|
|
240
242
|
|
|
241
243
|
def children(self) -> List[ConversationNode]:
|
|
242
|
-
return self.
|
|
244
|
+
return self._children
|
|
243
245
|
|
|
244
246
|
def root(self):
|
|
245
247
|
return self
|
|
@@ -1,12 +1,17 @@
|
|
|
1
1
|
from typing import Iterator, List
|
|
2
2
|
from attr import define
|
|
3
3
|
import json
|
|
4
|
+
from datamaestro.record import Record
|
|
4
5
|
from datamaestro.data import File
|
|
5
|
-
from .base import (
|
|
6
|
+
from datamaestro_text.data.conversation.base import (
|
|
7
|
+
ConversationDataset,
|
|
6
8
|
ConversationTree,
|
|
7
9
|
SingleConversationTree,
|
|
10
|
+
SimpleDecontextualizedItem,
|
|
11
|
+
EntryType,
|
|
8
12
|
)
|
|
9
|
-
from . import
|
|
13
|
+
from datamaestro_text.data.ir import IDItem, SimpleTextItem
|
|
14
|
+
import logging
|
|
10
15
|
|
|
11
16
|
|
|
12
17
|
@define(kw_only=True)
|
|
@@ -30,7 +35,10 @@ class CanardConversation:
|
|
|
30
35
|
|
|
31
36
|
|
|
32
37
|
class CanardDataset(ConversationDataset, File):
|
|
33
|
-
"""A dataset in the CANARD JSON format
|
|
38
|
+
"""A dataset in the CANARD JSON format
|
|
39
|
+
|
|
40
|
+
The CANARD dataset is composed of
|
|
41
|
+
"""
|
|
34
42
|
|
|
35
43
|
def entries(self) -> Iterator[CanardConversation]:
|
|
36
44
|
"""Iterates over re-written query with their context"""
|
|
@@ -47,22 +55,53 @@ class CanardDataset(ConversationDataset, File):
|
|
|
47
55
|
)
|
|
48
56
|
|
|
49
57
|
def __iter__(self) -> Iterator[ConversationTree]:
|
|
50
|
-
history = []
|
|
58
|
+
history: list[Record] = []
|
|
51
59
|
current_id = None
|
|
52
60
|
|
|
53
61
|
for entry in self.entries():
|
|
54
|
-
# Check if current conversation
|
|
55
|
-
if current_id != entry.dialogue_id
|
|
56
|
-
|
|
57
|
-
|
|
62
|
+
# Check if current conversation, otherwise we are OK
|
|
63
|
+
if current_id != entry.dialogue_id:
|
|
64
|
+
if current_id is not None:
|
|
65
|
+
history.reverse()
|
|
66
|
+
yield SingleConversationTree(current_id, history)
|
|
67
|
+
history = []
|
|
68
|
+
|
|
69
|
+
current_id = entry.dialogue_id
|
|
70
|
+
|
|
71
|
+
if not history:
|
|
72
|
+
# First round
|
|
73
|
+
# The two first items are the wikipedia title and section,
|
|
74
|
+
# we interpret them as two user queries
|
|
75
|
+
assert len(entry.history) == 2
|
|
76
|
+
history.extend(
|
|
77
|
+
Record(
|
|
78
|
+
SimpleTextItem(text),
|
|
79
|
+
EntryType.USER_QUERY,
|
|
80
|
+
)
|
|
81
|
+
for text in entry.history
|
|
82
|
+
)
|
|
83
|
+
else:
|
|
84
|
+
# The utterance before the last is the last user query
|
|
85
|
+
assert (
|
|
86
|
+
entry.history[-2] == history[-1][SimpleTextItem].text
|
|
87
|
+
), f"{entry.dialogue_id} {entry.history} / {history[-4:-1]}"
|
|
88
|
+
|
|
89
|
+
# The last utterance is the system side
|
|
90
|
+
history.append(
|
|
91
|
+
Record(SimpleTextItem(entry.history[-1]), EntryType.SYSTEM_ANSWER)
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
assert len(entry.history) == len(history)
|
|
58
95
|
|
|
59
96
|
# Add to current
|
|
60
97
|
history.append(
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
98
|
+
Record(
|
|
99
|
+
IDItem(f"{entry.dialogue_id}-{entry.query_no}"),
|
|
100
|
+
SimpleTextItem(entry.query),
|
|
101
|
+
SimpleDecontextualizedItem(entry.rewrite),
|
|
102
|
+
EntryType.USER_QUERY,
|
|
65
103
|
)
|
|
66
104
|
)
|
|
67
105
|
|
|
68
|
-
|
|
106
|
+
if current_id:
|
|
107
|
+
yield SingleConversationTree(current_id, history)
|
datamaestro_text/data/ir/base.py
CHANGED
|
@@ -58,7 +58,7 @@ class NFCorpusDocument(TextItem):
|
|
|
58
58
|
|
|
59
59
|
@cached_property
|
|
60
60
|
def text(self):
|
|
61
|
-
return self.abstract
|
|
61
|
+
return f"{self.title} {self.abstract}"
|
|
62
62
|
|
|
63
63
|
|
|
64
64
|
@define
|
|
@@ -99,8 +99,8 @@ class WapoDocument(TextItem):
|
|
|
99
99
|
body_media: Tuple[WapoDocMedia, ...]
|
|
100
100
|
|
|
101
101
|
@cached_property
|
|
102
|
-
def text(self):
|
|
103
|
-
return self.
|
|
102
|
+
def text(self):
|
|
103
|
+
return f"{self.title} {self.body_paras_html}"
|
|
104
104
|
|
|
105
105
|
|
|
106
106
|
@define
|
|
@@ -126,13 +126,24 @@ class OrConvQADocument(TextItem):
|
|
|
126
126
|
def text(self):
|
|
127
127
|
return f"{self.title} {self.body}"
|
|
128
128
|
|
|
129
|
+
|
|
129
130
|
@define
|
|
131
|
+
class DprW100Doc(TextItem):
|
|
132
|
+
text: str
|
|
133
|
+
title: str
|
|
134
|
+
|
|
135
|
+
@define
|
|
136
|
+
class MsMarcoV2Passage(TextItem):
|
|
137
|
+
text: str
|
|
138
|
+
spans: Tuple[Tuple[int, int], ...]
|
|
139
|
+
msmarco_document_id: str
|
|
130
140
|
class Touche2020(TextItem):
|
|
131
141
|
text: str
|
|
132
142
|
title: str
|
|
133
143
|
stance: str
|
|
134
144
|
url: str
|
|
135
145
|
|
|
146
|
+
|
|
136
147
|
@define
|
|
137
148
|
class SciDocs(TextItem):
|
|
138
149
|
text: str
|
|
@@ -175,7 +186,8 @@ class TrecMb14Query(TextItem):
|
|
|
175
186
|
def get_text(self):
|
|
176
187
|
return f"{self.query}"
|
|
177
188
|
|
|
178
|
-
|
|
189
|
+
|
|
190
|
+
@define
|
|
179
191
|
class SciDocsTopic(TextItem):
|
|
180
192
|
text: str
|
|
181
193
|
authors: List[str]
|
|
@@ -183,6 +195,7 @@ class SciDocsTopic(TextItem):
|
|
|
183
195
|
cited_by: List[str]
|
|
184
196
|
references: List[str]
|
|
185
197
|
|
|
198
|
+
|
|
186
199
|
@define()
|
|
187
200
|
class TrecTopic(SimpleTextItem):
|
|
188
201
|
description: str
|
|
@@ -190,3 +203,17 @@ class TrecTopic(SimpleTextItem):
|
|
|
190
203
|
|
|
191
204
|
|
|
192
205
|
TrecTopicRecord = record_type(IDItem, TrecTopic)
|
|
206
|
+
|
|
207
|
+
@define
|
|
208
|
+
class DprW100Query(TextItem):
|
|
209
|
+
text: str
|
|
210
|
+
answers: Tuple[str]
|
|
211
|
+
|
|
212
|
+
@define
|
|
213
|
+
class TrecBackgroundLinkingQuery(IDItem):
|
|
214
|
+
query_id: str
|
|
215
|
+
doc_id: str
|
|
216
|
+
url: str
|
|
217
|
+
|
|
218
|
+
def get_text(self):
|
|
219
|
+
raise NotImplementedError()
|
|
@@ -37,6 +37,7 @@ from datamaestro_text.data.ir.base import (
|
|
|
37
37
|
SimpleAdhocAssessment,
|
|
38
38
|
SimpleTextItem,
|
|
39
39
|
TopicRecord,
|
|
40
|
+
UrlItem,
|
|
40
41
|
create_record,
|
|
41
42
|
)
|
|
42
43
|
|
|
@@ -165,6 +166,19 @@ class Documents(ir.DocumentStore, IRDSId):
|
|
|
165
166
|
"source",
|
|
166
167
|
"source_content_type",
|
|
167
168
|
),
|
|
169
|
+
_irds.dpr_w100.DprW100Doc: tuple_constructor(
|
|
170
|
+
formats.DprW100Doc,
|
|
171
|
+
"doc_id",
|
|
172
|
+
"text",
|
|
173
|
+
"title",
|
|
174
|
+
),
|
|
175
|
+
_irds.msmarco_passage_v2.MsMarcoV2Passage: tuple_constructor(
|
|
176
|
+
formats.MsMarcoV2Passage,
|
|
177
|
+
"doc_id",
|
|
178
|
+
"text",
|
|
179
|
+
"spans",
|
|
180
|
+
"msmarco_document_id",
|
|
181
|
+
),
|
|
168
182
|
}
|
|
169
183
|
|
|
170
184
|
"""Wraps an ir datasets collection -- and provide a default text
|
|
@@ -385,6 +399,12 @@ class Topics(ir.TopicsStore, IRDSId):
|
|
|
385
399
|
"tweet_time",
|
|
386
400
|
"description",
|
|
387
401
|
),
|
|
402
|
+
_irds.dpr_w100.DprW100Query: tuple_constructor(
|
|
403
|
+
formats.DprW100Query,
|
|
404
|
+
"query_id",
|
|
405
|
+
"text",
|
|
406
|
+
"answers"
|
|
407
|
+
),
|
|
388
408
|
}
|
|
389
409
|
|
|
390
410
|
HANDLERS = {
|
|
@@ -415,7 +435,52 @@ class Topics(ir.TopicsStore, IRDSId):
|
|
|
415
435
|
def iter(self) -> Iterator[TopicRecord]:
|
|
416
436
|
"""Returns an iterator over topics"""
|
|
417
437
|
return self.handler.iter()
|
|
438
|
+
|
|
439
|
+
class TrecBackgroundLinkingTopicsHandler(TopicsHandler):
|
|
440
|
+
def __init__(self, dataset):
|
|
441
|
+
self.dataset = dataset
|
|
442
|
+
|
|
443
|
+
@cached_property
|
|
444
|
+
def ext2records(self):
|
|
445
|
+
return {record[IDItem].id: record for record in self.records}
|
|
446
|
+
|
|
447
|
+
def topic_int(self, internal_topic_id: int) -> TopicRecord:
|
|
448
|
+
"""Returns a document given its internal ID"""
|
|
449
|
+
return self.records[internal_topic_id]
|
|
450
|
+
|
|
451
|
+
def topic_ext(self, external_topic_id: str) -> TopicRecord:
|
|
452
|
+
"""Returns a document given its external ID"""
|
|
453
|
+
return self.ext2records[external_topic_id]
|
|
418
454
|
|
|
455
|
+
def iter(self) -> Iterator[ir.TopicRecord]:
|
|
456
|
+
"""Returns an iterator over topics"""
|
|
457
|
+
return iter(self.records)
|
|
458
|
+
|
|
459
|
+
@cached_property
|
|
460
|
+
def records(self):
|
|
461
|
+
try:
|
|
462
|
+
records = []
|
|
463
|
+
|
|
464
|
+
for query in self.dataset.dataset.queries_iter():
|
|
465
|
+
topic = Record(
|
|
466
|
+
IDItem(query.query_id),
|
|
467
|
+
# Following BEIR documentation, we use title of documents as queries: https://github.com/beir-cellar/beir/blob/main/examples/dataset/README.md#queries-and-qrels
|
|
468
|
+
SimpleTextItem(self.dataset.dataset.docs_store().get(query.doc_id).title),
|
|
469
|
+
UrlItem(query.url),
|
|
470
|
+
)
|
|
471
|
+
records.append(topic)
|
|
472
|
+
except Exception:
|
|
473
|
+
logging.exception("Error while computing topic records")
|
|
474
|
+
raise
|
|
475
|
+
|
|
476
|
+
return records
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
Topics.HANDLERS.update(
|
|
480
|
+
{
|
|
481
|
+
_irds.wapo.TrecBackgroundLinkingQuery: TrecBackgroundLinkingTopicsHandler
|
|
482
|
+
}
|
|
483
|
+
)
|
|
419
484
|
|
|
420
485
|
class CastTopicsHandler(TopicsHandler):
|
|
421
486
|
def __init__(self, dataset):
|
|
@@ -65,7 +65,7 @@ class QrelsDataset(Dataset):
|
|
|
65
65
|
return True
|
|
66
66
|
|
|
67
67
|
def _prepare(self, download=False) -> Documents:
|
|
68
|
-
return AdhocAssessments(id=self.fullid)
|
|
68
|
+
return AdhocAssessments.C(id=self.fullid)
|
|
69
69
|
|
|
70
70
|
|
|
71
71
|
class QueriesDataset(Dataset):
|
|
@@ -78,7 +78,7 @@ class QueriesDataset(Dataset):
|
|
|
78
78
|
return True
|
|
79
79
|
|
|
80
80
|
def _prepare(self, download=False) -> Documents:
|
|
81
|
-
return Topics(id=self.fullid)
|
|
81
|
+
return Topics.C(id=self.fullid)
|
|
82
82
|
|
|
83
83
|
|
|
84
84
|
# class ScoredDocuments(Dataset):
|
|
@@ -96,7 +96,7 @@ class DocumentsDataset(Dataset):
|
|
|
96
96
|
return True
|
|
97
97
|
|
|
98
98
|
def _prepare(self, download=False) -> Documents:
|
|
99
|
-
return Documents(id=self.fullid)
|
|
99
|
+
return Documents.C(id=self.fullid)
|
|
100
100
|
|
|
101
101
|
|
|
102
102
|
class TrainingTripletsDataset(Dataset):
|
|
@@ -116,10 +116,6 @@ class AdhocRunDataset(Dataset):
|
|
|
116
116
|
def _prepare(self, download=False) -> AdhocRun:
|
|
117
117
|
return AdhocRun(id=self.fullid)
|
|
118
118
|
|
|
119
|
-
@property
|
|
120
|
-
def configtype(self):
|
|
121
|
-
return AdhocRun
|
|
122
|
-
|
|
123
119
|
|
|
124
120
|
class Collection(Dataset):
|
|
125
121
|
base = Adhoc
|
|
@@ -127,7 +123,7 @@ class Collection(Dataset):
|
|
|
127
123
|
topics: QueriesDataset
|
|
128
124
|
|
|
129
125
|
def _prepare(self, download=False) -> Documents:
|
|
130
|
-
return Adhoc(
|
|
126
|
+
return Adhoc.C(
|
|
131
127
|
id=self.fullid,
|
|
132
128
|
topics=self.topics.prepare(download),
|
|
133
129
|
assessments=self.assessments.prepare(download),
|
datamaestro_text/utils/iter.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Callable, TypeVar, Iterator, List, Union
|
|
1
|
+
from typing import Callable, Sequence, TypeVar, Iterator, List, Union
|
|
2
2
|
|
|
3
3
|
T = TypeVar("T")
|
|
4
4
|
|
|
@@ -45,7 +45,7 @@ class RangeView:
|
|
|
45
45
|
return RangeView(self.source, key)
|
|
46
46
|
|
|
47
47
|
|
|
48
|
-
class LazyList:
|
|
48
|
+
class LazyList(Sequence):
|
|
49
49
|
"""Iterable-based list
|
|
50
50
|
|
|
51
51
|
The list is only materialized if needed"""
|
|
@@ -63,6 +63,9 @@ class LazyList:
|
|
|
63
63
|
else:
|
|
64
64
|
return iter(self.materialized_list)
|
|
65
65
|
|
|
66
|
+
def __len__(self):
|
|
67
|
+
return len(self.iterable)
|
|
68
|
+
|
|
66
69
|
def __getitem__(self, index):
|
|
67
70
|
# Materialize the list if accessing an index above the threshold or any slice
|
|
68
71
|
if isinstance(index, slice) or index >= self.materialize_threshold:
|
datamaestro_text/version.py
CHANGED
|
@@ -17,5 +17,5 @@ __version__: str
|
|
|
17
17
|
__version_tuple__: VERSION_TUPLE
|
|
18
18
|
version_tuple: VERSION_TUPLE
|
|
19
19
|
|
|
20
|
-
__version__ = version = '2025.
|
|
21
|
-
__version_tuple__ = version_tuple = (2025,
|
|
20
|
+
__version__ = version = '2025.6.11'
|
|
21
|
+
__version_tuple__ = version_tuple = (2025, 6, 11)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datamaestro-text
|
|
3
|
-
Version: 2025.
|
|
3
|
+
Version: 2025.6.11
|
|
4
4
|
Summary: Datamaestro module for text-related datasets
|
|
5
5
|
Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
|
|
6
6
|
License: GPL-3
|
|
@@ -18,9 +18,15 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
18
18
|
Requires-Python: >=3.8
|
|
19
19
|
Description-Content-Type: text/markdown
|
|
20
20
|
License-File: LICENSE
|
|
21
|
-
Requires-Dist: datamaestro>=1.2
|
|
21
|
+
Requires-Dist: datamaestro>=1.4.2
|
|
22
22
|
Requires-Dist: ir_datasets>=0.5.8
|
|
23
23
|
Requires-Dist: attrs
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: pytest; extra == "dev"
|
|
26
|
+
Requires-Dist: docutils; extra == "dev"
|
|
27
|
+
Requires-Dist: sphobjinv; extra == "dev"
|
|
28
|
+
Requires-Dist: flake8; extra == "dev"
|
|
29
|
+
Requires-Dist: sphinx; extra == "dev"
|
|
24
30
|
Dynamic: license-file
|
|
25
31
|
|
|
26
32
|
[](https://github.com/pre-commit/pre-commit) [](https://badge.fury.io/py/datamaestro-text)
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
datamaestro_text/__init__.py,sha256=hU8jZpkXl3F74qIfqnJl7v4nJ9YxfoR7IpJpUREFNRI,248
|
|
2
|
-
datamaestro_text/version.py,sha256=
|
|
2
|
+
datamaestro_text/version.py,sha256=M2FaCl2nFdTTxwIhOsBo1SQ_3ytid7NHYp2QLIimPXY,519
|
|
3
3
|
datamaestro_text/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
datamaestro_text/config/ai/quac.yaml,sha256=h1D7UJo2z1nZ_9MXpDHuQNJG9Ma2oayUdJV6lyumAIg,1103
|
|
5
5
|
datamaestro_text/config/com/oscar-corpus.py,sha256=dDfdy_uhG6esRQQzUOOORwK8KvEq4c4hZbWt22pv-zY,733
|
|
6
|
-
datamaestro_text/config/com/sentiment140.py,sha256=
|
|
6
|
+
datamaestro_text/config/com/sentiment140.py,sha256=3cZfqs395gY14gGojRC_RTYEdG5l8RUTSdF9li8ynBc,1283
|
|
7
7
|
datamaestro_text/config/com/fastml/goodbooks-10k.yaml,sha256=5ZABxUnBFs2ZnCXtBH8YoBiPb3SocRRdH1DLSfVWF-Y,1172
|
|
8
|
-
datamaestro_text/config/com/github/aagohary/canard.py,sha256=
|
|
9
|
-
datamaestro_text/config/com/github/apple/ml-qrecc.py,sha256
|
|
10
|
-
datamaestro_text/config/com/github/prdwb/orconvqa.py,sha256=
|
|
8
|
+
datamaestro_text/config/com/github/aagohary/canard.py,sha256=DREIKU2FFrfwibjU48Y1AsKTUzhwxmD4Kuykd4bDIus,1762
|
|
9
|
+
datamaestro_text/config/com/github/apple/ml-qrecc.py,sha256=-iHKJocWZ8N9N-P8E45y4ewg3OT_23XonlDh5-NcH2g,3055
|
|
10
|
+
datamaestro_text/config/com/github/prdwb/orconvqa.py,sha256=MFJYaxJoqJQ6kMDHa2PIE0zoxYTA8Kyl26-vzFoMML0,3032
|
|
11
11
|
datamaestro_text/config/com/github/soskek/bookcorpus.yaml,sha256=qJKs35yeEIilEMgNvU3OEqMp1TSn7mDM2T-uYyA7kTU,1607
|
|
12
12
|
datamaestro_text/config/com/microsoft/wikiqa.yaml,sha256=U7rU-W2Xz1MYv2YXT4jCTj5DsHyM0VssbQPNa3EISaM,540
|
|
13
13
|
datamaestro_text/config/com/microsoft/msmarco/passage.py,sha256=tVldwVh0pQpXXifqcMNEN9cLO1HXkXoEhToSazSx5RE,11643
|
|
@@ -15,7 +15,7 @@ datamaestro_text/config/com/smashwords/bookcorpus.py,sha256=hCFjZg9t1AHET05qU31u
|
|
|
15
15
|
datamaestro_text/config/edu/cornell/nlvr.yaml,sha256=9Yk5VZMncSmrP7JNuGXqExksgX5nQ_Zfnlps8hWze3Q,921
|
|
16
16
|
datamaestro_text/config/edu/stanford/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
17
|
datamaestro_text/config/edu/stanford/aclimdb.py,sha256=lsPDxnp_rWOCpBte6pZ0_LVaC33w5mmgfGh51rcTgt8,643
|
|
18
|
-
datamaestro_text/config/edu/stanford/glove.py,sha256=
|
|
18
|
+
datamaestro_text/config/edu/stanford/glove.py,sha256=bXxwiJqT8alPs5MwwxVuY0xBO8g1QWhm2L3AKVskTlA,2391
|
|
19
19
|
datamaestro_text/config/edu/stanford/im2p.yaml,sha256=JoToNyEPpmwdyLFedCBot5ypyw7p9rzi12mGXJuZin0,2909
|
|
20
20
|
datamaestro_text/config/edu/upenn/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
21
21
|
datamaestro_text/config/edu/upenn/ldc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -46,23 +46,23 @@ datamaestro_text/data/recommendation.py,sha256=wHV_9SXSclouuXaBmvwg3ncZLOFfdKRZG
|
|
|
46
46
|
datamaestro_text/data/tagging.py,sha256=yWm7bNLks77cAySa1aZNCmLFxTTqhqXZ0PaoaEYU6hI,697
|
|
47
47
|
datamaestro_text/data/text.py,sha256=_9J7-j06BOIZ1HsOcBsWy_asanAKkDoZIsWWMMj4tm4,496
|
|
48
48
|
datamaestro_text/data/conversation/__init__.py,sha256=esOWnSImMlQs5QtfxUt559ABLd6a5wwoNpj9XtIq71Y,159
|
|
49
|
-
datamaestro_text/data/conversation/base.py,sha256=
|
|
50
|
-
datamaestro_text/data/conversation/canard.py,sha256=
|
|
51
|
-
datamaestro_text/data/conversation/orconvqa.py,sha256=
|
|
49
|
+
datamaestro_text/data/conversation/base.py,sha256=PUVRCSMBlV9bSayBl-vnzsYvyr2Tdv_zTadIC_Tswe0,6508
|
|
50
|
+
datamaestro_text/data/conversation/canard.py,sha256=FI02rPNuzSsA_uXOcgcuQn31d3QahmHndzYCV1Zm8rk,3305
|
|
51
|
+
datamaestro_text/data/conversation/orconvqa.py,sha256=RL9cpK7QK2dsk_j2e4Th8wzGFq-G3ww_EjdM1eKh-8o,3764
|
|
52
52
|
datamaestro_text/data/conversation/qrecc.py,sha256=es4GmqPtE63A7O_GARe8Zy3rQvuLEhAvUA7CfN_nMeA,2562
|
|
53
53
|
datamaestro_text/data/ir/__init__.py,sha256=EOVnRv9oKEEucMOgrFjhXxeUWEkaJkbXBndpkKhZaiY,8701
|
|
54
|
-
datamaestro_text/data/ir/base.py,sha256=
|
|
54
|
+
datamaestro_text/data/ir/base.py,sha256=TSaY8UWoixqvKg1z5JfkXPnuZiVoChYEUvc7jjbpZqY,1495
|
|
55
55
|
datamaestro_text/data/ir/cord19.py,sha256=yu1Khgy6AZjH2BPQKdnxDid0vQgQ8zvb8-FQlHH-GOU,1465
|
|
56
56
|
datamaestro_text/data/ir/csv.py,sha256=tnxuqR_MZ3GQhuoXEMYyWLQw8PyD0gRqsnzIP5Gsziw,1212
|
|
57
57
|
datamaestro_text/data/ir/data.py,sha256=ERmmOxz_9mUkIuccNbzUa5Y6gVLLVDdyc4cCxbCCUbY,20
|
|
58
|
-
datamaestro_text/data/ir/formats.py,sha256=
|
|
58
|
+
datamaestro_text/data/ir/formats.py,sha256=K0mqmCGg0seneKo-Rt3cBfDVjEMS4_6t1MfNj4iW9Y4,3637
|
|
59
59
|
datamaestro_text/data/ir/huggingface.py,sha256=G71VFDN-SllZy4LFxumEbCumEJvb5-orAbLemHjWhiA,956
|
|
60
60
|
datamaestro_text/data/ir/stores.py,sha256=rlOogoBAfsP7o01KqvHiTF1XqzK2Fp6QbRmuasabKAE,859
|
|
61
61
|
datamaestro_text/data/ir/trec.py,sha256=qDOzmgUn0hMqeP9KdI190-9IKsLl6yNRqIoBz6s-cBs,1898
|
|
62
62
|
datamaestro_text/data/ir/utils.py,sha256=6-GhXVtgkBZGhIs2-ODZua_3DmKjSSVydStpHDqbAwE,833
|
|
63
63
|
datamaestro_text/datasets/irds/__init__.py,sha256=Tq0HN1qojnZYLBumM59BuTkz7r0gcu-5OXmDDLgPpAc,707
|
|
64
|
-
datamaestro_text/datasets/irds/data.py,sha256=
|
|
65
|
-
datamaestro_text/datasets/irds/datasets.py,sha256=
|
|
64
|
+
datamaestro_text/datasets/irds/data.py,sha256=0V5nPMQeh1I3Sp36x_NE7xrbC_vbZAKhU5NONj95aok,22058
|
|
65
|
+
datamaestro_text/datasets/irds/datasets.py,sha256=n5mvD2omVI4PkeuJgX13Cio6gLrgYjxq8ZUKKyqocEs,5625
|
|
66
66
|
datamaestro_text/datasets/irds/helpers.py,sha256=fGE-fbuJbXdTzl1qo55247jzn9cvApY-d82GJBgfY1E,3982
|
|
67
67
|
datamaestro_text/datasets/irds/utils.py,sha256=m30JLIrV_HgilN11TvY9dGTyumES6LLzWZDUAMT915M,1425
|
|
68
68
|
datamaestro_text/download/tmdb.py,sha256=kU_Vz9jhznlyeKMHziVu58IHoWv8zPu6CZTHVNQvmu4,4009
|
|
@@ -75,12 +75,12 @@ datamaestro_text/transforms/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMp
|
|
|
75
75
|
datamaestro_text/transforms/ir/__init__.py,sha256=eWxr0ZtfK9F9J4TachW5V8Ra5AhQJueDyOGR8vXi1uA,6553
|
|
76
76
|
datamaestro_text/utils/__init__.py,sha256=2449YLTAtKJzkmt84Mu8sBRCCveNs5fiaqTCK_p5ha0,3340
|
|
77
77
|
datamaestro_text/utils/files.py,sha256=n6ZGl5LNrZbHLcV9RFwd7cFT0vPUezit-2dsBzs1vRQ,170
|
|
78
|
-
datamaestro_text/utils/iter.py,sha256=
|
|
78
|
+
datamaestro_text/utils/iter.py,sha256=uzBmavBeqVyEvYmi9Ds2dqh0ywhJE-1SBO5v8AoIKcc,2537
|
|
79
79
|
datamaestro_text/utils/randomstream.py,sha256=_-boH4IIqN8qcl3IktjpNp9vmF4TWRzHUSNVwg7WAr8,973
|
|
80
80
|
datamaestro_text/utils/shuffle.py,sha256=o8JTz3mr0lYWyv0zEh91jEK12ci1etMiUnzh5GkOHCM,3490
|
|
81
|
-
datamaestro_text-2025.
|
|
82
|
-
datamaestro_text-2025.
|
|
83
|
-
datamaestro_text-2025.
|
|
84
|
-
datamaestro_text-2025.
|
|
85
|
-
datamaestro_text-2025.
|
|
86
|
-
datamaestro_text-2025.
|
|
81
|
+
datamaestro_text-2025.6.11.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
82
|
+
datamaestro_text-2025.6.11.dist-info/METADATA,sha256=oMXEU9_JPRCrUCoSE6IGmmyLzfdpVQgKHAaaEbFXMXY,1847
|
|
83
|
+
datamaestro_text-2025.6.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
84
|
+
datamaestro_text-2025.6.11.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
|
|
85
|
+
datamaestro_text-2025.6.11.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
|
|
86
|
+
datamaestro_text-2025.6.11.dist-info/RECORD,,
|
{datamaestro_text-2025.4.3.dist-info → datamaestro_text-2025.6.11.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{datamaestro_text-2025.4.3.dist-info → datamaestro_text-2025.6.11.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|