datamaestro-text 2024.3.10__py3-none-any.whl → 2025.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamaestro_text/config/com/github/apple/ml-qrecc.py +87 -0
- datamaestro_text/config/com/github/prdwb/orconvqa.py +4 -7
- datamaestro_text/data/conversation/base.py +76 -10
- datamaestro_text/data/conversation/orconvqa.py +12 -2
- datamaestro_text/data/conversation/qrecc.py +99 -0
- datamaestro_text/data/ir/__init__.py +3 -2
- datamaestro_text/data/ir/formats.py +20 -5
- datamaestro_text/data/ir/stores.py +13 -6
- datamaestro_text/datasets/irds/data.py +237 -124
- datamaestro_text/datasets/irds/helpers.py +58 -2
- datamaestro_text/version.py +2 -2
- {datamaestro_text-2024.3.10.dist-info → datamaestro_text-2025.1.7.dist-info}/METADATA +3 -3
- {datamaestro_text-2024.3.10.dist-info → datamaestro_text-2025.1.7.dist-info}/RECORD +17 -15
- {datamaestro_text-2024.3.10.dist-info → datamaestro_text-2025.1.7.dist-info}/WHEEL +1 -1
- {datamaestro_text-2024.3.10.dist-info → datamaestro_text-2025.1.7.dist-info}/LICENSE +0 -0
- {datamaestro_text-2024.3.10.dist-info → datamaestro_text-2025.1.7.dist-info}/entry_points.txt +0 -0
- {datamaestro_text-2024.3.10.dist-info → datamaestro_text-2025.1.7.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# See documentation on https://datamaestro.readthedocs.io
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import json
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from datamaestro.definitions import datatasks, datatags, dataset
|
|
7
|
+
from datamaestro.data.ml import Supervised
|
|
8
|
+
from datamaestro.download import reference
|
|
9
|
+
from datamaestro.download.archive import zipdownloader
|
|
10
|
+
from datamaestro.download.wayback import wayback_documents
|
|
11
|
+
from datamaestro.utils import HashCheck
|
|
12
|
+
from datamaestro_text.data.conversation.qrecc import QReCCDataset
|
|
13
|
+
from datamaestro_text.datasets.irds.data import (
|
|
14
|
+
LZ4JSONLDocumentStore,
|
|
15
|
+
SimpleJsonDocument,
|
|
16
|
+
)
|
|
17
|
+
from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@datatags("conversation", "context", "query")
|
|
21
|
+
@datatasks("query rewriting")
|
|
22
|
+
@zipdownloader(
|
|
23
|
+
"data",
|
|
24
|
+
"https://github.com/apple/ml-qrecc/raw/main/dataset/qrecc_data.zip",
|
|
25
|
+
checker=HashCheck("f88fcc7ef3678cd6312080389c8abd67"),
|
|
26
|
+
)
|
|
27
|
+
@dataset(
|
|
28
|
+
Supervised[QReCCDataset, None, QReCCDataset],
|
|
29
|
+
url="https://github.com/apple/ml-qrecc",
|
|
30
|
+
doi="https://doi.org/10.48550/arXiv.2010.04898",
|
|
31
|
+
id="",
|
|
32
|
+
)
|
|
33
|
+
def main(data: Path):
|
|
34
|
+
"""Open-Domain Question Answering Goes Conversational via Question Rewriting
|
|
35
|
+
|
|
36
|
+
We introduce QReCC (Question Rewriting in Conversational Context), an
|
|
37
|
+
end-to-end open-domain question answering dataset comprising of 14K
|
|
38
|
+
conversations with 81K question-answer pairs. The goal of this dataset is to
|
|
39
|
+
provide a challenging benchmark for end-to-end conversational question
|
|
40
|
+
answering that includes the individual subtasks of question rewriting,
|
|
41
|
+
passage retrieval and reading comprehension
|
|
42
|
+
"""
|
|
43
|
+
return {
|
|
44
|
+
"train": QReCCDataset(path=data / "qrecc_train.json"),
|
|
45
|
+
"test": QReCCDataset(path=data / "qrecc_test.json"),
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataset(
|
|
50
|
+
url="https://github.com/apple/ml-qrecc",
|
|
51
|
+
doi="https://doi.org/10.48550/arXiv.2010.04898",
|
|
52
|
+
)
|
|
53
|
+
class Content(LZ4JSONLDocumentStore):
|
|
54
|
+
"""QReCC mentionned URLs content"""
|
|
55
|
+
|
|
56
|
+
@staticmethod
|
|
57
|
+
def __create_dataset__(dataset, options=None):
|
|
58
|
+
ds = reference(reference=main).setup(dataset, options)
|
|
59
|
+
documents_path = wayback_documents(
|
|
60
|
+
"20191127", lambda: Content._urls(ds), name="wayback.jsonl"
|
|
61
|
+
).setup(dataset, options)
|
|
62
|
+
|
|
63
|
+
store_path = lz4docstore_builder(
|
|
64
|
+
"store",
|
|
65
|
+
lambda: Content._documents(documents_path),
|
|
66
|
+
SimpleJsonDocument,
|
|
67
|
+
"id",
|
|
68
|
+
).setup(dataset, options)
|
|
69
|
+
|
|
70
|
+
return LZ4JSONLDocumentStore(jsonl_path=store_path)
|
|
71
|
+
|
|
72
|
+
@staticmethod
|
|
73
|
+
def _documents(path: Path):
|
|
74
|
+
"""Iterates over documents from wayback"""
|
|
75
|
+
with path.open("rt") as fp:
|
|
76
|
+
for line in fp:
|
|
77
|
+
yield SimpleJsonDocument(**json.loads(line))
|
|
78
|
+
|
|
79
|
+
@staticmethod
|
|
80
|
+
def _urls(supervised: Supervised[QReCCDataset, None, QReCCDataset]):
|
|
81
|
+
urls = set()
|
|
82
|
+
for ds in [supervised.train, supervised.test]:
|
|
83
|
+
for entry in ds.entries():
|
|
84
|
+
if entry.answer_url:
|
|
85
|
+
url = re.sub("#.*$", "", entry.answer_url)
|
|
86
|
+
urls.add(url)
|
|
87
|
+
return urls
|
|
@@ -1,11 +1,9 @@
|
|
|
1
1
|
# See documentation on https://datamaestro.readthedocs.io
|
|
2
2
|
|
|
3
|
-
from collections import namedtuple
|
|
4
3
|
import gzip
|
|
5
4
|
import json
|
|
6
5
|
from pathlib import Path
|
|
7
|
-
from typing import Iterator
|
|
8
|
-
import attrs
|
|
6
|
+
from typing import Iterator
|
|
9
7
|
from datamaestro.definitions import datatasks, datatags, dataset
|
|
10
8
|
from datamaestro.download.single import filedownloader
|
|
11
9
|
from datamaestro.utils import HashCheck
|
|
@@ -14,10 +12,7 @@ from datamaestro.utils import HashCheck
|
|
|
14
12
|
from datamaestro_text.data.conversation.orconvqa import OrConvQADataset
|
|
15
13
|
from datamaestro.data.ml import Supervised
|
|
16
14
|
|
|
17
|
-
from datamaestro_text.data.ir import DocumentStore
|
|
18
|
-
from datamaestro_text.data.ir.formats import OrConvQADocument
|
|
19
15
|
from datamaestro_text.data.ir.stores import OrConvQADocumentStore
|
|
20
|
-
from datamaestro_text.datasets.irds.data import LZ4DocumentStore
|
|
21
16
|
from datamaestro_text.datasets.irds.helpers import lz4docstore_downloader
|
|
22
17
|
|
|
23
18
|
|
|
@@ -63,7 +58,9 @@ def preprocessed(train, dev, test):
|
|
|
63
58
|
def orConvQADocumentReader(source: Path) -> Iterator[OrConvQADocumentStore.NAMED_TUPLE]:
|
|
64
59
|
with gzip.open(source, "rt") as fp:
|
|
65
60
|
for line in fp:
|
|
66
|
-
|
|
61
|
+
data = json.loads(line)
|
|
62
|
+
data["body"] = data.pop("text")
|
|
63
|
+
yield OrConvQADocumentStore.NAMED_TUPLE(**data)
|
|
67
64
|
|
|
68
65
|
|
|
69
66
|
@lz4docstore_downloader(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
from enum import Enum
|
|
3
|
-
from typing import Dict, Generic, Iterator, List, Optional, Sequence
|
|
3
|
+
from typing import Dict, Generic, Iterator, List, Optional, Sequence, Tuple
|
|
4
4
|
from attr import define
|
|
5
5
|
from datamaestro.data import Base
|
|
6
6
|
from datamaestro.record import Record, Item
|
|
@@ -61,6 +61,20 @@ class AnswerEntry(Item):
|
|
|
61
61
|
"""The system answer"""
|
|
62
62
|
|
|
63
63
|
|
|
64
|
+
@define
|
|
65
|
+
class AnswerDocumentID(Item):
|
|
66
|
+
"""An answer as a document ID"""
|
|
67
|
+
|
|
68
|
+
document_id: str
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@define
|
|
72
|
+
class AnswerDocumentURL(Item):
|
|
73
|
+
"""An answer as a document ID"""
|
|
74
|
+
|
|
75
|
+
url: str
|
|
76
|
+
|
|
77
|
+
|
|
64
78
|
@define
|
|
65
79
|
class RetrievedEntry(Item):
|
|
66
80
|
"""List of system-retrieved documents and their relevance"""
|
|
@@ -68,8 +82,8 @@ class RetrievedEntry(Item):
|
|
|
68
82
|
documents: List[str]
|
|
69
83
|
"""List of retrieved documents"""
|
|
70
84
|
|
|
71
|
-
|
|
72
|
-
"""List of
|
|
85
|
+
relevant_documents: Optional[Dict[int, Tuple[Optional[int], Optional[int]]]] = None
|
|
86
|
+
"""List of relevance status (optional), with start/stop position"""
|
|
73
87
|
|
|
74
88
|
|
|
75
89
|
@define
|
|
@@ -95,56 +109,99 @@ class ConversationHistoryItem(Item):
|
|
|
95
109
|
|
|
96
110
|
|
|
97
111
|
class ConversationNode:
|
|
112
|
+
@abstractmethod
|
|
98
113
|
def entry(self) -> Record:
|
|
99
114
|
"""The current conversation entry"""
|
|
100
115
|
...
|
|
101
116
|
|
|
117
|
+
@abstractmethod
|
|
102
118
|
def history(self) -> ConversationHistory:
|
|
103
119
|
"""Preceding conversation entries, from most recent to more ancient"""
|
|
104
120
|
...
|
|
105
121
|
|
|
122
|
+
@abstractmethod
|
|
123
|
+
def parent(self) -> Optional["ConversationNode"]:
|
|
124
|
+
...
|
|
125
|
+
|
|
126
|
+
@abstractmethod
|
|
127
|
+
def children(self) -> List["ConversationNode"]:
|
|
128
|
+
...
|
|
129
|
+
|
|
106
130
|
|
|
107
|
-
class ConversationTree:
|
|
131
|
+
class ConversationTree(ABC):
|
|
132
|
+
@abstractmethod
|
|
133
|
+
def root(self) -> ConversationNode:
|
|
134
|
+
...
|
|
135
|
+
|
|
136
|
+
@abstractmethod
|
|
108
137
|
def __iter__(self) -> Iterator[ConversationNode]:
|
|
109
138
|
"""Iterates over conversation nodes"""
|
|
110
|
-
|
|
139
|
+
...
|
|
111
140
|
|
|
112
141
|
|
|
113
142
|
# ---- A conversation tree
|
|
114
143
|
|
|
115
144
|
|
|
116
|
-
class SingleConversationTree(ConversationTree):
|
|
145
|
+
class SingleConversationTree(ConversationTree, ABC):
|
|
117
146
|
"""Simple conversations, based on a sequence of entries"""
|
|
118
147
|
|
|
119
148
|
id: str
|
|
120
|
-
history:
|
|
149
|
+
history: List[Record]
|
|
121
150
|
|
|
122
151
|
def __init__(self, id: Optional[str], history: List[Record]):
|
|
123
152
|
"""Create a simple conversation
|
|
124
153
|
|
|
125
|
-
:param history: The entries, in reverse order (i.e. more ancient first)
|
|
154
|
+
:param history: The entries, in **reverse** order (i.e. more ancient first)
|
|
126
155
|
"""
|
|
127
156
|
self.history = history or []
|
|
157
|
+
self.id = id
|
|
128
158
|
|
|
129
159
|
def add(self, entry: Record):
|
|
130
160
|
self.history.insert(0, entry)
|
|
131
161
|
|
|
132
162
|
def __iter__(self) -> Iterator[ConversationNode]:
|
|
133
|
-
|
|
163
|
+
"""Iterates over the conversation (starting with the beginning)"""
|
|
164
|
+
for ix in reversed(range(len(self.history))):
|
|
134
165
|
yield SingleConversationTreeNode(self, ix)
|
|
135
166
|
|
|
167
|
+
def root(self):
|
|
168
|
+
return SingleConversationTreeNode(self, len(self.history) - 1)
|
|
169
|
+
|
|
136
170
|
|
|
137
171
|
@define
|
|
138
172
|
class SingleConversationTreeNode(ConversationNode):
|
|
139
173
|
tree: SingleConversationTree
|
|
140
174
|
index: int
|
|
141
175
|
|
|
176
|
+
@property
|
|
142
177
|
def entry(self) -> Record:
|
|
143
178
|
return self.tree.history[self.index]
|
|
144
179
|
|
|
180
|
+
@entry.setter
|
|
181
|
+
def entry(self, record: Record):
|
|
182
|
+
try:
|
|
183
|
+
self.tree.history[self.index] = record
|
|
184
|
+
except Exception as e:
|
|
185
|
+
print(e)
|
|
186
|
+
raise
|
|
187
|
+
|
|
145
188
|
def history(self) -> Sequence[Record]:
|
|
146
189
|
return self.tree.history[self.index + 1 :]
|
|
147
190
|
|
|
191
|
+
def parent(self) -> Optional[ConversationNode]:
|
|
192
|
+
return (
|
|
193
|
+
SingleConversationTreeNode(self.tree, self.index + 1)
|
|
194
|
+
if self.index < len(self.tree.history) - 1
|
|
195
|
+
else []
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
def children(self) -> List[ConversationNode]:
|
|
199
|
+
return (
|
|
200
|
+
[SingleConversationTreeNode(self.tree, self.index - 1)]
|
|
201
|
+
if self.index > 0
|
|
202
|
+
else []
|
|
203
|
+
)
|
|
204
|
+
|
|
148
205
|
|
|
149
206
|
class ConversationTreeNode(ConversationNode, ConversationTree):
|
|
150
207
|
"""A conversation tree node"""
|
|
@@ -178,6 +235,15 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
|
|
|
178
235
|
for child in self.children:
|
|
179
236
|
yield from child
|
|
180
237
|
|
|
238
|
+
def parent(self) -> Optional[ConversationNode]:
|
|
239
|
+
return self.parent
|
|
240
|
+
|
|
241
|
+
def children(self) -> List[ConversationNode]:
|
|
242
|
+
return self.children
|
|
243
|
+
|
|
244
|
+
def root(self):
|
|
245
|
+
return self
|
|
246
|
+
|
|
181
247
|
|
|
182
248
|
class ConversationDataset(Base, ABC):
|
|
183
249
|
"""A dataset made of conversations"""
|
|
@@ -186,4 +252,4 @@ class ConversationDataset(Base, ABC):
|
|
|
186
252
|
def __iter__(self) -> Iterator[ConversationTree]:
|
|
187
253
|
"""Return an iterator over conversations"""
|
|
188
254
|
for i in range(len(self)):
|
|
189
|
-
|
|
255
|
+
yield self.get(i)
|
|
@@ -102,16 +102,26 @@ class OrConvQADataset(ConversationDataset, File):
|
|
|
102
102
|
# Add to current
|
|
103
103
|
history.append(
|
|
104
104
|
Record(
|
|
105
|
-
IDItem(
|
|
105
|
+
IDItem(entry.query_id),
|
|
106
106
|
SimpleTextItem(entry.query),
|
|
107
107
|
SimpleDecontextualizedItem(entry.rewrite),
|
|
108
108
|
EntryType.USER_QUERY,
|
|
109
109
|
)
|
|
110
110
|
)
|
|
111
|
+
|
|
112
|
+
relevances = {}
|
|
113
|
+
for rank, relevance in enumerate(entry.retrieval_labels):
|
|
114
|
+
if relevance > 0:
|
|
115
|
+
relevances[rank] = (entry.answer.answer_start, None)
|
|
116
|
+
|
|
117
|
+
assert (
|
|
118
|
+
len(relevances) <= 1
|
|
119
|
+
), f"Too many relevance labels ({len(relevances)}) for {entry.query_id}"
|
|
120
|
+
|
|
111
121
|
history.append(
|
|
112
122
|
Record(
|
|
113
123
|
AnswerEntry(entry.answer.text),
|
|
114
|
-
RetrievedEntry(entry.evidences,
|
|
124
|
+
RetrievedEntry(entry.evidences, relevances),
|
|
115
125
|
EntryType.SYSTEM_ANSWER,
|
|
116
126
|
)
|
|
117
127
|
)
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
from typing import Iterator, List, Optional
|
|
2
|
+
from attr import define
|
|
3
|
+
import json
|
|
4
|
+
from datamaestro.data import File
|
|
5
|
+
from datamaestro.record import Record
|
|
6
|
+
|
|
7
|
+
from datamaestro_text.data.ir.base import (
|
|
8
|
+
IDItem,
|
|
9
|
+
SimpleTextItem,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
from .base import (
|
|
14
|
+
AnswerDocumentURL,
|
|
15
|
+
AnswerEntry,
|
|
16
|
+
ConversationTree,
|
|
17
|
+
EntryType,
|
|
18
|
+
SimpleDecontextualizedItem,
|
|
19
|
+
SingleConversationTree,
|
|
20
|
+
)
|
|
21
|
+
from . import ConversationDataset
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@define(kw_only=True)
|
|
25
|
+
class QReCCDatasetEntry:
|
|
26
|
+
"""A query with past history"""
|
|
27
|
+
|
|
28
|
+
conversation_no: int
|
|
29
|
+
"""Conversation ID"""
|
|
30
|
+
|
|
31
|
+
turn_no: int
|
|
32
|
+
"""The turn in the conversation"""
|
|
33
|
+
|
|
34
|
+
conversation_source: str
|
|
35
|
+
"""Conversation source"""
|
|
36
|
+
|
|
37
|
+
question: str
|
|
38
|
+
"""The last issued query"""
|
|
39
|
+
|
|
40
|
+
rewrite: str
|
|
41
|
+
"""Manually rewritten query"""
|
|
42
|
+
|
|
43
|
+
context: List[str]
|
|
44
|
+
"""The list of queries asked by the user"""
|
|
45
|
+
|
|
46
|
+
answer: str
|
|
47
|
+
"""The answer"""
|
|
48
|
+
|
|
49
|
+
answer_url: str
|
|
50
|
+
"""The URL containing the answer"""
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class QReCCDataset(ConversationDataset, File):
|
|
54
|
+
def entries(self) -> Iterator[QReCCDatasetEntry]:
|
|
55
|
+
"""Iterates over re-written query with their context"""
|
|
56
|
+
with self.path.open("rt") as fp:
|
|
57
|
+
data = json.load(fp)
|
|
58
|
+
|
|
59
|
+
data = [
|
|
60
|
+
QReCCDatasetEntry(**{key.lower(): value for key, value in entry.items()})
|
|
61
|
+
for entry in data
|
|
62
|
+
]
|
|
63
|
+
return iter(data)
|
|
64
|
+
|
|
65
|
+
def __iter__(self) -> Iterator[ConversationTree]:
|
|
66
|
+
history: List[Record] = []
|
|
67
|
+
current_id: Optional[str] = None
|
|
68
|
+
|
|
69
|
+
for entry in self.entries():
|
|
70
|
+
# Creates a new conversation if needed
|
|
71
|
+
if entry.conversation_no != current_id:
|
|
72
|
+
if current_id is not None:
|
|
73
|
+
history.reverse()
|
|
74
|
+
yield SingleConversationTree(current_id, history)
|
|
75
|
+
|
|
76
|
+
current_id = entry.conversation_no
|
|
77
|
+
history = []
|
|
78
|
+
|
|
79
|
+
# Add to current
|
|
80
|
+
history.append(
|
|
81
|
+
Record(
|
|
82
|
+
IDItem(f"{entry.conversation_no}#{entry.turn_no}"),
|
|
83
|
+
SimpleTextItem(entry.question),
|
|
84
|
+
AnswerDocumentURL(entry.answer_url),
|
|
85
|
+
SimpleDecontextualizedItem(entry.rewrite),
|
|
86
|
+
EntryType.USER_QUERY,
|
|
87
|
+
)
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
history.append(
|
|
91
|
+
Record(
|
|
92
|
+
AnswerEntry(entry.answer),
|
|
93
|
+
EntryType.SYSTEM_ANSWER,
|
|
94
|
+
)
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# Yields the last one
|
|
98
|
+
history.reverse()
|
|
99
|
+
yield SingleConversationTree(current_id, history)
|
|
@@ -25,6 +25,7 @@ from .base import ( # noqa: F401
|
|
|
25
25
|
create_record,
|
|
26
26
|
# Other things
|
|
27
27
|
AdhocAssessment,
|
|
28
|
+
AdhocAssessedTopic,
|
|
28
29
|
)
|
|
29
30
|
|
|
30
31
|
|
|
@@ -83,7 +84,7 @@ class DocumentStore(Documents):
|
|
|
83
84
|
def document_int(self, internal_docid: int) -> DocumentRecord:
|
|
84
85
|
"""Returns a document given its internal ID"""
|
|
85
86
|
docid = self.docid_internal2external(internal_docid)
|
|
86
|
-
return self.
|
|
87
|
+
return self.document_ext(docid)
|
|
87
88
|
|
|
88
89
|
def document_ext(self, docid: str) -> DocumentRecord:
|
|
89
90
|
"""Returns a document given its external ID"""
|
|
@@ -159,7 +160,7 @@ class TopicsStore(Topics):
|
|
|
159
160
|
class AdhocAssessments(Base, ABC):
|
|
160
161
|
"""Ad-hoc assessments (qrels)"""
|
|
161
162
|
|
|
162
|
-
def iter(self) -> Iterator[
|
|
163
|
+
def iter(self) -> Iterator[AdhocAssessedTopic]:
|
|
163
164
|
"""Returns an iterator over assessments"""
|
|
164
165
|
raise NotImplementedError(f"For class {self.__class__}")
|
|
165
166
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from functools import cached_property
|
|
2
|
-
from typing import ClassVar, Tuple
|
|
2
|
+
from typing import ClassVar, Tuple, List
|
|
3
3
|
from attrs import define
|
|
4
4
|
from datamaestro.record import record_type
|
|
5
5
|
from ir_datasets.datasets.wapo import WapoDocMedia
|
|
@@ -117,7 +117,6 @@ class TweetDoc(TextItem):
|
|
|
117
117
|
|
|
118
118
|
@define
|
|
119
119
|
class OrConvQADocument(TextItem):
|
|
120
|
-
id: str
|
|
121
120
|
title: str
|
|
122
121
|
body: str
|
|
123
122
|
aid: str
|
|
@@ -127,12 +126,21 @@ class OrConvQADocument(TextItem):
|
|
|
127
126
|
def text(self):
|
|
128
127
|
return f"{self.title} {self.body}"
|
|
129
128
|
|
|
129
|
+
@define
|
|
130
|
+
class Touche2020(TextItem):
|
|
131
|
+
text: str
|
|
132
|
+
title: str
|
|
133
|
+
stance: str
|
|
134
|
+
url: str
|
|
130
135
|
|
|
131
136
|
@define
|
|
132
|
-
class
|
|
137
|
+
class SciDocs(TextItem):
|
|
133
138
|
text: str
|
|
134
|
-
|
|
135
|
-
|
|
139
|
+
title: str
|
|
140
|
+
authors: List[str]
|
|
141
|
+
year: int
|
|
142
|
+
cited_by: List[str]
|
|
143
|
+
references: List[str]
|
|
136
144
|
|
|
137
145
|
|
|
138
146
|
@define
|
|
@@ -167,6 +175,13 @@ class TrecMb14Query(TextItem):
|
|
|
167
175
|
def get_text(self):
|
|
168
176
|
return f"{self.query}"
|
|
169
177
|
|
|
178
|
+
@define
|
|
179
|
+
class SciDocsTopic(TextItem):
|
|
180
|
+
text: str
|
|
181
|
+
authors: List[str]
|
|
182
|
+
year: int
|
|
183
|
+
cited_by: List[str]
|
|
184
|
+
references: List[str]
|
|
170
185
|
|
|
171
186
|
@define()
|
|
172
187
|
class TrecTopic(SimpleTextItem):
|
|
@@ -1,16 +1,21 @@
|
|
|
1
1
|
from collections import namedtuple
|
|
2
|
-
from typing import List
|
|
2
|
+
from typing import List, NamedTuple
|
|
3
3
|
from experimaestro import Constant
|
|
4
4
|
import attrs
|
|
5
5
|
|
|
6
|
+
from datamaestro.record import Record
|
|
7
|
+
from datamaestro_text.data.ir.base import IDItem
|
|
6
8
|
from datamaestro_text.datasets.irds.data import LZ4DocumentStore
|
|
7
9
|
from datamaestro_text.data.ir.formats import OrConvQADocument
|
|
8
10
|
|
|
9
11
|
|
|
10
12
|
class OrConvQADocumentStore(LZ4DocumentStore):
|
|
11
|
-
NAMED_TUPLE
|
|
12
|
-
|
|
13
|
-
|
|
13
|
+
class NAMED_TUPLE(NamedTuple):
|
|
14
|
+
id: str
|
|
15
|
+
title: str
|
|
16
|
+
body: str
|
|
17
|
+
aid: str
|
|
18
|
+
bid: int
|
|
14
19
|
|
|
15
20
|
lookup_field: Constant[str] = "id"
|
|
16
21
|
fields: Constant[List[str]] = list(NAMED_TUPLE._fields)
|
|
@@ -18,5 +23,7 @@ class OrConvQADocumentStore(LZ4DocumentStore):
|
|
|
18
23
|
|
|
19
24
|
data_cls = NAMED_TUPLE
|
|
20
25
|
|
|
21
|
-
def converter(self, data: NAMED_TUPLE) ->
|
|
22
|
-
|
|
26
|
+
def converter(self, data: NAMED_TUPLE) -> Record:
|
|
27
|
+
fields = data._asdict()
|
|
28
|
+
del fields["id"]
|
|
29
|
+
return Record(OrConvQADocument(**fields), IDItem(data.id))
|
|
@@ -1,35 +1,44 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
from abc import ABC, abstractmethod
|
|
3
|
+
from dataclasses import dataclass
|
|
2
4
|
from functools import partial
|
|
3
|
-
import logging
|
|
4
5
|
from pathlib import Path
|
|
5
|
-
from typing import Iterator, Tuple, Type
|
|
6
|
+
from typing import Dict, Iterator, List, NamedTuple, Tuple, Type
|
|
7
|
+
|
|
6
8
|
import ir_datasets
|
|
7
|
-
|
|
9
|
+
import ir_datasets.datasets as _irds
|
|
10
|
+
from datamaestro.record import RecordType, record_type
|
|
11
|
+
from experimaestro import Config, Meta, Option, Param
|
|
12
|
+
from experimaestro.compat import cached_property
|
|
8
13
|
from ir_datasets.formats import (
|
|
9
14
|
GenericDoc,
|
|
10
|
-
GenericQuery,
|
|
11
15
|
GenericDocPair,
|
|
16
|
+
GenericQuery,
|
|
12
17
|
TrecParsedDoc,
|
|
13
18
|
TrecQuery,
|
|
14
19
|
)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
from experimaestro.compat import cached_property
|
|
18
|
-
from experimaestro import Option
|
|
19
|
-
from datamaestro.record import RecordType, record_type
|
|
20
|
+
from ir_datasets.indices import PickleLz4FullStore
|
|
21
|
+
|
|
20
22
|
import datamaestro_text.data.ir as ir
|
|
23
|
+
import datamaestro_text.data.ir.formats as formats
|
|
24
|
+
from datamaestro_text.data.conversation.base import (
|
|
25
|
+
AnswerDocumentID,
|
|
26
|
+
AnswerEntry,
|
|
27
|
+
ConversationHistoryItem,
|
|
28
|
+
ConversationTreeNode,
|
|
29
|
+
DecontextualizedDictItem,
|
|
30
|
+
EntryType,
|
|
31
|
+
)
|
|
21
32
|
from datamaestro_text.data.ir.base import (
|
|
22
|
-
Record,
|
|
23
|
-
TopicRecord,
|
|
24
|
-
DocumentRecord,
|
|
25
|
-
SimpleTextItem,
|
|
26
33
|
AdhocAssessedTopic,
|
|
27
|
-
|
|
34
|
+
DocumentRecord,
|
|
28
35
|
IDItem,
|
|
36
|
+
Record,
|
|
37
|
+
SimpleAdhocAssessment,
|
|
38
|
+
SimpleTextItem,
|
|
39
|
+
TopicRecord,
|
|
29
40
|
create_record,
|
|
30
41
|
)
|
|
31
|
-
import datamaestro_text.data.ir.formats as formats
|
|
32
|
-
|
|
33
42
|
|
|
34
43
|
# Interface between ir_datasets and datamaestro:
|
|
35
44
|
# provides adapted data types
|
|
@@ -108,6 +117,12 @@ class Documents(ir.DocumentStore, IRDSId):
|
|
|
108
117
|
_irds.beir.BeirTitleUrlDoc: tuple_constructor(
|
|
109
118
|
formats.TitleUrlDocument, "doc_id", "text", "title", "url"
|
|
110
119
|
),
|
|
120
|
+
_irds.beir.BeirToucheDoc: tuple_constructor(
|
|
121
|
+
formats.Touche2020, "doc_id", "text", "title", "stance", "url"
|
|
122
|
+
),
|
|
123
|
+
_irds.beir.BeirSciDoc: tuple_constructor(
|
|
124
|
+
formats.SciDocs, "doc_id", "text", "title", "authors", "year", "cited_by", "references"
|
|
125
|
+
),
|
|
111
126
|
_irds.msmarco_document.MsMarcoDocument: tuple_constructor(
|
|
112
127
|
formats.MsMarcoDocument, "doc_id", "url", "title", "body"
|
|
113
128
|
),
|
|
@@ -215,20 +230,6 @@ if hasattr(_irds, "miracl"):
|
|
|
215
230
|
)
|
|
216
231
|
|
|
217
232
|
|
|
218
|
-
# Fix while PR https://github.com/allenai/ir_datasets/pull/252
|
|
219
|
-
# is not in.
|
|
220
|
-
class DMPickleLz4FullStore(PickleLz4FullStore):
|
|
221
|
-
def get_many(self, doc_ids, field=None):
|
|
222
|
-
result = {}
|
|
223
|
-
field_idx = self._doc_cls._fields.index(field) if field is not None else None
|
|
224
|
-
for doc in self.get_many_iter(doc_ids):
|
|
225
|
-
if field is not None:
|
|
226
|
-
result[getattr(doc, self._id_field)] = doc[field_idx]
|
|
227
|
-
else:
|
|
228
|
-
result[getattr(doc, self._id_field)] = doc
|
|
229
|
-
return result
|
|
230
|
-
|
|
231
|
-
|
|
232
233
|
class LZ4DocumentStore(ir.DocumentStore):
|
|
233
234
|
"""A LZ4-based document store"""
|
|
234
235
|
|
|
@@ -242,7 +243,7 @@ class LZ4DocumentStore(ir.DocumentStore):
|
|
|
242
243
|
|
|
243
244
|
@cached_property
|
|
244
245
|
def store(self):
|
|
245
|
-
return
|
|
246
|
+
return PickleLz4FullStore(
|
|
246
247
|
self.path, None, self.data_cls, self.lookup_field, self.index_fields
|
|
247
248
|
)
|
|
248
249
|
|
|
@@ -254,33 +255,48 @@ class LZ4DocumentStore(ir.DocumentStore):
|
|
|
254
255
|
return getattr(self._docs[ix], self.store._id_field)
|
|
255
256
|
|
|
256
257
|
def document_ext(self, docid: str) -> DocumentRecord:
|
|
257
|
-
return self.converter(self.
|
|
258
|
+
return self.converter(self.store.get(docid))
|
|
258
259
|
|
|
259
260
|
def documents_ext(self, docids: List[str]) -> DocumentRecord:
|
|
260
261
|
"""Returns documents given their external IDs (optimized for batch)"""
|
|
261
262
|
retrieved = self.store.get_many(docids)
|
|
262
|
-
return [
|
|
263
|
-
self.converter(self.document_recordtype, retrieved[docid])
|
|
264
|
-
for docid in docids
|
|
265
|
-
]
|
|
263
|
+
return [self.converter(retrieved[docid]) for docid in docids]
|
|
266
264
|
|
|
265
|
+
@abstractmethod
|
|
267
266
|
def converter(self, data):
|
|
268
|
-
"""Converts a document from LZ4 tuples to
|
|
269
|
-
|
|
270
|
-
return data
|
|
267
|
+
"""Converts a document from LZ4 tuples to a document record"""
|
|
268
|
+
...
|
|
271
269
|
|
|
272
270
|
def iter(self) -> Iterator[DocumentRecord]:
|
|
273
271
|
"""Returns an iterator over documents"""
|
|
274
|
-
return map(
|
|
275
|
-
partial(self.converter, self.document_recordtype), self.store.__iter__()
|
|
276
|
-
)
|
|
272
|
+
return map(self.converter, self.store.__iter__())
|
|
277
273
|
|
|
274
|
+
@cached_property
|
|
278
275
|
def documentcount(self):
|
|
279
276
|
if self.count:
|
|
280
277
|
return self.count
|
|
281
278
|
return self.store.count()
|
|
282
279
|
|
|
283
280
|
|
|
281
|
+
class SimpleJsonDocument(NamedTuple):
|
|
282
|
+
id: str
|
|
283
|
+
text: str
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
class LZ4JSONLDocumentStore(LZ4DocumentStore):
|
|
287
|
+
jsonl_path: Meta[Path]
|
|
288
|
+
"""json-l based document store
|
|
289
|
+
|
|
290
|
+
Each line is of the form
|
|
291
|
+
```json
|
|
292
|
+
{ "id": "...", "text": "..." }
|
|
293
|
+
```
|
|
294
|
+
"""
|
|
295
|
+
|
|
296
|
+
def converter(self, data):
|
|
297
|
+
return DocumentRecord(IDItem(data["id"]), SimpleTextItem(data["text"]))
|
|
298
|
+
|
|
299
|
+
|
|
284
300
|
class TopicsHandler(ABC):
|
|
285
301
|
@abstractmethod
|
|
286
302
|
def topic_int(self, internal_topic_id: int) -> TopicRecord:
|
|
@@ -352,6 +368,12 @@ class Topics(ir.TopicsStore, IRDSId):
|
|
|
352
368
|
TrecQuery: tuple_constructor(
|
|
353
369
|
formats.TrecTopic, "query_id", "title", "description", "narrative"
|
|
354
370
|
),
|
|
371
|
+
_irds.beir.BeirToucheQuery: tuple_constructor(
|
|
372
|
+
formats.TrecTopic, "query_id", "text", "description", "narrative"
|
|
373
|
+
),
|
|
374
|
+
_irds.beir.BeirSciQuery: tuple_constructor(
|
|
375
|
+
formats.SciDocsTopic, "query_id", "text", "authors", "year", "cited_by", "references"
|
|
376
|
+
),
|
|
355
377
|
_irds.tweets2013_ia.TrecMb13Query: tuple_constructor(
|
|
356
378
|
formats.TrecMb13Query, "query_id", "query", "time", "tweet_time"
|
|
357
379
|
),
|
|
@@ -395,99 +417,190 @@ class Topics(ir.TopicsStore, IRDSId):
|
|
|
395
417
|
return self.handler.iter()
|
|
396
418
|
|
|
397
419
|
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
420
|
+
class CastTopicsHandler(TopicsHandler):
|
|
421
|
+
def __init__(self, dataset):
|
|
422
|
+
self.dataset = dataset
|
|
423
|
+
|
|
424
|
+
@cached_property
|
|
425
|
+
def ext2records(self):
|
|
426
|
+
return {record[IDItem].id: record for record in self.records}
|
|
427
|
+
|
|
428
|
+
def topic_int(self, internal_topic_id: int) -> TopicRecord:
|
|
429
|
+
"""Returns a document given its internal ID"""
|
|
430
|
+
return self.records[internal_topic_id]
|
|
405
431
|
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
432
|
+
def topic_ext(self, external_topic_id: str) -> TopicRecord:
|
|
433
|
+
"""Returns a document given its external ID"""
|
|
434
|
+
return self.ext2records[external_topic_id]
|
|
435
|
+
|
|
436
|
+
def iter(self) -> Iterator[ir.TopicRecord]:
|
|
437
|
+
"""Returns an iterator over topics"""
|
|
438
|
+
return iter(self.records)
|
|
439
|
+
|
|
440
|
+
@cached_property
|
|
441
|
+
def records(self):
|
|
442
|
+
try:
|
|
443
|
+
topic_number = None
|
|
444
|
+
node = None
|
|
445
|
+
conversation = []
|
|
446
|
+
records = []
|
|
447
|
+
|
|
448
|
+
for query in self.dataset.dataset.queries_iter():
|
|
449
|
+
decontextualized = DecontextualizedDictItem(
|
|
450
|
+
"manual",
|
|
451
|
+
{
|
|
452
|
+
"manual": query.manual_rewritten_utterance,
|
|
453
|
+
"auto": query.automatic_rewritten_utterance,
|
|
454
|
+
},
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
is_new_conversation = topic_number != query.topic_number
|
|
458
|
+
|
|
459
|
+
topic = Record(
|
|
460
|
+
IDItem(query.query_id),
|
|
461
|
+
SimpleTextItem(query.raw_utterance),
|
|
462
|
+
decontextualized,
|
|
463
|
+
ConversationHistoryItem(
|
|
464
|
+
[] if is_new_conversation else node.conversation(False)
|
|
465
|
+
),
|
|
466
|
+
EntryType.USER_QUERY,
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
if is_new_conversation:
|
|
470
|
+
conversation = []
|
|
471
|
+
node = ConversationTreeNode(topic)
|
|
472
|
+
topic_number = query.topic_number
|
|
473
|
+
else:
|
|
474
|
+
node = node.add(ConversationTreeNode(topic))
|
|
475
|
+
|
|
476
|
+
records.append(topic)
|
|
477
|
+
|
|
478
|
+
conversation.append(node)
|
|
479
|
+
node = node.add(
|
|
480
|
+
ConversationTreeNode(
|
|
481
|
+
Record(
|
|
482
|
+
AnswerDocumentID(self.get_canonical_result_id(query)),
|
|
483
|
+
EntryType.SYSTEM_ANSWER,
|
|
484
|
+
)
|
|
451
485
|
)
|
|
486
|
+
)
|
|
487
|
+
conversation.append(node)
|
|
488
|
+
except Exception:
|
|
489
|
+
logging.exception("Error while computing topic records")
|
|
490
|
+
raise
|
|
491
|
+
|
|
492
|
+
return records
|
|
493
|
+
|
|
494
|
+
@staticmethod
|
|
495
|
+
def get_canonical_result_id():
|
|
496
|
+
return None
|
|
497
|
+
|
|
498
|
+
|
|
499
|
+
class Cast2020TopicsHandler(CastTopicsHandler):
|
|
500
|
+
@staticmethod
|
|
501
|
+
def get_canonical_result_id(query: _irds.trec_cast.Cast2020Query):
|
|
502
|
+
return query.manual_canonical_result_id
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
class Cast2021TopicsHandler(CastTopicsHandler):
|
|
506
|
+
@staticmethod
|
|
507
|
+
def get_canonical_result_id(query: _irds.trec_cast.Cast2021Query):
|
|
508
|
+
return query.canonical_result_id
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
class Cast2022TopicsHandler(CastTopicsHandler):
|
|
512
|
+
def __init__(self, dataset):
|
|
513
|
+
self.dataset = dataset
|
|
514
|
+
|
|
515
|
+
@cached_property
|
|
516
|
+
def records(self):
|
|
517
|
+
try:
|
|
518
|
+
records = []
|
|
519
|
+
nodes: Dict[str, ConversationTreeNode] = {}
|
|
520
|
+
|
|
521
|
+
for (
|
|
522
|
+
query
|
|
523
|
+
) in (
|
|
524
|
+
self.dataset.dataset.queries_iter()
|
|
525
|
+
): # type: _irds.trec_cast.Cast2022Query
|
|
526
|
+
parent = nodes[query.parent_id] if query.parent_id else None
|
|
527
|
+
|
|
528
|
+
if query.participant == "User":
|
|
452
529
|
topic = Record(
|
|
453
530
|
IDItem(query.query_id),
|
|
454
531
|
SimpleTextItem(query.raw_utterance),
|
|
455
|
-
|
|
532
|
+
DecontextualizedDictItem(
|
|
533
|
+
"manual",
|
|
534
|
+
{
|
|
535
|
+
"manual": query.manual_rewritten_utterance,
|
|
536
|
+
},
|
|
537
|
+
),
|
|
456
538
|
ConversationHistoryItem(
|
|
457
|
-
|
|
539
|
+
parent.conversation(False) if parent else []
|
|
458
540
|
),
|
|
541
|
+
EntryType.USER_QUERY,
|
|
459
542
|
)
|
|
460
|
-
|
|
461
|
-
if topic_number == query.topic_number:
|
|
462
|
-
node = node.add(ConversationTreeNode(topic))
|
|
463
|
-
else:
|
|
464
|
-
conversation = []
|
|
465
|
-
node = ConversationTreeNode(topic)
|
|
466
|
-
topic_number = query.topic_number
|
|
467
|
-
|
|
543
|
+
node = ConversationTreeNode(topic)
|
|
468
544
|
records.append(topic)
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
545
|
+
else:
|
|
546
|
+
node = ConversationTreeNode(
|
|
547
|
+
Record(
|
|
548
|
+
AnswerEntry(query.response),
|
|
549
|
+
EntryType.SYSTEM_ANSWER,
|
|
474
550
|
)
|
|
475
551
|
)
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
552
|
+
|
|
553
|
+
nodes[query.query_id] = node
|
|
554
|
+
if parent:
|
|
555
|
+
parent.add(node)
|
|
556
|
+
except Exception:
|
|
557
|
+
logging.exception("Error while computing topic records")
|
|
558
|
+
raise
|
|
559
|
+
|
|
560
|
+
return records
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
Topics.HANDLERS.update(
|
|
564
|
+
{
|
|
565
|
+
# _irds.trec_cast.Cast2019Query: Cast2019TopicsHandler,
|
|
566
|
+
_irds.trec_cast.Cast2020Query: Cast2020TopicsHandler,
|
|
567
|
+
_irds.trec_cast.Cast2021Query: Cast2021TopicsHandler,
|
|
568
|
+
_irds.trec_cast.Cast2022Query: Cast2022TopicsHandler,
|
|
569
|
+
}
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
class CastDocHandler:
|
|
574
|
+
def check(self, cls):
|
|
575
|
+
assert issubclass(cls, _irds.trec_cast.CastDoc)
|
|
576
|
+
|
|
577
|
+
@cached_property
|
|
578
|
+
def target_cls(self):
|
|
579
|
+
return formats.TitleUrlDocument
|
|
580
|
+
|
|
581
|
+
def __call__(self, _, doc: _irds.trec_cast.CastDoc):
|
|
582
|
+
return Record(
|
|
583
|
+
IDItem(doc.doc_id), formats.SimpleTextItem(" ".join(doc.passages))
|
|
584
|
+
)
|
|
585
|
+
|
|
586
|
+
|
|
587
|
+
class CastPassageDocHandler:
|
|
588
|
+
def check(self, cls):
|
|
589
|
+
assert issubclass(cls, _irds.trec_cast.CastPassageDoc)
|
|
590
|
+
|
|
591
|
+
@cached_property
|
|
592
|
+
def target_cls(self):
|
|
593
|
+
return formats.TitleUrlDocument
|
|
594
|
+
|
|
595
|
+
def __call__(self, _, doc: _irds.trec_cast.CastPassageDoc):
|
|
596
|
+
return Record(
|
|
597
|
+
IDItem(doc.doc_id),
|
|
598
|
+
formats.TitleUrlDocument(doc.text, doc.title, doc.url),
|
|
599
|
+
)
|
|
600
|
+
|
|
601
|
+
|
|
602
|
+
Documents.CONVERTERS[_irds.trec_cast.CastDoc] = CastDocHandler()
|
|
603
|
+
Documents.CONVERTERS[_irds.trec_cast.CastPassageDoc] = CastPassageDocHandler()
|
|
491
604
|
|
|
492
605
|
|
|
493
606
|
class Adhoc(ir.Adhoc, IRDSId):
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from typing import Optional, Type, Callable, Iterator
|
|
3
3
|
from ir_datasets.indices import PickleLz4FullStore
|
|
4
|
-
from datamaestro.download import
|
|
4
|
+
from datamaestro.download import Resource
|
|
5
5
|
from datamaestro.utils import FileChecker
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
import urllib3
|
|
8
8
|
|
|
9
9
|
|
|
10
|
-
class lz4docstore_downloader(
|
|
10
|
+
class lz4docstore_downloader(Resource):
|
|
11
11
|
"""Uses ir_datasets Lz4FullStore to build a document store for a stream of documents"""
|
|
12
12
|
|
|
13
13
|
def __init__(
|
|
@@ -69,3 +69,59 @@ class lz4docstore_downloader(Download):
|
|
|
69
69
|
|
|
70
70
|
# All good!
|
|
71
71
|
(destination / "done").touch()
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class lz4docstore_builder(Resource):
|
|
75
|
+
def __init__(
|
|
76
|
+
self,
|
|
77
|
+
name: str,
|
|
78
|
+
iter_factory: Callable[[], Iterator],
|
|
79
|
+
doc_cls: Type,
|
|
80
|
+
lookup_field: str,
|
|
81
|
+
*,
|
|
82
|
+
count_hint: Optional[int] = None,
|
|
83
|
+
):
|
|
84
|
+
"""Uses ir_datasets Lz4FullStore to build a document store for a stream of documents
|
|
85
|
+
|
|
86
|
+
:param name: The name of the variable for path construction
|
|
87
|
+
:param iter_factory: Iterator over documents
|
|
88
|
+
:param doc_cls: The class of documents (must be a dataclass because of how ir-datasets works)
|
|
89
|
+
:param lookup_field: Which field to use for lookup
|
|
90
|
+
:param count_hint: Number of documents (hint), defaults to None
|
|
91
|
+
"""
|
|
92
|
+
super().__init__(name)
|
|
93
|
+
self.iter_factory = iter_factory
|
|
94
|
+
self.doc_cls = doc_cls
|
|
95
|
+
self.lookup_field = lookup_field
|
|
96
|
+
self.count_hint = count_hint
|
|
97
|
+
|
|
98
|
+
def prepare(self):
|
|
99
|
+
return self.definition.datapath / self.varname
|
|
100
|
+
|
|
101
|
+
def download(self, force=False):
|
|
102
|
+
# Creates directory if needed
|
|
103
|
+
destination = self.definition.datapath / self.varname
|
|
104
|
+
destination.mkdir(exist_ok=True)
|
|
105
|
+
|
|
106
|
+
# Early exit
|
|
107
|
+
if (destination / "done").is_file() and not force:
|
|
108
|
+
return True
|
|
109
|
+
|
|
110
|
+
# Download (cache)
|
|
111
|
+
logging.info("Building the document index")
|
|
112
|
+
|
|
113
|
+
# Builds the LZ4 store
|
|
114
|
+
store = PickleLz4FullStore(
|
|
115
|
+
destination,
|
|
116
|
+
lambda: self.iter_factory(),
|
|
117
|
+
self.doc_cls,
|
|
118
|
+
lookup_field=self.lookup_field,
|
|
119
|
+
index_fields=[self.lookup_field],
|
|
120
|
+
key_field_prefix=None,
|
|
121
|
+
size_hint=None,
|
|
122
|
+
count_hint=self.count_hint,
|
|
123
|
+
)
|
|
124
|
+
store.build()
|
|
125
|
+
|
|
126
|
+
# All good!
|
|
127
|
+
(destination / "done").touch()
|
datamaestro_text/version.py
CHANGED
|
@@ -12,5 +12,5 @@ __version__: str
|
|
|
12
12
|
__version_tuple__: VERSION_TUPLE
|
|
13
13
|
version_tuple: VERSION_TUPLE
|
|
14
14
|
|
|
15
|
-
__version__ = version = '
|
|
16
|
-
__version_tuple__ = version_tuple = (
|
|
15
|
+
__version__ = version = '2025.1.7'
|
|
16
|
+
__version_tuple__ = version_tuple = (2025, 1, 7)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datamaestro-text
|
|
3
|
-
Version:
|
|
3
|
+
Version: 2025.1.7
|
|
4
4
|
Summary: Datamaestro module for text-related datasets
|
|
5
5
|
Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
|
|
6
6
|
License: GPL-3
|
|
@@ -18,8 +18,8 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
18
18
|
Requires-Python: >=3.8
|
|
19
19
|
Description-Content-Type: text/markdown
|
|
20
20
|
License-File: LICENSE
|
|
21
|
-
Requires-Dist: datamaestro
|
|
22
|
-
Requires-Dist:
|
|
21
|
+
Requires-Dist: datamaestro>=1.2.1
|
|
22
|
+
Requires-Dist: ir_datasets>=0.5.8
|
|
23
23
|
Requires-Dist: attrs
|
|
24
24
|
|
|
25
25
|
[](https://github.com/pre-commit/pre-commit) [](https://badge.fury.io/py/datamaestro-text)
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
datamaestro_text/__init__.py,sha256=hU8jZpkXl3F74qIfqnJl7v4nJ9YxfoR7IpJpUREFNRI,248
|
|
2
|
-
datamaestro_text/version.py,sha256=
|
|
2
|
+
datamaestro_text/version.py,sha256=k65KHkS4PD3NjDDCJgWcTwRBE4yVcKs32B8SUxv2DvE,417
|
|
3
3
|
datamaestro_text/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
datamaestro_text/config/ai/quac.yaml,sha256=h1D7UJo2z1nZ_9MXpDHuQNJG9Ma2oayUdJV6lyumAIg,1103
|
|
5
5
|
datamaestro_text/config/com/oscar-corpus.py,sha256=dDfdy_uhG6esRQQzUOOORwK8KvEq4c4hZbWt22pv-zY,733
|
|
6
6
|
datamaestro_text/config/com/sentiment140.py,sha256=iRV_rSjQcr9WNjjQ4EdznaEMGLADV_rwpeRx7ycQi0k,1383
|
|
7
7
|
datamaestro_text/config/com/fastml/goodbooks-10k.yaml,sha256=5ZABxUnBFs2ZnCXtBH8YoBiPb3SocRRdH1DLSfVWF-Y,1172
|
|
8
8
|
datamaestro_text/config/com/github/aagohary/canard.py,sha256=tIwb_KxMUR3st7rzQUkt6rIjolTl8uKvDq6t795b1nY,1468
|
|
9
|
-
datamaestro_text/config/com/github/
|
|
9
|
+
datamaestro_text/config/com/github/apple/ml-qrecc.py,sha256=_lufJlg-4zaQyjr5Ae-X-9hXzpl2JFjfxh_RhnBbva4,3068
|
|
10
|
+
datamaestro_text/config/com/github/prdwb/orconvqa.py,sha256=Tg3hbnvilxW_Lwt7fpKvGMtu-6mc9oNIHM-LX6JTR1U,3026
|
|
10
11
|
datamaestro_text/config/com/github/soskek/bookcorpus.yaml,sha256=qJKs35yeEIilEMgNvU3OEqMp1TSn7mDM2T-uYyA7kTU,1607
|
|
11
12
|
datamaestro_text/config/com/microsoft/wikiqa.yaml,sha256=U7rU-W2Xz1MYv2YXT4jCTj5DsHyM0VssbQPNa3EISaM,540
|
|
12
13
|
datamaestro_text/config/com/microsoft/msmarco/passage.py,sha256=ouNn2nivS6mUMaCyMzqxNv1YMoPrSEX-UcSZpG1v_uw,11645
|
|
@@ -45,23 +46,24 @@ datamaestro_text/data/recommendation.py,sha256=MatelpJQiCMpNDuaQVBkRY809J1qiQo-8
|
|
|
45
46
|
datamaestro_text/data/tagging.py,sha256=yWm7bNLks77cAySa1aZNCmLFxTTqhqXZ0PaoaEYU6hI,697
|
|
46
47
|
datamaestro_text/data/text.py,sha256=pOI8nrEd6RoQA28DVH1JufHTunr9vG3FQzwElR8YirI,499
|
|
47
48
|
datamaestro_text/data/conversation/__init__.py,sha256=esOWnSImMlQs5QtfxUt559ABLd6a5wwoNpj9XtIq71Y,159
|
|
48
|
-
datamaestro_text/data/conversation/base.py,sha256=
|
|
49
|
+
datamaestro_text/data/conversation/base.py,sha256=zXX5sPMoVtDf4WTkLx45IvZz6lgKPgcUZFu6N4lVBlc,6457
|
|
49
50
|
datamaestro_text/data/conversation/canard.py,sha256=IMxu5NfytWZLZ_cyT8UFOICbDE82HLJoNBT_6j36Faw,1808
|
|
50
|
-
datamaestro_text/data/conversation/orconvqa.py,sha256=
|
|
51
|
-
datamaestro_text/data/
|
|
51
|
+
datamaestro_text/data/conversation/orconvqa.py,sha256=VG3GV5_IPQcvla9rrQPypDNcZYmKNMgozmr2oudeLA4,3802
|
|
52
|
+
datamaestro_text/data/conversation/qrecc.py,sha256=es4GmqPtE63A7O_GARe8Zy3rQvuLEhAvUA7CfN_nMeA,2562
|
|
53
|
+
datamaestro_text/data/ir/__init__.py,sha256=EOVnRv9oKEEucMOgrFjhXxeUWEkaJkbXBndpkKhZaiY,8701
|
|
52
54
|
datamaestro_text/data/ir/base.py,sha256=Cw8egjChpx4ksUwp-vTA70B2OWxROH7FIeJylsXggAk,1429
|
|
53
55
|
datamaestro_text/data/ir/cord19.py,sha256=7xDIzsy63WrA9lxxyNOMu5ECRymu5x23EzYG977nS6Y,1440
|
|
54
56
|
datamaestro_text/data/ir/csv.py,sha256=tnxuqR_MZ3GQhuoXEMYyWLQw8PyD0gRqsnzIP5Gsziw,1212
|
|
55
57
|
datamaestro_text/data/ir/data.py,sha256=ERmmOxz_9mUkIuccNbzUa5Y6gVLLVDdyc4cCxbCCUbY,20
|
|
56
|
-
datamaestro_text/data/ir/formats.py,sha256=
|
|
58
|
+
datamaestro_text/data/ir/formats.py,sha256=rU9uJhdFDdlnQ2qfFowK_--1WVOH1fVgfXWOPPDF_FY,3160
|
|
57
59
|
datamaestro_text/data/ir/huggingface.py,sha256=G71VFDN-SllZy4LFxumEbCumEJvb5-orAbLemHjWhiA,956
|
|
58
|
-
datamaestro_text/data/ir/stores.py,sha256=
|
|
60
|
+
datamaestro_text/data/ir/stores.py,sha256=rlOogoBAfsP7o01KqvHiTF1XqzK2Fp6QbRmuasabKAE,859
|
|
59
61
|
datamaestro_text/data/ir/trec.py,sha256=qDOzmgUn0hMqeP9KdI190-9IKsLl6yNRqIoBz6s-cBs,1898
|
|
60
62
|
datamaestro_text/data/ir/utils.py,sha256=6-GhXVtgkBZGhIs2-ODZua_3DmKjSSVydStpHDqbAwE,833
|
|
61
63
|
datamaestro_text/datasets/irds/__init__.py,sha256=Tq0HN1qojnZYLBumM59BuTkz7r0gcu-5OXmDDLgPpAc,707
|
|
62
|
-
datamaestro_text/datasets/irds/data.py,sha256=
|
|
64
|
+
datamaestro_text/datasets/irds/data.py,sha256=BTsFZdjFJ0XdlYRh2rcshAoeknHrypf7be2BGUbWWFk,19931
|
|
63
65
|
datamaestro_text/datasets/irds/datasets.py,sha256=qtN-nX2_OV9FD339aZjwVL1jFf6I4T6MbNCcuRAjmtU,5682
|
|
64
|
-
datamaestro_text/datasets/irds/helpers.py,sha256=
|
|
66
|
+
datamaestro_text/datasets/irds/helpers.py,sha256=fGE-fbuJbXdTzl1qo55247jzn9cvApY-d82GJBgfY1E,3982
|
|
65
67
|
datamaestro_text/datasets/irds/utils.py,sha256=m30JLIrV_HgilN11TvY9dGTyumES6LLzWZDUAMT915M,1425
|
|
66
68
|
datamaestro_text/download/tmdb.py,sha256=kU_Vz9jhznlyeKMHziVu58IHoWv8zPu6CZTHVNQvmu4,4009
|
|
67
69
|
datamaestro_text/interfaces/plaintext.py,sha256=cWfS_xjqZxQ0EV4Ax5BEarZ4lnhQ1I7mc_vgfBgE76w,885
|
|
@@ -76,9 +78,9 @@ datamaestro_text/utils/files.py,sha256=n6ZGl5LNrZbHLcV9RFwd7cFT0vPUezit-2dsBzs1v
|
|
|
76
78
|
datamaestro_text/utils/iter.py,sha256=2_UZ8y9Ma4k5U9ZD4w55Zfb6NGrKM1L4G40OygRm1is,2459
|
|
77
79
|
datamaestro_text/utils/randomstream.py,sha256=_-boH4IIqN8qcl3IktjpNp9vmF4TWRzHUSNVwg7WAr8,973
|
|
78
80
|
datamaestro_text/utils/shuffle.py,sha256=o8JTz3mr0lYWyv0zEh91jEK12ci1etMiUnzh5GkOHCM,3490
|
|
79
|
-
datamaestro_text-
|
|
80
|
-
datamaestro_text-
|
|
81
|
-
datamaestro_text-
|
|
82
|
-
datamaestro_text-
|
|
83
|
-
datamaestro_text-
|
|
84
|
-
datamaestro_text-
|
|
81
|
+
datamaestro_text-2025.1.7.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
82
|
+
datamaestro_text-2025.1.7.dist-info/METADATA,sha256=-wsBcUcnEnOMiHb1ROIf43r55ZNNNjpIemuFXm5hHUY,1609
|
|
83
|
+
datamaestro_text-2025.1.7.dist-info/WHEEL,sha256=A3WOREP4zgxI0fKrHUG8DC8013e3dK3n7a6HDbcEIwE,91
|
|
84
|
+
datamaestro_text-2025.1.7.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
|
|
85
|
+
datamaestro_text-2025.1.7.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
|
|
86
|
+
datamaestro_text-2025.1.7.dist-info/RECORD,,
|
|
File without changes
|
{datamaestro_text-2024.3.10.dist-info → datamaestro_text-2025.1.7.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|