datamaestro-text 2025.1.7__py3-none-any.whl → 2025.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. datamaestro_text/config/com/github/aagohary/canard.py +19 -12
  2. datamaestro_text/config/com/github/apple/ml-qrecc.py +6 -8
  3. datamaestro_text/config/com/microsoft/msmarco/passage.py +8 -7
  4. datamaestro_text/config/com/sentiment140.py +1 -6
  5. datamaestro_text/config/edu/stanford/glove.py +1 -0
  6. datamaestro_text/config/edu/upenn/ldc/aquaint.py +3 -4
  7. datamaestro_text/config/gov/nist/trec/tipster.py +1 -1
  8. datamaestro_text/data/conversation/base.py +10 -8
  9. datamaestro_text/data/conversation/canard.py +52 -13
  10. datamaestro_text/data/conversation/orconvqa.py +0 -1
  11. datamaestro_text/data/embeddings.py +3 -3
  12. datamaestro_text/data/ir/base.py +6 -0
  13. datamaestro_text/data/ir/cord19.py +2 -1
  14. datamaestro_text/data/ir/formats.py +31 -4
  15. datamaestro_text/data/recommendation.py +6 -7
  16. datamaestro_text/data/text.py +6 -6
  17. datamaestro_text/datasets/irds/data.py +65 -0
  18. datamaestro_text/datasets/irds/datasets.py +0 -4
  19. datamaestro_text/utils/iter.py +5 -2
  20. datamaestro_text/version.py +9 -4
  21. {datamaestro_text-2025.1.7.dist-info → datamaestro_text-2025.5.13.dist-info}/METADATA +10 -3
  22. {datamaestro_text-2025.1.7.dist-info → datamaestro_text-2025.5.13.dist-info}/RECORD +26 -26
  23. {datamaestro_text-2025.1.7.dist-info → datamaestro_text-2025.5.13.dist-info}/WHEEL +1 -1
  24. {datamaestro_text-2025.1.7.dist-info → datamaestro_text-2025.5.13.dist-info}/entry_points.txt +0 -0
  25. {datamaestro_text-2025.1.7.dist-info → datamaestro_text-2025.5.13.dist-info/licenses}/LICENSE +0 -0
  26. {datamaestro_text-2025.1.7.dist-info → datamaestro_text-2025.5.13.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,5 @@
1
- # See documentation on https://datamaestro.readthedocs.io
2
-
3
1
  from datamaestro.definitions import datatasks, datatags, dataset
4
- from datamaestro.download.archive import zipdownloader
2
+ from datamaestro.download.single import filedownloader
5
3
  from datamaestro.utils import HashCheck
6
4
 
7
5
  from datamaestro.data.ml import Supervised
@@ -10,14 +8,23 @@ from datamaestro_text.data.conversation.canard import CanardDataset
10
8
 
11
9
  @datatags("conversation", "context", "query")
12
10
  @datatasks("query rewriting")
13
- @zipdownloader(
14
- "archive",
15
- "https://obj.umiacs.umd.edu/elgohary/CANARD_Release.zip",
16
- subpath="CANARD_Release",
17
- checker=HashCheck("c9bba7c6bb898f669383415b54fd6ffd"),
11
+ @filedownloader(
12
+ "train.json",
13
+ "https://raw.githubusercontent.com/aagohary/canard/refs/heads/master/data/release/train.json",
14
+ checker=HashCheck("73624ac646fb81e09b0fd7f01370ada3"),
15
+ )
16
+ @filedownloader(
17
+ "dev.json",
18
+ "https://raw.githubusercontent.com/aagohary/canard/refs/heads/master/data/release/dev.json",
19
+ checker=HashCheck("c84525631a83bc771c58ff31f4a9b601"),
20
+ )
21
+ @filedownloader(
22
+ "test.json",
23
+ "https://raw.githubusercontent.com/aagohary/canard/refs/heads/master/data/release/test.json",
24
+ checker=HashCheck("3fc14d0078e7a5056f5da571728f024e"),
18
25
  )
19
26
  @dataset(Supervised, url="https://sites.google.com/view/qanta/projects/canard", id="")
20
- def main(archive):
27
+ def main(train, dev, test):
21
28
  """Question-in-context rewriting
22
29
 
23
30
  CANARD is a dataset for question-in-context rewriting that consists of
@@ -30,7 +37,7 @@ def main(archive):
30
37
  Each dataset is an instance of :class:`datamaestro_text.data.conversation.CanardDataset`
31
38
  """
32
39
  return {
33
- "train": CanardDataset(path=archive / "train.json"),
34
- "validation": CanardDataset(path=archive / "dev.json"),
35
- "test": CanardDataset(path=archive / "test.json"),
40
+ "train": CanardDataset(path=train),
41
+ "validation": CanardDataset(path=dev),
42
+ "test": CanardDataset(path=test),
36
43
  }
@@ -25,12 +25,11 @@ from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
25
25
  checker=HashCheck("f88fcc7ef3678cd6312080389c8abd67"),
26
26
  )
27
27
  @dataset(
28
- Supervised[QReCCDataset, None, QReCCDataset],
29
28
  url="https://github.com/apple/ml-qrecc",
30
29
  doi="https://doi.org/10.48550/arXiv.2010.04898",
31
30
  id="",
32
31
  )
33
- def main(data: Path):
32
+ def main(data: Path) -> Supervised[QReCCDataset, None, QReCCDataset]:
34
33
  """Open-Domain Question Answering Goes Conversational via Question Rewriting
35
34
 
36
35
  We introduce QReCC (Question Rewriting in Conversational Context), an
@@ -40,10 +39,10 @@ def main(data: Path):
40
39
  answering that includes the individual subtasks of question rewriting,
41
40
  passage retrieval and reading comprehension
42
41
  """
43
- return {
44
- "train": QReCCDataset(path=data / "qrecc_train.json"),
45
- "test": QReCCDataset(path=data / "qrecc_test.json"),
46
- }
42
+ return Supervised(
43
+ train=QReCCDataset(path=data / "qrecc_train.json"),
44
+ test=QReCCDataset(path=data / "qrecc_test.json"),
45
+ )
47
46
 
48
47
 
49
48
  @dataset(
@@ -52,7 +51,6 @@ def main(data: Path):
52
51
  )
53
52
  class Content(LZ4JSONLDocumentStore):
54
53
  """QReCC mentionned URLs content"""
55
-
56
54
  @staticmethod
57
55
  def __create_dataset__(dataset, options=None):
58
56
  ds = reference(reference=main).setup(dataset, options)
@@ -67,7 +65,7 @@ class Content(LZ4JSONLDocumentStore):
67
65
  "id",
68
66
  ).setup(dataset, options)
69
67
 
70
- return LZ4JSONLDocumentStore(jsonl_path=store_path)
68
+ return Content(jsonl_path=store_path)
71
69
 
72
70
  @staticmethod
73
71
  def _documents(path: Path):
@@ -1,11 +1,11 @@
1
1
  """MS MARCO (Microsoft Machine Reading Comprehension) is a large scale dataset focused on machine reading comprehension, question answering, and passage ranking. A variant of this task will be the part of TREC and AFIRM 2019. For Updates about TREC 2019 please follow This Repository Passage Reranking task Task Given a query q and a the 1000 most relevant passages P = p1, p2, p3,... p1000, as retrieved by BM25 a succeful system is expected to rerank the most relevant passage as high as possible. For this task not all 1000 relevant items have a human labeled relevant passage. Evaluation will be done using MRR.
2
2
 
3
- **Publication**:
4
- Tri Nguyen, Mir Rosenberg, Xia Song, Jianfeng Gao, Saurabh Tiwary, RanganMajumder, and Li Deng. 2016.
5
- MS MARCO: A Human Generated MAchineReading COmprehension Dataset. In CoCo@NIPS.
3
+ **Publication**:
4
+ Tri Nguyen, Mir Rosenberg, Xia Song, Jianfeng Gao, Saurabh Tiwary, RanganMajumder, and Li Deng. 2016.
5
+ MS MARCO: A Human Generated MAchineReading COmprehension Dataset. In CoCo@NIPS.
6
6
 
7
7
 
8
- See [https://github.com/microsoft/MSMARCO-Passage-Ranking](https://github.com/microsoft/MSMARCO-Passage-Ranking) for more details
8
+ See [https://github.com/microsoft/MSMARCO-Passage-Ranking](https://github.com/microsoft/MSMARCO-Passage-Ranking) for more details
9
9
  """
10
10
 
11
11
  from datamaestro.annotations.agreement import useragreement
@@ -35,6 +35,7 @@ http://www.msmarco.org/dataset.aspx""",
35
35
 
36
36
  # --- Document collection
37
37
 
38
+
38
39
  # TODO: Not ideal since it would be better to have small versions right away
39
40
  # instead of downloading again the MS Marco Collection
40
41
  @lua
@@ -43,10 +44,10 @@ http://www.msmarco.org/dataset.aspx""",
43
44
  url="https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz",
44
45
  checker=HashCheck("31644046b18952c1386cd4564ba2ae69", md5),
45
46
  )
46
- @dataset(Folder, url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
47
- def collection_etc(data):
47
+ @dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
48
+ def collection_etc(data) -> Folder:
48
49
  """Documents and some more files"""
49
- return {"path": data}
50
+ return Folder(path=data)
50
51
 
51
52
 
52
53
  @lua
@@ -1,14 +1,9 @@
1
1
  from datamaestro.data.csv import Generic
2
- from datamaestro.definitions import argument, datatasks, datatags, dataset
2
+ from datamaestro.definitions import datatasks, datatags, dataset
3
3
  from datamaestro.download.archive import zipdownloader
4
4
  from datamaestro.data.ml import Supervised
5
5
  from datamaestro.utils import HashCheck
6
6
 
7
- # name: Sentiment140
8
- # web: http://help.sentiment140.com/for-students/
9
-
10
- # description: |
11
-
12
7
 
13
8
  @zipdownloader(
14
9
  "dir",
@@ -11,6 +11,7 @@ from datamaestro.download.archive import zipdownloader
11
11
  from datamaestro.download.single import filedownloader
12
12
  from datamaestro_text.data.embeddings import WordEmbeddingsText
13
13
 
14
+
14
15
  # size: 822M
15
16
  # statistics:
16
17
  # tokens: 6G
@@ -1,10 +1,9 @@
1
1
  """The AQUAINT Corpus, Linguistic Data Consortium (LDC) catalog number LDC2002T31 and ISBN 1-58563-240-6 consists of newswire text data in English, drawn from three sources: the Xinhua News Service (People's Republic of China), the New York Times News Service, and the Associated Press Worldstream News Service. It was prepared by the LDC for the AQUAINT Project, and will be used in official benchmark evaluations conducted by National Institute of Standards and Technology (NIST)."""
2
2
 
3
- from datamaestro.definitions import DatafolderPath
4
- from datamaestro.data import Base
5
- from datamaestro_text.data.ir.trec import TipsterCollection
6
- from datamaestro.definitions import argument, datatasks, datatags, dataset
3
+ from datamaestro.context import DatafolderPath
4
+ from datamaestro.definitions import dataset
7
5
  from datamaestro.download.links import links, linkfolder
6
+ from datamaestro_text.data.ir.trec import TipsterCollection
8
7
 
9
8
 
10
9
  URL = "https://catalog.ldc.upenn.edu/LDC2002T31"
@@ -22,8 +22,8 @@ from datamaestro_text.data.ir.trec import TipsterCollection
22
22
  from datamaestro.download.links import linkfolder
23
23
  from datamaestro.definitions import (
24
24
  dataset,
25
- DatafolderPath,
26
25
  )
26
+ from datamaestro.context import DatafolderPath
27
27
 
28
28
  # Store meta-information
29
29
  TIPSTER = dataset(TipsterCollection, url="https://catalog.ldc.upenn.edu/LDC93T3A")
@@ -129,6 +129,8 @@ class ConversationNode:
129
129
 
130
130
 
131
131
  class ConversationTree(ABC):
132
+ """Represents a conversation tree"""
133
+
132
134
  @abstractmethod
133
135
  def root(self) -> ConversationNode:
134
136
  ...
@@ -192,7 +194,7 @@ class SingleConversationTreeNode(ConversationNode):
192
194
  return (
193
195
  SingleConversationTreeNode(self.tree, self.index + 1)
194
196
  if self.index < len(self.tree.history) - 1
195
- else []
197
+ else None
196
198
  )
197
199
 
198
200
  def children(self) -> List[ConversationNode]:
@@ -207,8 +209,8 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
207
209
  """A conversation tree node"""
208
210
 
209
211
  entry: Record
210
- parent: Optional["ConversationTreeNode"]
211
- children: List["ConversationTreeNode"]
212
+ _parent: Optional["ConversationTreeNode"]
213
+ _children: List["ConversationTreeNode"]
212
214
 
213
215
  def __init__(self, entry):
214
216
  self.entry = entry
@@ -216,8 +218,8 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
216
218
  self.children = []
217
219
 
218
220
  def add(self, node: "ConversationTreeNode") -> "ConversationTreeNode":
219
- self.children.append(node)
220
- node.parent = self
221
+ self._children.append(node)
222
+ node._parent = self
221
223
  return node
222
224
 
223
225
  def conversation(self, skip_self: bool) -> ConversationHistory:
@@ -232,14 +234,14 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
232
234
  def __iter__(self) -> Iterator["ConversationTreeNode"]:
233
235
  """Iterates over all conversation tree nodes (pre-order)"""
234
236
  yield self.entry
235
- for child in self.children:
237
+ for child in self._children:
236
238
  yield from child
237
239
 
238
240
  def parent(self) -> Optional[ConversationNode]:
239
- return self.parent
241
+ return self._parent
240
242
 
241
243
  def children(self) -> List[ConversationNode]:
242
- return self.children
244
+ return self._children
243
245
 
244
246
  def root(self):
245
247
  return self
@@ -1,12 +1,17 @@
1
1
  from typing import Iterator, List
2
2
  from attr import define
3
3
  import json
4
+ from datamaestro.record import Record
4
5
  from datamaestro.data import File
5
- from .base import (
6
+ from datamaestro_text.data.conversation.base import (
7
+ ConversationDataset,
6
8
  ConversationTree,
7
9
  SingleConversationTree,
10
+ SimpleDecontextualizedItem,
11
+ EntryType,
8
12
  )
9
- from . import ConversationDataset
13
+ from datamaestro_text.data.ir import IDItem, SimpleTextItem
14
+ import logging
10
15
 
11
16
 
12
17
  @define(kw_only=True)
@@ -30,7 +35,10 @@ class CanardConversation:
30
35
 
31
36
 
32
37
  class CanardDataset(ConversationDataset, File):
33
- """A dataset in the CANARD JSON format"""
38
+ """A dataset in the CANARD JSON format
39
+
40
+ The CANARD dataset is composed of
41
+ """
34
42
 
35
43
  def entries(self) -> Iterator[CanardConversation]:
36
44
  """Iterates over re-written query with their context"""
@@ -47,22 +55,53 @@ class CanardDataset(ConversationDataset, File):
47
55
  )
48
56
 
49
57
  def __iter__(self) -> Iterator[ConversationTree]:
50
- history = []
58
+ history: list[Record] = []
51
59
  current_id = None
52
60
 
53
61
  for entry in self.entries():
54
- # Check if current conversation
55
- if current_id != entry.dialogue_id and current_id is not None:
56
- history.reverse()
57
- yield SingleConversationTree(current_id, history)
62
+ # Check if current conversation, otherwise we are OK
63
+ if current_id != entry.dialogue_id:
64
+ if current_id is not None:
65
+ history.reverse()
66
+ yield SingleConversationTree(current_id, history)
67
+ history = []
68
+
69
+ current_id = entry.dialogue_id
70
+
71
+ if not history:
72
+ # First round
73
+ # The two first items are the wikipedia title and section,
74
+ # we interpret them as two user queries
75
+ assert len(entry.history) == 2
76
+ history.extend(
77
+ Record(
78
+ SimpleTextItem(text),
79
+ EntryType.USER_QUERY,
80
+ )
81
+ for text in entry.history
82
+ )
83
+ else:
84
+ # The utterance before the last is the last user query
85
+ assert (
86
+ entry.history[-2] == history[-1][SimpleTextItem].text
87
+ ), f"{entry.dialogue_id} {entry.history} / {history[-4:-1]}"
88
+
89
+ # The last utterance is the system side
90
+ history.append(
91
+ Record(SimpleTextItem(entry.history[-1]), EntryType.SYSTEM_ANSWER)
92
+ )
93
+
94
+ assert len(entry.history) == len(history)
58
95
 
59
96
  # Add to current
60
97
  history.append(
61
- # FIXME: not working anymore
62
- CanardEntry(
63
- query=entry.query,
64
- decontextualized_query=entry.rewrite,
98
+ Record(
99
+ IDItem(f"{entry.dialogue_id}-{entry.query_no}"),
100
+ SimpleTextItem(entry.query),
101
+ SimpleDecontextualizedItem(entry.rewrite),
102
+ EntryType.USER_QUERY,
65
103
  )
66
104
  )
67
105
 
68
- yield current
106
+ if current_id:
107
+ yield SingleConversationTree(current_id, history)
@@ -1,4 +1,3 @@
1
- from functools import cached_property
2
1
  from typing import Iterator, List, Optional
3
2
  from attr import define
4
3
  import json
@@ -1,5 +1,5 @@
1
- from pathlib import Path
2
- from datamaestro.data import Base, File, argument
1
+ from experimaestro import Meta
2
+ from datamaestro.data import Base, File
3
3
  from datamaestro.definitions import datatags
4
4
  import numpy as np
5
5
  from typing import Tuple, List
@@ -18,9 +18,9 @@ class WordEmbeddings(Base):
18
18
  raise NotImplementedError()
19
19
 
20
20
 
21
- @argument("encoding", str, ignored=True, default="utf-8")
22
21
  class WordEmbeddingsText(WordEmbeddings, File):
23
22
  """Word embeddings as a text word / values"""
23
+ encoding: Meta[str] = "utf-8"
24
24
 
25
25
  def load(self):
26
26
  words = []
@@ -43,6 +43,12 @@ class IDItem(Item, ABC):
43
43
 
44
44
  id: str
45
45
 
46
+ @define
47
+ class UrlItem(Item):
48
+ """An url item"""
49
+
50
+ url: str
51
+
46
52
 
47
53
  @define
48
54
  class AdhocAssessment:
@@ -1,7 +1,8 @@
1
1
  from csv import DictReader
2
2
  from typing import Iterator
3
3
 
4
- from datamaestro.data import File, documentation
4
+ from experimaestro import documentation
5
+ from datamaestro.data import File
5
6
  from datamaestro.record import Record
6
7
  from datamaestro_text.data.ir import Documents, TopicRecord, Topics, IDItem
7
8
  from datamaestro_text.data.ir.formats import (
@@ -58,7 +58,7 @@ class NFCorpusDocument(TextItem):
58
58
 
59
59
  @cached_property
60
60
  def text(self):
61
- return self.abstract
61
+ return f"{self.title} {self.abstract}"
62
62
 
63
63
 
64
64
  @define
@@ -99,8 +99,8 @@ class WapoDocument(TextItem):
99
99
  body_media: Tuple[WapoDocMedia, ...]
100
100
 
101
101
  @cached_property
102
- def text(self):
103
- return self.body
102
+ def text(self):
103
+ return f"{self.title} {self.body_paras_html}"
104
104
 
105
105
 
106
106
  @define
@@ -126,13 +126,24 @@ class OrConvQADocument(TextItem):
126
126
  def text(self):
127
127
  return f"{self.title} {self.body}"
128
128
 
129
+
129
130
  @define
131
+ class DprW100Doc(TextItem):
132
+ text: str
133
+ title: str
134
+
135
+ @define
136
+ class MsMarcoV2Passage(TextItem):
137
+ text: str
138
+ spans: Tuple[Tuple[int, int], ...]
139
+ msmarco_document_id: str
130
140
  class Touche2020(TextItem):
131
141
  text: str
132
142
  title: str
133
143
  stance: str
134
144
  url: str
135
145
 
146
+
136
147
  @define
137
148
  class SciDocs(TextItem):
138
149
  text: str
@@ -175,7 +186,8 @@ class TrecMb14Query(TextItem):
175
186
  def get_text(self):
176
187
  return f"{self.query}"
177
188
 
178
- @define
189
+
190
+ @define
179
191
  class SciDocsTopic(TextItem):
180
192
  text: str
181
193
  authors: List[str]
@@ -183,6 +195,7 @@ class SciDocsTopic(TextItem):
183
195
  cited_by: List[str]
184
196
  references: List[str]
185
197
 
198
+
186
199
  @define()
187
200
  class TrecTopic(SimpleTextItem):
188
201
  description: str
@@ -190,3 +203,17 @@ class TrecTopic(SimpleTextItem):
190
203
 
191
204
 
192
205
  TrecTopicRecord = record_type(IDItem, TrecTopic)
206
+
207
+ @define
208
+ class DprW100Query(TextItem):
209
+ text: str
210
+ answers: Tuple[str]
211
+
212
+ @define
213
+ class TrecBackgroundLinkingQuery(IDItem):
214
+ query_id: str
215
+ doc_id: str
216
+ url: str
217
+
218
+ def get_text(self):
219
+ raise NotImplementedError()
@@ -1,14 +1,13 @@
1
- from datamaestro.data import Base, File, argument
1
+ from experimaestro import Param
2
+ from datamaestro.data import Base, File
2
3
  import datamaestro.data.csv as csv
3
4
 
4
5
 
5
- @argument("ratings", type=File)
6
6
  class RatedItems(Base):
7
- pass
7
+ ratings: Param[File]
8
8
 
9
9
 
10
- @argument("links", type=csv.Generic)
11
- @argument("movies", type=csv.Generic)
12
- @argument("tags", type=csv.Generic)
13
10
  class Movielens(RatedItems):
14
- pass
11
+ links: Param[csv.Generic]
12
+ movies: Param[csv.Generic]
13
+ tags: Param[csv.Generic]
@@ -1,15 +1,15 @@
1
- from pathlib import Path
2
- from datamaestro.data import Base, Folder, File, argument
1
+ from typing import Optional
2
+ from experimaestro import Param
3
+ from datamaestro.data import Base, Folder, File
3
4
  from datamaestro.data.ml import Supervised
4
5
 
5
6
 
6
- @argument("train", type=Base)
7
- @argument("test", type=Base, required=False)
8
- @argument("validation", type=Base, required=False)
9
7
  class TrainingText(Supervised):
10
8
  """ "A dataset used for training with a train and a test"""
11
9
 
12
- pass
10
+ train: Param[Base]
11
+ test: Param[Optional[Base]] = None
12
+ validation: Param[Optional[Base]] = None
13
13
 
14
14
 
15
15
  class TextFolder(Folder):
@@ -37,6 +37,7 @@ from datamaestro_text.data.ir.base import (
37
37
  SimpleAdhocAssessment,
38
38
  SimpleTextItem,
39
39
  TopicRecord,
40
+ UrlItem,
40
41
  create_record,
41
42
  )
42
43
 
@@ -165,6 +166,19 @@ class Documents(ir.DocumentStore, IRDSId):
165
166
  "source",
166
167
  "source_content_type",
167
168
  ),
169
+ _irds.dpr_w100.DprW100Doc: tuple_constructor(
170
+ formats.DprW100Doc,
171
+ "doc_id",
172
+ "text",
173
+ "title",
174
+ ),
175
+ _irds.msmarco_passage_v2.MsMarcoV2Passage: tuple_constructor(
176
+ formats.MsMarcoV2Passage,
177
+ "doc_id",
178
+ "text",
179
+ "spans",
180
+ "msmarco_document_id",
181
+ ),
168
182
  }
169
183
 
170
184
  """Wraps an ir datasets collection -- and provide a default text
@@ -385,6 +399,12 @@ class Topics(ir.TopicsStore, IRDSId):
385
399
  "tweet_time",
386
400
  "description",
387
401
  ),
402
+ _irds.dpr_w100.DprW100Query: tuple_constructor(
403
+ formats.DprW100Query,
404
+ "query_id",
405
+ "text",
406
+ "answers"
407
+ ),
388
408
  }
389
409
 
390
410
  HANDLERS = {
@@ -415,7 +435,52 @@ class Topics(ir.TopicsStore, IRDSId):
415
435
  def iter(self) -> Iterator[TopicRecord]:
416
436
  """Returns an iterator over topics"""
417
437
  return self.handler.iter()
438
+
439
+ class TrecBackgroundLinkingTopicsHandler(TopicsHandler):
440
+ def __init__(self, dataset):
441
+ self.dataset = dataset
442
+
443
+ @cached_property
444
+ def ext2records(self):
445
+ return {record[IDItem].id: record for record in self.records}
446
+
447
+ def topic_int(self, internal_topic_id: int) -> TopicRecord:
448
+ """Returns a document given its internal ID"""
449
+ return self.records[internal_topic_id]
450
+
451
+ def topic_ext(self, external_topic_id: str) -> TopicRecord:
452
+ """Returns a document given its external ID"""
453
+ return self.ext2records[external_topic_id]
418
454
 
455
+ def iter(self) -> Iterator[ir.TopicRecord]:
456
+ """Returns an iterator over topics"""
457
+ return iter(self.records)
458
+
459
+ @cached_property
460
+ def records(self):
461
+ try:
462
+ records = []
463
+
464
+ for query in self.dataset.dataset.queries_iter():
465
+ topic = Record(
466
+ IDItem(query.query_id),
467
+ # Following BEIR documentation, we use title of documents as queries: https://github.com/beir-cellar/beir/blob/main/examples/dataset/README.md#queries-and-qrels
468
+ SimpleTextItem(self.dataset.dataset.docs_store().get(query.doc_id).title),
469
+ UrlItem(query.url),
470
+ )
471
+ records.append(topic)
472
+ except Exception:
473
+ logging.exception("Error while computing topic records")
474
+ raise
475
+
476
+ return records
477
+
478
+
479
+ Topics.HANDLERS.update(
480
+ {
481
+ _irds.wapo.TrecBackgroundLinkingQuery: TrecBackgroundLinkingTopicsHandler
482
+ }
483
+ )
419
484
 
420
485
  class CastTopicsHandler(TopicsHandler):
421
486
  def __init__(self, dataset):
@@ -116,10 +116,6 @@ class AdhocRunDataset(Dataset):
116
116
  def _prepare(self, download=False) -> AdhocRun:
117
117
  return AdhocRun(id=self.fullid)
118
118
 
119
- @property
120
- def configtype(self):
121
- return AdhocRun
122
-
123
119
 
124
120
  class Collection(Dataset):
125
121
  base = Adhoc
@@ -1,4 +1,4 @@
1
- from typing import Callable, TypeVar, Iterator, List, Union
1
+ from typing import Callable, Sequence, TypeVar, Iterator, List, Union
2
2
 
3
3
  T = TypeVar("T")
4
4
 
@@ -45,7 +45,7 @@ class RangeView:
45
45
  return RangeView(self.source, key)
46
46
 
47
47
 
48
- class LazyList:
48
+ class LazyList(Sequence):
49
49
  """Iterable-based list
50
50
 
51
51
  The list is only materialized if needed"""
@@ -63,6 +63,9 @@ class LazyList:
63
63
  else:
64
64
  return iter(self.materialized_list)
65
65
 
66
+ def __len__(self):
67
+ return len(self.iterable)
68
+
66
69
  def __getitem__(self, index):
67
70
  # Materialize the list if accessing an index above the threshold or any slice
68
71
  if isinstance(index, slice) or index >= self.materialize_threshold:
@@ -1,8 +1,13 @@
1
- # file generated by setuptools_scm
1
+ # file generated by setuptools-scm
2
2
  # don't change, don't track in version control
3
+
4
+ __all__ = ["__version__", "__version_tuple__", "version", "version_tuple"]
5
+
3
6
  TYPE_CHECKING = False
4
7
  if TYPE_CHECKING:
5
- from typing import Tuple, Union
8
+ from typing import Tuple
9
+ from typing import Union
10
+
6
11
  VERSION_TUPLE = Tuple[Union[int, str], ...]
7
12
  else:
8
13
  VERSION_TUPLE = object
@@ -12,5 +17,5 @@ __version__: str
12
17
  __version_tuple__: VERSION_TUPLE
13
18
  version_tuple: VERSION_TUPLE
14
19
 
15
- __version__ = version = '2025.1.7'
16
- __version_tuple__ = version_tuple = (2025, 1, 7)
20
+ __version__ = version = '2025.5.13'
21
+ __version_tuple__ = version_tuple = (2025, 5, 13)
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: datamaestro-text
3
- Version: 2025.1.7
3
+ Version: 2025.5.13
4
4
  Summary: Datamaestro module for text-related datasets
5
5
  Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
6
6
  License: GPL-3
@@ -18,9 +18,16 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
18
  Requires-Python: >=3.8
19
19
  Description-Content-Type: text/markdown
20
20
  License-File: LICENSE
21
- Requires-Dist: datamaestro>=1.2.1
21
+ Requires-Dist: datamaestro>=1.4.2
22
22
  Requires-Dist: ir_datasets>=0.5.8
23
23
  Requires-Dist: attrs
24
+ Provides-Extra: dev
25
+ Requires-Dist: pytest; extra == "dev"
26
+ Requires-Dist: docutils; extra == "dev"
27
+ Requires-Dist: sphobjinv; extra == "dev"
28
+ Requires-Dist: flake8; extra == "dev"
29
+ Requires-Dist: sphinx; extra == "dev"
30
+ Dynamic: license-file
24
31
 
25
32
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) [![PyPI version](https://badge.fury.io/py/datamaestro-text.svg)](https://badge.fury.io/py/datamaestro-text)
26
33
 
@@ -1,25 +1,25 @@
1
1
  datamaestro_text/__init__.py,sha256=hU8jZpkXl3F74qIfqnJl7v4nJ9YxfoR7IpJpUREFNRI,248
2
- datamaestro_text/version.py,sha256=k65KHkS4PD3NjDDCJgWcTwRBE4yVcKs32B8SUxv2DvE,417
2
+ datamaestro_text/version.py,sha256=EsLzhbhZSIiOqGSyEpMlneQnIpzB12JreUxG8EMn7EE,519
3
3
  datamaestro_text/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  datamaestro_text/config/ai/quac.yaml,sha256=h1D7UJo2z1nZ_9MXpDHuQNJG9Ma2oayUdJV6lyumAIg,1103
5
5
  datamaestro_text/config/com/oscar-corpus.py,sha256=dDfdy_uhG6esRQQzUOOORwK8KvEq4c4hZbWt22pv-zY,733
6
- datamaestro_text/config/com/sentiment140.py,sha256=iRV_rSjQcr9WNjjQ4EdznaEMGLADV_rwpeRx7ycQi0k,1383
6
+ datamaestro_text/config/com/sentiment140.py,sha256=3cZfqs395gY14gGojRC_RTYEdG5l8RUTSdF9li8ynBc,1283
7
7
  datamaestro_text/config/com/fastml/goodbooks-10k.yaml,sha256=5ZABxUnBFs2ZnCXtBH8YoBiPb3SocRRdH1DLSfVWF-Y,1172
8
- datamaestro_text/config/com/github/aagohary/canard.py,sha256=tIwb_KxMUR3st7rzQUkt6rIjolTl8uKvDq6t795b1nY,1468
9
- datamaestro_text/config/com/github/apple/ml-qrecc.py,sha256=_lufJlg-4zaQyjr5Ae-X-9hXzpl2JFjfxh_RhnBbva4,3068
8
+ datamaestro_text/config/com/github/aagohary/canard.py,sha256=DREIKU2FFrfwibjU48Y1AsKTUzhwxmD4Kuykd4bDIus,1762
9
+ datamaestro_text/config/com/github/apple/ml-qrecc.py,sha256=-iHKJocWZ8N9N-P8E45y4ewg3OT_23XonlDh5-NcH2g,3055
10
10
  datamaestro_text/config/com/github/prdwb/orconvqa.py,sha256=Tg3hbnvilxW_Lwt7fpKvGMtu-6mc9oNIHM-LX6JTR1U,3026
11
11
  datamaestro_text/config/com/github/soskek/bookcorpus.yaml,sha256=qJKs35yeEIilEMgNvU3OEqMp1TSn7mDM2T-uYyA7kTU,1607
12
12
  datamaestro_text/config/com/microsoft/wikiqa.yaml,sha256=U7rU-W2Xz1MYv2YXT4jCTj5DsHyM0VssbQPNa3EISaM,540
13
- datamaestro_text/config/com/microsoft/msmarco/passage.py,sha256=ouNn2nivS6mUMaCyMzqxNv1YMoPrSEX-UcSZpG1v_uw,11645
13
+ datamaestro_text/config/com/microsoft/msmarco/passage.py,sha256=tVldwVh0pQpXXifqcMNEN9cLO1HXkXoEhToSazSx5RE,11643
14
14
  datamaestro_text/config/com/smashwords/bookcorpus.py,sha256=hCFjZg9t1AHET05qU31uatiAOD5EEzzWG9y_W90jcOE,861
15
15
  datamaestro_text/config/edu/cornell/nlvr.yaml,sha256=9Yk5VZMncSmrP7JNuGXqExksgX5nQ_Zfnlps8hWze3Q,921
16
16
  datamaestro_text/config/edu/stanford/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  datamaestro_text/config/edu/stanford/aclimdb.py,sha256=lsPDxnp_rWOCpBte6pZ0_LVaC33w5mmgfGh51rcTgt8,643
18
- datamaestro_text/config/edu/stanford/glove.py,sha256=ykkQ7nYWqhmgc2TeohNMliYSiX831cYUygftkBTGIac,2390
18
+ datamaestro_text/config/edu/stanford/glove.py,sha256=bXxwiJqT8alPs5MwwxVuY0xBO8g1QWhm2L3AKVskTlA,2391
19
19
  datamaestro_text/config/edu/stanford/im2p.yaml,sha256=JoToNyEPpmwdyLFedCBot5ypyw7p9rzi12mGXJuZin0,2909
20
20
  datamaestro_text/config/edu/upenn/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
21
  datamaestro_text/config/edu/upenn/ldc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
- datamaestro_text/config/edu/upenn/ldc/aquaint.py,sha256=Y_biKee8LA8arsHiKOUlPBWfylDDM9k-x5UgN-uJdLE,1658
22
+ datamaestro_text/config/edu/upenn/ldc/aquaint.py,sha256=YEU3kIXzv71Vjg9bUoXyQ-vSLvC-y4LlE3mJQf38XuY,1589
23
23
  datamaestro_text/config/gov/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
24
  datamaestro_text/config/gov/nist/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
25
  datamaestro_text/config/gov/nist/ir/covid.py,sha256=wn2E7sQ8M6pAucVD4sKJYImyzKUKphyiDFJD0oYRCbg,4004
@@ -28,7 +28,7 @@ datamaestro_text/config/gov/nist/trec/adhoc.py,sha256=vcFaLlZ-chwDt013MYI8bYZ2ug
28
28
  datamaestro_text/config/gov/nist/trec/clueweb.yaml,sha256=sm4UbdtMzWoDVPsewtVDS7Vj2jBOdgp18Xqo1X4ysQc,792
29
29
  datamaestro_text/config/gov/nist/trec/deeplearning.yaml,sha256=QGM7PtXLJRttNdOPE16o7-k3e5tA9HgcaM_-qFDV_5Q,2125
30
30
  datamaestro_text/config/gov/nist/trec/index.yaml,sha256=oSUhUmtukf5oXqUbJLhae8xZx7Uac5V6uZoUB-RJ7Sw,2711
31
- datamaestro_text/config/gov/nist/trec/tipster.py,sha256=rmVFcwUPAfD529rneZUlCLBke-edYjrBIH3n02-qfvc,5371
31
+ datamaestro_text/config/gov/nist/trec/tipster.py,sha256=t0w9lOBfvwt6YCYCyDj9fo1QiBXfs0q8qzKl74f40jU,5398
32
32
  datamaestro_text/config/gov/nist/trec/web.yaml,sha256=iNIJ-PCeLwpUaJByJesyCqfFJolWCts_VETACha3hfQ,563
33
33
  datamaestro_text/config/io/github/rajpurkar/squad.yaml,sha256=JchkTruEhVd0uOTGgek2tOgBL4UT3G3KVASvZcNigLg,1143
34
34
  datamaestro_text/config/io/github/thunlp/fewrel.py,sha256=LJYexk2ssZ5SqKIKLjPdybS9ju4g9fGIFy5eybtsvYQ,1224
@@ -41,28 +41,28 @@ datamaestro_text/config/org/grouplens/movielens.py,sha256=e78JoiHuwP6gbj7Q84UwPT
41
41
  datamaestro_text/config/org/universaldependencies/french.py,sha256=W_gDtfinjlw08qohX_PWvzQlacDwRFB7PeOzO33mRVU,2208
42
42
  datamaestro_text/config/uk/ac/ucl/cs/qangaroo.yaml,sha256=IBy82CDNNLjJPNPzues1EgDXu0A5WDvUFeVNSOyrIpI,1137
43
43
  datamaestro_text/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
- datamaestro_text/data/embeddings.py,sha256=AskX7Ggvkpqhb-Je_hBTFp_vfkiWzWtJH1gFQxuUTwM,1155
45
- datamaestro_text/data/recommendation.py,sha256=MatelpJQiCMpNDuaQVBkRY809J1qiQo-8QO9XD2J_dU,303
44
+ datamaestro_text/data/embeddings.py,sha256=qdeZ4m6drFNkqW_4pKxJKdJHtOnDRs0P7QVJ7AO6xFE,1127
45
+ datamaestro_text/data/recommendation.py,sha256=wHV_9SXSclouuXaBmvwg3ncZLOFfdKRZG3IHkkPJX9Y,279
46
46
  datamaestro_text/data/tagging.py,sha256=yWm7bNLks77cAySa1aZNCmLFxTTqhqXZ0PaoaEYU6hI,697
47
- datamaestro_text/data/text.py,sha256=pOI8nrEd6RoQA28DVH1JufHTunr9vG3FQzwElR8YirI,499
47
+ datamaestro_text/data/text.py,sha256=_9J7-j06BOIZ1HsOcBsWy_asanAKkDoZIsWWMMj4tm4,496
48
48
  datamaestro_text/data/conversation/__init__.py,sha256=esOWnSImMlQs5QtfxUt559ABLd6a5wwoNpj9XtIq71Y,159
49
- datamaestro_text/data/conversation/base.py,sha256=zXX5sPMoVtDf4WTkLx45IvZz6lgKPgcUZFu6N4lVBlc,6457
50
- datamaestro_text/data/conversation/canard.py,sha256=IMxu5NfytWZLZ_cyT8UFOICbDE82HLJoNBT_6j36Faw,1808
51
- datamaestro_text/data/conversation/orconvqa.py,sha256=VG3GV5_IPQcvla9rrQPypDNcZYmKNMgozmr2oudeLA4,3802
49
+ datamaestro_text/data/conversation/base.py,sha256=PUVRCSMBlV9bSayBl-vnzsYvyr2Tdv_zTadIC_Tswe0,6508
50
+ datamaestro_text/data/conversation/canard.py,sha256=FI02rPNuzSsA_uXOcgcuQn31d3QahmHndzYCV1Zm8rk,3305
51
+ datamaestro_text/data/conversation/orconvqa.py,sha256=RL9cpK7QK2dsk_j2e4Th8wzGFq-G3ww_EjdM1eKh-8o,3764
52
52
  datamaestro_text/data/conversation/qrecc.py,sha256=es4GmqPtE63A7O_GARe8Zy3rQvuLEhAvUA7CfN_nMeA,2562
53
53
  datamaestro_text/data/ir/__init__.py,sha256=EOVnRv9oKEEucMOgrFjhXxeUWEkaJkbXBndpkKhZaiY,8701
54
- datamaestro_text/data/ir/base.py,sha256=Cw8egjChpx4ksUwp-vTA70B2OWxROH7FIeJylsXggAk,1429
55
- datamaestro_text/data/ir/cord19.py,sha256=7xDIzsy63WrA9lxxyNOMu5ECRymu5x23EzYG977nS6Y,1440
54
+ datamaestro_text/data/ir/base.py,sha256=TSaY8UWoixqvKg1z5JfkXPnuZiVoChYEUvc7jjbpZqY,1495
55
+ datamaestro_text/data/ir/cord19.py,sha256=yu1Khgy6AZjH2BPQKdnxDid0vQgQ8zvb8-FQlHH-GOU,1465
56
56
  datamaestro_text/data/ir/csv.py,sha256=tnxuqR_MZ3GQhuoXEMYyWLQw8PyD0gRqsnzIP5Gsziw,1212
57
57
  datamaestro_text/data/ir/data.py,sha256=ERmmOxz_9mUkIuccNbzUa5Y6gVLLVDdyc4cCxbCCUbY,20
58
- datamaestro_text/data/ir/formats.py,sha256=rU9uJhdFDdlnQ2qfFowK_--1WVOH1fVgfXWOPPDF_FY,3160
58
+ datamaestro_text/data/ir/formats.py,sha256=K0mqmCGg0seneKo-Rt3cBfDVjEMS4_6t1MfNj4iW9Y4,3637
59
59
  datamaestro_text/data/ir/huggingface.py,sha256=G71VFDN-SllZy4LFxumEbCumEJvb5-orAbLemHjWhiA,956
60
60
  datamaestro_text/data/ir/stores.py,sha256=rlOogoBAfsP7o01KqvHiTF1XqzK2Fp6QbRmuasabKAE,859
61
61
  datamaestro_text/data/ir/trec.py,sha256=qDOzmgUn0hMqeP9KdI190-9IKsLl6yNRqIoBz6s-cBs,1898
62
62
  datamaestro_text/data/ir/utils.py,sha256=6-GhXVtgkBZGhIs2-ODZua_3DmKjSSVydStpHDqbAwE,833
63
63
  datamaestro_text/datasets/irds/__init__.py,sha256=Tq0HN1qojnZYLBumM59BuTkz7r0gcu-5OXmDDLgPpAc,707
64
- datamaestro_text/datasets/irds/data.py,sha256=BTsFZdjFJ0XdlYRh2rcshAoeknHrypf7be2BGUbWWFk,19931
65
- datamaestro_text/datasets/irds/datasets.py,sha256=qtN-nX2_OV9FD339aZjwVL1jFf6I4T6MbNCcuRAjmtU,5682
64
+ datamaestro_text/datasets/irds/data.py,sha256=0V5nPMQeh1I3Sp36x_NE7xrbC_vbZAKhU5NONj95aok,22058
65
+ datamaestro_text/datasets/irds/datasets.py,sha256=yrJx3X7u7oYcHXsL8YmUrXsQhkiqkBC6LjeZA_Ldx5Q,5617
66
66
  datamaestro_text/datasets/irds/helpers.py,sha256=fGE-fbuJbXdTzl1qo55247jzn9cvApY-d82GJBgfY1E,3982
67
67
  datamaestro_text/datasets/irds/utils.py,sha256=m30JLIrV_HgilN11TvY9dGTyumES6LLzWZDUAMT915M,1425
68
68
  datamaestro_text/download/tmdb.py,sha256=kU_Vz9jhznlyeKMHziVu58IHoWv8zPu6CZTHVNQvmu4,4009
@@ -75,12 +75,12 @@ datamaestro_text/transforms/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMp
75
75
  datamaestro_text/transforms/ir/__init__.py,sha256=eWxr0ZtfK9F9J4TachW5V8Ra5AhQJueDyOGR8vXi1uA,6553
76
76
  datamaestro_text/utils/__init__.py,sha256=2449YLTAtKJzkmt84Mu8sBRCCveNs5fiaqTCK_p5ha0,3340
77
77
  datamaestro_text/utils/files.py,sha256=n6ZGl5LNrZbHLcV9RFwd7cFT0vPUezit-2dsBzs1vRQ,170
78
- datamaestro_text/utils/iter.py,sha256=2_UZ8y9Ma4k5U9ZD4w55Zfb6NGrKM1L4G40OygRm1is,2459
78
+ datamaestro_text/utils/iter.py,sha256=uzBmavBeqVyEvYmi9Ds2dqh0ywhJE-1SBO5v8AoIKcc,2537
79
79
  datamaestro_text/utils/randomstream.py,sha256=_-boH4IIqN8qcl3IktjpNp9vmF4TWRzHUSNVwg7WAr8,973
80
80
  datamaestro_text/utils/shuffle.py,sha256=o8JTz3mr0lYWyv0zEh91jEK12ci1etMiUnzh5GkOHCM,3490
81
- datamaestro_text-2025.1.7.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
82
- datamaestro_text-2025.1.7.dist-info/METADATA,sha256=-wsBcUcnEnOMiHb1ROIf43r55ZNNNjpIemuFXm5hHUY,1609
83
- datamaestro_text-2025.1.7.dist-info/WHEEL,sha256=A3WOREP4zgxI0fKrHUG8DC8013e3dK3n7a6HDbcEIwE,91
84
- datamaestro_text-2025.1.7.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
85
- datamaestro_text-2025.1.7.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
86
- datamaestro_text-2025.1.7.dist-info/RECORD,,
81
+ datamaestro_text-2025.5.13.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
82
+ datamaestro_text-2025.5.13.dist-info/METADATA,sha256=EYppi8IJMqWU3ObzwSvM_PuOkC_pgwGAxwvaFx2dG3A,1847
83
+ datamaestro_text-2025.5.13.dist-info/WHEEL,sha256=DnLRTWE75wApRYVsjgc6wsVswC54sMSJhAEd4xhDpBk,91
84
+ datamaestro_text-2025.5.13.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
85
+ datamaestro_text-2025.5.13.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
86
+ datamaestro_text-2025.5.13.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.7.0)
2
+ Generator: setuptools (80.4.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5