datamaestro-text 2025.1.7__py3-none-any.whl → 2025.5.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamaestro_text/config/com/github/aagohary/canard.py +19 -12
- datamaestro_text/config/com/github/apple/ml-qrecc.py +6 -8
- datamaestro_text/config/com/microsoft/msmarco/passage.py +8 -7
- datamaestro_text/config/com/sentiment140.py +1 -6
- datamaestro_text/config/edu/stanford/glove.py +1 -0
- datamaestro_text/config/edu/upenn/ldc/aquaint.py +3 -4
- datamaestro_text/config/gov/nist/trec/tipster.py +1 -1
- datamaestro_text/data/conversation/base.py +10 -8
- datamaestro_text/data/conversation/canard.py +52 -13
- datamaestro_text/data/conversation/orconvqa.py +0 -1
- datamaestro_text/data/embeddings.py +3 -3
- datamaestro_text/data/ir/base.py +6 -0
- datamaestro_text/data/ir/cord19.py +2 -1
- datamaestro_text/data/ir/formats.py +31 -4
- datamaestro_text/data/recommendation.py +6 -7
- datamaestro_text/data/text.py +6 -6
- datamaestro_text/datasets/irds/data.py +65 -0
- datamaestro_text/datasets/irds/datasets.py +0 -4
- datamaestro_text/utils/iter.py +5 -2
- datamaestro_text/version.py +9 -4
- {datamaestro_text-2025.1.7.dist-info → datamaestro_text-2025.5.13.dist-info}/METADATA +10 -3
- {datamaestro_text-2025.1.7.dist-info → datamaestro_text-2025.5.13.dist-info}/RECORD +26 -26
- {datamaestro_text-2025.1.7.dist-info → datamaestro_text-2025.5.13.dist-info}/WHEEL +1 -1
- {datamaestro_text-2025.1.7.dist-info → datamaestro_text-2025.5.13.dist-info}/entry_points.txt +0 -0
- {datamaestro_text-2025.1.7.dist-info → datamaestro_text-2025.5.13.dist-info/licenses}/LICENSE +0 -0
- {datamaestro_text-2025.1.7.dist-info → datamaestro_text-2025.5.13.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,5 @@
|
|
|
1
|
-
# See documentation on https://datamaestro.readthedocs.io
|
|
2
|
-
|
|
3
1
|
from datamaestro.definitions import datatasks, datatags, dataset
|
|
4
|
-
from datamaestro.download.
|
|
2
|
+
from datamaestro.download.single import filedownloader
|
|
5
3
|
from datamaestro.utils import HashCheck
|
|
6
4
|
|
|
7
5
|
from datamaestro.data.ml import Supervised
|
|
@@ -10,14 +8,23 @@ from datamaestro_text.data.conversation.canard import CanardDataset
|
|
|
10
8
|
|
|
11
9
|
@datatags("conversation", "context", "query")
|
|
12
10
|
@datatasks("query rewriting")
|
|
13
|
-
@
|
|
14
|
-
"
|
|
15
|
-
"https://
|
|
16
|
-
|
|
17
|
-
|
|
11
|
+
@filedownloader(
|
|
12
|
+
"train.json",
|
|
13
|
+
"https://raw.githubusercontent.com/aagohary/canard/refs/heads/master/data/release/train.json",
|
|
14
|
+
checker=HashCheck("73624ac646fb81e09b0fd7f01370ada3"),
|
|
15
|
+
)
|
|
16
|
+
@filedownloader(
|
|
17
|
+
"dev.json",
|
|
18
|
+
"https://raw.githubusercontent.com/aagohary/canard/refs/heads/master/data/release/dev.json",
|
|
19
|
+
checker=HashCheck("c84525631a83bc771c58ff31f4a9b601"),
|
|
20
|
+
)
|
|
21
|
+
@filedownloader(
|
|
22
|
+
"test.json",
|
|
23
|
+
"https://raw.githubusercontent.com/aagohary/canard/refs/heads/master/data/release/test.json",
|
|
24
|
+
checker=HashCheck("3fc14d0078e7a5056f5da571728f024e"),
|
|
18
25
|
)
|
|
19
26
|
@dataset(Supervised, url="https://sites.google.com/view/qanta/projects/canard", id="")
|
|
20
|
-
def main(
|
|
27
|
+
def main(train, dev, test):
|
|
21
28
|
"""Question-in-context rewriting
|
|
22
29
|
|
|
23
30
|
CANARD is a dataset for question-in-context rewriting that consists of
|
|
@@ -30,7 +37,7 @@ def main(archive):
|
|
|
30
37
|
Each dataset is an instance of :class:`datamaestro_text.data.conversation.CanardDataset`
|
|
31
38
|
"""
|
|
32
39
|
return {
|
|
33
|
-
"train": CanardDataset(path=
|
|
34
|
-
"validation": CanardDataset(path=
|
|
35
|
-
"test": CanardDataset(path=
|
|
40
|
+
"train": CanardDataset(path=train),
|
|
41
|
+
"validation": CanardDataset(path=dev),
|
|
42
|
+
"test": CanardDataset(path=test),
|
|
36
43
|
}
|
|
@@ -25,12 +25,11 @@ from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
|
|
|
25
25
|
checker=HashCheck("f88fcc7ef3678cd6312080389c8abd67"),
|
|
26
26
|
)
|
|
27
27
|
@dataset(
|
|
28
|
-
Supervised[QReCCDataset, None, QReCCDataset],
|
|
29
28
|
url="https://github.com/apple/ml-qrecc",
|
|
30
29
|
doi="https://doi.org/10.48550/arXiv.2010.04898",
|
|
31
30
|
id="",
|
|
32
31
|
)
|
|
33
|
-
def main(data: Path):
|
|
32
|
+
def main(data: Path) -> Supervised[QReCCDataset, None, QReCCDataset]:
|
|
34
33
|
"""Open-Domain Question Answering Goes Conversational via Question Rewriting
|
|
35
34
|
|
|
36
35
|
We introduce QReCC (Question Rewriting in Conversational Context), an
|
|
@@ -40,10 +39,10 @@ def main(data: Path):
|
|
|
40
39
|
answering that includes the individual subtasks of question rewriting,
|
|
41
40
|
passage retrieval and reading comprehension
|
|
42
41
|
"""
|
|
43
|
-
return
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
42
|
+
return Supervised(
|
|
43
|
+
train=QReCCDataset(path=data / "qrecc_train.json"),
|
|
44
|
+
test=QReCCDataset(path=data / "qrecc_test.json"),
|
|
45
|
+
)
|
|
47
46
|
|
|
48
47
|
|
|
49
48
|
@dataset(
|
|
@@ -52,7 +51,6 @@ def main(data: Path):
|
|
|
52
51
|
)
|
|
53
52
|
class Content(LZ4JSONLDocumentStore):
|
|
54
53
|
"""QReCC mentionned URLs content"""
|
|
55
|
-
|
|
56
54
|
@staticmethod
|
|
57
55
|
def __create_dataset__(dataset, options=None):
|
|
58
56
|
ds = reference(reference=main).setup(dataset, options)
|
|
@@ -67,7 +65,7 @@ class Content(LZ4JSONLDocumentStore):
|
|
|
67
65
|
"id",
|
|
68
66
|
).setup(dataset, options)
|
|
69
67
|
|
|
70
|
-
return
|
|
68
|
+
return Content(jsonl_path=store_path)
|
|
71
69
|
|
|
72
70
|
@staticmethod
|
|
73
71
|
def _documents(path: Path):
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
"""MS MARCO (Microsoft Machine Reading Comprehension) is a large scale dataset focused on machine reading comprehension, question answering, and passage ranking. A variant of this task will be the part of TREC and AFIRM 2019. For Updates about TREC 2019 please follow This Repository Passage Reranking task Task Given a query q and a the 1000 most relevant passages P = p1, p2, p3,... p1000, as retrieved by BM25 a succeful system is expected to rerank the most relevant passage as high as possible. For this task not all 1000 relevant items have a human labeled relevant passage. Evaluation will be done using MRR.
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
3
|
+
**Publication**:
|
|
4
|
+
Tri Nguyen, Mir Rosenberg, Xia Song, Jianfeng Gao, Saurabh Tiwary, RanganMajumder, and Li Deng. 2016.
|
|
5
|
+
MS MARCO: A Human Generated MAchineReading COmprehension Dataset. In CoCo@NIPS.
|
|
6
6
|
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
See [https://github.com/microsoft/MSMARCO-Passage-Ranking](https://github.com/microsoft/MSMARCO-Passage-Ranking) for more details
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
11
|
from datamaestro.annotations.agreement import useragreement
|
|
@@ -35,6 +35,7 @@ http://www.msmarco.org/dataset.aspx""",
|
|
|
35
35
|
|
|
36
36
|
# --- Document collection
|
|
37
37
|
|
|
38
|
+
|
|
38
39
|
# TODO: Not ideal since it would be better to have small versions right away
|
|
39
40
|
# instead of downloading again the MS Marco Collection
|
|
40
41
|
@lua
|
|
@@ -43,10 +44,10 @@ http://www.msmarco.org/dataset.aspx""",
|
|
|
43
44
|
url="https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz",
|
|
44
45
|
checker=HashCheck("31644046b18952c1386cd4564ba2ae69", md5),
|
|
45
46
|
)
|
|
46
|
-
@dataset(
|
|
47
|
-
def collection_etc(data):
|
|
47
|
+
@dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
|
|
48
|
+
def collection_etc(data) -> Folder:
|
|
48
49
|
"""Documents and some more files"""
|
|
49
|
-
return
|
|
50
|
+
return Folder(path=data)
|
|
50
51
|
|
|
51
52
|
|
|
52
53
|
@lua
|
|
@@ -1,14 +1,9 @@
|
|
|
1
1
|
from datamaestro.data.csv import Generic
|
|
2
|
-
from datamaestro.definitions import
|
|
2
|
+
from datamaestro.definitions import datatasks, datatags, dataset
|
|
3
3
|
from datamaestro.download.archive import zipdownloader
|
|
4
4
|
from datamaestro.data.ml import Supervised
|
|
5
5
|
from datamaestro.utils import HashCheck
|
|
6
6
|
|
|
7
|
-
# name: Sentiment140
|
|
8
|
-
# web: http://help.sentiment140.com/for-students/
|
|
9
|
-
|
|
10
|
-
# description: |
|
|
11
|
-
|
|
12
7
|
|
|
13
8
|
@zipdownloader(
|
|
14
9
|
"dir",
|
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
"""The AQUAINT Corpus, Linguistic Data Consortium (LDC) catalog number LDC2002T31 and ISBN 1-58563-240-6 consists of newswire text data in English, drawn from three sources: the Xinhua News Service (People's Republic of China), the New York Times News Service, and the Associated Press Worldstream News Service. It was prepared by the LDC for the AQUAINT Project, and will be used in official benchmark evaluations conducted by National Institute of Standards and Technology (NIST)."""
|
|
2
2
|
|
|
3
|
-
from datamaestro.
|
|
4
|
-
from datamaestro.
|
|
5
|
-
from datamaestro_text.data.ir.trec import TipsterCollection
|
|
6
|
-
from datamaestro.definitions import argument, datatasks, datatags, dataset
|
|
3
|
+
from datamaestro.context import DatafolderPath
|
|
4
|
+
from datamaestro.definitions import dataset
|
|
7
5
|
from datamaestro.download.links import links, linkfolder
|
|
6
|
+
from datamaestro_text.data.ir.trec import TipsterCollection
|
|
8
7
|
|
|
9
8
|
|
|
10
9
|
URL = "https://catalog.ldc.upenn.edu/LDC2002T31"
|
|
@@ -22,8 +22,8 @@ from datamaestro_text.data.ir.trec import TipsterCollection
|
|
|
22
22
|
from datamaestro.download.links import linkfolder
|
|
23
23
|
from datamaestro.definitions import (
|
|
24
24
|
dataset,
|
|
25
|
-
DatafolderPath,
|
|
26
25
|
)
|
|
26
|
+
from datamaestro.context import DatafolderPath
|
|
27
27
|
|
|
28
28
|
# Store meta-information
|
|
29
29
|
TIPSTER = dataset(TipsterCollection, url="https://catalog.ldc.upenn.edu/LDC93T3A")
|
|
@@ -129,6 +129,8 @@ class ConversationNode:
|
|
|
129
129
|
|
|
130
130
|
|
|
131
131
|
class ConversationTree(ABC):
|
|
132
|
+
"""Represents a conversation tree"""
|
|
133
|
+
|
|
132
134
|
@abstractmethod
|
|
133
135
|
def root(self) -> ConversationNode:
|
|
134
136
|
...
|
|
@@ -192,7 +194,7 @@ class SingleConversationTreeNode(ConversationNode):
|
|
|
192
194
|
return (
|
|
193
195
|
SingleConversationTreeNode(self.tree, self.index + 1)
|
|
194
196
|
if self.index < len(self.tree.history) - 1
|
|
195
|
-
else
|
|
197
|
+
else None
|
|
196
198
|
)
|
|
197
199
|
|
|
198
200
|
def children(self) -> List[ConversationNode]:
|
|
@@ -207,8 +209,8 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
|
|
|
207
209
|
"""A conversation tree node"""
|
|
208
210
|
|
|
209
211
|
entry: Record
|
|
210
|
-
|
|
211
|
-
|
|
212
|
+
_parent: Optional["ConversationTreeNode"]
|
|
213
|
+
_children: List["ConversationTreeNode"]
|
|
212
214
|
|
|
213
215
|
def __init__(self, entry):
|
|
214
216
|
self.entry = entry
|
|
@@ -216,8 +218,8 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
|
|
|
216
218
|
self.children = []
|
|
217
219
|
|
|
218
220
|
def add(self, node: "ConversationTreeNode") -> "ConversationTreeNode":
|
|
219
|
-
self.
|
|
220
|
-
node.
|
|
221
|
+
self._children.append(node)
|
|
222
|
+
node._parent = self
|
|
221
223
|
return node
|
|
222
224
|
|
|
223
225
|
def conversation(self, skip_self: bool) -> ConversationHistory:
|
|
@@ -232,14 +234,14 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
|
|
|
232
234
|
def __iter__(self) -> Iterator["ConversationTreeNode"]:
|
|
233
235
|
"""Iterates over all conversation tree nodes (pre-order)"""
|
|
234
236
|
yield self.entry
|
|
235
|
-
for child in self.
|
|
237
|
+
for child in self._children:
|
|
236
238
|
yield from child
|
|
237
239
|
|
|
238
240
|
def parent(self) -> Optional[ConversationNode]:
|
|
239
|
-
return self.
|
|
241
|
+
return self._parent
|
|
240
242
|
|
|
241
243
|
def children(self) -> List[ConversationNode]:
|
|
242
|
-
return self.
|
|
244
|
+
return self._children
|
|
243
245
|
|
|
244
246
|
def root(self):
|
|
245
247
|
return self
|
|
@@ -1,12 +1,17 @@
|
|
|
1
1
|
from typing import Iterator, List
|
|
2
2
|
from attr import define
|
|
3
3
|
import json
|
|
4
|
+
from datamaestro.record import Record
|
|
4
5
|
from datamaestro.data import File
|
|
5
|
-
from .base import (
|
|
6
|
+
from datamaestro_text.data.conversation.base import (
|
|
7
|
+
ConversationDataset,
|
|
6
8
|
ConversationTree,
|
|
7
9
|
SingleConversationTree,
|
|
10
|
+
SimpleDecontextualizedItem,
|
|
11
|
+
EntryType,
|
|
8
12
|
)
|
|
9
|
-
from . import
|
|
13
|
+
from datamaestro_text.data.ir import IDItem, SimpleTextItem
|
|
14
|
+
import logging
|
|
10
15
|
|
|
11
16
|
|
|
12
17
|
@define(kw_only=True)
|
|
@@ -30,7 +35,10 @@ class CanardConversation:
|
|
|
30
35
|
|
|
31
36
|
|
|
32
37
|
class CanardDataset(ConversationDataset, File):
|
|
33
|
-
"""A dataset in the CANARD JSON format
|
|
38
|
+
"""A dataset in the CANARD JSON format
|
|
39
|
+
|
|
40
|
+
The CANARD dataset is composed of
|
|
41
|
+
"""
|
|
34
42
|
|
|
35
43
|
def entries(self) -> Iterator[CanardConversation]:
|
|
36
44
|
"""Iterates over re-written query with their context"""
|
|
@@ -47,22 +55,53 @@ class CanardDataset(ConversationDataset, File):
|
|
|
47
55
|
)
|
|
48
56
|
|
|
49
57
|
def __iter__(self) -> Iterator[ConversationTree]:
|
|
50
|
-
history = []
|
|
58
|
+
history: list[Record] = []
|
|
51
59
|
current_id = None
|
|
52
60
|
|
|
53
61
|
for entry in self.entries():
|
|
54
|
-
# Check if current conversation
|
|
55
|
-
if current_id != entry.dialogue_id
|
|
56
|
-
|
|
57
|
-
|
|
62
|
+
# Check if current conversation, otherwise we are OK
|
|
63
|
+
if current_id != entry.dialogue_id:
|
|
64
|
+
if current_id is not None:
|
|
65
|
+
history.reverse()
|
|
66
|
+
yield SingleConversationTree(current_id, history)
|
|
67
|
+
history = []
|
|
68
|
+
|
|
69
|
+
current_id = entry.dialogue_id
|
|
70
|
+
|
|
71
|
+
if not history:
|
|
72
|
+
# First round
|
|
73
|
+
# The two first items are the wikipedia title and section,
|
|
74
|
+
# we interpret them as two user queries
|
|
75
|
+
assert len(entry.history) == 2
|
|
76
|
+
history.extend(
|
|
77
|
+
Record(
|
|
78
|
+
SimpleTextItem(text),
|
|
79
|
+
EntryType.USER_QUERY,
|
|
80
|
+
)
|
|
81
|
+
for text in entry.history
|
|
82
|
+
)
|
|
83
|
+
else:
|
|
84
|
+
# The utterance before the last is the last user query
|
|
85
|
+
assert (
|
|
86
|
+
entry.history[-2] == history[-1][SimpleTextItem].text
|
|
87
|
+
), f"{entry.dialogue_id} {entry.history} / {history[-4:-1]}"
|
|
88
|
+
|
|
89
|
+
# The last utterance is the system side
|
|
90
|
+
history.append(
|
|
91
|
+
Record(SimpleTextItem(entry.history[-1]), EntryType.SYSTEM_ANSWER)
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
assert len(entry.history) == len(history)
|
|
58
95
|
|
|
59
96
|
# Add to current
|
|
60
97
|
history.append(
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
98
|
+
Record(
|
|
99
|
+
IDItem(f"{entry.dialogue_id}-{entry.query_no}"),
|
|
100
|
+
SimpleTextItem(entry.query),
|
|
101
|
+
SimpleDecontextualizedItem(entry.rewrite),
|
|
102
|
+
EntryType.USER_QUERY,
|
|
65
103
|
)
|
|
66
104
|
)
|
|
67
105
|
|
|
68
|
-
|
|
106
|
+
if current_id:
|
|
107
|
+
yield SingleConversationTree(current_id, history)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
from
|
|
2
|
-
from datamaestro.data import Base, File
|
|
1
|
+
from experimaestro import Meta
|
|
2
|
+
from datamaestro.data import Base, File
|
|
3
3
|
from datamaestro.definitions import datatags
|
|
4
4
|
import numpy as np
|
|
5
5
|
from typing import Tuple, List
|
|
@@ -18,9 +18,9 @@ class WordEmbeddings(Base):
|
|
|
18
18
|
raise NotImplementedError()
|
|
19
19
|
|
|
20
20
|
|
|
21
|
-
@argument("encoding", str, ignored=True, default="utf-8")
|
|
22
21
|
class WordEmbeddingsText(WordEmbeddings, File):
|
|
23
22
|
"""Word embeddings as a text word / values"""
|
|
23
|
+
encoding: Meta[str] = "utf-8"
|
|
24
24
|
|
|
25
25
|
def load(self):
|
|
26
26
|
words = []
|
datamaestro_text/data/ir/base.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
from csv import DictReader
|
|
2
2
|
from typing import Iterator
|
|
3
3
|
|
|
4
|
-
from
|
|
4
|
+
from experimaestro import documentation
|
|
5
|
+
from datamaestro.data import File
|
|
5
6
|
from datamaestro.record import Record
|
|
6
7
|
from datamaestro_text.data.ir import Documents, TopicRecord, Topics, IDItem
|
|
7
8
|
from datamaestro_text.data.ir.formats import (
|
|
@@ -58,7 +58,7 @@ class NFCorpusDocument(TextItem):
|
|
|
58
58
|
|
|
59
59
|
@cached_property
|
|
60
60
|
def text(self):
|
|
61
|
-
return self.abstract
|
|
61
|
+
return f"{self.title} {self.abstract}"
|
|
62
62
|
|
|
63
63
|
|
|
64
64
|
@define
|
|
@@ -99,8 +99,8 @@ class WapoDocument(TextItem):
|
|
|
99
99
|
body_media: Tuple[WapoDocMedia, ...]
|
|
100
100
|
|
|
101
101
|
@cached_property
|
|
102
|
-
def text(self):
|
|
103
|
-
return self.
|
|
102
|
+
def text(self):
|
|
103
|
+
return f"{self.title} {self.body_paras_html}"
|
|
104
104
|
|
|
105
105
|
|
|
106
106
|
@define
|
|
@@ -126,13 +126,24 @@ class OrConvQADocument(TextItem):
|
|
|
126
126
|
def text(self):
|
|
127
127
|
return f"{self.title} {self.body}"
|
|
128
128
|
|
|
129
|
+
|
|
129
130
|
@define
|
|
131
|
+
class DprW100Doc(TextItem):
|
|
132
|
+
text: str
|
|
133
|
+
title: str
|
|
134
|
+
|
|
135
|
+
@define
|
|
136
|
+
class MsMarcoV2Passage(TextItem):
|
|
137
|
+
text: str
|
|
138
|
+
spans: Tuple[Tuple[int, int], ...]
|
|
139
|
+
msmarco_document_id: str
|
|
130
140
|
class Touche2020(TextItem):
|
|
131
141
|
text: str
|
|
132
142
|
title: str
|
|
133
143
|
stance: str
|
|
134
144
|
url: str
|
|
135
145
|
|
|
146
|
+
|
|
136
147
|
@define
|
|
137
148
|
class SciDocs(TextItem):
|
|
138
149
|
text: str
|
|
@@ -175,7 +186,8 @@ class TrecMb14Query(TextItem):
|
|
|
175
186
|
def get_text(self):
|
|
176
187
|
return f"{self.query}"
|
|
177
188
|
|
|
178
|
-
|
|
189
|
+
|
|
190
|
+
@define
|
|
179
191
|
class SciDocsTopic(TextItem):
|
|
180
192
|
text: str
|
|
181
193
|
authors: List[str]
|
|
@@ -183,6 +195,7 @@ class SciDocsTopic(TextItem):
|
|
|
183
195
|
cited_by: List[str]
|
|
184
196
|
references: List[str]
|
|
185
197
|
|
|
198
|
+
|
|
186
199
|
@define()
|
|
187
200
|
class TrecTopic(SimpleTextItem):
|
|
188
201
|
description: str
|
|
@@ -190,3 +203,17 @@ class TrecTopic(SimpleTextItem):
|
|
|
190
203
|
|
|
191
204
|
|
|
192
205
|
TrecTopicRecord = record_type(IDItem, TrecTopic)
|
|
206
|
+
|
|
207
|
+
@define
|
|
208
|
+
class DprW100Query(TextItem):
|
|
209
|
+
text: str
|
|
210
|
+
answers: Tuple[str]
|
|
211
|
+
|
|
212
|
+
@define
|
|
213
|
+
class TrecBackgroundLinkingQuery(IDItem):
|
|
214
|
+
query_id: str
|
|
215
|
+
doc_id: str
|
|
216
|
+
url: str
|
|
217
|
+
|
|
218
|
+
def get_text(self):
|
|
219
|
+
raise NotImplementedError()
|
|
@@ -1,14 +1,13 @@
|
|
|
1
|
-
from
|
|
1
|
+
from experimaestro import Param
|
|
2
|
+
from datamaestro.data import Base, File
|
|
2
3
|
import datamaestro.data.csv as csv
|
|
3
4
|
|
|
4
5
|
|
|
5
|
-
@argument("ratings", type=File)
|
|
6
6
|
class RatedItems(Base):
|
|
7
|
-
|
|
7
|
+
ratings: Param[File]
|
|
8
8
|
|
|
9
9
|
|
|
10
|
-
@argument("links", type=csv.Generic)
|
|
11
|
-
@argument("movies", type=csv.Generic)
|
|
12
|
-
@argument("tags", type=csv.Generic)
|
|
13
10
|
class Movielens(RatedItems):
|
|
14
|
-
|
|
11
|
+
links: Param[csv.Generic]
|
|
12
|
+
movies: Param[csv.Generic]
|
|
13
|
+
tags: Param[csv.Generic]
|
datamaestro_text/data/text.py
CHANGED
|
@@ -1,15 +1,15 @@
|
|
|
1
|
-
from
|
|
2
|
-
from
|
|
1
|
+
from typing import Optional
|
|
2
|
+
from experimaestro import Param
|
|
3
|
+
from datamaestro.data import Base, Folder, File
|
|
3
4
|
from datamaestro.data.ml import Supervised
|
|
4
5
|
|
|
5
6
|
|
|
6
|
-
@argument("train", type=Base)
|
|
7
|
-
@argument("test", type=Base, required=False)
|
|
8
|
-
@argument("validation", type=Base, required=False)
|
|
9
7
|
class TrainingText(Supervised):
|
|
10
8
|
""" "A dataset used for training with a train and a test"""
|
|
11
9
|
|
|
12
|
-
|
|
10
|
+
train: Param[Base]
|
|
11
|
+
test: Param[Optional[Base]] = None
|
|
12
|
+
validation: Param[Optional[Base]] = None
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
class TextFolder(Folder):
|
|
@@ -37,6 +37,7 @@ from datamaestro_text.data.ir.base import (
|
|
|
37
37
|
SimpleAdhocAssessment,
|
|
38
38
|
SimpleTextItem,
|
|
39
39
|
TopicRecord,
|
|
40
|
+
UrlItem,
|
|
40
41
|
create_record,
|
|
41
42
|
)
|
|
42
43
|
|
|
@@ -165,6 +166,19 @@ class Documents(ir.DocumentStore, IRDSId):
|
|
|
165
166
|
"source",
|
|
166
167
|
"source_content_type",
|
|
167
168
|
),
|
|
169
|
+
_irds.dpr_w100.DprW100Doc: tuple_constructor(
|
|
170
|
+
formats.DprW100Doc,
|
|
171
|
+
"doc_id",
|
|
172
|
+
"text",
|
|
173
|
+
"title",
|
|
174
|
+
),
|
|
175
|
+
_irds.msmarco_passage_v2.MsMarcoV2Passage: tuple_constructor(
|
|
176
|
+
formats.MsMarcoV2Passage,
|
|
177
|
+
"doc_id",
|
|
178
|
+
"text",
|
|
179
|
+
"spans",
|
|
180
|
+
"msmarco_document_id",
|
|
181
|
+
),
|
|
168
182
|
}
|
|
169
183
|
|
|
170
184
|
"""Wraps an ir datasets collection -- and provide a default text
|
|
@@ -385,6 +399,12 @@ class Topics(ir.TopicsStore, IRDSId):
|
|
|
385
399
|
"tweet_time",
|
|
386
400
|
"description",
|
|
387
401
|
),
|
|
402
|
+
_irds.dpr_w100.DprW100Query: tuple_constructor(
|
|
403
|
+
formats.DprW100Query,
|
|
404
|
+
"query_id",
|
|
405
|
+
"text",
|
|
406
|
+
"answers"
|
|
407
|
+
),
|
|
388
408
|
}
|
|
389
409
|
|
|
390
410
|
HANDLERS = {
|
|
@@ -415,7 +435,52 @@ class Topics(ir.TopicsStore, IRDSId):
|
|
|
415
435
|
def iter(self) -> Iterator[TopicRecord]:
|
|
416
436
|
"""Returns an iterator over topics"""
|
|
417
437
|
return self.handler.iter()
|
|
438
|
+
|
|
439
|
+
class TrecBackgroundLinkingTopicsHandler(TopicsHandler):
|
|
440
|
+
def __init__(self, dataset):
|
|
441
|
+
self.dataset = dataset
|
|
442
|
+
|
|
443
|
+
@cached_property
|
|
444
|
+
def ext2records(self):
|
|
445
|
+
return {record[IDItem].id: record for record in self.records}
|
|
446
|
+
|
|
447
|
+
def topic_int(self, internal_topic_id: int) -> TopicRecord:
|
|
448
|
+
"""Returns a document given its internal ID"""
|
|
449
|
+
return self.records[internal_topic_id]
|
|
450
|
+
|
|
451
|
+
def topic_ext(self, external_topic_id: str) -> TopicRecord:
|
|
452
|
+
"""Returns a document given its external ID"""
|
|
453
|
+
return self.ext2records[external_topic_id]
|
|
418
454
|
|
|
455
|
+
def iter(self) -> Iterator[ir.TopicRecord]:
|
|
456
|
+
"""Returns an iterator over topics"""
|
|
457
|
+
return iter(self.records)
|
|
458
|
+
|
|
459
|
+
@cached_property
|
|
460
|
+
def records(self):
|
|
461
|
+
try:
|
|
462
|
+
records = []
|
|
463
|
+
|
|
464
|
+
for query in self.dataset.dataset.queries_iter():
|
|
465
|
+
topic = Record(
|
|
466
|
+
IDItem(query.query_id),
|
|
467
|
+
# Following BEIR documentation, we use title of documents as queries: https://github.com/beir-cellar/beir/blob/main/examples/dataset/README.md#queries-and-qrels
|
|
468
|
+
SimpleTextItem(self.dataset.dataset.docs_store().get(query.doc_id).title),
|
|
469
|
+
UrlItem(query.url),
|
|
470
|
+
)
|
|
471
|
+
records.append(topic)
|
|
472
|
+
except Exception:
|
|
473
|
+
logging.exception("Error while computing topic records")
|
|
474
|
+
raise
|
|
475
|
+
|
|
476
|
+
return records
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
Topics.HANDLERS.update(
|
|
480
|
+
{
|
|
481
|
+
_irds.wapo.TrecBackgroundLinkingQuery: TrecBackgroundLinkingTopicsHandler
|
|
482
|
+
}
|
|
483
|
+
)
|
|
419
484
|
|
|
420
485
|
class CastTopicsHandler(TopicsHandler):
|
|
421
486
|
def __init__(self, dataset):
|
datamaestro_text/utils/iter.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Callable, TypeVar, Iterator, List, Union
|
|
1
|
+
from typing import Callable, Sequence, TypeVar, Iterator, List, Union
|
|
2
2
|
|
|
3
3
|
T = TypeVar("T")
|
|
4
4
|
|
|
@@ -45,7 +45,7 @@ class RangeView:
|
|
|
45
45
|
return RangeView(self.source, key)
|
|
46
46
|
|
|
47
47
|
|
|
48
|
-
class LazyList:
|
|
48
|
+
class LazyList(Sequence):
|
|
49
49
|
"""Iterable-based list
|
|
50
50
|
|
|
51
51
|
The list is only materialized if needed"""
|
|
@@ -63,6 +63,9 @@ class LazyList:
|
|
|
63
63
|
else:
|
|
64
64
|
return iter(self.materialized_list)
|
|
65
65
|
|
|
66
|
+
def __len__(self):
|
|
67
|
+
return len(self.iterable)
|
|
68
|
+
|
|
66
69
|
def __getitem__(self, index):
|
|
67
70
|
# Materialize the list if accessing an index above the threshold or any slice
|
|
68
71
|
if isinstance(index, slice) or index >= self.materialize_threshold:
|
datamaestro_text/version.py
CHANGED
|
@@ -1,8 +1,13 @@
|
|
|
1
|
-
# file generated by
|
|
1
|
+
# file generated by setuptools-scm
|
|
2
2
|
# don't change, don't track in version control
|
|
3
|
+
|
|
4
|
+
__all__ = ["__version__", "__version_tuple__", "version", "version_tuple"]
|
|
5
|
+
|
|
3
6
|
TYPE_CHECKING = False
|
|
4
7
|
if TYPE_CHECKING:
|
|
5
|
-
from typing import Tuple
|
|
8
|
+
from typing import Tuple
|
|
9
|
+
from typing import Union
|
|
10
|
+
|
|
6
11
|
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
7
12
|
else:
|
|
8
13
|
VERSION_TUPLE = object
|
|
@@ -12,5 +17,5 @@ __version__: str
|
|
|
12
17
|
__version_tuple__: VERSION_TUPLE
|
|
13
18
|
version_tuple: VERSION_TUPLE
|
|
14
19
|
|
|
15
|
-
__version__ = version = '2025.
|
|
16
|
-
__version_tuple__ = version_tuple = (2025,
|
|
20
|
+
__version__ = version = '2025.5.13'
|
|
21
|
+
__version_tuple__ = version_tuple = (2025, 5, 13)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: datamaestro-text
|
|
3
|
-
Version: 2025.
|
|
3
|
+
Version: 2025.5.13
|
|
4
4
|
Summary: Datamaestro module for text-related datasets
|
|
5
5
|
Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
|
|
6
6
|
License: GPL-3
|
|
@@ -18,9 +18,16 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
18
18
|
Requires-Python: >=3.8
|
|
19
19
|
Description-Content-Type: text/markdown
|
|
20
20
|
License-File: LICENSE
|
|
21
|
-
Requires-Dist: datamaestro>=1.2
|
|
21
|
+
Requires-Dist: datamaestro>=1.4.2
|
|
22
22
|
Requires-Dist: ir_datasets>=0.5.8
|
|
23
23
|
Requires-Dist: attrs
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: pytest; extra == "dev"
|
|
26
|
+
Requires-Dist: docutils; extra == "dev"
|
|
27
|
+
Requires-Dist: sphobjinv; extra == "dev"
|
|
28
|
+
Requires-Dist: flake8; extra == "dev"
|
|
29
|
+
Requires-Dist: sphinx; extra == "dev"
|
|
30
|
+
Dynamic: license-file
|
|
24
31
|
|
|
25
32
|
[](https://github.com/pre-commit/pre-commit) [](https://badge.fury.io/py/datamaestro-text)
|
|
26
33
|
|
|
@@ -1,25 +1,25 @@
|
|
|
1
1
|
datamaestro_text/__init__.py,sha256=hU8jZpkXl3F74qIfqnJl7v4nJ9YxfoR7IpJpUREFNRI,248
|
|
2
|
-
datamaestro_text/version.py,sha256=
|
|
2
|
+
datamaestro_text/version.py,sha256=EsLzhbhZSIiOqGSyEpMlneQnIpzB12JreUxG8EMn7EE,519
|
|
3
3
|
datamaestro_text/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
datamaestro_text/config/ai/quac.yaml,sha256=h1D7UJo2z1nZ_9MXpDHuQNJG9Ma2oayUdJV6lyumAIg,1103
|
|
5
5
|
datamaestro_text/config/com/oscar-corpus.py,sha256=dDfdy_uhG6esRQQzUOOORwK8KvEq4c4hZbWt22pv-zY,733
|
|
6
|
-
datamaestro_text/config/com/sentiment140.py,sha256=
|
|
6
|
+
datamaestro_text/config/com/sentiment140.py,sha256=3cZfqs395gY14gGojRC_RTYEdG5l8RUTSdF9li8ynBc,1283
|
|
7
7
|
datamaestro_text/config/com/fastml/goodbooks-10k.yaml,sha256=5ZABxUnBFs2ZnCXtBH8YoBiPb3SocRRdH1DLSfVWF-Y,1172
|
|
8
|
-
datamaestro_text/config/com/github/aagohary/canard.py,sha256=
|
|
9
|
-
datamaestro_text/config/com/github/apple/ml-qrecc.py,sha256
|
|
8
|
+
datamaestro_text/config/com/github/aagohary/canard.py,sha256=DREIKU2FFrfwibjU48Y1AsKTUzhwxmD4Kuykd4bDIus,1762
|
|
9
|
+
datamaestro_text/config/com/github/apple/ml-qrecc.py,sha256=-iHKJocWZ8N9N-P8E45y4ewg3OT_23XonlDh5-NcH2g,3055
|
|
10
10
|
datamaestro_text/config/com/github/prdwb/orconvqa.py,sha256=Tg3hbnvilxW_Lwt7fpKvGMtu-6mc9oNIHM-LX6JTR1U,3026
|
|
11
11
|
datamaestro_text/config/com/github/soskek/bookcorpus.yaml,sha256=qJKs35yeEIilEMgNvU3OEqMp1TSn7mDM2T-uYyA7kTU,1607
|
|
12
12
|
datamaestro_text/config/com/microsoft/wikiqa.yaml,sha256=U7rU-W2Xz1MYv2YXT4jCTj5DsHyM0VssbQPNa3EISaM,540
|
|
13
|
-
datamaestro_text/config/com/microsoft/msmarco/passage.py,sha256=
|
|
13
|
+
datamaestro_text/config/com/microsoft/msmarco/passage.py,sha256=tVldwVh0pQpXXifqcMNEN9cLO1HXkXoEhToSazSx5RE,11643
|
|
14
14
|
datamaestro_text/config/com/smashwords/bookcorpus.py,sha256=hCFjZg9t1AHET05qU31uatiAOD5EEzzWG9y_W90jcOE,861
|
|
15
15
|
datamaestro_text/config/edu/cornell/nlvr.yaml,sha256=9Yk5VZMncSmrP7JNuGXqExksgX5nQ_Zfnlps8hWze3Q,921
|
|
16
16
|
datamaestro_text/config/edu/stanford/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
17
|
datamaestro_text/config/edu/stanford/aclimdb.py,sha256=lsPDxnp_rWOCpBte6pZ0_LVaC33w5mmgfGh51rcTgt8,643
|
|
18
|
-
datamaestro_text/config/edu/stanford/glove.py,sha256=
|
|
18
|
+
datamaestro_text/config/edu/stanford/glove.py,sha256=bXxwiJqT8alPs5MwwxVuY0xBO8g1QWhm2L3AKVskTlA,2391
|
|
19
19
|
datamaestro_text/config/edu/stanford/im2p.yaml,sha256=JoToNyEPpmwdyLFedCBot5ypyw7p9rzi12mGXJuZin0,2909
|
|
20
20
|
datamaestro_text/config/edu/upenn/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
21
21
|
datamaestro_text/config/edu/upenn/ldc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
|
-
datamaestro_text/config/edu/upenn/ldc/aquaint.py,sha256=
|
|
22
|
+
datamaestro_text/config/edu/upenn/ldc/aquaint.py,sha256=YEU3kIXzv71Vjg9bUoXyQ-vSLvC-y4LlE3mJQf38XuY,1589
|
|
23
23
|
datamaestro_text/config/gov/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
24
|
datamaestro_text/config/gov/nist/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
25
|
datamaestro_text/config/gov/nist/ir/covid.py,sha256=wn2E7sQ8M6pAucVD4sKJYImyzKUKphyiDFJD0oYRCbg,4004
|
|
@@ -28,7 +28,7 @@ datamaestro_text/config/gov/nist/trec/adhoc.py,sha256=vcFaLlZ-chwDt013MYI8bYZ2ug
|
|
|
28
28
|
datamaestro_text/config/gov/nist/trec/clueweb.yaml,sha256=sm4UbdtMzWoDVPsewtVDS7Vj2jBOdgp18Xqo1X4ysQc,792
|
|
29
29
|
datamaestro_text/config/gov/nist/trec/deeplearning.yaml,sha256=QGM7PtXLJRttNdOPE16o7-k3e5tA9HgcaM_-qFDV_5Q,2125
|
|
30
30
|
datamaestro_text/config/gov/nist/trec/index.yaml,sha256=oSUhUmtukf5oXqUbJLhae8xZx7Uac5V6uZoUB-RJ7Sw,2711
|
|
31
|
-
datamaestro_text/config/gov/nist/trec/tipster.py,sha256=
|
|
31
|
+
datamaestro_text/config/gov/nist/trec/tipster.py,sha256=t0w9lOBfvwt6YCYCyDj9fo1QiBXfs0q8qzKl74f40jU,5398
|
|
32
32
|
datamaestro_text/config/gov/nist/trec/web.yaml,sha256=iNIJ-PCeLwpUaJByJesyCqfFJolWCts_VETACha3hfQ,563
|
|
33
33
|
datamaestro_text/config/io/github/rajpurkar/squad.yaml,sha256=JchkTruEhVd0uOTGgek2tOgBL4UT3G3KVASvZcNigLg,1143
|
|
34
34
|
datamaestro_text/config/io/github/thunlp/fewrel.py,sha256=LJYexk2ssZ5SqKIKLjPdybS9ju4g9fGIFy5eybtsvYQ,1224
|
|
@@ -41,28 +41,28 @@ datamaestro_text/config/org/grouplens/movielens.py,sha256=e78JoiHuwP6gbj7Q84UwPT
|
|
|
41
41
|
datamaestro_text/config/org/universaldependencies/french.py,sha256=W_gDtfinjlw08qohX_PWvzQlacDwRFB7PeOzO33mRVU,2208
|
|
42
42
|
datamaestro_text/config/uk/ac/ucl/cs/qangaroo.yaml,sha256=IBy82CDNNLjJPNPzues1EgDXu0A5WDvUFeVNSOyrIpI,1137
|
|
43
43
|
datamaestro_text/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
44
|
-
datamaestro_text/data/embeddings.py,sha256=
|
|
45
|
-
datamaestro_text/data/recommendation.py,sha256=
|
|
44
|
+
datamaestro_text/data/embeddings.py,sha256=qdeZ4m6drFNkqW_4pKxJKdJHtOnDRs0P7QVJ7AO6xFE,1127
|
|
45
|
+
datamaestro_text/data/recommendation.py,sha256=wHV_9SXSclouuXaBmvwg3ncZLOFfdKRZG3IHkkPJX9Y,279
|
|
46
46
|
datamaestro_text/data/tagging.py,sha256=yWm7bNLks77cAySa1aZNCmLFxTTqhqXZ0PaoaEYU6hI,697
|
|
47
|
-
datamaestro_text/data/text.py,sha256=
|
|
47
|
+
datamaestro_text/data/text.py,sha256=_9J7-j06BOIZ1HsOcBsWy_asanAKkDoZIsWWMMj4tm4,496
|
|
48
48
|
datamaestro_text/data/conversation/__init__.py,sha256=esOWnSImMlQs5QtfxUt559ABLd6a5wwoNpj9XtIq71Y,159
|
|
49
|
-
datamaestro_text/data/conversation/base.py,sha256=
|
|
50
|
-
datamaestro_text/data/conversation/canard.py,sha256=
|
|
51
|
-
datamaestro_text/data/conversation/orconvqa.py,sha256=
|
|
49
|
+
datamaestro_text/data/conversation/base.py,sha256=PUVRCSMBlV9bSayBl-vnzsYvyr2Tdv_zTadIC_Tswe0,6508
|
|
50
|
+
datamaestro_text/data/conversation/canard.py,sha256=FI02rPNuzSsA_uXOcgcuQn31d3QahmHndzYCV1Zm8rk,3305
|
|
51
|
+
datamaestro_text/data/conversation/orconvqa.py,sha256=RL9cpK7QK2dsk_j2e4Th8wzGFq-G3ww_EjdM1eKh-8o,3764
|
|
52
52
|
datamaestro_text/data/conversation/qrecc.py,sha256=es4GmqPtE63A7O_GARe8Zy3rQvuLEhAvUA7CfN_nMeA,2562
|
|
53
53
|
datamaestro_text/data/ir/__init__.py,sha256=EOVnRv9oKEEucMOgrFjhXxeUWEkaJkbXBndpkKhZaiY,8701
|
|
54
|
-
datamaestro_text/data/ir/base.py,sha256=
|
|
55
|
-
datamaestro_text/data/ir/cord19.py,sha256=
|
|
54
|
+
datamaestro_text/data/ir/base.py,sha256=TSaY8UWoixqvKg1z5JfkXPnuZiVoChYEUvc7jjbpZqY,1495
|
|
55
|
+
datamaestro_text/data/ir/cord19.py,sha256=yu1Khgy6AZjH2BPQKdnxDid0vQgQ8zvb8-FQlHH-GOU,1465
|
|
56
56
|
datamaestro_text/data/ir/csv.py,sha256=tnxuqR_MZ3GQhuoXEMYyWLQw8PyD0gRqsnzIP5Gsziw,1212
|
|
57
57
|
datamaestro_text/data/ir/data.py,sha256=ERmmOxz_9mUkIuccNbzUa5Y6gVLLVDdyc4cCxbCCUbY,20
|
|
58
|
-
datamaestro_text/data/ir/formats.py,sha256=
|
|
58
|
+
datamaestro_text/data/ir/formats.py,sha256=K0mqmCGg0seneKo-Rt3cBfDVjEMS4_6t1MfNj4iW9Y4,3637
|
|
59
59
|
datamaestro_text/data/ir/huggingface.py,sha256=G71VFDN-SllZy4LFxumEbCumEJvb5-orAbLemHjWhiA,956
|
|
60
60
|
datamaestro_text/data/ir/stores.py,sha256=rlOogoBAfsP7o01KqvHiTF1XqzK2Fp6QbRmuasabKAE,859
|
|
61
61
|
datamaestro_text/data/ir/trec.py,sha256=qDOzmgUn0hMqeP9KdI190-9IKsLl6yNRqIoBz6s-cBs,1898
|
|
62
62
|
datamaestro_text/data/ir/utils.py,sha256=6-GhXVtgkBZGhIs2-ODZua_3DmKjSSVydStpHDqbAwE,833
|
|
63
63
|
datamaestro_text/datasets/irds/__init__.py,sha256=Tq0HN1qojnZYLBumM59BuTkz7r0gcu-5OXmDDLgPpAc,707
|
|
64
|
-
datamaestro_text/datasets/irds/data.py,sha256=
|
|
65
|
-
datamaestro_text/datasets/irds/datasets.py,sha256=
|
|
64
|
+
datamaestro_text/datasets/irds/data.py,sha256=0V5nPMQeh1I3Sp36x_NE7xrbC_vbZAKhU5NONj95aok,22058
|
|
65
|
+
datamaestro_text/datasets/irds/datasets.py,sha256=yrJx3X7u7oYcHXsL8YmUrXsQhkiqkBC6LjeZA_Ldx5Q,5617
|
|
66
66
|
datamaestro_text/datasets/irds/helpers.py,sha256=fGE-fbuJbXdTzl1qo55247jzn9cvApY-d82GJBgfY1E,3982
|
|
67
67
|
datamaestro_text/datasets/irds/utils.py,sha256=m30JLIrV_HgilN11TvY9dGTyumES6LLzWZDUAMT915M,1425
|
|
68
68
|
datamaestro_text/download/tmdb.py,sha256=kU_Vz9jhznlyeKMHziVu58IHoWv8zPu6CZTHVNQvmu4,4009
|
|
@@ -75,12 +75,12 @@ datamaestro_text/transforms/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMp
|
|
|
75
75
|
datamaestro_text/transforms/ir/__init__.py,sha256=eWxr0ZtfK9F9J4TachW5V8Ra5AhQJueDyOGR8vXi1uA,6553
|
|
76
76
|
datamaestro_text/utils/__init__.py,sha256=2449YLTAtKJzkmt84Mu8sBRCCveNs5fiaqTCK_p5ha0,3340
|
|
77
77
|
datamaestro_text/utils/files.py,sha256=n6ZGl5LNrZbHLcV9RFwd7cFT0vPUezit-2dsBzs1vRQ,170
|
|
78
|
-
datamaestro_text/utils/iter.py,sha256=
|
|
78
|
+
datamaestro_text/utils/iter.py,sha256=uzBmavBeqVyEvYmi9Ds2dqh0ywhJE-1SBO5v8AoIKcc,2537
|
|
79
79
|
datamaestro_text/utils/randomstream.py,sha256=_-boH4IIqN8qcl3IktjpNp9vmF4TWRzHUSNVwg7WAr8,973
|
|
80
80
|
datamaestro_text/utils/shuffle.py,sha256=o8JTz3mr0lYWyv0zEh91jEK12ci1etMiUnzh5GkOHCM,3490
|
|
81
|
-
datamaestro_text-2025.
|
|
82
|
-
datamaestro_text-2025.
|
|
83
|
-
datamaestro_text-2025.
|
|
84
|
-
datamaestro_text-2025.
|
|
85
|
-
datamaestro_text-2025.
|
|
86
|
-
datamaestro_text-2025.
|
|
81
|
+
datamaestro_text-2025.5.13.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
82
|
+
datamaestro_text-2025.5.13.dist-info/METADATA,sha256=EYppi8IJMqWU3ObzwSvM_PuOkC_pgwGAxwvaFx2dG3A,1847
|
|
83
|
+
datamaestro_text-2025.5.13.dist-info/WHEEL,sha256=DnLRTWE75wApRYVsjgc6wsVswC54sMSJhAEd4xhDpBk,91
|
|
84
|
+
datamaestro_text-2025.5.13.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
|
|
85
|
+
datamaestro_text-2025.5.13.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
|
|
86
|
+
datamaestro_text-2025.5.13.dist-info/RECORD,,
|
{datamaestro_text-2025.1.7.dist-info → datamaestro_text-2025.5.13.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{datamaestro_text-2025.1.7.dist-info → datamaestro_text-2025.5.13.dist-info/licenses}/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|