datamaestro-text 2025.6.30__py3-none-any.whl → 2025.9.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamaestro_text/config/com/github/aagohary/canard.py +3 -3
- datamaestro_text/config/com/github/apple/ml-qrecc.py +2 -1
- datamaestro_text/config/com/github/ikat.py +102 -19
- datamaestro_text/config/com/microsoft/msmarco/passage.py +1 -1
- datamaestro_text/config/com/sentiment140.py +4 -4
- datamaestro_text/config/edu/stanford/aclimdb.py +2 -2
- datamaestro_text/config/gov/nist/trec/tipster.py +0 -1
- datamaestro_text/config/io/github/thunlp/fewrel.py +1 -1
- datamaestro_text/config/io/metamind/research/wikitext.py +3 -3
- datamaestro_text/config/org/grouplens/movielens.py +8 -8
- datamaestro_text/config/org/universaldependencies/french.py +3 -3
- datamaestro_text/data/conversation/base.py +34 -9
- datamaestro_text/data/conversation/ikat.py +38 -13
- datamaestro_text/data/ir/__init__.py +44 -4
- datamaestro_text/data/ir/base.py +2 -1
- datamaestro_text/data/ir/formats.py +8 -2
- datamaestro_text/data/ir/stores.py +99 -4
- datamaestro_text/data/ir/trec.py +7 -4
- datamaestro_text/datasets/irds/data.py +47 -16
- datamaestro_text/interfaces/trec.py +28 -1
- datamaestro_text/utils/files.py +103 -0
- datamaestro_text/utils/iter.py +5 -0
- datamaestro_text/version.py +16 -3
- {datamaestro_text-2025.6.30.dist-info → datamaestro_text-2025.9.11.dist-info}/METADATA +3 -3
- {datamaestro_text-2025.6.30.dist-info → datamaestro_text-2025.9.11.dist-info}/RECORD +29 -29
- {datamaestro_text-2025.6.30.dist-info → datamaestro_text-2025.9.11.dist-info}/WHEEL +0 -0
- {datamaestro_text-2025.6.30.dist-info → datamaestro_text-2025.9.11.dist-info}/entry_points.txt +0 -0
- {datamaestro_text-2025.6.30.dist-info → datamaestro_text-2025.9.11.dist-info}/licenses/LICENSE +0 -0
- {datamaestro_text-2025.6.30.dist-info → datamaestro_text-2025.9.11.dist-info}/top_level.txt +0 -0
|
@@ -37,7 +37,7 @@ def main(train, dev, test):
|
|
|
37
37
|
Each dataset is an instance of :class:`datamaestro_text.data.conversation.CanardDataset`
|
|
38
38
|
"""
|
|
39
39
|
return {
|
|
40
|
-
"train": CanardDataset(path=train),
|
|
41
|
-
"validation": CanardDataset(path=dev),
|
|
42
|
-
"test": CanardDataset(path=test),
|
|
40
|
+
"train": CanardDataset.C(path=train),
|
|
41
|
+
"validation": CanardDataset.C(path=dev),
|
|
42
|
+
"test": CanardDataset.C(path=test),
|
|
43
43
|
}
|
|
@@ -51,6 +51,7 @@ def main(data: Path) -> Supervised[QReCCDataset, None, QReCCDataset]:
|
|
|
51
51
|
)
|
|
52
52
|
class Content(LZ4JSONLDocumentStore):
|
|
53
53
|
"""QReCC mentionned URLs content"""
|
|
54
|
+
|
|
54
55
|
@staticmethod
|
|
55
56
|
def __create_dataset__(dataset, options=None):
|
|
56
57
|
ds = reference(reference=main).setup(dataset, options)
|
|
@@ -65,7 +66,7 @@ class Content(LZ4JSONLDocumentStore):
|
|
|
65
66
|
"id",
|
|
66
67
|
).setup(dataset, options)
|
|
67
68
|
|
|
68
|
-
return Content(jsonl_path=store_path)
|
|
69
|
+
return Content.C(jsonl_path=store_path)
|
|
69
70
|
|
|
70
71
|
@staticmethod
|
|
71
72
|
def _documents(path: Path):
|
|
@@ -1,38 +1,121 @@
|
|
|
1
1
|
# See documentation on https://datamaestro.readthedocs.io
|
|
2
2
|
|
|
3
|
+
import bz2
|
|
4
|
+
from datamaestro.download import reference
|
|
3
5
|
from datamaestro.definitions import datatasks, datatags, dataset
|
|
4
|
-
from
|
|
5
|
-
from
|
|
6
|
+
from datamaestro_text.data.conversation.base import ConversationUserTopics
|
|
7
|
+
from datamaestro_text.data.ir import Adhoc
|
|
6
8
|
|
|
7
9
|
from datamaestro.utils import HashCheck
|
|
10
|
+
from datamaestro.context import DatafolderPath
|
|
8
11
|
from datamaestro.download.single import filedownloader
|
|
9
|
-
from datamaestro_text.data.conversation.ikat import
|
|
10
|
-
from
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
import
|
|
12
|
+
from datamaestro_text.data.conversation.ikat import IkatConversations
|
|
13
|
+
from datamaestro.download.links import linkfolder
|
|
14
|
+
|
|
15
|
+
from datamaestro_text.data.ir.stores import IKatClueWeb22DocumentStore
|
|
16
|
+
from datamaestro_text.data.ir.trec import TrecAdhocAssessments
|
|
17
|
+
from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataset(as_prepare=True)
|
|
21
|
+
def clueweb22(dataset, options=None) -> IKatClueWeb22DocumentStore:
|
|
22
|
+
# Number of documents in the dataset
|
|
23
|
+
count = 116_838_987
|
|
24
|
+
|
|
25
|
+
jsonl_folder = linkfolder(
|
|
26
|
+
"documents", [DatafolderPath("gov.nist.trec.ikat.clueweb22", "jsonl")]
|
|
27
|
+
).setup(dataset, options)
|
|
28
|
+
store_path = lz4docstore_builder(
|
|
29
|
+
"store",
|
|
30
|
+
IKatClueWeb22DocumentStore.generator(
|
|
31
|
+
jsonl_folder,
|
|
32
|
+
jsonl_folder / "ikat_2023_passages_jsonl.sha256sums",
|
|
33
|
+
jsonl_folder / "ikat_2023_passages_hashes.tsv.bz2",
|
|
34
|
+
),
|
|
35
|
+
IKatClueWeb22DocumentStore.Document,
|
|
36
|
+
"id",
|
|
37
|
+
count_hint=count,
|
|
38
|
+
).setup(dataset, options)
|
|
39
|
+
|
|
40
|
+
return IKatClueWeb22DocumentStore.C(path=store_path, count=count)
|
|
41
|
+
|
|
15
42
|
|
|
16
43
|
@datatags("conversation", "context", "query")
|
|
17
|
-
@datatasks("query rewriting")
|
|
44
|
+
@datatasks("conversational search", "query rewriting")
|
|
45
|
+
@reference("documents", clueweb22)
|
|
18
46
|
@filedownloader(
|
|
19
|
-
"
|
|
47
|
+
"topics.json",
|
|
20
48
|
"https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2025/data/2025_test_topics.json",
|
|
21
49
|
checker=HashCheck("16f8444a8d0a8dfe0090f478f185a63c"),
|
|
22
50
|
)
|
|
23
|
-
|
|
24
51
|
@dataset(
|
|
25
|
-
|
|
52
|
+
id="2025",
|
|
26
53
|
url="https://github.com/irlabamsterdam/iKAT/tree/main/2025",
|
|
27
54
|
)
|
|
28
|
-
|
|
29
|
-
def main(test) -> Supervised[IkatDataset, None, IkatDataset]:
|
|
55
|
+
def test_2025(topics, documents) -> Adhoc.C:
|
|
30
56
|
"""Question-in-context rewriting
|
|
31
57
|
|
|
32
|
-
iKAT is a test dataset for question-in-context rewriting that consists of
|
|
58
|
+
iKAT is a test dataset for question-in-context rewriting that consists of
|
|
33
59
|
questions each given in a dialog context together with a context-independent
|
|
34
|
-
rewriting of the question.
|
|
35
|
-
One of the special features of iKAT is that it includes a Personal PKTB',
|
|
60
|
+
rewriting of the question.
|
|
36
61
|
"""
|
|
37
|
-
|
|
38
|
-
|
|
62
|
+
return Adhoc.C(
|
|
63
|
+
topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
|
|
64
|
+
# TODO: add when available
|
|
65
|
+
assessments=TrecAdhocAssessments.C(path="/to/do"),
|
|
66
|
+
documents=documents,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@datatags("conversation", "context", "query")
|
|
71
|
+
@datatasks("conversational search", "query rewriting")
|
|
72
|
+
@reference("documents", clueweb22)
|
|
73
|
+
@filedownloader(
|
|
74
|
+
"qrels",
|
|
75
|
+
"https://trec.nist.gov/data/ikat/2024-qrels.txt",
|
|
76
|
+
checker=HashCheck("57f958903ed1c12bbac207f62800814f"),
|
|
77
|
+
)
|
|
78
|
+
@filedownloader(
|
|
79
|
+
"topics.json",
|
|
80
|
+
"https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2024/data/2024_test_topics.json",
|
|
81
|
+
checker=HashCheck("ad45bc6e7add2081d69ea60a0a4d1203"),
|
|
82
|
+
)
|
|
83
|
+
@dataset(
|
|
84
|
+
Adhoc,
|
|
85
|
+
id="2024",
|
|
86
|
+
url="https://github.com/irlabamsterdam/iKAT/tree/main/2024",
|
|
87
|
+
)
|
|
88
|
+
def test_2024(topics, qrels, documents) -> Adhoc.C:
|
|
89
|
+
"""iKAT 2024 dataset"""
|
|
90
|
+
return Adhoc.C(
|
|
91
|
+
topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
|
|
92
|
+
assessments=TrecAdhocAssessments.C(path=qrels),
|
|
93
|
+
documents=documents,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@datatags("conversation", "context", "query")
|
|
98
|
+
@datatasks("conversational search", "query rewriting")
|
|
99
|
+
@reference("documents", clueweb22)
|
|
100
|
+
@filedownloader(
|
|
101
|
+
"qrels",
|
|
102
|
+
"https://trec.nist.gov/data/ikat/2023-qrels.all-turns.txt",
|
|
103
|
+
checker=HashCheck("79dc121bab25b2245e52a53263e5ad1f"),
|
|
104
|
+
)
|
|
105
|
+
@filedownloader(
|
|
106
|
+
"topics.json",
|
|
107
|
+
"https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2023/data/2023_test_topics.json",
|
|
108
|
+
checker=HashCheck("684fa0197cdec8c3cfb6a2e586ab83f6"),
|
|
109
|
+
)
|
|
110
|
+
@dataset(
|
|
111
|
+
Adhoc,
|
|
112
|
+
id="2023",
|
|
113
|
+
url="https://github.com/irlabamsterdam/iKAT/tree/main/2023",
|
|
114
|
+
)
|
|
115
|
+
def test_2023(topics, qrels, documents) -> Adhoc.C:
|
|
116
|
+
"""iKAT 2023 dataset"""
|
|
117
|
+
return Adhoc.C(
|
|
118
|
+
topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
|
|
119
|
+
assessments=TrecAdhocAssessments.C(path=qrels),
|
|
120
|
+
documents=documents,
|
|
121
|
+
)
|
|
@@ -47,7 +47,7 @@ http://www.msmarco.org/dataset.aspx""",
|
|
|
47
47
|
@dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
|
|
48
48
|
def collection_etc(data) -> Folder:
|
|
49
49
|
"""Documents and some more files"""
|
|
50
|
-
return Folder(path=data)
|
|
50
|
+
return Folder.C(path=data)
|
|
51
51
|
|
|
52
52
|
|
|
53
53
|
@lua
|
|
@@ -26,7 +26,7 @@ def english(dir):
|
|
|
26
26
|
|
|
27
27
|
If you use this data, please cite Sentiment140 as your source.
|
|
28
28
|
"""
|
|
29
|
-
return
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
29
|
+
return Supervised.C(
|
|
30
|
+
train=Generic.C(path=dir / "training.1600000.processed.noemoticon.csv"),
|
|
31
|
+
test=Generic.C(path=dir / "testdata.manual.2009.06.14.csv"),
|
|
32
|
+
)
|
|
@@ -11,6 +11,6 @@ def aclimdb(data):
|
|
|
11
11
|
Paper http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf
|
|
12
12
|
"""
|
|
13
13
|
return {
|
|
14
|
-
"train": FolderBased(path=data / "train", classes=["neg", "pos"]),
|
|
15
|
-
"test": FolderBased(path=data / "test", classes=["neg", "pos"]),
|
|
14
|
+
"train": FolderBased.C(path=data / "train", classes=["neg", "pos"]),
|
|
15
|
+
"test": FolderBased.C(path=data / "test", classes=["neg", "pos"]),
|
|
16
16
|
}
|
|
@@ -17,7 +17,6 @@ of three CD-ROMs of SGML encoded documents distributed by LDC plus queries and a
|
|
|
17
17
|
See also https://trec.nist.gov/data/docs_eng.html and https://trec.nist.gov/data/intro_eng.html
|
|
18
18
|
"""
|
|
19
19
|
|
|
20
|
-
from datamaestro.data import Base
|
|
21
20
|
from datamaestro_text.data.ir.trec import TipsterCollection
|
|
22
21
|
from datamaestro.download.links import linkfolder
|
|
23
22
|
from datamaestro.definitions import (
|
|
@@ -32,4 +32,4 @@ def v1(train, validation):
|
|
|
32
32
|
Only the train and validation dataset are available. The test set is hidden
|
|
33
33
|
for the leaderboard.
|
|
34
34
|
"""
|
|
35
|
-
return {"train": File(path=train), "validation": File(path=validation)}
|
|
35
|
+
return {"train": File.C(path=train), "validation": File.C(path=validation)}
|
|
@@ -30,9 +30,9 @@ def WikiText(data, type):
|
|
|
30
30
|
https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/
|
|
31
31
|
"""
|
|
32
32
|
return {
|
|
33
|
-
"train": File(path=data / ("wiki.train.%s" % type)),
|
|
34
|
-
"validation": File(path=data / ("wiki.valid.%s" % type)),
|
|
35
|
-
"test": File(path=data / ("wiki.test.%s" % type)),
|
|
33
|
+
"train": File.C(path=data / ("wiki.train.%s" % type)),
|
|
34
|
+
"validation": File.C(path=data / ("wiki.valid.%s" % type)),
|
|
35
|
+
"test": File.C(path=data / ("wiki.test.%s" % type)),
|
|
36
36
|
}
|
|
37
37
|
|
|
38
38
|
|
|
@@ -31,10 +31,10 @@ def small(ds) -> Movielens:
|
|
|
31
31
|
100,000 ratings and 3,600 tag applications applied to 9,000 movies by 600 users (as of 9/2018)
|
|
32
32
|
"""
|
|
33
33
|
return {
|
|
34
|
-
"ratings": csv.Generic(path=ds / "ratings.csv", names_row=0),
|
|
35
|
-
"links": csv.Generic(path=ds / "links.csv", names_row=0),
|
|
36
|
-
"movies": csv.Generic(path=ds / "movies.csv", names_row=0),
|
|
37
|
-
"tags": csv.Generic(path=ds / "tags.csv", names_row=0),
|
|
34
|
+
"ratings": csv.Generic.C(path=ds / "ratings.csv", names_row=0),
|
|
35
|
+
"links": csv.Generic.C(path=ds / "links.csv", names_row=0),
|
|
36
|
+
"movies": csv.Generic.C(path=ds / "movies.csv", names_row=0),
|
|
37
|
+
"tags": csv.Generic.C(path=ds / "tags.csv", names_row=0),
|
|
38
38
|
}
|
|
39
39
|
|
|
40
40
|
|
|
@@ -46,8 +46,8 @@ def full(ds) -> Movielens:
|
|
|
46
46
|
27,000,000 ratings and 1,100,000 tag applications applied to 58,000 movies by 280,000 users (as of 9/2018)
|
|
47
47
|
"""
|
|
48
48
|
return {
|
|
49
|
-
"ratings": csv.Generic(path=ds / "ratings.csv", names_row=0),
|
|
50
|
-
"links": csv.Generic(path=ds / "links.csv", names_row=0),
|
|
51
|
-
"movies": csv.Generic(path=ds / "movies.csv", names_row=0),
|
|
52
|
-
"tags": csv.Generic(path=ds / "tags.csv", names_row=0),
|
|
49
|
+
"ratings": csv.Generic.C(path=ds / "ratings.csv", names_row=0),
|
|
50
|
+
"links": csv.Generic.C(path=ds / "links.csv", names_row=0),
|
|
51
|
+
"movies": csv.Generic.C(path=ds / "movies.csv", names_row=0),
|
|
52
|
+
"tags": csv.Generic.C(path=ds / "tags.csv", names_row=0),
|
|
53
53
|
}
|
|
@@ -34,9 +34,9 @@ def gsd(ds) -> Supervised:
|
|
|
34
34
|
is updated since 2015 independently from the previous source.
|
|
35
35
|
"""
|
|
36
36
|
return {
|
|
37
|
-
"train": CoNLL_U(path=ds / "fr_gsd-ud-train.conllu"),
|
|
38
|
-
"test": CoNLL_U(path=ds / "fr_gsd-ud-dev.conllu"),
|
|
39
|
-
"validation": CoNLL_U(path=ds / "fr_gsd-ud-test.conllu"),
|
|
37
|
+
"train": CoNLL_U.C(path=ds / "fr_gsd-ud-train.conllu"),
|
|
38
|
+
"test": CoNLL_U.C(path=ds / "fr_gsd-ud-dev.conllu"),
|
|
39
|
+
"validation": CoNLL_U.C(path=ds / "fr_gsd-ud-test.conllu"),
|
|
40
40
|
}
|
|
41
41
|
|
|
42
42
|
|
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
from enum import Enum
|
|
3
|
+
from datamaestro_text.data.ir.base import IDItem, SimpleTextItem
|
|
4
|
+
from experimaestro import Param
|
|
3
5
|
from typing import Dict, Generic, Iterator, List, Optional, Sequence, Tuple
|
|
4
6
|
from attr import define
|
|
7
|
+
from datamaestro.record import record_type
|
|
5
8
|
from datamaestro.data import Base
|
|
6
9
|
from datamaestro.record import Record, Item
|
|
7
|
-
from datamaestro_text.data.ir import TopicRecord
|
|
10
|
+
from datamaestro_text.data.ir import TopicRecord, Topics
|
|
8
11
|
from datamaestro_text.utils.iter import FactoryIterable, LazyList, RangeView
|
|
9
12
|
|
|
10
13
|
# ---- Basic types
|
|
@@ -120,20 +123,17 @@ class ConversationNode:
|
|
|
120
123
|
...
|
|
121
124
|
|
|
122
125
|
@abstractmethod
|
|
123
|
-
def parent(self) -> Optional["ConversationNode"]:
|
|
124
|
-
...
|
|
126
|
+
def parent(self) -> Optional["ConversationNode"]: ...
|
|
125
127
|
|
|
126
128
|
@abstractmethod
|
|
127
|
-
def children(self) -> List["ConversationNode"]:
|
|
128
|
-
...
|
|
129
|
+
def children(self) -> List["ConversationNode"]: ...
|
|
129
130
|
|
|
130
131
|
|
|
131
132
|
class ConversationTree(ABC):
|
|
132
133
|
"""Represents a conversation tree"""
|
|
133
134
|
|
|
134
135
|
@abstractmethod
|
|
135
|
-
def root(self) -> ConversationNode:
|
|
136
|
-
...
|
|
136
|
+
def root(self) -> ConversationNode: ...
|
|
137
137
|
|
|
138
138
|
@abstractmethod
|
|
139
139
|
def __iter__(self) -> Iterator[ConversationNode]:
|
|
@@ -253,5 +253,30 @@ class ConversationDataset(Base, ABC):
|
|
|
253
253
|
@abstractmethod
|
|
254
254
|
def __iter__(self) -> Iterator[ConversationTree]:
|
|
255
255
|
"""Return an iterator over conversations"""
|
|
256
|
-
|
|
257
|
-
|
|
256
|
+
...
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
class ConversationUserTopics(Topics):
|
|
260
|
+
"""Extract user topics from conversations"""
|
|
261
|
+
|
|
262
|
+
conversations: Param[ConversationDataset]
|
|
263
|
+
|
|
264
|
+
topic_recordtype = record_type(IDItem, SimpleTextItem)
|
|
265
|
+
|
|
266
|
+
def iter(self) -> Iterator[TopicRecord]:
|
|
267
|
+
"""Returns an iterator over topics"""
|
|
268
|
+
# Extracts topics from conversations, Each user query is a topic (can perform retrieval on it)
|
|
269
|
+
# TODO: merge with xpmir.learning.DatasetConversationBase -> same logic
|
|
270
|
+
|
|
271
|
+
records: List[TopicRecord] = []
|
|
272
|
+
for conversation in self.conversations.__iter__():
|
|
273
|
+
nodes = [
|
|
274
|
+
node
|
|
275
|
+
for node in conversation
|
|
276
|
+
if node.entry[EntryType] == EntryType.USER_QUERY
|
|
277
|
+
]
|
|
278
|
+
for node in nodes:
|
|
279
|
+
records.append(
|
|
280
|
+
node.entry.update(ConversationHistoryItem(node.history()))
|
|
281
|
+
)
|
|
282
|
+
return iter(records)
|
|
@@ -1,10 +1,11 @@
|
|
|
1
|
-
from typing import Iterator, List
|
|
1
|
+
from typing import Iterator, List
|
|
2
2
|
from attr import define, field
|
|
3
3
|
import json
|
|
4
4
|
import logging
|
|
5
5
|
from datamaestro.data import File
|
|
6
6
|
from datamaestro.record import Record
|
|
7
7
|
|
|
8
|
+
from datamaestro_text.data.ir import Topics
|
|
8
9
|
from datamaestro_text.data.ir.base import (
|
|
9
10
|
IDItem,
|
|
10
11
|
SimpleTextItem,
|
|
@@ -12,7 +13,6 @@ from datamaestro_text.data.ir.base import (
|
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
from .base import (
|
|
15
|
-
AnswerDocumentURL,
|
|
16
16
|
AnswerEntry,
|
|
17
17
|
ConversationTree,
|
|
18
18
|
EntryType,
|
|
@@ -21,6 +21,25 @@ from .base import (
|
|
|
21
21
|
)
|
|
22
22
|
from . import ConversationDataset
|
|
23
23
|
|
|
24
|
+
# Keys to change in the dataset entries for compatibility across different years
|
|
25
|
+
|
|
26
|
+
KEY_MAPPINGS = {
|
|
27
|
+
# Keys to replace: Target Key
|
|
28
|
+
"turns": "responses",
|
|
29
|
+
"utterance": "user_utterance",
|
|
30
|
+
"ptkb_provenance": "relevant_ptkbs",
|
|
31
|
+
"response_provenance": "citations",
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def norm_dict(entry: dict) -> dict:
|
|
36
|
+
"""Convert keys in the entry to match the expected format."""
|
|
37
|
+
normalized = {}
|
|
38
|
+
for k, v in entry.items():
|
|
39
|
+
# Check for direct mapping, then try lowercase mapping
|
|
40
|
+
new_key = KEY_MAPPINGS.get(k) or KEY_MAPPINGS.get(k.lower()) or k.lower()
|
|
41
|
+
normalized[new_key] = v
|
|
42
|
+
return normalized
|
|
24
43
|
|
|
25
44
|
|
|
26
45
|
@define(kw_only=True)
|
|
@@ -47,7 +66,7 @@ class IkatConversationEntry:
|
|
|
47
66
|
|
|
48
67
|
|
|
49
68
|
@define(kw_only=True)
|
|
50
|
-
class
|
|
69
|
+
class IkatConversationTopic:
|
|
51
70
|
"""A query with past history"""
|
|
52
71
|
|
|
53
72
|
number: str
|
|
@@ -60,14 +79,20 @@ class IkatDatasetEntry:
|
|
|
60
79
|
"""The personal knowledge base associated with the user"""
|
|
61
80
|
|
|
62
81
|
responses: List[IkatConversationEntry] = field(
|
|
63
|
-
converter=lambda items: [
|
|
82
|
+
converter=lambda items: [
|
|
83
|
+
IkatConversationEntry(**item) if isinstance(item, dict) else item
|
|
84
|
+
for item in map(norm_dict, items)
|
|
85
|
+
]
|
|
64
86
|
)
|
|
65
87
|
"""The list of responses to the query"""
|
|
66
88
|
|
|
67
89
|
|
|
68
|
-
class
|
|
90
|
+
class IkatConversations(ConversationDataset, File):
|
|
91
|
+
"""A dataset containing conversations from the IKAT project"""
|
|
69
92
|
|
|
70
|
-
|
|
93
|
+
"""Keys to change in the dataset entries for compatibility across different years"""
|
|
94
|
+
|
|
95
|
+
def entries(self) -> Iterator[IkatConversationTopic]:
|
|
71
96
|
"""Reads all conversation entries from the dataset file."""
|
|
72
97
|
with self.path.open("rt") as fp:
|
|
73
98
|
raw_data = json.load(fp)
|
|
@@ -75,12 +100,13 @@ class IkatDataset(ConversationDataset, File):
|
|
|
75
100
|
logging.debug("Loaded %d entries from %s", len(raw_data), self.path)
|
|
76
101
|
logging.debug(f"raw data has keys {raw_data[0].keys()}")
|
|
77
102
|
|
|
78
|
-
processed_data = []
|
|
79
103
|
for entry in raw_data:
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
104
|
+
try:
|
|
105
|
+
normalized_entry = norm_dict(entry)
|
|
106
|
+
yield IkatConversationTopic(**normalized_entry)
|
|
107
|
+
except Exception as e:
|
|
108
|
+
logging.warning(f"Failed to parse entry: {e}")
|
|
109
|
+
raise e
|
|
84
110
|
|
|
85
111
|
def __iter__(self) -> Iterator[ConversationTree]:
|
|
86
112
|
for entry in self.entries():
|
|
@@ -88,7 +114,7 @@ class IkatDataset(ConversationDataset, File):
|
|
|
88
114
|
|
|
89
115
|
for turn in entry.responses:
|
|
90
116
|
turn: IkatConversationEntry = turn # Ensure type is correct
|
|
91
|
-
query_id = f"{entry.number}
|
|
117
|
+
query_id = f"{entry.number}_{turn.turn_id}"
|
|
92
118
|
|
|
93
119
|
# USER QUERY record
|
|
94
120
|
history.append(
|
|
@@ -117,4 +143,3 @@ class IkatDataset(ConversationDataset, File):
|
|
|
117
143
|
# Ensure reverse if needed for compatibility (optional)
|
|
118
144
|
history.reverse()
|
|
119
145
|
yield SingleConversationTree(entry.number, history)
|
|
120
|
-
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
"""Generic data types for information retrieval"""
|
|
2
2
|
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
|
+
from enum import Enum
|
|
4
5
|
from functools import cached_property
|
|
6
|
+
import logging
|
|
5
7
|
from pathlib import Path
|
|
6
8
|
from attrs import define
|
|
7
|
-
from typing import Callable, Dict, Iterator, List, Optional, Tuple, Type
|
|
9
|
+
from typing import Callable, Dict, Iterator, List, Optional, Tuple, Type, TYPE_CHECKING
|
|
8
10
|
import random
|
|
9
11
|
from experimaestro import Config
|
|
10
12
|
from datamaestro.definitions import datatasks, Param, Meta
|
|
@@ -28,6 +30,9 @@ from .base import ( # noqa: F401
|
|
|
28
30
|
AdhocAssessedTopic,
|
|
29
31
|
)
|
|
30
32
|
|
|
33
|
+
#: A adhoc run dictionary (query id -> doc id -> score)
|
|
34
|
+
AdhocRunDict = dict[str, dict[str, float]]
|
|
35
|
+
|
|
31
36
|
|
|
32
37
|
class Documents(Base):
|
|
33
38
|
"""A set of documents with identifiers
|
|
@@ -45,6 +50,22 @@ class Documents(Base):
|
|
|
45
50
|
def iter_documents(self) -> Iterator[DocumentRecord]:
|
|
46
51
|
return self.iter()
|
|
47
52
|
|
|
53
|
+
def iter_documents_from(self, start=0) -> Iterator[DocumentRecord]:
|
|
54
|
+
"""Iterate over a range of documents
|
|
55
|
+
|
|
56
|
+
Can be specialized in a subclass for faster access
|
|
57
|
+
|
|
58
|
+
:param start: The starting document, defaults to 0
|
|
59
|
+
:return: An iterator
|
|
60
|
+
"""
|
|
61
|
+
iter = self.iter()
|
|
62
|
+
if start > 0:
|
|
63
|
+
logging.info("skipping %d documents", start + 1)
|
|
64
|
+
for _ in range(start + 1):
|
|
65
|
+
next(iter)
|
|
66
|
+
|
|
67
|
+
return iter
|
|
68
|
+
|
|
48
69
|
def iter_ids(self) -> Iterator[str]:
|
|
49
70
|
"""Iterates over document ids
|
|
50
71
|
|
|
@@ -68,6 +89,19 @@ class Documents(Base):
|
|
|
68
89
|
...
|
|
69
90
|
|
|
70
91
|
|
|
92
|
+
class FileAccess(Enum):
|
|
93
|
+
"""Defines how to access files (e.g. for document stores)"""
|
|
94
|
+
|
|
95
|
+
FILE = 0
|
|
96
|
+
"""Direct file access"""
|
|
97
|
+
|
|
98
|
+
MMAP = 1
|
|
99
|
+
"""Use mmap"""
|
|
100
|
+
|
|
101
|
+
MEMORY = 2
|
|
102
|
+
"""Use memory"""
|
|
103
|
+
|
|
104
|
+
|
|
71
105
|
class DocumentStore(Documents):
|
|
72
106
|
"""A document store
|
|
73
107
|
|
|
@@ -77,6 +111,10 @@ class DocumentStore(Documents):
|
|
|
77
111
|
- return the number of documents
|
|
78
112
|
"""
|
|
79
113
|
|
|
114
|
+
file_access: Meta[FileAccess] = FileAccess.MMAP
|
|
115
|
+
"""How to access the file collection (might not have any impact, depends on
|
|
116
|
+
the docstore)"""
|
|
117
|
+
|
|
80
118
|
def docid_internal2external(self, docid: int):
|
|
81
119
|
"""Converts an internal collection ID (integer) to an external ID"""
|
|
82
120
|
raise NotImplementedError(f"For class {self.__class__}")
|
|
@@ -168,7 +206,10 @@ class AdhocAssessments(Base, ABC):
|
|
|
168
206
|
class AdhocRun(Base):
|
|
169
207
|
"""IR adhoc run"""
|
|
170
208
|
|
|
171
|
-
|
|
209
|
+
@abstractmethod
|
|
210
|
+
def get_dict(self) -> "AdhocRunDict":
|
|
211
|
+
"""Get the run as a dictionary query ID -> doc ID -> score"""
|
|
212
|
+
...
|
|
172
213
|
|
|
173
214
|
|
|
174
215
|
class AdhocResults(Base):
|
|
@@ -304,5 +345,4 @@ class PairwiseSampleDataset(Base, ABC):
|
|
|
304
345
|
"""Datasets where each record is a query with positive and negative samples"""
|
|
305
346
|
|
|
306
347
|
@abstractmethod
|
|
307
|
-
def iter(self) -> Iterator[PairwiseSample]:
|
|
308
|
-
...
|
|
348
|
+
def iter(self) -> Iterator[PairwiseSample]: ...
|
datamaestro_text/data/ir/base.py
CHANGED
|
@@ -43,6 +43,7 @@ class IDItem(Item, ABC):
|
|
|
43
43
|
|
|
44
44
|
id: str
|
|
45
45
|
|
|
46
|
+
|
|
46
47
|
@define
|
|
47
48
|
class UrlItem(Item):
|
|
48
49
|
"""An url item"""
|
|
@@ -70,7 +71,7 @@ class AdhocAssessedTopic:
|
|
|
70
71
|
"""List of assessments for this topic"""
|
|
71
72
|
|
|
72
73
|
|
|
73
|
-
def create_record(*items: Item, id: str = None, text: str = None):
|
|
74
|
+
def create_record(*items: Item, id: str = None, text: str = None) -> Record:
|
|
74
75
|
"""Easy creation of a text/id item"""
|
|
75
76
|
extra_items = []
|
|
76
77
|
if id is not None:
|
|
@@ -99,7 +99,7 @@ class WapoDocument(TextItem):
|
|
|
99
99
|
body_media: Tuple[WapoDocMedia, ...]
|
|
100
100
|
|
|
101
101
|
@cached_property
|
|
102
|
-
def text(self):
|
|
102
|
+
def text(self):
|
|
103
103
|
return f"{self.title} {self.body_paras_html}"
|
|
104
104
|
|
|
105
105
|
|
|
@@ -132,11 +132,15 @@ class DprW100Doc(TextItem):
|
|
|
132
132
|
text: str
|
|
133
133
|
title: str
|
|
134
134
|
|
|
135
|
-
|
|
135
|
+
|
|
136
|
+
@define
|
|
136
137
|
class MsMarcoV2Passage(TextItem):
|
|
137
138
|
text: str
|
|
138
139
|
spans: Tuple[Tuple[int, int], ...]
|
|
139
140
|
msmarco_document_id: str
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
@define
|
|
140
144
|
class Touche2020(TextItem):
|
|
141
145
|
text: str
|
|
142
146
|
title: str
|
|
@@ -204,11 +208,13 @@ class TrecTopic(SimpleTextItem):
|
|
|
204
208
|
|
|
205
209
|
TrecTopicRecord = record_type(IDItem, TrecTopic)
|
|
206
210
|
|
|
211
|
+
|
|
207
212
|
@define
|
|
208
213
|
class DprW100Query(TextItem):
|
|
209
214
|
text: str
|
|
210
215
|
answers: Tuple[str]
|
|
211
216
|
|
|
217
|
+
|
|
212
218
|
@define
|
|
213
219
|
class TrecBackgroundLinkingQuery(IDItem):
|
|
214
220
|
query_id: str
|
|
@@ -1,12 +1,21 @@
|
|
|
1
|
-
|
|
1
|
+
import bz2
|
|
2
|
+
from hashlib import md5, sha256
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
2
6
|
from typing import List, NamedTuple
|
|
7
|
+
from datamaestro_text.utils.files import TQDMFileReader
|
|
3
8
|
from experimaestro import Constant
|
|
4
|
-
import attrs
|
|
5
|
-
|
|
6
9
|
from datamaestro.record import Record
|
|
7
|
-
from datamaestro_text.data.ir.base import
|
|
10
|
+
from datamaestro_text.data.ir.base import (
|
|
11
|
+
DocumentRecord,
|
|
12
|
+
IDItem,
|
|
13
|
+
SimpleTextItem,
|
|
14
|
+
UrlItem,
|
|
15
|
+
)
|
|
8
16
|
from datamaestro_text.datasets.irds.data import LZ4DocumentStore
|
|
9
17
|
from datamaestro_text.data.ir.formats import OrConvQADocument
|
|
18
|
+
from tqdm import tqdm
|
|
10
19
|
|
|
11
20
|
|
|
12
21
|
class OrConvQADocumentStore(LZ4DocumentStore):
|
|
@@ -27,3 +36,89 @@ class OrConvQADocumentStore(LZ4DocumentStore):
|
|
|
27
36
|
fields = data._asdict()
|
|
28
37
|
del fields["id"]
|
|
29
38
|
return Record(OrConvQADocument(**fields), IDItem(data.id))
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class IKatClueWeb22DocumentStore(LZ4DocumentStore):
|
|
42
|
+
@staticmethod
|
|
43
|
+
def generator(path: Path, checksums_file: Path, passages_hashes: Path):
|
|
44
|
+
"""Returns an iterator over iKAT 2022-25 documents
|
|
45
|
+
|
|
46
|
+
:param path: The folder containing the files
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def __iter__():
|
|
50
|
+
errors = False
|
|
51
|
+
|
|
52
|
+
assert checksums_file.is_file(), f"{checksums_file} does not exist"
|
|
53
|
+
assert passages_hashes.is_file(), f"{passages_hashes} does not exist"
|
|
54
|
+
|
|
55
|
+
# Get the list of files
|
|
56
|
+
with checksums_file.open("rt") as fp:
|
|
57
|
+
files = []
|
|
58
|
+
for line in fp:
|
|
59
|
+
checksum, filename = line.strip().split()
|
|
60
|
+
files.append((checksum, filename))
|
|
61
|
+
if not (path / filename).is_file():
|
|
62
|
+
logging.error("File %s does not exist", path / filename)
|
|
63
|
+
errors = True
|
|
64
|
+
|
|
65
|
+
assert not errors, "Errors detected, stopping"
|
|
66
|
+
|
|
67
|
+
# Check the SHA256 sums
|
|
68
|
+
match checksums_file.suffix:
|
|
69
|
+
case ".sha256sums":
|
|
70
|
+
hasher_factory = sha256
|
|
71
|
+
case _:
|
|
72
|
+
raise NotImplementedError(
|
|
73
|
+
f"Cannot handle {checksums_file.suffix} checksum files"
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
for checksum, filename in tqdm(files):
|
|
77
|
+
logging.info("Checking %s", filename)
|
|
78
|
+
hasher = hasher_factory()
|
|
79
|
+
with (path / filename).open("rb") as fp:
|
|
80
|
+
while data := fp.read(2**20):
|
|
81
|
+
hasher.update(data)
|
|
82
|
+
|
|
83
|
+
file_checksum = hasher.hexdigest()
|
|
84
|
+
assert file_checksum == checksum, (
|
|
85
|
+
f"Expected {checksum}, " f"got {file_checksum} for {filename}"
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# Get the MD5 hashes of all the passages
|
|
89
|
+
logging.info("Reading the hashes of all passages")
|
|
90
|
+
with TQDMFileReader(passages_hashes, "rt", bz2.open) as fp:
|
|
91
|
+
passage_checksums = {}
|
|
92
|
+
for line in tqdm(fp):
|
|
93
|
+
doc_id, passage_no, checksum = line.strip().split()
|
|
94
|
+
passage_checksums[f"{doc_id}:{passage_no}"] = checksum # noqa: E231
|
|
95
|
+
|
|
96
|
+
# Read the files
|
|
97
|
+
logging.info("Starting to read the files")
|
|
98
|
+
for _, filename in tqdm(files):
|
|
99
|
+
with TQDMFileReader(path / filename, "rt", bz2.open) as jsonl_fp:
|
|
100
|
+
for line in jsonl_fp:
|
|
101
|
+
data = json.loads(line)
|
|
102
|
+
expected = passage_checksums[data["id"]]
|
|
103
|
+
computed = md5(data["contents"].encode("utf-8")).hexdigest()
|
|
104
|
+
assert expected == computed, (
|
|
105
|
+
f"Expected {expected}, "
|
|
106
|
+
f"got {computed} for passage {data['id']} in {filename}"
|
|
107
|
+
)
|
|
108
|
+
yield IKatClueWeb22DocumentStore.Document(**data)
|
|
109
|
+
|
|
110
|
+
return __iter__
|
|
111
|
+
|
|
112
|
+
class Document(NamedTuple):
|
|
113
|
+
id: str
|
|
114
|
+
contents: str
|
|
115
|
+
url: str
|
|
116
|
+
|
|
117
|
+
data_cls = Document
|
|
118
|
+
lookup_field: Constant[str] = "id"
|
|
119
|
+
index_fields: Constant[List[str]] = ["id"]
|
|
120
|
+
|
|
121
|
+
def converter(self, data):
|
|
122
|
+
return DocumentRecord(
|
|
123
|
+
IDItem(data.id), SimpleTextItem(data.contents), UrlItem(data.url)
|
|
124
|
+
)
|
datamaestro_text/data/ir/trec.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
+
import re
|
|
1
2
|
from typing import Dict, List, Optional
|
|
2
|
-
from datamaestro.data import Base
|
|
3
3
|
from experimaestro import documentation, Param, Meta
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from datamaestro.record import Record
|
|
6
5
|
from datamaestro_text.data.ir import (
|
|
6
|
+
AdhocRunDict,
|
|
7
7
|
Documents,
|
|
8
8
|
Topics,
|
|
9
9
|
AdhocAssessments,
|
|
@@ -47,6 +47,11 @@ class TrecAdhocAssessments(AdhocAssessments):
|
|
|
47
47
|
class TrecAdhocRun(AdhocRun):
|
|
48
48
|
path: Param[Path]
|
|
49
49
|
|
|
50
|
+
def get_dict(self) -> AdhocRunDict:
|
|
51
|
+
import datamaestro_text.interfaces.trec as trec
|
|
52
|
+
|
|
53
|
+
return trec.parse_run(self.path)
|
|
54
|
+
|
|
50
55
|
|
|
51
56
|
class TrecAdhocResults(AdhocResults):
|
|
52
57
|
"""Adhoc results (TREC format)"""
|
|
@@ -62,8 +67,6 @@ class TrecAdhocResults(AdhocResults):
|
|
|
62
67
|
|
|
63
68
|
def get_results(self) -> Dict[str, float]:
|
|
64
69
|
"""Returns the results as a dictionary {metric_name: value}"""
|
|
65
|
-
import re
|
|
66
|
-
|
|
67
70
|
re_spaces = re.compile(r"\s+")
|
|
68
71
|
|
|
69
72
|
results = {}
|
|
@@ -122,7 +122,14 @@ class Documents(ir.DocumentStore, IRDSId):
|
|
|
122
122
|
formats.Touche2020, "doc_id", "text", "title", "stance", "url"
|
|
123
123
|
),
|
|
124
124
|
_irds.beir.BeirSciDoc: tuple_constructor(
|
|
125
|
-
formats.SciDocs,
|
|
125
|
+
formats.SciDocs,
|
|
126
|
+
"doc_id",
|
|
127
|
+
"text",
|
|
128
|
+
"title",
|
|
129
|
+
"authors",
|
|
130
|
+
"year",
|
|
131
|
+
"cited_by",
|
|
132
|
+
"references",
|
|
126
133
|
),
|
|
127
134
|
_irds.msmarco_document.MsMarcoDocument: tuple_constructor(
|
|
128
135
|
formats.MsMarcoDocument, "doc_id", "url", "title", "body"
|
|
@@ -198,13 +205,29 @@ class Documents(ir.DocumentStore, IRDSId):
|
|
|
198
205
|
for doc in self.dataset.docs_iter():
|
|
199
206
|
yield self.converter(self.document_recordtype, doc)
|
|
200
207
|
|
|
208
|
+
def iter_documents_from(self, start=0):
|
|
209
|
+
for doc in self.dataset.docs_iter()[start:]:
|
|
210
|
+
yield self.converter(self.document_recordtype, doc)
|
|
211
|
+
|
|
201
212
|
@property
|
|
202
213
|
def documentcount(self):
|
|
203
214
|
return self.dataset.docs_count()
|
|
204
215
|
|
|
205
216
|
@cached_property
|
|
206
217
|
def store(self):
|
|
207
|
-
|
|
218
|
+
kwargs = {}
|
|
219
|
+
try:
|
|
220
|
+
# Translate to ir datasets docstore options
|
|
221
|
+
import ir_datasets.indices as ir_indices
|
|
222
|
+
file_access = {
|
|
223
|
+
ir.FileAccess.MMAP: ir_indices.FileAccess.MMAP,
|
|
224
|
+
ir.FileAccess.FILE: ir_indices.FileAccess.FILE,
|
|
225
|
+
ir.FileAccess.MEMORY: ir_indices.FileAccess.MEMORY
|
|
226
|
+
}[self.file_access]
|
|
227
|
+
kwargs = {"options": ir_indices.DocstoreOptions(file_access=file_access)}
|
|
228
|
+
except ImportError:
|
|
229
|
+
logging.warning("This version of ir-datasets cannot handle docstore options")
|
|
230
|
+
return self.dataset.docs_store(**kwargs)
|
|
208
231
|
|
|
209
232
|
@cached_property
|
|
210
233
|
def _docs(self):
|
|
@@ -244,7 +267,7 @@ if hasattr(_irds, "miracl"):
|
|
|
244
267
|
)
|
|
245
268
|
|
|
246
269
|
|
|
247
|
-
class LZ4DocumentStore(ir.DocumentStore):
|
|
270
|
+
class LZ4DocumentStore(ir.DocumentStore, ABC):
|
|
248
271
|
"""A LZ4-based document store"""
|
|
249
272
|
|
|
250
273
|
path: Param[Path]
|
|
@@ -253,7 +276,7 @@ class LZ4DocumentStore(ir.DocumentStore):
|
|
|
253
276
|
lookup_field: Param[str]
|
|
254
277
|
|
|
255
278
|
# Extra indexed fields (e.g. URLs)
|
|
256
|
-
index_fields: List[str]
|
|
279
|
+
index_fields: List[str] = []
|
|
257
280
|
|
|
258
281
|
@cached_property
|
|
259
282
|
def store(self):
|
|
@@ -285,6 +308,9 @@ class LZ4DocumentStore(ir.DocumentStore):
|
|
|
285
308
|
"""Returns an iterator over documents"""
|
|
286
309
|
return map(self.converter, self.store.__iter__())
|
|
287
310
|
|
|
311
|
+
def iter_documents_from(self, start=0):
|
|
312
|
+
return map(self.converter, self.store.__iter__()[start:])
|
|
313
|
+
|
|
288
314
|
@cached_property
|
|
289
315
|
def documentcount(self):
|
|
290
316
|
if self.count:
|
|
@@ -386,7 +412,13 @@ class Topics(ir.TopicsStore, IRDSId):
|
|
|
386
412
|
formats.TrecTopic, "query_id", "text", "description", "narrative"
|
|
387
413
|
),
|
|
388
414
|
_irds.beir.BeirSciQuery: tuple_constructor(
|
|
389
|
-
formats.SciDocsTopic,
|
|
415
|
+
formats.SciDocsTopic,
|
|
416
|
+
"query_id",
|
|
417
|
+
"text",
|
|
418
|
+
"authors",
|
|
419
|
+
"year",
|
|
420
|
+
"cited_by",
|
|
421
|
+
"references",
|
|
390
422
|
),
|
|
391
423
|
_irds.tweets2013_ia.TrecMb13Query: tuple_constructor(
|
|
392
424
|
formats.TrecMb13Query, "query_id", "query", "time", "tweet_time"
|
|
@@ -400,10 +432,7 @@ class Topics(ir.TopicsStore, IRDSId):
|
|
|
400
432
|
"description",
|
|
401
433
|
),
|
|
402
434
|
_irds.dpr_w100.DprW100Query: tuple_constructor(
|
|
403
|
-
formats.DprW100Query,
|
|
404
|
-
"query_id",
|
|
405
|
-
"text",
|
|
406
|
-
"answers"
|
|
435
|
+
formats.DprW100Query, "query_id", "text", "answers"
|
|
407
436
|
),
|
|
408
437
|
}
|
|
409
438
|
|
|
@@ -435,11 +464,12 @@ class Topics(ir.TopicsStore, IRDSId):
|
|
|
435
464
|
def iter(self) -> Iterator[TopicRecord]:
|
|
436
465
|
"""Returns an iterator over topics"""
|
|
437
466
|
return self.handler.iter()
|
|
438
|
-
|
|
467
|
+
|
|
468
|
+
|
|
439
469
|
class TrecBackgroundLinkingTopicsHandler(TopicsHandler):
|
|
440
470
|
def __init__(self, dataset):
|
|
441
471
|
self.dataset = dataset
|
|
442
|
-
|
|
472
|
+
|
|
443
473
|
@cached_property
|
|
444
474
|
def ext2records(self):
|
|
445
475
|
return {record[IDItem].id: record for record in self.records}
|
|
@@ -462,10 +492,12 @@ class TrecBackgroundLinkingTopicsHandler(TopicsHandler):
|
|
|
462
492
|
records = []
|
|
463
493
|
|
|
464
494
|
for query in self.dataset.dataset.queries_iter():
|
|
465
|
-
topic =
|
|
495
|
+
topic = Record(
|
|
466
496
|
IDItem(query.query_id),
|
|
467
497
|
# Following BEIR documentation, we use title of documents as queries: https://github.com/beir-cellar/beir/blob/main/examples/dataset/README.md#queries-and-qrels
|
|
468
|
-
SimpleTextItem(
|
|
498
|
+
SimpleTextItem(
|
|
499
|
+
self.dataset.dataset.docs_store().get(query.doc_id).title
|
|
500
|
+
),
|
|
469
501
|
UrlItem(query.url),
|
|
470
502
|
)
|
|
471
503
|
records.append(topic)
|
|
@@ -477,11 +509,10 @@ class TrecBackgroundLinkingTopicsHandler(TopicsHandler):
|
|
|
477
509
|
|
|
478
510
|
|
|
479
511
|
Topics.HANDLERS.update(
|
|
480
|
-
{
|
|
481
|
-
_irds.wapo.TrecBackgroundLinkingQuery: TrecBackgroundLinkingTopicsHandler
|
|
482
|
-
}
|
|
512
|
+
{_irds.wapo.TrecBackgroundLinkingQuery: TrecBackgroundLinkingTopicsHandler}
|
|
483
513
|
)
|
|
484
514
|
|
|
515
|
+
|
|
485
516
|
class CastTopicsHandler(TopicsHandler):
|
|
486
517
|
def __init__(self, dataset):
|
|
487
518
|
self.dataset = dataset
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
from attrs import define
|
|
2
1
|
from pathlib import Path
|
|
3
2
|
from typing import Iterator, Optional
|
|
4
3
|
import re
|
|
4
|
+
from datamaestro_text.data.ir import AdhocRunDict
|
|
5
5
|
from datamaestro_text.data.ir.base import (
|
|
6
6
|
AdhocAssessedTopic,
|
|
7
7
|
TopicRecord,
|
|
@@ -10,6 +10,33 @@ from datamaestro_text.data.ir.base import (
|
|
|
10
10
|
)
|
|
11
11
|
from datamaestro_text.data.ir.formats import TrecTopicRecord, TrecTopic
|
|
12
12
|
|
|
13
|
+
# --- Runs
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def parse_run(path: Path) -> AdhocRunDict:
|
|
17
|
+
results = {}
|
|
18
|
+
with path.open("rt") as f:
|
|
19
|
+
for line in f:
|
|
20
|
+
query_id, _q0, doc_id, _rank, score, _model_id = re.split(
|
|
21
|
+
r"\s+", line.strip()
|
|
22
|
+
)
|
|
23
|
+
results.setdefault(query_id, {})[doc_id] = score
|
|
24
|
+
|
|
25
|
+
return results
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def write_run_dict(run: AdhocRunDict, run_path: Path):
|
|
29
|
+
"""Write run dict"""
|
|
30
|
+
with run_path.open("wt") as f:
|
|
31
|
+
for query_id, scored_documents in run.items():
|
|
32
|
+
scored_documents = list(
|
|
33
|
+
[(doc_id, score) for doc_id, score in scored_documents.items()]
|
|
34
|
+
)
|
|
35
|
+
scored_documents.sort(key=lambda x: x[1], reverse=True)
|
|
36
|
+
for ix, (doc_id, score) in enumerate(scored_documents):
|
|
37
|
+
f.write(f"{query_id} Q0 {doc_id} {ix + 1} {score} run\n")
|
|
38
|
+
|
|
39
|
+
|
|
13
40
|
# --- Assessments
|
|
14
41
|
|
|
15
42
|
|
datamaestro_text/utils/files.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from tqdm import tqdm
|
|
1
3
|
import gzip
|
|
2
4
|
from pathlib import Path
|
|
3
5
|
|
|
@@ -6,3 +8,104 @@ def auto_open(path: Path, mode: str):
|
|
|
6
8
|
if path.suffix == ".gz":
|
|
7
9
|
return gzip.open(path, mode)
|
|
8
10
|
return path.open(mode)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class CountingWrapper:
|
|
14
|
+
"""Wrap a file object to count the actual compressed bytes read."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, file_obj):
|
|
17
|
+
self.file_obj = file_obj
|
|
18
|
+
self.bytes_read = 0
|
|
19
|
+
|
|
20
|
+
def read(self, size=-1):
|
|
21
|
+
data = self.file_obj.read(size)
|
|
22
|
+
self.bytes_read += len(data)
|
|
23
|
+
return data
|
|
24
|
+
|
|
25
|
+
def readline(self, size=-1):
|
|
26
|
+
data = self.file_obj.readline(size)
|
|
27
|
+
self.bytes_read += len(data)
|
|
28
|
+
return data
|
|
29
|
+
|
|
30
|
+
def __iter__(self):
|
|
31
|
+
return self
|
|
32
|
+
|
|
33
|
+
def __next__(self):
|
|
34
|
+
line = self.readline()
|
|
35
|
+
if not line:
|
|
36
|
+
raise StopIteration
|
|
37
|
+
return line
|
|
38
|
+
|
|
39
|
+
def close(self):
|
|
40
|
+
self.file_obj.close()
|
|
41
|
+
|
|
42
|
+
def __getattr__(self, attr):
|
|
43
|
+
return getattr(self.file_obj, attr)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class TQDMBytesReader:
|
|
47
|
+
def __init__(self, file_obj, total_size, **tqdm_kwargs):
|
|
48
|
+
self.file_obj = CountingWrapper(file_obj)
|
|
49
|
+
self.tqdm = tqdm(
|
|
50
|
+
total=total_size,
|
|
51
|
+
unit="B",
|
|
52
|
+
unit_scale=True,
|
|
53
|
+
unit_divisor=1024,
|
|
54
|
+
**tqdm_kwargs,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
def _update_progress(self):
|
|
58
|
+
delta = self.file_obj.bytes_read - self.tqdm.n
|
|
59
|
+
if delta > 0:
|
|
60
|
+
self.tqdm.update(delta)
|
|
61
|
+
|
|
62
|
+
def read(self, size=-1):
|
|
63
|
+
data = self.file_obj.read(size)
|
|
64
|
+
self._update_progress()
|
|
65
|
+
return data
|
|
66
|
+
|
|
67
|
+
def readline(self, size=-1):
|
|
68
|
+
line = self.file_obj.readline(size)
|
|
69
|
+
self._update_progress()
|
|
70
|
+
return line
|
|
71
|
+
|
|
72
|
+
def readlines(self, hint=-1):
|
|
73
|
+
lines = self.file_obj.readlines(hint)
|
|
74
|
+
self._update_progress()
|
|
75
|
+
return lines
|
|
76
|
+
|
|
77
|
+
def __iter__(self):
|
|
78
|
+
return self
|
|
79
|
+
|
|
80
|
+
def __next__(self):
|
|
81
|
+
line = self.readline()
|
|
82
|
+
if not line:
|
|
83
|
+
raise StopIteration
|
|
84
|
+
return line
|
|
85
|
+
|
|
86
|
+
def close(self):
|
|
87
|
+
self.tqdm.close()
|
|
88
|
+
self.file_obj.close()
|
|
89
|
+
|
|
90
|
+
def __getattr__(self, attr):
|
|
91
|
+
# Delegate any other attribute to the underlying file object
|
|
92
|
+
return getattr(self.file_obj, attr)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class TQDMFileReader:
|
|
96
|
+
def __init__(self, filepath, mode="rt", file_opener=open, **tqdm_kwargs):
|
|
97
|
+
self.filepath = filepath
|
|
98
|
+
self.mode = mode
|
|
99
|
+
self.file_opener = file_opener
|
|
100
|
+
self.tqdm_kwargs = tqdm_kwargs
|
|
101
|
+
|
|
102
|
+
def __enter__(self):
|
|
103
|
+
self.file_obj = self.file_opener(self.filepath, self.mode)
|
|
104
|
+
total_size = os.path.getsize(self.filepath) # this is compressed size
|
|
105
|
+
self.reader = TQDMBytesReader(
|
|
106
|
+
self.file_obj, total_size=total_size, **self.tqdm_kwargs
|
|
107
|
+
)
|
|
108
|
+
return self.reader
|
|
109
|
+
|
|
110
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
111
|
+
self.reader.close()
|
datamaestro_text/utils/iter.py
CHANGED
|
@@ -82,3 +82,8 @@ class LazyList(Sequence):
|
|
|
82
82
|
# Convert the iterable to a list if it hasn't been already
|
|
83
83
|
if self.materialized_list is None:
|
|
84
84
|
self.materialized_list = list(self.iterable)
|
|
85
|
+
|
|
86
|
+
def reverse(self):
|
|
87
|
+
"""Reverse the list in place, materializing it if necessary"""
|
|
88
|
+
self._materialize()
|
|
89
|
+
self.materialized_list.reverse()
|
datamaestro_text/version.py
CHANGED
|
@@ -1,7 +1,14 @@
|
|
|
1
1
|
# file generated by setuptools-scm
|
|
2
2
|
# don't change, don't track in version control
|
|
3
3
|
|
|
4
|
-
__all__ = [
|
|
4
|
+
__all__ = [
|
|
5
|
+
"__version__",
|
|
6
|
+
"__version_tuple__",
|
|
7
|
+
"version",
|
|
8
|
+
"version_tuple",
|
|
9
|
+
"__commit_id__",
|
|
10
|
+
"commit_id",
|
|
11
|
+
]
|
|
5
12
|
|
|
6
13
|
TYPE_CHECKING = False
|
|
7
14
|
if TYPE_CHECKING:
|
|
@@ -9,13 +16,19 @@ if TYPE_CHECKING:
|
|
|
9
16
|
from typing import Union
|
|
10
17
|
|
|
11
18
|
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
19
|
+
COMMIT_ID = Union[str, None]
|
|
12
20
|
else:
|
|
13
21
|
VERSION_TUPLE = object
|
|
22
|
+
COMMIT_ID = object
|
|
14
23
|
|
|
15
24
|
version: str
|
|
16
25
|
__version__: str
|
|
17
26
|
__version_tuple__: VERSION_TUPLE
|
|
18
27
|
version_tuple: VERSION_TUPLE
|
|
28
|
+
commit_id: COMMIT_ID
|
|
29
|
+
__commit_id__: COMMIT_ID
|
|
19
30
|
|
|
20
|
-
__version__ = version = '2025.
|
|
21
|
-
__version_tuple__ = version_tuple = (2025,
|
|
31
|
+
__version__ = version = '2025.9.11'
|
|
32
|
+
__version_tuple__ = version_tuple = (2025, 9, 11)
|
|
33
|
+
|
|
34
|
+
__commit_id__ = commit_id = 'gadcc9bd27'
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datamaestro-text
|
|
3
|
-
Version: 2025.
|
|
3
|
+
Version: 2025.9.11
|
|
4
4
|
Summary: Datamaestro module for text-related datasets
|
|
5
5
|
Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
|
|
6
6
|
License: GPL-3
|
|
@@ -15,10 +15,10 @@ Classifier: Operating System :: OS Independent
|
|
|
15
15
|
Classifier: Programming Language :: Python
|
|
16
16
|
Classifier: Programming Language :: Python :: 3
|
|
17
17
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
18
|
-
Requires-Python: >=3.
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
19
|
Description-Content-Type: text/markdown
|
|
20
20
|
License-File: LICENSE
|
|
21
|
-
Requires-Dist: datamaestro>=1.
|
|
21
|
+
Requires-Dist: datamaestro>=1.5.0
|
|
22
22
|
Requires-Dist: ir_datasets>=0.5.8
|
|
23
23
|
Requires-Dist: attrs
|
|
24
24
|
Provides-Extra: dev
|
|
@@ -1,21 +1,21 @@
|
|
|
1
1
|
datamaestro_text/__init__.py,sha256=hU8jZpkXl3F74qIfqnJl7v4nJ9YxfoR7IpJpUREFNRI,248
|
|
2
|
-
datamaestro_text/version.py,sha256=
|
|
2
|
+
datamaestro_text/version.py,sha256=aHTcqRU_FAq8moUGgwqhCMrhMi8VBbk38TX-uMF8p20,720
|
|
3
3
|
datamaestro_text/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
datamaestro_text/config/ai/quac.yaml,sha256=h1D7UJo2z1nZ_9MXpDHuQNJG9Ma2oayUdJV6lyumAIg,1103
|
|
5
5
|
datamaestro_text/config/com/oscar-corpus.py,sha256=dDfdy_uhG6esRQQzUOOORwK8KvEq4c4hZbWt22pv-zY,733
|
|
6
|
-
datamaestro_text/config/com/sentiment140.py,sha256=
|
|
6
|
+
datamaestro_text/config/com/sentiment140.py,sha256=itfBEgcOniECXKOw8I2dhzyS9LOMsltMLfKK6NGRpVY,1293
|
|
7
7
|
datamaestro_text/config/com/fastml/goodbooks-10k.yaml,sha256=5ZABxUnBFs2ZnCXtBH8YoBiPb3SocRRdH1DLSfVWF-Y,1172
|
|
8
|
-
datamaestro_text/config/com/github/ikat.py,sha256=
|
|
9
|
-
datamaestro_text/config/com/github/aagohary/canard.py,sha256=
|
|
10
|
-
datamaestro_text/config/com/github/apple/ml-qrecc.py,sha256=
|
|
8
|
+
datamaestro_text/config/com/github/ikat.py,sha256=DCayX-t2OBeW5bOJvRxoQgIH3vy-__mYzdmVcnayAkk,4230
|
|
9
|
+
datamaestro_text/config/com/github/aagohary/canard.py,sha256=5fLwCLNBGM_7--naTCDayAMYLvK3yTD8auaEf-dqrb4,1768
|
|
10
|
+
datamaestro_text/config/com/github/apple/ml-qrecc.py,sha256=zP3w7A9KSvJVCo44OaB1az1pDKWxE6qXS4qFm3hqg3Y,3064
|
|
11
11
|
datamaestro_text/config/com/github/prdwb/orconvqa.py,sha256=MFJYaxJoqJQ6kMDHa2PIE0zoxYTA8Kyl26-vzFoMML0,3032
|
|
12
12
|
datamaestro_text/config/com/github/soskek/bookcorpus.yaml,sha256=qJKs35yeEIilEMgNvU3OEqMp1TSn7mDM2T-uYyA7kTU,1607
|
|
13
13
|
datamaestro_text/config/com/microsoft/wikiqa.yaml,sha256=U7rU-W2Xz1MYv2YXT4jCTj5DsHyM0VssbQPNa3EISaM,540
|
|
14
|
-
datamaestro_text/config/com/microsoft/msmarco/passage.py,sha256=
|
|
14
|
+
datamaestro_text/config/com/microsoft/msmarco/passage.py,sha256=hN2KOdi6ToHlodozqsYAOtxaqiUGkGGtRtb3RFSgnEU,11645
|
|
15
15
|
datamaestro_text/config/com/smashwords/bookcorpus.py,sha256=hCFjZg9t1AHET05qU31uatiAOD5EEzzWG9y_W90jcOE,861
|
|
16
16
|
datamaestro_text/config/edu/cornell/nlvr.yaml,sha256=9Yk5VZMncSmrP7JNuGXqExksgX5nQ_Zfnlps8hWze3Q,921
|
|
17
17
|
datamaestro_text/config/edu/stanford/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
|
-
datamaestro_text/config/edu/stanford/aclimdb.py,sha256=
|
|
18
|
+
datamaestro_text/config/edu/stanford/aclimdb.py,sha256=QtriReAVsbJlxkgfJWQCZdCeJ9LswYnOR9mFrgghL9c,647
|
|
19
19
|
datamaestro_text/config/edu/stanford/glove.py,sha256=bXxwiJqT8alPs5MwwxVuY0xBO8g1QWhm2L3AKVskTlA,2391
|
|
20
20
|
datamaestro_text/config/edu/stanford/im2p.yaml,sha256=JoToNyEPpmwdyLFedCBot5ypyw7p9rzi12mGXJuZin0,2909
|
|
21
21
|
datamaestro_text/config/edu/upenn/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -29,17 +29,17 @@ datamaestro_text/config/gov/nist/trec/adhoc.py,sha256=vcFaLlZ-chwDt013MYI8bYZ2ug
|
|
|
29
29
|
datamaestro_text/config/gov/nist/trec/clueweb.yaml,sha256=sm4UbdtMzWoDVPsewtVDS7Vj2jBOdgp18Xqo1X4ysQc,792
|
|
30
30
|
datamaestro_text/config/gov/nist/trec/deeplearning.yaml,sha256=QGM7PtXLJRttNdOPE16o7-k3e5tA9HgcaM_-qFDV_5Q,2125
|
|
31
31
|
datamaestro_text/config/gov/nist/trec/index.yaml,sha256=oSUhUmtukf5oXqUbJLhae8xZx7Uac5V6uZoUB-RJ7Sw,2711
|
|
32
|
-
datamaestro_text/config/gov/nist/trec/tipster.py,sha256=
|
|
32
|
+
datamaestro_text/config/gov/nist/trec/tipster.py,sha256=DirpnHpS10e27LcL7v9ksKreKVy7EgfVhyztV49VRds,5364
|
|
33
33
|
datamaestro_text/config/gov/nist/trec/web.yaml,sha256=iNIJ-PCeLwpUaJByJesyCqfFJolWCts_VETACha3hfQ,563
|
|
34
34
|
datamaestro_text/config/io/github/rajpurkar/squad.yaml,sha256=JchkTruEhVd0uOTGgek2tOgBL4UT3G3KVASvZcNigLg,1143
|
|
35
|
-
datamaestro_text/config/io/github/thunlp/fewrel.py,sha256=
|
|
35
|
+
datamaestro_text/config/io/github/thunlp/fewrel.py,sha256=rg_qAnMrXYUZhQYxA12r_Npl0ggyfTLJQjdSCjU0QxM,1228
|
|
36
36
|
datamaestro_text/config/io/metamind/research/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
37
|
-
datamaestro_text/config/io/metamind/research/wikitext.py,sha256=
|
|
37
|
+
datamaestro_text/config/io/metamind/research/wikitext.py,sha256=DjyBmG74JvuMt9RpMwuLAnxzOdByIWsk4VnXgkJp1NM,2307
|
|
38
38
|
datamaestro_text/config/net/mattmahoney/enwiki.yaml,sha256=HCUn3s0AydXX3BjJ6yUXY0vGLGWSBkOCaDhQ4PA2Adg,2452
|
|
39
39
|
datamaestro_text/config/org/acm/recsys/cb2014.yaml,sha256=5SAK3Am1k0HFugSSCIQN5mLPBfr1zZZAkhLrSH5pHQc,1274
|
|
40
40
|
datamaestro_text/config/org/cocodataset/index.yaml,sha256=KISJChMeKwlZbSnHmRcGMsm6jbcFGVe1aA4GhP2fzqw,474
|
|
41
|
-
datamaestro_text/config/org/grouplens/movielens.py,sha256=
|
|
42
|
-
datamaestro_text/config/org/universaldependencies/french.py,sha256=
|
|
41
|
+
datamaestro_text/config/org/grouplens/movielens.py,sha256=tV6OSTDdtjll1dQBCsYIlsBbtOO-MCiLles2aj0MgDA,1840
|
|
42
|
+
datamaestro_text/config/org/universaldependencies/french.py,sha256=etedb3_SC-fV5Oa2rM4_smZk6t4CPiNvU4C4keUFZHY,2214
|
|
43
43
|
datamaestro_text/config/uk/ac/ucl/cs/qangaroo.yaml,sha256=IBy82CDNNLjJPNPzues1EgDXu0A5WDvUFeVNSOyrIpI,1137
|
|
44
44
|
datamaestro_text/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
45
45
|
datamaestro_text/data/embeddings.py,sha256=qdeZ4m6drFNkqW_4pKxJKdJHtOnDRs0P7QVJ7AO6xFE,1127
|
|
@@ -47,42 +47,42 @@ datamaestro_text/data/recommendation.py,sha256=wHV_9SXSclouuXaBmvwg3ncZLOFfdKRZG
|
|
|
47
47
|
datamaestro_text/data/tagging.py,sha256=yWm7bNLks77cAySa1aZNCmLFxTTqhqXZ0PaoaEYU6hI,697
|
|
48
48
|
datamaestro_text/data/text.py,sha256=_9J7-j06BOIZ1HsOcBsWy_asanAKkDoZIsWWMMj4tm4,496
|
|
49
49
|
datamaestro_text/data/conversation/__init__.py,sha256=esOWnSImMlQs5QtfxUt559ABLd6a5wwoNpj9XtIq71Y,159
|
|
50
|
-
datamaestro_text/data/conversation/base.py,sha256=
|
|
50
|
+
datamaestro_text/data/conversation/base.py,sha256=BAqu5YOjlRcBOSiyvrjiTIuuc1OalM0BDtcEvJYhO2Y,7515
|
|
51
51
|
datamaestro_text/data/conversation/canard.py,sha256=FI02rPNuzSsA_uXOcgcuQn31d3QahmHndzYCV1Zm8rk,3305
|
|
52
|
-
datamaestro_text/data/conversation/ikat.py,sha256=
|
|
52
|
+
datamaestro_text/data/conversation/ikat.py,sha256=djxTv0vHLXIUPzfmwpB25fyUWJNVPqthTtDbQ6CzmQo,4363
|
|
53
53
|
datamaestro_text/data/conversation/orconvqa.py,sha256=RL9cpK7QK2dsk_j2e4Th8wzGFq-G3ww_EjdM1eKh-8o,3764
|
|
54
54
|
datamaestro_text/data/conversation/qrecc.py,sha256=es4GmqPtE63A7O_GARe8Zy3rQvuLEhAvUA7CfN_nMeA,2562
|
|
55
|
-
datamaestro_text/data/ir/__init__.py,sha256=
|
|
56
|
-
datamaestro_text/data/ir/base.py,sha256=
|
|
55
|
+
datamaestro_text/data/ir/__init__.py,sha256=ZRJrUeeUyD1ncMN5JINVvFJ2lDr3KsbgiiEBJkczSi0,9814
|
|
56
|
+
datamaestro_text/data/ir/base.py,sha256=uwIiKn0ryK5DWUQsEegeTs6bye0uAOGl0XVm_cvV3ZU,1506
|
|
57
57
|
datamaestro_text/data/ir/cord19.py,sha256=yu1Khgy6AZjH2BPQKdnxDid0vQgQ8zvb8-FQlHH-GOU,1465
|
|
58
58
|
datamaestro_text/data/ir/csv.py,sha256=tnxuqR_MZ3GQhuoXEMYyWLQw8PyD0gRqsnzIP5Gsziw,1212
|
|
59
59
|
datamaestro_text/data/ir/data.py,sha256=ERmmOxz_9mUkIuccNbzUa5Y6gVLLVDdyc4cCxbCCUbY,20
|
|
60
|
-
datamaestro_text/data/ir/formats.py,sha256=
|
|
60
|
+
datamaestro_text/data/ir/formats.py,sha256=Tbu5rrssq9m76a448-ixgs7a-KKvNCvXYkdv5FkFqp4,3641
|
|
61
61
|
datamaestro_text/data/ir/huggingface.py,sha256=G71VFDN-SllZy4LFxumEbCumEJvb5-orAbLemHjWhiA,956
|
|
62
|
-
datamaestro_text/data/ir/stores.py,sha256=
|
|
63
|
-
datamaestro_text/data/ir/trec.py,sha256=
|
|
62
|
+
datamaestro_text/data/ir/stores.py,sha256=A4Ew0L4P6iLLmDKhxqjjVkHcz797BHf4d76YguGkB1A,4412
|
|
63
|
+
datamaestro_text/data/ir/trec.py,sha256=IOtQRMUz8zx-dYEMR2NIIM6qXEUjsV0eVOhGvKIRJK4,1974
|
|
64
64
|
datamaestro_text/data/ir/utils.py,sha256=6-GhXVtgkBZGhIs2-ODZua_3DmKjSSVydStpHDqbAwE,833
|
|
65
65
|
datamaestro_text/datasets/irds/__init__.py,sha256=Tq0HN1qojnZYLBumM59BuTkz7r0gcu-5OXmDDLgPpAc,707
|
|
66
|
-
datamaestro_text/datasets/irds/data.py,sha256=
|
|
66
|
+
datamaestro_text/datasets/irds/data.py,sha256=YlDbGFsh6_mCmk49F3bwdsLEbpHVvMv4gvc1H8KZnpo,23096
|
|
67
67
|
datamaestro_text/datasets/irds/datasets.py,sha256=n5mvD2omVI4PkeuJgX13Cio6gLrgYjxq8ZUKKyqocEs,5625
|
|
68
68
|
datamaestro_text/datasets/irds/helpers.py,sha256=fGE-fbuJbXdTzl1qo55247jzn9cvApY-d82GJBgfY1E,3982
|
|
69
69
|
datamaestro_text/datasets/irds/utils.py,sha256=m30JLIrV_HgilN11TvY9dGTyumES6LLzWZDUAMT915M,1425
|
|
70
70
|
datamaestro_text/download/tmdb.py,sha256=kU_Vz9jhznlyeKMHziVu58IHoWv8zPu6CZTHVNQvmu4,4009
|
|
71
71
|
datamaestro_text/interfaces/plaintext.py,sha256=cWfS_xjqZxQ0EV4Ax5BEarZ4lnhQ1I7mc_vgfBgE76w,885
|
|
72
|
-
datamaestro_text/interfaces/trec.py,sha256=
|
|
72
|
+
datamaestro_text/interfaces/trec.py,sha256=GrP0N_Hcj5f73KS6CSzkyN4aaI-XoBJ19oVMpHVR3QM,3579
|
|
73
73
|
datamaestro_text/test/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
74
74
|
datamaestro_text/test/test_datasets.py,sha256=hD1pe-CjEnOj0JNqDLasz_97ltFzMbYfyYBy1QyYdf8,202
|
|
75
75
|
datamaestro_text/test/test_documented.py,sha256=Kl90fz_r_dsV0oXE1Mad34GiQyZ9dc6bfZKNtxP0O2s,453
|
|
76
76
|
datamaestro_text/transforms/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
77
77
|
datamaestro_text/transforms/ir/__init__.py,sha256=eWxr0ZtfK9F9J4TachW5V8Ra5AhQJueDyOGR8vXi1uA,6553
|
|
78
78
|
datamaestro_text/utils/__init__.py,sha256=2449YLTAtKJzkmt84Mu8sBRCCveNs5fiaqTCK_p5ha0,3340
|
|
79
|
-
datamaestro_text/utils/files.py,sha256=
|
|
80
|
-
datamaestro_text/utils/iter.py,sha256=
|
|
79
|
+
datamaestro_text/utils/files.py,sha256=jhcirufgTztEkx1hs2-qgScEcHnIcGF_BjCeuCSsNv0,2838
|
|
80
|
+
datamaestro_text/utils/iter.py,sha256=QBajeSPLHvkeh6BCTZDSqWlOYNjwUDvgTTZ_YxJntXw,2701
|
|
81
81
|
datamaestro_text/utils/randomstream.py,sha256=_-boH4IIqN8qcl3IktjpNp9vmF4TWRzHUSNVwg7WAr8,973
|
|
82
82
|
datamaestro_text/utils/shuffle.py,sha256=o8JTz3mr0lYWyv0zEh91jEK12ci1etMiUnzh5GkOHCM,3490
|
|
83
|
-
datamaestro_text-2025.
|
|
84
|
-
datamaestro_text-2025.
|
|
85
|
-
datamaestro_text-2025.
|
|
86
|
-
datamaestro_text-2025.
|
|
87
|
-
datamaestro_text-2025.
|
|
88
|
-
datamaestro_text-2025.
|
|
83
|
+
datamaestro_text-2025.9.11.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
84
|
+
datamaestro_text-2025.9.11.dist-info/METADATA,sha256=ChGV_8bnixfGl91eG_3-Qwba8tjMwe2VPCwXdGxG_xM,1848
|
|
85
|
+
datamaestro_text-2025.9.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
86
|
+
datamaestro_text-2025.9.11.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
|
|
87
|
+
datamaestro_text-2025.9.11.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
|
|
88
|
+
datamaestro_text-2025.9.11.dist-info/RECORD,,
|
|
File without changes
|
{datamaestro_text-2025.6.30.dist-info → datamaestro_text-2025.9.11.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{datamaestro_text-2025.6.30.dist-info → datamaestro_text-2025.9.11.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|