datamaestro-text 2025.6.30__py3-none-any.whl → 2025.7.28__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamaestro_text/config/com/github/ikat.py +102 -19
- datamaestro_text/config/com/sentiment140.py +4 -4
- datamaestro_text/config/gov/nist/trec/tipster.py +0 -1
- datamaestro_text/data/conversation/base.py +34 -9
- datamaestro_text/data/conversation/ikat.py +38 -13
- datamaestro_text/data/ir/__init__.py +25 -2
- datamaestro_text/data/ir/base.py +2 -1
- datamaestro_text/data/ir/formats.py +8 -2
- datamaestro_text/data/ir/stores.py +99 -4
- datamaestro_text/data/ir/trec.py +7 -4
- datamaestro_text/datasets/irds/data.py +34 -15
- datamaestro_text/interfaces/trec.py +28 -1
- datamaestro_text/utils/files.py +103 -0
- datamaestro_text/utils/iter.py +5 -0
- datamaestro_text/version.py +2 -2
- {datamaestro_text-2025.6.30.dist-info → datamaestro_text-2025.7.28.dist-info}/METADATA +3 -3
- {datamaestro_text-2025.6.30.dist-info → datamaestro_text-2025.7.28.dist-info}/RECORD +21 -21
- {datamaestro_text-2025.6.30.dist-info → datamaestro_text-2025.7.28.dist-info}/WHEEL +0 -0
- {datamaestro_text-2025.6.30.dist-info → datamaestro_text-2025.7.28.dist-info}/entry_points.txt +0 -0
- {datamaestro_text-2025.6.30.dist-info → datamaestro_text-2025.7.28.dist-info}/licenses/LICENSE +0 -0
- {datamaestro_text-2025.6.30.dist-info → datamaestro_text-2025.7.28.dist-info}/top_level.txt +0 -0
|
@@ -1,38 +1,121 @@
|
|
|
1
1
|
# See documentation on https://datamaestro.readthedocs.io
|
|
2
2
|
|
|
3
|
+
import bz2
|
|
4
|
+
from datamaestro.download import reference
|
|
3
5
|
from datamaestro.definitions import datatasks, datatags, dataset
|
|
4
|
-
from
|
|
5
|
-
from
|
|
6
|
+
from datamaestro_text.data.conversation.base import ConversationUserTopics
|
|
7
|
+
from datamaestro_text.data.ir import Adhoc
|
|
6
8
|
|
|
7
9
|
from datamaestro.utils import HashCheck
|
|
10
|
+
from datamaestro.context import DatafolderPath
|
|
8
11
|
from datamaestro.download.single import filedownloader
|
|
9
|
-
from datamaestro_text.data.conversation.ikat import
|
|
10
|
-
from
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
import
|
|
12
|
+
from datamaestro_text.data.conversation.ikat import IkatConversations
|
|
13
|
+
from datamaestro.download.links import linkfolder
|
|
14
|
+
|
|
15
|
+
from datamaestro_text.data.ir.stores import IKatClueWeb22DocumentStore
|
|
16
|
+
from datamaestro_text.data.ir.trec import TrecAdhocAssessments
|
|
17
|
+
from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataset(as_prepare=True)
|
|
21
|
+
def clueweb22(dataset, options=None) -> IKatClueWeb22DocumentStore:
|
|
22
|
+
# Number of documents in the dataset
|
|
23
|
+
count = 116_838_987
|
|
24
|
+
|
|
25
|
+
jsonl_folder = linkfolder(
|
|
26
|
+
"documents", [DatafolderPath("gov.nist.trec.ikat.clueweb22", "jsonl")]
|
|
27
|
+
).setup(dataset, options)
|
|
28
|
+
store_path = lz4docstore_builder(
|
|
29
|
+
"store",
|
|
30
|
+
IKatClueWeb22DocumentStore.generator(
|
|
31
|
+
jsonl_folder,
|
|
32
|
+
jsonl_folder / "ikat_2023_passages_jsonl.sha256sums",
|
|
33
|
+
jsonl_folder / "ikat_2023_passages_hashes.tsv.bz2",
|
|
34
|
+
),
|
|
35
|
+
IKatClueWeb22DocumentStore.Document,
|
|
36
|
+
"id",
|
|
37
|
+
count_hint=count,
|
|
38
|
+
).setup(dataset, options)
|
|
39
|
+
|
|
40
|
+
return IKatClueWeb22DocumentStore.C(path=store_path, count=count)
|
|
41
|
+
|
|
15
42
|
|
|
16
43
|
@datatags("conversation", "context", "query")
|
|
17
|
-
@datatasks("query rewriting")
|
|
44
|
+
@datatasks("conversational search", "query rewriting")
|
|
45
|
+
@reference("documents", clueweb22)
|
|
18
46
|
@filedownloader(
|
|
19
|
-
"
|
|
47
|
+
"topics.json",
|
|
20
48
|
"https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2025/data/2025_test_topics.json",
|
|
21
49
|
checker=HashCheck("16f8444a8d0a8dfe0090f478f185a63c"),
|
|
22
50
|
)
|
|
23
|
-
|
|
24
51
|
@dataset(
|
|
25
|
-
|
|
52
|
+
id="2025",
|
|
26
53
|
url="https://github.com/irlabamsterdam/iKAT/tree/main/2025",
|
|
27
54
|
)
|
|
28
|
-
|
|
29
|
-
def main(test) -> Supervised[IkatDataset, None, IkatDataset]:
|
|
55
|
+
def test_2025(topics, documents) -> Adhoc.C:
|
|
30
56
|
"""Question-in-context rewriting
|
|
31
57
|
|
|
32
|
-
iKAT is a test dataset for question-in-context rewriting that consists of
|
|
58
|
+
iKAT is a test dataset for question-in-context rewriting that consists of
|
|
33
59
|
questions each given in a dialog context together with a context-independent
|
|
34
|
-
rewriting of the question.
|
|
35
|
-
One of the special features of iKAT is that it includes a Personal PKTB',
|
|
60
|
+
rewriting of the question.
|
|
36
61
|
"""
|
|
37
|
-
|
|
38
|
-
|
|
62
|
+
return Adhoc.C(
|
|
63
|
+
topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
|
|
64
|
+
# TODO: add when available
|
|
65
|
+
assessments=TrecAdhocAssessments.C(path="/to/do"),
|
|
66
|
+
documents=documents,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@datatags("conversation", "context", "query")
|
|
71
|
+
@datatasks("conversational search", "query rewriting")
|
|
72
|
+
@reference("documents", clueweb22)
|
|
73
|
+
@filedownloader(
|
|
74
|
+
"qrels",
|
|
75
|
+
"https://trec.nist.gov/data/ikat/2024-qrels.txt",
|
|
76
|
+
checker=HashCheck("57f958903ed1c12bbac207f62800814f"),
|
|
77
|
+
)
|
|
78
|
+
@filedownloader(
|
|
79
|
+
"topics.json",
|
|
80
|
+
"https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2024/data/2024_test_topics.json",
|
|
81
|
+
checker=HashCheck("ad45bc6e7add2081d69ea60a0a4d1203"),
|
|
82
|
+
)
|
|
83
|
+
@dataset(
|
|
84
|
+
Adhoc,
|
|
85
|
+
id="2024",
|
|
86
|
+
url="https://github.com/irlabamsterdam/iKAT/tree/main/2024",
|
|
87
|
+
)
|
|
88
|
+
def test_2024(topics, qrels, documents) -> Adhoc.C:
|
|
89
|
+
"""iKAT 2024 dataset"""
|
|
90
|
+
return Adhoc.C(
|
|
91
|
+
topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
|
|
92
|
+
assessments=TrecAdhocAssessments.C(path=qrels),
|
|
93
|
+
documents=documents,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@datatags("conversation", "context", "query")
|
|
98
|
+
@datatasks("conversational search", "query rewriting")
|
|
99
|
+
@reference("documents", clueweb22)
|
|
100
|
+
@filedownloader(
|
|
101
|
+
"qrels",
|
|
102
|
+
"https://trec.nist.gov/data/ikat/2023-qrels.all-turns.txt",
|
|
103
|
+
checker=HashCheck("79dc121bab25b2245e52a53263e5ad1f"),
|
|
104
|
+
)
|
|
105
|
+
@filedownloader(
|
|
106
|
+
"topics.json",
|
|
107
|
+
"https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2023/data/2023_test_topics.json",
|
|
108
|
+
checker=HashCheck("684fa0197cdec8c3cfb6a2e586ab83f6"),
|
|
109
|
+
)
|
|
110
|
+
@dataset(
|
|
111
|
+
Adhoc,
|
|
112
|
+
id="2023",
|
|
113
|
+
url="https://github.com/irlabamsterdam/iKAT/tree/main/2023",
|
|
114
|
+
)
|
|
115
|
+
def test_2023(topics, qrels, documents) -> Adhoc.C:
|
|
116
|
+
"""iKAT 2023 dataset"""
|
|
117
|
+
return Adhoc.C(
|
|
118
|
+
topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
|
|
119
|
+
assessments=TrecAdhocAssessments.C(path=qrels),
|
|
120
|
+
documents=documents,
|
|
121
|
+
)
|
|
@@ -26,7 +26,7 @@ def english(dir):
|
|
|
26
26
|
|
|
27
27
|
If you use this data, please cite Sentiment140 as your source.
|
|
28
28
|
"""
|
|
29
|
-
return
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
29
|
+
return Supervised.C(
|
|
30
|
+
train=Generic(path=dir / "training.1600000.processed.noemoticon.csv"),
|
|
31
|
+
test=Generic(path=dir / "testdata.manual.2009.06.14.csv"),
|
|
32
|
+
)
|
|
@@ -17,7 +17,6 @@ of three CD-ROMs of SGML encoded documents distributed by LDC plus queries and a
|
|
|
17
17
|
See also https://trec.nist.gov/data/docs_eng.html and https://trec.nist.gov/data/intro_eng.html
|
|
18
18
|
"""
|
|
19
19
|
|
|
20
|
-
from datamaestro.data import Base
|
|
21
20
|
from datamaestro_text.data.ir.trec import TipsterCollection
|
|
22
21
|
from datamaestro.download.links import linkfolder
|
|
23
22
|
from datamaestro.definitions import (
|
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
from enum import Enum
|
|
3
|
+
from datamaestro_text.data.ir.base import IDItem, SimpleTextItem
|
|
4
|
+
from experimaestro import Param
|
|
3
5
|
from typing import Dict, Generic, Iterator, List, Optional, Sequence, Tuple
|
|
4
6
|
from attr import define
|
|
7
|
+
from datamaestro.record import record_type
|
|
5
8
|
from datamaestro.data import Base
|
|
6
9
|
from datamaestro.record import Record, Item
|
|
7
|
-
from datamaestro_text.data.ir import TopicRecord
|
|
10
|
+
from datamaestro_text.data.ir import TopicRecord, Topics
|
|
8
11
|
from datamaestro_text.utils.iter import FactoryIterable, LazyList, RangeView
|
|
9
12
|
|
|
10
13
|
# ---- Basic types
|
|
@@ -120,20 +123,17 @@ class ConversationNode:
|
|
|
120
123
|
...
|
|
121
124
|
|
|
122
125
|
@abstractmethod
|
|
123
|
-
def parent(self) -> Optional["ConversationNode"]:
|
|
124
|
-
...
|
|
126
|
+
def parent(self) -> Optional["ConversationNode"]: ...
|
|
125
127
|
|
|
126
128
|
@abstractmethod
|
|
127
|
-
def children(self) -> List["ConversationNode"]:
|
|
128
|
-
...
|
|
129
|
+
def children(self) -> List["ConversationNode"]: ...
|
|
129
130
|
|
|
130
131
|
|
|
131
132
|
class ConversationTree(ABC):
|
|
132
133
|
"""Represents a conversation tree"""
|
|
133
134
|
|
|
134
135
|
@abstractmethod
|
|
135
|
-
def root(self) -> ConversationNode:
|
|
136
|
-
...
|
|
136
|
+
def root(self) -> ConversationNode: ...
|
|
137
137
|
|
|
138
138
|
@abstractmethod
|
|
139
139
|
def __iter__(self) -> Iterator[ConversationNode]:
|
|
@@ -253,5 +253,30 @@ class ConversationDataset(Base, ABC):
|
|
|
253
253
|
@abstractmethod
|
|
254
254
|
def __iter__(self) -> Iterator[ConversationTree]:
|
|
255
255
|
"""Return an iterator over conversations"""
|
|
256
|
-
|
|
257
|
-
|
|
256
|
+
...
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
class ConversationUserTopics(Topics):
|
|
260
|
+
"""Extract user topics from conversations"""
|
|
261
|
+
|
|
262
|
+
conversations: Param[ConversationDataset]
|
|
263
|
+
|
|
264
|
+
topic_recordtype = record_type(IDItem, SimpleTextItem)
|
|
265
|
+
|
|
266
|
+
def iter(self) -> Iterator[TopicRecord]:
|
|
267
|
+
"""Returns an iterator over topics"""
|
|
268
|
+
# Extracts topics from conversations, Each user query is a topic (can perform retrieval on it)
|
|
269
|
+
# TODO: merge with xpmir.learning.DatasetConversationBase -> same logic
|
|
270
|
+
|
|
271
|
+
records: List[TopicRecord] = []
|
|
272
|
+
for conversation in self.conversations.__iter__():
|
|
273
|
+
nodes = [
|
|
274
|
+
node
|
|
275
|
+
for node in conversation
|
|
276
|
+
if node.entry[EntryType] == EntryType.USER_QUERY
|
|
277
|
+
]
|
|
278
|
+
for node in nodes:
|
|
279
|
+
records.append(
|
|
280
|
+
node.entry.update(ConversationHistoryItem(node.history()))
|
|
281
|
+
)
|
|
282
|
+
return iter(records)
|
|
@@ -1,10 +1,11 @@
|
|
|
1
|
-
from typing import Iterator, List
|
|
1
|
+
from typing import Iterator, List
|
|
2
2
|
from attr import define, field
|
|
3
3
|
import json
|
|
4
4
|
import logging
|
|
5
5
|
from datamaestro.data import File
|
|
6
6
|
from datamaestro.record import Record
|
|
7
7
|
|
|
8
|
+
from datamaestro_text.data.ir import Topics
|
|
8
9
|
from datamaestro_text.data.ir.base import (
|
|
9
10
|
IDItem,
|
|
10
11
|
SimpleTextItem,
|
|
@@ -12,7 +13,6 @@ from datamaestro_text.data.ir.base import (
|
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
from .base import (
|
|
15
|
-
AnswerDocumentURL,
|
|
16
16
|
AnswerEntry,
|
|
17
17
|
ConversationTree,
|
|
18
18
|
EntryType,
|
|
@@ -21,6 +21,25 @@ from .base import (
|
|
|
21
21
|
)
|
|
22
22
|
from . import ConversationDataset
|
|
23
23
|
|
|
24
|
+
# Keys to change in the dataset entries for compatibility across different years
|
|
25
|
+
|
|
26
|
+
KEY_MAPPINGS = {
|
|
27
|
+
# Keys to replace: Target Key
|
|
28
|
+
"turns": "responses",
|
|
29
|
+
"utterance": "user_utterance",
|
|
30
|
+
"ptkb_provenance": "relevant_ptkbs",
|
|
31
|
+
"response_provenance": "citations",
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def norm_dict(entry: dict) -> dict:
|
|
36
|
+
"""Convert keys in the entry to match the expected format."""
|
|
37
|
+
normalized = {}
|
|
38
|
+
for k, v in entry.items():
|
|
39
|
+
# Check for direct mapping, then try lowercase mapping
|
|
40
|
+
new_key = KEY_MAPPINGS.get(k) or KEY_MAPPINGS.get(k.lower()) or k.lower()
|
|
41
|
+
normalized[new_key] = v
|
|
42
|
+
return normalized
|
|
24
43
|
|
|
25
44
|
|
|
26
45
|
@define(kw_only=True)
|
|
@@ -47,7 +66,7 @@ class IkatConversationEntry:
|
|
|
47
66
|
|
|
48
67
|
|
|
49
68
|
@define(kw_only=True)
|
|
50
|
-
class
|
|
69
|
+
class IkatConversationTopic:
|
|
51
70
|
"""A query with past history"""
|
|
52
71
|
|
|
53
72
|
number: str
|
|
@@ -60,14 +79,20 @@ class IkatDatasetEntry:
|
|
|
60
79
|
"""The personal knowledge base associated with the user"""
|
|
61
80
|
|
|
62
81
|
responses: List[IkatConversationEntry] = field(
|
|
63
|
-
converter=lambda items: [
|
|
82
|
+
converter=lambda items: [
|
|
83
|
+
IkatConversationEntry(**item) if isinstance(item, dict) else item
|
|
84
|
+
for item in map(norm_dict, items)
|
|
85
|
+
]
|
|
64
86
|
)
|
|
65
87
|
"""The list of responses to the query"""
|
|
66
88
|
|
|
67
89
|
|
|
68
|
-
class
|
|
90
|
+
class IkatConversations(ConversationDataset, File):
|
|
91
|
+
"""A dataset containing conversations from the IKAT project"""
|
|
69
92
|
|
|
70
|
-
|
|
93
|
+
"""Keys to change in the dataset entries for compatibility across different years"""
|
|
94
|
+
|
|
95
|
+
def entries(self) -> Iterator[IkatConversationTopic]:
|
|
71
96
|
"""Reads all conversation entries from the dataset file."""
|
|
72
97
|
with self.path.open("rt") as fp:
|
|
73
98
|
raw_data = json.load(fp)
|
|
@@ -75,12 +100,13 @@ class IkatDataset(ConversationDataset, File):
|
|
|
75
100
|
logging.debug("Loaded %d entries from %s", len(raw_data), self.path)
|
|
76
101
|
logging.debug(f"raw data has keys {raw_data[0].keys()}")
|
|
77
102
|
|
|
78
|
-
processed_data = []
|
|
79
103
|
for entry in raw_data:
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
104
|
+
try:
|
|
105
|
+
normalized_entry = norm_dict(entry)
|
|
106
|
+
yield IkatConversationTopic(**normalized_entry)
|
|
107
|
+
except Exception as e:
|
|
108
|
+
logging.warning(f"Failed to parse entry: {e}")
|
|
109
|
+
raise e
|
|
84
110
|
|
|
85
111
|
def __iter__(self) -> Iterator[ConversationTree]:
|
|
86
112
|
for entry in self.entries():
|
|
@@ -88,7 +114,7 @@ class IkatDataset(ConversationDataset, File):
|
|
|
88
114
|
|
|
89
115
|
for turn in entry.responses:
|
|
90
116
|
turn: IkatConversationEntry = turn # Ensure type is correct
|
|
91
|
-
query_id = f"{entry.number}
|
|
117
|
+
query_id = f"{entry.number}_{turn.turn_id}"
|
|
92
118
|
|
|
93
119
|
# USER QUERY record
|
|
94
120
|
history.append(
|
|
@@ -117,4 +143,3 @@ class IkatDataset(ConversationDataset, File):
|
|
|
117
143
|
# Ensure reverse if needed for compatibility (optional)
|
|
118
144
|
history.reverse()
|
|
119
145
|
yield SingleConversationTree(entry.number, history)
|
|
120
|
-
|
|
@@ -2,9 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
4
|
from functools import cached_property
|
|
5
|
+
import logging
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from attrs import define
|
|
7
|
-
from typing import Callable, Dict, Iterator, List, Optional, Tuple, Type
|
|
8
|
+
from typing import Callable, Dict, Iterator, List, Optional, Tuple, Type, TYPE_CHECKING
|
|
8
9
|
import random
|
|
9
10
|
from experimaestro import Config
|
|
10
11
|
from datamaestro.definitions import datatasks, Param, Meta
|
|
@@ -28,6 +29,9 @@ from .base import ( # noqa: F401
|
|
|
28
29
|
AdhocAssessedTopic,
|
|
29
30
|
)
|
|
30
31
|
|
|
32
|
+
#: A adhoc run dictionary (query id -> doc id -> score)
|
|
33
|
+
AdhocRunDict = dict[str, dict[str, float]]
|
|
34
|
+
|
|
31
35
|
|
|
32
36
|
class Documents(Base):
|
|
33
37
|
"""A set of documents with identifiers
|
|
@@ -45,6 +49,22 @@ class Documents(Base):
|
|
|
45
49
|
def iter_documents(self) -> Iterator[DocumentRecord]:
|
|
46
50
|
return self.iter()
|
|
47
51
|
|
|
52
|
+
def iter_documents_from(self, start=0) -> Iterator[DocumentRecord]:
|
|
53
|
+
"""Iterate over a range of documents
|
|
54
|
+
|
|
55
|
+
Can be specialized in a subclass for faster access
|
|
56
|
+
|
|
57
|
+
:param start: The starting document, defaults to 0
|
|
58
|
+
:return: An iterator
|
|
59
|
+
"""
|
|
60
|
+
iter = self.iter()
|
|
61
|
+
if start > 0:
|
|
62
|
+
logging.info("skipping %d documents", start + 1)
|
|
63
|
+
for _ in range(start + 1):
|
|
64
|
+
next(iter)
|
|
65
|
+
|
|
66
|
+
return iter
|
|
67
|
+
|
|
48
68
|
def iter_ids(self) -> Iterator[str]:
|
|
49
69
|
"""Iterates over document ids
|
|
50
70
|
|
|
@@ -168,7 +188,10 @@ class AdhocAssessments(Base, ABC):
|
|
|
168
188
|
class AdhocRun(Base):
|
|
169
189
|
"""IR adhoc run"""
|
|
170
190
|
|
|
171
|
-
|
|
191
|
+
@abstractmethod
|
|
192
|
+
def get_dict(self) -> "AdhocRunDict":
|
|
193
|
+
"""Get the run as a dictionary query ID -> doc ID -> score"""
|
|
194
|
+
...
|
|
172
195
|
|
|
173
196
|
|
|
174
197
|
class AdhocResults(Base):
|
datamaestro_text/data/ir/base.py
CHANGED
|
@@ -43,6 +43,7 @@ class IDItem(Item, ABC):
|
|
|
43
43
|
|
|
44
44
|
id: str
|
|
45
45
|
|
|
46
|
+
|
|
46
47
|
@define
|
|
47
48
|
class UrlItem(Item):
|
|
48
49
|
"""An url item"""
|
|
@@ -70,7 +71,7 @@ class AdhocAssessedTopic:
|
|
|
70
71
|
"""List of assessments for this topic"""
|
|
71
72
|
|
|
72
73
|
|
|
73
|
-
def create_record(*items: Item, id: str = None, text: str = None):
|
|
74
|
+
def create_record(*items: Item, id: str = None, text: str = None) -> Record:
|
|
74
75
|
"""Easy creation of a text/id item"""
|
|
75
76
|
extra_items = []
|
|
76
77
|
if id is not None:
|
|
@@ -99,7 +99,7 @@ class WapoDocument(TextItem):
|
|
|
99
99
|
body_media: Tuple[WapoDocMedia, ...]
|
|
100
100
|
|
|
101
101
|
@cached_property
|
|
102
|
-
def text(self):
|
|
102
|
+
def text(self):
|
|
103
103
|
return f"{self.title} {self.body_paras_html}"
|
|
104
104
|
|
|
105
105
|
|
|
@@ -132,11 +132,15 @@ class DprW100Doc(TextItem):
|
|
|
132
132
|
text: str
|
|
133
133
|
title: str
|
|
134
134
|
|
|
135
|
-
|
|
135
|
+
|
|
136
|
+
@define
|
|
136
137
|
class MsMarcoV2Passage(TextItem):
|
|
137
138
|
text: str
|
|
138
139
|
spans: Tuple[Tuple[int, int], ...]
|
|
139
140
|
msmarco_document_id: str
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
@define
|
|
140
144
|
class Touche2020(TextItem):
|
|
141
145
|
text: str
|
|
142
146
|
title: str
|
|
@@ -204,11 +208,13 @@ class TrecTopic(SimpleTextItem):
|
|
|
204
208
|
|
|
205
209
|
TrecTopicRecord = record_type(IDItem, TrecTopic)
|
|
206
210
|
|
|
211
|
+
|
|
207
212
|
@define
|
|
208
213
|
class DprW100Query(TextItem):
|
|
209
214
|
text: str
|
|
210
215
|
answers: Tuple[str]
|
|
211
216
|
|
|
217
|
+
|
|
212
218
|
@define
|
|
213
219
|
class TrecBackgroundLinkingQuery(IDItem):
|
|
214
220
|
query_id: str
|
|
@@ -1,12 +1,21 @@
|
|
|
1
|
-
|
|
1
|
+
import bz2
|
|
2
|
+
from hashlib import md5, sha256
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
2
6
|
from typing import List, NamedTuple
|
|
7
|
+
from datamaestro_text.utils.files import TQDMFileReader
|
|
3
8
|
from experimaestro import Constant
|
|
4
|
-
import attrs
|
|
5
|
-
|
|
6
9
|
from datamaestro.record import Record
|
|
7
|
-
from datamaestro_text.data.ir.base import
|
|
10
|
+
from datamaestro_text.data.ir.base import (
|
|
11
|
+
DocumentRecord,
|
|
12
|
+
IDItem,
|
|
13
|
+
SimpleTextItem,
|
|
14
|
+
UrlItem,
|
|
15
|
+
)
|
|
8
16
|
from datamaestro_text.datasets.irds.data import LZ4DocumentStore
|
|
9
17
|
from datamaestro_text.data.ir.formats import OrConvQADocument
|
|
18
|
+
from tqdm import tqdm
|
|
10
19
|
|
|
11
20
|
|
|
12
21
|
class OrConvQADocumentStore(LZ4DocumentStore):
|
|
@@ -27,3 +36,89 @@ class OrConvQADocumentStore(LZ4DocumentStore):
|
|
|
27
36
|
fields = data._asdict()
|
|
28
37
|
del fields["id"]
|
|
29
38
|
return Record(OrConvQADocument(**fields), IDItem(data.id))
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class IKatClueWeb22DocumentStore(LZ4DocumentStore):
|
|
42
|
+
@staticmethod
|
|
43
|
+
def generator(path: Path, checksums_file: Path, passages_hashes: Path):
|
|
44
|
+
"""Returns an iterator over iKAT 2022-25 documents
|
|
45
|
+
|
|
46
|
+
:param path: The folder containing the files
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def __iter__():
|
|
50
|
+
errors = False
|
|
51
|
+
|
|
52
|
+
assert checksums_file.is_file(), f"{checksums_file} does not exist"
|
|
53
|
+
assert passages_hashes.is_file(), f"{passages_hashes} does not exist"
|
|
54
|
+
|
|
55
|
+
# Get the list of files
|
|
56
|
+
with checksums_file.open("rt") as fp:
|
|
57
|
+
files = []
|
|
58
|
+
for line in fp:
|
|
59
|
+
checksum, filename = line.strip().split()
|
|
60
|
+
files.append((checksum, filename))
|
|
61
|
+
if not (path / filename).is_file():
|
|
62
|
+
logging.error("File %s does not exist", path / filename)
|
|
63
|
+
errors = True
|
|
64
|
+
|
|
65
|
+
assert not errors, "Errors detected, stopping"
|
|
66
|
+
|
|
67
|
+
# Check the SHA256 sums
|
|
68
|
+
match checksums_file.suffix:
|
|
69
|
+
case ".sha256sums":
|
|
70
|
+
hasher_factory = sha256
|
|
71
|
+
case _:
|
|
72
|
+
raise NotImplementedError(
|
|
73
|
+
f"Cannot handle {checksums_file.suffix} checksum files"
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
for checksum, filename in tqdm(files):
|
|
77
|
+
logging.info("Checking %s", filename)
|
|
78
|
+
hasher = hasher_factory()
|
|
79
|
+
with (path / filename).open("rb") as fp:
|
|
80
|
+
while data := fp.read(2**20):
|
|
81
|
+
hasher.update(data)
|
|
82
|
+
|
|
83
|
+
file_checksum = hasher.hexdigest()
|
|
84
|
+
assert file_checksum == checksum, (
|
|
85
|
+
f"Expected {checksum}, " f"got {file_checksum} for {filename}"
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# Get the MD5 hashes of all the passages
|
|
89
|
+
logging.info("Reading the hashes of all passages")
|
|
90
|
+
with TQDMFileReader(passages_hashes, "rt", bz2.open) as fp:
|
|
91
|
+
passage_checksums = {}
|
|
92
|
+
for line in tqdm(fp):
|
|
93
|
+
doc_id, passage_no, checksum = line.strip().split()
|
|
94
|
+
passage_checksums[f"{doc_id}:{passage_no}"] = checksum # noqa: E231
|
|
95
|
+
|
|
96
|
+
# Read the files
|
|
97
|
+
logging.info("Starting to read the files")
|
|
98
|
+
for _, filename in tqdm(files):
|
|
99
|
+
with TQDMFileReader(path / filename, "rt", bz2.open) as jsonl_fp:
|
|
100
|
+
for line in jsonl_fp:
|
|
101
|
+
data = json.loads(line)
|
|
102
|
+
expected = passage_checksums[data["id"]]
|
|
103
|
+
computed = md5(data["contents"].encode("utf-8")).hexdigest()
|
|
104
|
+
assert expected == computed, (
|
|
105
|
+
f"Expected {expected}, "
|
|
106
|
+
f"got {computed} for passage {data['id']} in {filename}"
|
|
107
|
+
)
|
|
108
|
+
yield IKatClueWeb22DocumentStore.Document(**data)
|
|
109
|
+
|
|
110
|
+
return __iter__
|
|
111
|
+
|
|
112
|
+
class Document(NamedTuple):
|
|
113
|
+
id: str
|
|
114
|
+
contents: str
|
|
115
|
+
url: str
|
|
116
|
+
|
|
117
|
+
data_cls = Document
|
|
118
|
+
lookup_field: Constant[str] = "id"
|
|
119
|
+
index_fields: Constant[List[str]] = ["id"]
|
|
120
|
+
|
|
121
|
+
def converter(self, data):
|
|
122
|
+
return DocumentRecord(
|
|
123
|
+
IDItem(data.id), SimpleTextItem(data.contents), UrlItem(data.url)
|
|
124
|
+
)
|
datamaestro_text/data/ir/trec.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
+
import re
|
|
1
2
|
from typing import Dict, List, Optional
|
|
2
|
-
from datamaestro.data import Base
|
|
3
3
|
from experimaestro import documentation, Param, Meta
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from datamaestro.record import Record
|
|
6
5
|
from datamaestro_text.data.ir import (
|
|
6
|
+
AdhocRunDict,
|
|
7
7
|
Documents,
|
|
8
8
|
Topics,
|
|
9
9
|
AdhocAssessments,
|
|
@@ -47,6 +47,11 @@ class TrecAdhocAssessments(AdhocAssessments):
|
|
|
47
47
|
class TrecAdhocRun(AdhocRun):
|
|
48
48
|
path: Param[Path]
|
|
49
49
|
|
|
50
|
+
def get_dict(self) -> AdhocRunDict:
|
|
51
|
+
import datamaestro_text.interfaces.trec as trec
|
|
52
|
+
|
|
53
|
+
return trec.parse_run(self.path)
|
|
54
|
+
|
|
50
55
|
|
|
51
56
|
class TrecAdhocResults(AdhocResults):
|
|
52
57
|
"""Adhoc results (TREC format)"""
|
|
@@ -62,8 +67,6 @@ class TrecAdhocResults(AdhocResults):
|
|
|
62
67
|
|
|
63
68
|
def get_results(self) -> Dict[str, float]:
|
|
64
69
|
"""Returns the results as a dictionary {metric_name: value}"""
|
|
65
|
-
import re
|
|
66
|
-
|
|
67
70
|
re_spaces = re.compile(r"\s+")
|
|
68
71
|
|
|
69
72
|
results = {}
|
|
@@ -122,7 +122,14 @@ class Documents(ir.DocumentStore, IRDSId):
|
|
|
122
122
|
formats.Touche2020, "doc_id", "text", "title", "stance", "url"
|
|
123
123
|
),
|
|
124
124
|
_irds.beir.BeirSciDoc: tuple_constructor(
|
|
125
|
-
formats.SciDocs,
|
|
125
|
+
formats.SciDocs,
|
|
126
|
+
"doc_id",
|
|
127
|
+
"text",
|
|
128
|
+
"title",
|
|
129
|
+
"authors",
|
|
130
|
+
"year",
|
|
131
|
+
"cited_by",
|
|
132
|
+
"references",
|
|
126
133
|
),
|
|
127
134
|
_irds.msmarco_document.MsMarcoDocument: tuple_constructor(
|
|
128
135
|
formats.MsMarcoDocument, "doc_id", "url", "title", "body"
|
|
@@ -198,6 +205,10 @@ class Documents(ir.DocumentStore, IRDSId):
|
|
|
198
205
|
for doc in self.dataset.docs_iter():
|
|
199
206
|
yield self.converter(self.document_recordtype, doc)
|
|
200
207
|
|
|
208
|
+
def iter_documents_from(self, start=0):
|
|
209
|
+
for doc in self.dataset.docs_iter()[start:]:
|
|
210
|
+
yield self.converter(self.document_recordtype, doc)
|
|
211
|
+
|
|
201
212
|
@property
|
|
202
213
|
def documentcount(self):
|
|
203
214
|
return self.dataset.docs_count()
|
|
@@ -244,7 +255,7 @@ if hasattr(_irds, "miracl"):
|
|
|
244
255
|
)
|
|
245
256
|
|
|
246
257
|
|
|
247
|
-
class LZ4DocumentStore(ir.DocumentStore):
|
|
258
|
+
class LZ4DocumentStore(ir.DocumentStore, ABC):
|
|
248
259
|
"""A LZ4-based document store"""
|
|
249
260
|
|
|
250
261
|
path: Param[Path]
|
|
@@ -253,7 +264,7 @@ class LZ4DocumentStore(ir.DocumentStore):
|
|
|
253
264
|
lookup_field: Param[str]
|
|
254
265
|
|
|
255
266
|
# Extra indexed fields (e.g. URLs)
|
|
256
|
-
index_fields: List[str]
|
|
267
|
+
index_fields: List[str] = []
|
|
257
268
|
|
|
258
269
|
@cached_property
|
|
259
270
|
def store(self):
|
|
@@ -285,6 +296,9 @@ class LZ4DocumentStore(ir.DocumentStore):
|
|
|
285
296
|
"""Returns an iterator over documents"""
|
|
286
297
|
return map(self.converter, self.store.__iter__())
|
|
287
298
|
|
|
299
|
+
def iter_documents_from(self, start=0):
|
|
300
|
+
return map(self.converter, self.store.__iter__()[start:])
|
|
301
|
+
|
|
288
302
|
@cached_property
|
|
289
303
|
def documentcount(self):
|
|
290
304
|
if self.count:
|
|
@@ -386,7 +400,13 @@ class Topics(ir.TopicsStore, IRDSId):
|
|
|
386
400
|
formats.TrecTopic, "query_id", "text", "description", "narrative"
|
|
387
401
|
),
|
|
388
402
|
_irds.beir.BeirSciQuery: tuple_constructor(
|
|
389
|
-
formats.SciDocsTopic,
|
|
403
|
+
formats.SciDocsTopic,
|
|
404
|
+
"query_id",
|
|
405
|
+
"text",
|
|
406
|
+
"authors",
|
|
407
|
+
"year",
|
|
408
|
+
"cited_by",
|
|
409
|
+
"references",
|
|
390
410
|
),
|
|
391
411
|
_irds.tweets2013_ia.TrecMb13Query: tuple_constructor(
|
|
392
412
|
formats.TrecMb13Query, "query_id", "query", "time", "tweet_time"
|
|
@@ -400,10 +420,7 @@ class Topics(ir.TopicsStore, IRDSId):
|
|
|
400
420
|
"description",
|
|
401
421
|
),
|
|
402
422
|
_irds.dpr_w100.DprW100Query: tuple_constructor(
|
|
403
|
-
formats.DprW100Query,
|
|
404
|
-
"query_id",
|
|
405
|
-
"text",
|
|
406
|
-
"answers"
|
|
423
|
+
formats.DprW100Query, "query_id", "text", "answers"
|
|
407
424
|
),
|
|
408
425
|
}
|
|
409
426
|
|
|
@@ -435,11 +452,12 @@ class Topics(ir.TopicsStore, IRDSId):
|
|
|
435
452
|
def iter(self) -> Iterator[TopicRecord]:
|
|
436
453
|
"""Returns an iterator over topics"""
|
|
437
454
|
return self.handler.iter()
|
|
438
|
-
|
|
455
|
+
|
|
456
|
+
|
|
439
457
|
class TrecBackgroundLinkingTopicsHandler(TopicsHandler):
|
|
440
458
|
def __init__(self, dataset):
|
|
441
459
|
self.dataset = dataset
|
|
442
|
-
|
|
460
|
+
|
|
443
461
|
@cached_property
|
|
444
462
|
def ext2records(self):
|
|
445
463
|
return {record[IDItem].id: record for record in self.records}
|
|
@@ -462,10 +480,12 @@ class TrecBackgroundLinkingTopicsHandler(TopicsHandler):
|
|
|
462
480
|
records = []
|
|
463
481
|
|
|
464
482
|
for query in self.dataset.dataset.queries_iter():
|
|
465
|
-
topic =
|
|
483
|
+
topic = Record(
|
|
466
484
|
IDItem(query.query_id),
|
|
467
485
|
# Following BEIR documentation, we use title of documents as queries: https://github.com/beir-cellar/beir/blob/main/examples/dataset/README.md#queries-and-qrels
|
|
468
|
-
SimpleTextItem(
|
|
486
|
+
SimpleTextItem(
|
|
487
|
+
self.dataset.dataset.docs_store().get(query.doc_id).title
|
|
488
|
+
),
|
|
469
489
|
UrlItem(query.url),
|
|
470
490
|
)
|
|
471
491
|
records.append(topic)
|
|
@@ -477,11 +497,10 @@ class TrecBackgroundLinkingTopicsHandler(TopicsHandler):
|
|
|
477
497
|
|
|
478
498
|
|
|
479
499
|
Topics.HANDLERS.update(
|
|
480
|
-
{
|
|
481
|
-
_irds.wapo.TrecBackgroundLinkingQuery: TrecBackgroundLinkingTopicsHandler
|
|
482
|
-
}
|
|
500
|
+
{_irds.wapo.TrecBackgroundLinkingQuery: TrecBackgroundLinkingTopicsHandler}
|
|
483
501
|
)
|
|
484
502
|
|
|
503
|
+
|
|
485
504
|
class CastTopicsHandler(TopicsHandler):
|
|
486
505
|
def __init__(self, dataset):
|
|
487
506
|
self.dataset = dataset
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
from attrs import define
|
|
2
1
|
from pathlib import Path
|
|
3
2
|
from typing import Iterator, Optional
|
|
4
3
|
import re
|
|
4
|
+
from datamaestro_text.data.ir import AdhocRunDict
|
|
5
5
|
from datamaestro_text.data.ir.base import (
|
|
6
6
|
AdhocAssessedTopic,
|
|
7
7
|
TopicRecord,
|
|
@@ -10,6 +10,33 @@ from datamaestro_text.data.ir.base import (
|
|
|
10
10
|
)
|
|
11
11
|
from datamaestro_text.data.ir.formats import TrecTopicRecord, TrecTopic
|
|
12
12
|
|
|
13
|
+
# --- Runs
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def parse_run(path: Path) -> AdhocRunDict:
|
|
17
|
+
results = {}
|
|
18
|
+
with path.open("rt") as f:
|
|
19
|
+
for line in f:
|
|
20
|
+
query_id, _q0, doc_id, _rank, score, _model_id = re.split(
|
|
21
|
+
r"\s+", line.strip()
|
|
22
|
+
)
|
|
23
|
+
results.setdefault(query_id, {})[doc_id] = score
|
|
24
|
+
|
|
25
|
+
return results
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def write_run_dict(run: AdhocRunDict, run_path: Path):
|
|
29
|
+
"""Write run dict"""
|
|
30
|
+
with run_path.open("wt") as f:
|
|
31
|
+
for query_id, scored_documents in run.items():
|
|
32
|
+
scored_documents = list(
|
|
33
|
+
[(doc_id, score) for doc_id, score in scored_documents.items()]
|
|
34
|
+
)
|
|
35
|
+
scored_documents.sort(key=lambda x: x[1], reverse=True)
|
|
36
|
+
for ix, (doc_id, score) in enumerate(scored_documents):
|
|
37
|
+
f.write(f"{query_id} Q0 {doc_id} {ix + 1} {score} run\n")
|
|
38
|
+
|
|
39
|
+
|
|
13
40
|
# --- Assessments
|
|
14
41
|
|
|
15
42
|
|
datamaestro_text/utils/files.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from tqdm import tqdm
|
|
1
3
|
import gzip
|
|
2
4
|
from pathlib import Path
|
|
3
5
|
|
|
@@ -6,3 +8,104 @@ def auto_open(path: Path, mode: str):
|
|
|
6
8
|
if path.suffix == ".gz":
|
|
7
9
|
return gzip.open(path, mode)
|
|
8
10
|
return path.open(mode)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class CountingWrapper:
|
|
14
|
+
"""Wrap a file object to count the actual compressed bytes read."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, file_obj):
|
|
17
|
+
self.file_obj = file_obj
|
|
18
|
+
self.bytes_read = 0
|
|
19
|
+
|
|
20
|
+
def read(self, size=-1):
|
|
21
|
+
data = self.file_obj.read(size)
|
|
22
|
+
self.bytes_read += len(data)
|
|
23
|
+
return data
|
|
24
|
+
|
|
25
|
+
def readline(self, size=-1):
|
|
26
|
+
data = self.file_obj.readline(size)
|
|
27
|
+
self.bytes_read += len(data)
|
|
28
|
+
return data
|
|
29
|
+
|
|
30
|
+
def __iter__(self):
|
|
31
|
+
return self
|
|
32
|
+
|
|
33
|
+
def __next__(self):
|
|
34
|
+
line = self.readline()
|
|
35
|
+
if not line:
|
|
36
|
+
raise StopIteration
|
|
37
|
+
return line
|
|
38
|
+
|
|
39
|
+
def close(self):
|
|
40
|
+
self.file_obj.close()
|
|
41
|
+
|
|
42
|
+
def __getattr__(self, attr):
|
|
43
|
+
return getattr(self.file_obj, attr)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class TQDMBytesReader:
|
|
47
|
+
def __init__(self, file_obj, total_size, **tqdm_kwargs):
|
|
48
|
+
self.file_obj = CountingWrapper(file_obj)
|
|
49
|
+
self.tqdm = tqdm(
|
|
50
|
+
total=total_size,
|
|
51
|
+
unit="B",
|
|
52
|
+
unit_scale=True,
|
|
53
|
+
unit_divisor=1024,
|
|
54
|
+
**tqdm_kwargs,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
def _update_progress(self):
|
|
58
|
+
delta = self.file_obj.bytes_read - self.tqdm.n
|
|
59
|
+
if delta > 0:
|
|
60
|
+
self.tqdm.update(delta)
|
|
61
|
+
|
|
62
|
+
def read(self, size=-1):
|
|
63
|
+
data = self.file_obj.read(size)
|
|
64
|
+
self._update_progress()
|
|
65
|
+
return data
|
|
66
|
+
|
|
67
|
+
def readline(self, size=-1):
|
|
68
|
+
line = self.file_obj.readline(size)
|
|
69
|
+
self._update_progress()
|
|
70
|
+
return line
|
|
71
|
+
|
|
72
|
+
def readlines(self, hint=-1):
|
|
73
|
+
lines = self.file_obj.readlines(hint)
|
|
74
|
+
self._update_progress()
|
|
75
|
+
return lines
|
|
76
|
+
|
|
77
|
+
def __iter__(self):
|
|
78
|
+
return self
|
|
79
|
+
|
|
80
|
+
def __next__(self):
|
|
81
|
+
line = self.readline()
|
|
82
|
+
if not line:
|
|
83
|
+
raise StopIteration
|
|
84
|
+
return line
|
|
85
|
+
|
|
86
|
+
def close(self):
|
|
87
|
+
self.tqdm.close()
|
|
88
|
+
self.file_obj.close()
|
|
89
|
+
|
|
90
|
+
def __getattr__(self, attr):
|
|
91
|
+
# Delegate any other attribute to the underlying file object
|
|
92
|
+
return getattr(self.file_obj, attr)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class TQDMFileReader:
|
|
96
|
+
def __init__(self, filepath, mode="rt", file_opener=open, **tqdm_kwargs):
|
|
97
|
+
self.filepath = filepath
|
|
98
|
+
self.mode = mode
|
|
99
|
+
self.file_opener = file_opener
|
|
100
|
+
self.tqdm_kwargs = tqdm_kwargs
|
|
101
|
+
|
|
102
|
+
def __enter__(self):
|
|
103
|
+
self.file_obj = self.file_opener(self.filepath, self.mode)
|
|
104
|
+
total_size = os.path.getsize(self.filepath) # this is compressed size
|
|
105
|
+
self.reader = TQDMBytesReader(
|
|
106
|
+
self.file_obj, total_size=total_size, **self.tqdm_kwargs
|
|
107
|
+
)
|
|
108
|
+
return self.reader
|
|
109
|
+
|
|
110
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
111
|
+
self.reader.close()
|
datamaestro_text/utils/iter.py
CHANGED
|
@@ -82,3 +82,8 @@ class LazyList(Sequence):
|
|
|
82
82
|
# Convert the iterable to a list if it hasn't been already
|
|
83
83
|
if self.materialized_list is None:
|
|
84
84
|
self.materialized_list = list(self.iterable)
|
|
85
|
+
|
|
86
|
+
def reverse(self):
|
|
87
|
+
"""Reverse the list in place, materializing it if necessary"""
|
|
88
|
+
self._materialize()
|
|
89
|
+
self.materialized_list.reverse()
|
datamaestro_text/version.py
CHANGED
|
@@ -17,5 +17,5 @@ __version__: str
|
|
|
17
17
|
__version_tuple__: VERSION_TUPLE
|
|
18
18
|
version_tuple: VERSION_TUPLE
|
|
19
19
|
|
|
20
|
-
__version__ = version = '2025.
|
|
21
|
-
__version_tuple__ = version_tuple = (2025,
|
|
20
|
+
__version__ = version = '2025.7.28'
|
|
21
|
+
__version_tuple__ = version_tuple = (2025, 7, 28)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datamaestro-text
|
|
3
|
-
Version: 2025.
|
|
3
|
+
Version: 2025.7.28
|
|
4
4
|
Summary: Datamaestro module for text-related datasets
|
|
5
5
|
Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
|
|
6
6
|
License: GPL-3
|
|
@@ -15,10 +15,10 @@ Classifier: Operating System :: OS Independent
|
|
|
15
15
|
Classifier: Programming Language :: Python
|
|
16
16
|
Classifier: Programming Language :: Python :: 3
|
|
17
17
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
18
|
-
Requires-Python: >=3.
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
19
|
Description-Content-Type: text/markdown
|
|
20
20
|
License-File: LICENSE
|
|
21
|
-
Requires-Dist: datamaestro>=1.
|
|
21
|
+
Requires-Dist: datamaestro>=1.5.0
|
|
22
22
|
Requires-Dist: ir_datasets>=0.5.8
|
|
23
23
|
Requires-Dist: attrs
|
|
24
24
|
Provides-Extra: dev
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
datamaestro_text/__init__.py,sha256=hU8jZpkXl3F74qIfqnJl7v4nJ9YxfoR7IpJpUREFNRI,248
|
|
2
|
-
datamaestro_text/version.py,sha256=
|
|
2
|
+
datamaestro_text/version.py,sha256=rJQHFC3G5XDG0rUPZ6r1msOA_XCbSY-qMukJgu2nA1M,519
|
|
3
3
|
datamaestro_text/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
datamaestro_text/config/ai/quac.yaml,sha256=h1D7UJo2z1nZ_9MXpDHuQNJG9Ma2oayUdJV6lyumAIg,1103
|
|
5
5
|
datamaestro_text/config/com/oscar-corpus.py,sha256=dDfdy_uhG6esRQQzUOOORwK8KvEq4c4hZbWt22pv-zY,733
|
|
6
|
-
datamaestro_text/config/com/sentiment140.py,sha256=
|
|
6
|
+
datamaestro_text/config/com/sentiment140.py,sha256=WKKLaD7psbj9fIaTBHDTzOZanO2mukaB1g7aeTN1jdU,1289
|
|
7
7
|
datamaestro_text/config/com/fastml/goodbooks-10k.yaml,sha256=5ZABxUnBFs2ZnCXtBH8YoBiPb3SocRRdH1DLSfVWF-Y,1172
|
|
8
|
-
datamaestro_text/config/com/github/ikat.py,sha256=
|
|
8
|
+
datamaestro_text/config/com/github/ikat.py,sha256=DCayX-t2OBeW5bOJvRxoQgIH3vy-__mYzdmVcnayAkk,4230
|
|
9
9
|
datamaestro_text/config/com/github/aagohary/canard.py,sha256=DREIKU2FFrfwibjU48Y1AsKTUzhwxmD4Kuykd4bDIus,1762
|
|
10
10
|
datamaestro_text/config/com/github/apple/ml-qrecc.py,sha256=oYI0SUxEYzGoL2IbRrnze2cQuWwENwNk4ID9NQuI2Tw,3061
|
|
11
11
|
datamaestro_text/config/com/github/prdwb/orconvqa.py,sha256=MFJYaxJoqJQ6kMDHa2PIE0zoxYTA8Kyl26-vzFoMML0,3032
|
|
@@ -29,7 +29,7 @@ datamaestro_text/config/gov/nist/trec/adhoc.py,sha256=vcFaLlZ-chwDt013MYI8bYZ2ug
|
|
|
29
29
|
datamaestro_text/config/gov/nist/trec/clueweb.yaml,sha256=sm4UbdtMzWoDVPsewtVDS7Vj2jBOdgp18Xqo1X4ysQc,792
|
|
30
30
|
datamaestro_text/config/gov/nist/trec/deeplearning.yaml,sha256=QGM7PtXLJRttNdOPE16o7-k3e5tA9HgcaM_-qFDV_5Q,2125
|
|
31
31
|
datamaestro_text/config/gov/nist/trec/index.yaml,sha256=oSUhUmtukf5oXqUbJLhae8xZx7Uac5V6uZoUB-RJ7Sw,2711
|
|
32
|
-
datamaestro_text/config/gov/nist/trec/tipster.py,sha256=
|
|
32
|
+
datamaestro_text/config/gov/nist/trec/tipster.py,sha256=DirpnHpS10e27LcL7v9ksKreKVy7EgfVhyztV49VRds,5364
|
|
33
33
|
datamaestro_text/config/gov/nist/trec/web.yaml,sha256=iNIJ-PCeLwpUaJByJesyCqfFJolWCts_VETACha3hfQ,563
|
|
34
34
|
datamaestro_text/config/io/github/rajpurkar/squad.yaml,sha256=JchkTruEhVd0uOTGgek2tOgBL4UT3G3KVASvZcNigLg,1143
|
|
35
35
|
datamaestro_text/config/io/github/thunlp/fewrel.py,sha256=LJYexk2ssZ5SqKIKLjPdybS9ju4g9fGIFy5eybtsvYQ,1224
|
|
@@ -47,42 +47,42 @@ datamaestro_text/data/recommendation.py,sha256=wHV_9SXSclouuXaBmvwg3ncZLOFfdKRZG
|
|
|
47
47
|
datamaestro_text/data/tagging.py,sha256=yWm7bNLks77cAySa1aZNCmLFxTTqhqXZ0PaoaEYU6hI,697
|
|
48
48
|
datamaestro_text/data/text.py,sha256=_9J7-j06BOIZ1HsOcBsWy_asanAKkDoZIsWWMMj4tm4,496
|
|
49
49
|
datamaestro_text/data/conversation/__init__.py,sha256=esOWnSImMlQs5QtfxUt559ABLd6a5wwoNpj9XtIq71Y,159
|
|
50
|
-
datamaestro_text/data/conversation/base.py,sha256=
|
|
50
|
+
datamaestro_text/data/conversation/base.py,sha256=BAqu5YOjlRcBOSiyvrjiTIuuc1OalM0BDtcEvJYhO2Y,7515
|
|
51
51
|
datamaestro_text/data/conversation/canard.py,sha256=FI02rPNuzSsA_uXOcgcuQn31d3QahmHndzYCV1Zm8rk,3305
|
|
52
|
-
datamaestro_text/data/conversation/ikat.py,sha256=
|
|
52
|
+
datamaestro_text/data/conversation/ikat.py,sha256=djxTv0vHLXIUPzfmwpB25fyUWJNVPqthTtDbQ6CzmQo,4363
|
|
53
53
|
datamaestro_text/data/conversation/orconvqa.py,sha256=RL9cpK7QK2dsk_j2e4Th8wzGFq-G3ww_EjdM1eKh-8o,3764
|
|
54
54
|
datamaestro_text/data/conversation/qrecc.py,sha256=es4GmqPtE63A7O_GARe8Zy3rQvuLEhAvUA7CfN_nMeA,2562
|
|
55
|
-
datamaestro_text/data/ir/__init__.py,sha256=
|
|
56
|
-
datamaestro_text/data/ir/base.py,sha256=
|
|
55
|
+
datamaestro_text/data/ir/__init__.py,sha256=jHoyIWyl0beUbg1gmkwNFf1cQRawB8p3SGfa17gniGM,9442
|
|
56
|
+
datamaestro_text/data/ir/base.py,sha256=uwIiKn0ryK5DWUQsEegeTs6bye0uAOGl0XVm_cvV3ZU,1506
|
|
57
57
|
datamaestro_text/data/ir/cord19.py,sha256=yu1Khgy6AZjH2BPQKdnxDid0vQgQ8zvb8-FQlHH-GOU,1465
|
|
58
58
|
datamaestro_text/data/ir/csv.py,sha256=tnxuqR_MZ3GQhuoXEMYyWLQw8PyD0gRqsnzIP5Gsziw,1212
|
|
59
59
|
datamaestro_text/data/ir/data.py,sha256=ERmmOxz_9mUkIuccNbzUa5Y6gVLLVDdyc4cCxbCCUbY,20
|
|
60
|
-
datamaestro_text/data/ir/formats.py,sha256=
|
|
60
|
+
datamaestro_text/data/ir/formats.py,sha256=Tbu5rrssq9m76a448-ixgs7a-KKvNCvXYkdv5FkFqp4,3641
|
|
61
61
|
datamaestro_text/data/ir/huggingface.py,sha256=G71VFDN-SllZy4LFxumEbCumEJvb5-orAbLemHjWhiA,956
|
|
62
|
-
datamaestro_text/data/ir/stores.py,sha256=
|
|
63
|
-
datamaestro_text/data/ir/trec.py,sha256=
|
|
62
|
+
datamaestro_text/data/ir/stores.py,sha256=A4Ew0L4P6iLLmDKhxqjjVkHcz797BHf4d76YguGkB1A,4412
|
|
63
|
+
datamaestro_text/data/ir/trec.py,sha256=IOtQRMUz8zx-dYEMR2NIIM6qXEUjsV0eVOhGvKIRJK4,1974
|
|
64
64
|
datamaestro_text/data/ir/utils.py,sha256=6-GhXVtgkBZGhIs2-ODZua_3DmKjSSVydStpHDqbAwE,833
|
|
65
65
|
datamaestro_text/datasets/irds/__init__.py,sha256=Tq0HN1qojnZYLBumM59BuTkz7r0gcu-5OXmDDLgPpAc,707
|
|
66
|
-
datamaestro_text/datasets/irds/data.py,sha256=
|
|
66
|
+
datamaestro_text/datasets/irds/data.py,sha256=eUehp_80H1yyh7CVkM7mOWJtB9XHlmI-A7fLewXuaDE,22487
|
|
67
67
|
datamaestro_text/datasets/irds/datasets.py,sha256=n5mvD2omVI4PkeuJgX13Cio6gLrgYjxq8ZUKKyqocEs,5625
|
|
68
68
|
datamaestro_text/datasets/irds/helpers.py,sha256=fGE-fbuJbXdTzl1qo55247jzn9cvApY-d82GJBgfY1E,3982
|
|
69
69
|
datamaestro_text/datasets/irds/utils.py,sha256=m30JLIrV_HgilN11TvY9dGTyumES6LLzWZDUAMT915M,1425
|
|
70
70
|
datamaestro_text/download/tmdb.py,sha256=kU_Vz9jhznlyeKMHziVu58IHoWv8zPu6CZTHVNQvmu4,4009
|
|
71
71
|
datamaestro_text/interfaces/plaintext.py,sha256=cWfS_xjqZxQ0EV4Ax5BEarZ4lnhQ1I7mc_vgfBgE76w,885
|
|
72
|
-
datamaestro_text/interfaces/trec.py,sha256=
|
|
72
|
+
datamaestro_text/interfaces/trec.py,sha256=GrP0N_Hcj5f73KS6CSzkyN4aaI-XoBJ19oVMpHVR3QM,3579
|
|
73
73
|
datamaestro_text/test/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
74
74
|
datamaestro_text/test/test_datasets.py,sha256=hD1pe-CjEnOj0JNqDLasz_97ltFzMbYfyYBy1QyYdf8,202
|
|
75
75
|
datamaestro_text/test/test_documented.py,sha256=Kl90fz_r_dsV0oXE1Mad34GiQyZ9dc6bfZKNtxP0O2s,453
|
|
76
76
|
datamaestro_text/transforms/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
77
77
|
datamaestro_text/transforms/ir/__init__.py,sha256=eWxr0ZtfK9F9J4TachW5V8Ra5AhQJueDyOGR8vXi1uA,6553
|
|
78
78
|
datamaestro_text/utils/__init__.py,sha256=2449YLTAtKJzkmt84Mu8sBRCCveNs5fiaqTCK_p5ha0,3340
|
|
79
|
-
datamaestro_text/utils/files.py,sha256=
|
|
80
|
-
datamaestro_text/utils/iter.py,sha256=
|
|
79
|
+
datamaestro_text/utils/files.py,sha256=jhcirufgTztEkx1hs2-qgScEcHnIcGF_BjCeuCSsNv0,2838
|
|
80
|
+
datamaestro_text/utils/iter.py,sha256=QBajeSPLHvkeh6BCTZDSqWlOYNjwUDvgTTZ_YxJntXw,2701
|
|
81
81
|
datamaestro_text/utils/randomstream.py,sha256=_-boH4IIqN8qcl3IktjpNp9vmF4TWRzHUSNVwg7WAr8,973
|
|
82
82
|
datamaestro_text/utils/shuffle.py,sha256=o8JTz3mr0lYWyv0zEh91jEK12ci1etMiUnzh5GkOHCM,3490
|
|
83
|
-
datamaestro_text-2025.
|
|
84
|
-
datamaestro_text-2025.
|
|
85
|
-
datamaestro_text-2025.
|
|
86
|
-
datamaestro_text-2025.
|
|
87
|
-
datamaestro_text-2025.
|
|
88
|
-
datamaestro_text-2025.
|
|
83
|
+
datamaestro_text-2025.7.28.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
84
|
+
datamaestro_text-2025.7.28.dist-info/METADATA,sha256=M0V-4Q2_EBFIRnP0czVXvZC9t_qhhmVRbWSAry31SW4,1848
|
|
85
|
+
datamaestro_text-2025.7.28.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
86
|
+
datamaestro_text-2025.7.28.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
|
|
87
|
+
datamaestro_text-2025.7.28.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
|
|
88
|
+
datamaestro_text-2025.7.28.dist-info/RECORD,,
|
|
File without changes
|
{datamaestro_text-2025.6.30.dist-info → datamaestro_text-2025.7.28.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{datamaestro_text-2025.6.30.dist-info → datamaestro_text-2025.7.28.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|