datamaestro-text 2025.6.11__py3-none-any.whl → 2025.7.28__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -39,9 +39,9 @@ def main(data: Path) -> Supervised[QReCCDataset, None, QReCCDataset]:
39
39
  answering that includes the individual subtasks of question rewriting,
40
40
  passage retrieval and reading comprehension
41
41
  """
42
- return Supervised(
43
- train=QReCCDataset(path=data / "qrecc_train.json"),
44
- test=QReCCDataset(path=data / "qrecc_test.json"),
42
+ return Supervised.C(
43
+ train=QReCCDataset.C(path=data / "qrecc_train.json"),
44
+ test=QReCCDataset.C(path=data / "qrecc_test.json"),
45
45
  )
46
46
 
47
47
 
@@ -0,0 +1,121 @@
1
+ # See documentation on https://datamaestro.readthedocs.io
2
+
3
+ import bz2
4
+ from datamaestro.download import reference
5
+ from datamaestro.definitions import datatasks, datatags, dataset
6
+ from datamaestro_text.data.conversation.base import ConversationUserTopics
7
+ from datamaestro_text.data.ir import Adhoc
8
+
9
+ from datamaestro.utils import HashCheck
10
+ from datamaestro.context import DatafolderPath
11
+ from datamaestro.download.single import filedownloader
12
+ from datamaestro_text.data.conversation.ikat import IkatConversations
13
+ from datamaestro.download.links import linkfolder
14
+
15
+ from datamaestro_text.data.ir.stores import IKatClueWeb22DocumentStore
16
+ from datamaestro_text.data.ir.trec import TrecAdhocAssessments
17
+ from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
18
+
19
+
20
+ @dataset(as_prepare=True)
21
+ def clueweb22(dataset, options=None) -> IKatClueWeb22DocumentStore:
22
+ # Number of documents in the dataset
23
+ count = 116_838_987
24
+
25
+ jsonl_folder = linkfolder(
26
+ "documents", [DatafolderPath("gov.nist.trec.ikat.clueweb22", "jsonl")]
27
+ ).setup(dataset, options)
28
+ store_path = lz4docstore_builder(
29
+ "store",
30
+ IKatClueWeb22DocumentStore.generator(
31
+ jsonl_folder,
32
+ jsonl_folder / "ikat_2023_passages_jsonl.sha256sums",
33
+ jsonl_folder / "ikat_2023_passages_hashes.tsv.bz2",
34
+ ),
35
+ IKatClueWeb22DocumentStore.Document,
36
+ "id",
37
+ count_hint=count,
38
+ ).setup(dataset, options)
39
+
40
+ return IKatClueWeb22DocumentStore.C(path=store_path, count=count)
41
+
42
+
43
+ @datatags("conversation", "context", "query")
44
+ @datatasks("conversational search", "query rewriting")
45
+ @reference("documents", clueweb22)
46
+ @filedownloader(
47
+ "topics.json",
48
+ "https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2025/data/2025_test_topics.json",
49
+ checker=HashCheck("16f8444a8d0a8dfe0090f478f185a63c"),
50
+ )
51
+ @dataset(
52
+ id="2025",
53
+ url="https://github.com/irlabamsterdam/iKAT/tree/main/2025",
54
+ )
55
+ def test_2025(topics, documents) -> Adhoc.C:
56
+ """Question-in-context rewriting
57
+
58
+ iKAT is a test dataset for question-in-context rewriting that consists of
59
+ questions each given in a dialog context together with a context-independent
60
+ rewriting of the question.
61
+ """
62
+ return Adhoc.C(
63
+ topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
64
+ # TODO: add when available
65
+ assessments=TrecAdhocAssessments.C(path="/to/do"),
66
+ documents=documents,
67
+ )
68
+
69
+
70
+ @datatags("conversation", "context", "query")
71
+ @datatasks("conversational search", "query rewriting")
72
+ @reference("documents", clueweb22)
73
+ @filedownloader(
74
+ "qrels",
75
+ "https://trec.nist.gov/data/ikat/2024-qrels.txt",
76
+ checker=HashCheck("57f958903ed1c12bbac207f62800814f"),
77
+ )
78
+ @filedownloader(
79
+ "topics.json",
80
+ "https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2024/data/2024_test_topics.json",
81
+ checker=HashCheck("ad45bc6e7add2081d69ea60a0a4d1203"),
82
+ )
83
+ @dataset(
84
+ Adhoc,
85
+ id="2024",
86
+ url="https://github.com/irlabamsterdam/iKAT/tree/main/2024",
87
+ )
88
+ def test_2024(topics, qrels, documents) -> Adhoc.C:
89
+ """iKAT 2024 dataset"""
90
+ return Adhoc.C(
91
+ topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
92
+ assessments=TrecAdhocAssessments.C(path=qrels),
93
+ documents=documents,
94
+ )
95
+
96
+
97
+ @datatags("conversation", "context", "query")
98
+ @datatasks("conversational search", "query rewriting")
99
+ @reference("documents", clueweb22)
100
+ @filedownloader(
101
+ "qrels",
102
+ "https://trec.nist.gov/data/ikat/2023-qrels.all-turns.txt",
103
+ checker=HashCheck("79dc121bab25b2245e52a53263e5ad1f"),
104
+ )
105
+ @filedownloader(
106
+ "topics.json",
107
+ "https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2023/data/2023_test_topics.json",
108
+ checker=HashCheck("684fa0197cdec8c3cfb6a2e586ab83f6"),
109
+ )
110
+ @dataset(
111
+ Adhoc,
112
+ id="2023",
113
+ url="https://github.com/irlabamsterdam/iKAT/tree/main/2023",
114
+ )
115
+ def test_2023(topics, qrels, documents) -> Adhoc.C:
116
+ """iKAT 2023 dataset"""
117
+ return Adhoc.C(
118
+ topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
119
+ assessments=TrecAdhocAssessments.C(path=qrels),
120
+ documents=documents,
121
+ )
@@ -26,7 +26,7 @@ def english(dir):
26
26
 
27
27
  If you use this data, please cite Sentiment140 as your source.
28
28
  """
29
- return {
30
- "train": Generic(path=dir / "training.1600000.processed.noemoticon.csv"),
31
- "test": Generic(path=dir / "testdata.manual.2009.06.14.csv"),
32
- }
29
+ return Supervised.C(
30
+ train=Generic(path=dir / "training.1600000.processed.noemoticon.csv"),
31
+ test=Generic(path=dir / "testdata.manual.2009.06.14.csv"),
32
+ )
@@ -17,7 +17,6 @@ of three CD-ROMs of SGML encoded documents distributed by LDC plus queries and a
17
17
  See also https://trec.nist.gov/data/docs_eng.html and https://trec.nist.gov/data/intro_eng.html
18
18
  """
19
19
 
20
- from datamaestro.data import Base
21
20
  from datamaestro_text.data.ir.trec import TipsterCollection
22
21
  from datamaestro.download.links import linkfolder
23
22
  from datamaestro.definitions import (
@@ -1,10 +1,13 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from enum import Enum
3
+ from datamaestro_text.data.ir.base import IDItem, SimpleTextItem
4
+ from experimaestro import Param
3
5
  from typing import Dict, Generic, Iterator, List, Optional, Sequence, Tuple
4
6
  from attr import define
7
+ from datamaestro.record import record_type
5
8
  from datamaestro.data import Base
6
9
  from datamaestro.record import Record, Item
7
- from datamaestro_text.data.ir import TopicRecord
10
+ from datamaestro_text.data.ir import TopicRecord, Topics
8
11
  from datamaestro_text.utils.iter import FactoryIterable, LazyList, RangeView
9
12
 
10
13
  # ---- Basic types
@@ -120,20 +123,17 @@ class ConversationNode:
120
123
  ...
121
124
 
122
125
  @abstractmethod
123
- def parent(self) -> Optional["ConversationNode"]:
124
- ...
126
+ def parent(self) -> Optional["ConversationNode"]: ...
125
127
 
126
128
  @abstractmethod
127
- def children(self) -> List["ConversationNode"]:
128
- ...
129
+ def children(self) -> List["ConversationNode"]: ...
129
130
 
130
131
 
131
132
  class ConversationTree(ABC):
132
133
  """Represents a conversation tree"""
133
134
 
134
135
  @abstractmethod
135
- def root(self) -> ConversationNode:
136
- ...
136
+ def root(self) -> ConversationNode: ...
137
137
 
138
138
  @abstractmethod
139
139
  def __iter__(self) -> Iterator[ConversationNode]:
@@ -214,8 +214,8 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
214
214
 
215
215
  def __init__(self, entry):
216
216
  self.entry = entry
217
- self.parent = None
218
- self.children = []
217
+ self._parent = None
218
+ self._children = []
219
219
 
220
220
  def add(self, node: "ConversationTreeNode") -> "ConversationTreeNode":
221
221
  self._children.append(node)
@@ -224,10 +224,10 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
224
224
 
225
225
  def conversation(self, skip_self: bool) -> ConversationHistory:
226
226
  def iterator():
227
- current = self.parent if skip_self else self
227
+ current = self.parent() if skip_self else self
228
228
  while current is not None:
229
229
  yield current.entry
230
- current = current.parent
230
+ current = current.parent()
231
231
 
232
232
  return LazyList(FactoryIterable(iterator))
233
233
 
@@ -253,5 +253,30 @@ class ConversationDataset(Base, ABC):
253
253
  @abstractmethod
254
254
  def __iter__(self) -> Iterator[ConversationTree]:
255
255
  """Return an iterator over conversations"""
256
- for i in range(len(self)):
257
- yield self.get(i)
256
+ ...
257
+
258
+
259
+ class ConversationUserTopics(Topics):
260
+ """Extract user topics from conversations"""
261
+
262
+ conversations: Param[ConversationDataset]
263
+
264
+ topic_recordtype = record_type(IDItem, SimpleTextItem)
265
+
266
+ def iter(self) -> Iterator[TopicRecord]:
267
+ """Returns an iterator over topics"""
268
+ # Extracts topics from conversations, Each user query is a topic (can perform retrieval on it)
269
+ # TODO: merge with xpmir.learning.DatasetConversationBase -> same logic
270
+
271
+ records: List[TopicRecord] = []
272
+ for conversation in self.conversations.__iter__():
273
+ nodes = [
274
+ node
275
+ for node in conversation
276
+ if node.entry[EntryType] == EntryType.USER_QUERY
277
+ ]
278
+ for node in nodes:
279
+ records.append(
280
+ node.entry.update(ConversationHistoryItem(node.history()))
281
+ )
282
+ return iter(records)
@@ -0,0 +1,145 @@
1
+ from typing import Iterator, List
2
+ from attr import define, field
3
+ import json
4
+ import logging
5
+ from datamaestro.data import File
6
+ from datamaestro.record import Record
7
+
8
+ from datamaestro_text.data.ir import Topics
9
+ from datamaestro_text.data.ir.base import (
10
+ IDItem,
11
+ SimpleTextItem,
12
+ )
13
+
14
+
15
+ from .base import (
16
+ AnswerEntry,
17
+ ConversationTree,
18
+ EntryType,
19
+ SimpleDecontextualizedItem,
20
+ SingleConversationTree,
21
+ )
22
+ from . import ConversationDataset
23
+
24
+ # Keys to change in the dataset entries for compatibility across different years
25
+
26
+ KEY_MAPPINGS = {
27
+ # Keys to replace: Target Key
28
+ "turns": "responses",
29
+ "utterance": "user_utterance",
30
+ "ptkb_provenance": "relevant_ptkbs",
31
+ "response_provenance": "citations",
32
+ }
33
+
34
+
35
+ def norm_dict(entry: dict) -> dict:
36
+ """Convert keys in the entry to match the expected format."""
37
+ normalized = {}
38
+ for k, v in entry.items():
39
+ # Check for direct mapping, then try lowercase mapping
40
+ new_key = KEY_MAPPINGS.get(k) or KEY_MAPPINGS.get(k.lower()) or k.lower()
41
+ normalized[new_key] = v
42
+ return normalized
43
+
44
+
45
+ @define(kw_only=True)
46
+ class IkatConversationEntry:
47
+ """A query with past history"""
48
+
49
+ turn_id: int
50
+ """Turn number in the conversation"""
51
+
52
+ user_utterance: str
53
+ """The last issued query"""
54
+
55
+ resolved_utterance: str
56
+ """Manually rewritten query"""
57
+
58
+ response: str
59
+ """The system response to the query"""
60
+
61
+ relevant_ptkbs: List[str]
62
+ """The list of relevant personal knowledge bases for the query"""
63
+
64
+ citations: List[str]
65
+ """The list of citations for the response"""
66
+
67
+
68
+ @define(kw_only=True)
69
+ class IkatConversationTopic:
70
+ """A query with past history"""
71
+
72
+ number: str
73
+ """Conversation ID"""
74
+
75
+ title: str
76
+ """Title of the conversation"""
77
+
78
+ ptkb: str
79
+ """The personal knowledge base associated with the user"""
80
+
81
+ responses: List[IkatConversationEntry] = field(
82
+ converter=lambda items: [
83
+ IkatConversationEntry(**item) if isinstance(item, dict) else item
84
+ for item in map(norm_dict, items)
85
+ ]
86
+ )
87
+ """The list of responses to the query"""
88
+
89
+
90
+ class IkatConversations(ConversationDataset, File):
91
+ """A dataset containing conversations from the IKAT project"""
92
+
93
+ """Keys to change in the dataset entries for compatibility across different years"""
94
+
95
+ def entries(self) -> Iterator[IkatConversationTopic]:
96
+ """Reads all conversation entries from the dataset file."""
97
+ with self.path.open("rt") as fp:
98
+ raw_data = json.load(fp)
99
+
100
+ logging.debug("Loaded %d entries from %s", len(raw_data), self.path)
101
+ logging.debug(f"raw data has keys {raw_data[0].keys()}")
102
+
103
+ for entry in raw_data:
104
+ try:
105
+ normalized_entry = norm_dict(entry)
106
+ yield IkatConversationTopic(**normalized_entry)
107
+ except Exception as e:
108
+ logging.warning(f"Failed to parse entry: {e}")
109
+ raise e
110
+
111
+ def __iter__(self) -> Iterator[ConversationTree]:
112
+ for entry in self.entries():
113
+ history: List[Record] = []
114
+
115
+ for turn in entry.responses:
116
+ turn: IkatConversationEntry = turn # Ensure type is correct
117
+ query_id = f"{entry.number}_{turn.turn_id}"
118
+
119
+ # USER QUERY record
120
+ history.append(
121
+ Record(
122
+ IDItem(query_id),
123
+ SimpleTextItem(turn.user_utterance),
124
+ SimpleDecontextualizedItem(turn.resolved_utterance),
125
+ EntryType.USER_QUERY,
126
+ )
127
+ )
128
+
129
+ # Build citation info (stubbed relevance to match format)
130
+ relevances = {}
131
+ if turn.relevant_ptkbs:
132
+ # Example: just use first as relevant (can be improved)
133
+ relevances[0] = (0, None) # No position info in this structure
134
+
135
+ # SYSTEM ANSWER record
136
+ history.append(
137
+ Record(
138
+ AnswerEntry(turn.response),
139
+ EntryType.SYSTEM_ANSWER,
140
+ )
141
+ )
142
+
143
+ # Ensure reverse if needed for compatibility (optional)
144
+ history.reverse()
145
+ yield SingleConversationTree(entry.number, history)
@@ -2,9 +2,10 @@
2
2
 
3
3
  from abc import ABC, abstractmethod
4
4
  from functools import cached_property
5
+ import logging
5
6
  from pathlib import Path
6
7
  from attrs import define
7
- from typing import Callable, Dict, Iterator, List, Optional, Tuple, Type
8
+ from typing import Callable, Dict, Iterator, List, Optional, Tuple, Type, TYPE_CHECKING
8
9
  import random
9
10
  from experimaestro import Config
10
11
  from datamaestro.definitions import datatasks, Param, Meta
@@ -28,6 +29,9 @@ from .base import ( # noqa: F401
28
29
  AdhocAssessedTopic,
29
30
  )
30
31
 
32
+ #: A adhoc run dictionary (query id -> doc id -> score)
33
+ AdhocRunDict = dict[str, dict[str, float]]
34
+
31
35
 
32
36
  class Documents(Base):
33
37
  """A set of documents with identifiers
@@ -45,6 +49,22 @@ class Documents(Base):
45
49
  def iter_documents(self) -> Iterator[DocumentRecord]:
46
50
  return self.iter()
47
51
 
52
+ def iter_documents_from(self, start=0) -> Iterator[DocumentRecord]:
53
+ """Iterate over a range of documents
54
+
55
+ Can be specialized in a subclass for faster access
56
+
57
+ :param start: The starting document, defaults to 0
58
+ :return: An iterator
59
+ """
60
+ iter = self.iter()
61
+ if start > 0:
62
+ logging.info("skipping %d documents", start + 1)
63
+ for _ in range(start + 1):
64
+ next(iter)
65
+
66
+ return iter
67
+
48
68
  def iter_ids(self) -> Iterator[str]:
49
69
  """Iterates over document ids
50
70
 
@@ -168,7 +188,10 @@ class AdhocAssessments(Base, ABC):
168
188
  class AdhocRun(Base):
169
189
  """IR adhoc run"""
170
190
 
171
- pass
191
+ @abstractmethod
192
+ def get_dict(self) -> "AdhocRunDict":
193
+ """Get the run as a dictionary query ID -> doc ID -> score"""
194
+ ...
172
195
 
173
196
 
174
197
  class AdhocResults(Base):
@@ -43,6 +43,7 @@ class IDItem(Item, ABC):
43
43
 
44
44
  id: str
45
45
 
46
+
46
47
  @define
47
48
  class UrlItem(Item):
48
49
  """An url item"""
@@ -70,7 +71,7 @@ class AdhocAssessedTopic:
70
71
  """List of assessments for this topic"""
71
72
 
72
73
 
73
- def create_record(*items: Item, id: str = None, text: str = None):
74
+ def create_record(*items: Item, id: str = None, text: str = None) -> Record:
74
75
  """Easy creation of a text/id item"""
75
76
  extra_items = []
76
77
  if id is not None:
@@ -99,7 +99,7 @@ class WapoDocument(TextItem):
99
99
  body_media: Tuple[WapoDocMedia, ...]
100
100
 
101
101
  @cached_property
102
- def text(self):
102
+ def text(self):
103
103
  return f"{self.title} {self.body_paras_html}"
104
104
 
105
105
 
@@ -132,11 +132,15 @@ class DprW100Doc(TextItem):
132
132
  text: str
133
133
  title: str
134
134
 
135
- @define
135
+
136
+ @define
136
137
  class MsMarcoV2Passage(TextItem):
137
138
  text: str
138
139
  spans: Tuple[Tuple[int, int], ...]
139
140
  msmarco_document_id: str
141
+
142
+
143
+ @define
140
144
  class Touche2020(TextItem):
141
145
  text: str
142
146
  title: str
@@ -204,11 +208,13 @@ class TrecTopic(SimpleTextItem):
204
208
 
205
209
  TrecTopicRecord = record_type(IDItem, TrecTopic)
206
210
 
211
+
207
212
  @define
208
213
  class DprW100Query(TextItem):
209
214
  text: str
210
215
  answers: Tuple[str]
211
216
 
217
+
212
218
  @define
213
219
  class TrecBackgroundLinkingQuery(IDItem):
214
220
  query_id: str
@@ -1,12 +1,21 @@
1
- from collections import namedtuple
1
+ import bz2
2
+ from hashlib import md5, sha256
3
+ import json
4
+ import logging
5
+ from pathlib import Path
2
6
  from typing import List, NamedTuple
7
+ from datamaestro_text.utils.files import TQDMFileReader
3
8
  from experimaestro import Constant
4
- import attrs
5
-
6
9
  from datamaestro.record import Record
7
- from datamaestro_text.data.ir.base import IDItem
10
+ from datamaestro_text.data.ir.base import (
11
+ DocumentRecord,
12
+ IDItem,
13
+ SimpleTextItem,
14
+ UrlItem,
15
+ )
8
16
  from datamaestro_text.datasets.irds.data import LZ4DocumentStore
9
17
  from datamaestro_text.data.ir.formats import OrConvQADocument
18
+ from tqdm import tqdm
10
19
 
11
20
 
12
21
  class OrConvQADocumentStore(LZ4DocumentStore):
@@ -27,3 +36,89 @@ class OrConvQADocumentStore(LZ4DocumentStore):
27
36
  fields = data._asdict()
28
37
  del fields["id"]
29
38
  return Record(OrConvQADocument(**fields), IDItem(data.id))
39
+
40
+
41
+ class IKatClueWeb22DocumentStore(LZ4DocumentStore):
42
+ @staticmethod
43
+ def generator(path: Path, checksums_file: Path, passages_hashes: Path):
44
+ """Returns an iterator over iKAT 2022-25 documents
45
+
46
+ :param path: The folder containing the files
47
+ """
48
+
49
+ def __iter__():
50
+ errors = False
51
+
52
+ assert checksums_file.is_file(), f"{checksums_file} does not exist"
53
+ assert passages_hashes.is_file(), f"{passages_hashes} does not exist"
54
+
55
+ # Get the list of files
56
+ with checksums_file.open("rt") as fp:
57
+ files = []
58
+ for line in fp:
59
+ checksum, filename = line.strip().split()
60
+ files.append((checksum, filename))
61
+ if not (path / filename).is_file():
62
+ logging.error("File %s does not exist", path / filename)
63
+ errors = True
64
+
65
+ assert not errors, "Errors detected, stopping"
66
+
67
+ # Check the SHA256 sums
68
+ match checksums_file.suffix:
69
+ case ".sha256sums":
70
+ hasher_factory = sha256
71
+ case _:
72
+ raise NotImplementedError(
73
+ f"Cannot handle {checksums_file.suffix} checksum files"
74
+ )
75
+
76
+ for checksum, filename in tqdm(files):
77
+ logging.info("Checking %s", filename)
78
+ hasher = hasher_factory()
79
+ with (path / filename).open("rb") as fp:
80
+ while data := fp.read(2**20):
81
+ hasher.update(data)
82
+
83
+ file_checksum = hasher.hexdigest()
84
+ assert file_checksum == checksum, (
85
+ f"Expected {checksum}, " f"got {file_checksum} for {filename}"
86
+ )
87
+
88
+ # Get the MD5 hashes of all the passages
89
+ logging.info("Reading the hashes of all passages")
90
+ with TQDMFileReader(passages_hashes, "rt", bz2.open) as fp:
91
+ passage_checksums = {}
92
+ for line in tqdm(fp):
93
+ doc_id, passage_no, checksum = line.strip().split()
94
+ passage_checksums[f"{doc_id}:{passage_no}"] = checksum # noqa: E231
95
+
96
+ # Read the files
97
+ logging.info("Starting to read the files")
98
+ for _, filename in tqdm(files):
99
+ with TQDMFileReader(path / filename, "rt", bz2.open) as jsonl_fp:
100
+ for line in jsonl_fp:
101
+ data = json.loads(line)
102
+ expected = passage_checksums[data["id"]]
103
+ computed = md5(data["contents"].encode("utf-8")).hexdigest()
104
+ assert expected == computed, (
105
+ f"Expected {expected}, "
106
+ f"got {computed} for passage {data['id']} in {filename}"
107
+ )
108
+ yield IKatClueWeb22DocumentStore.Document(**data)
109
+
110
+ return __iter__
111
+
112
+ class Document(NamedTuple):
113
+ id: str
114
+ contents: str
115
+ url: str
116
+
117
+ data_cls = Document
118
+ lookup_field: Constant[str] = "id"
119
+ index_fields: Constant[List[str]] = ["id"]
120
+
121
+ def converter(self, data):
122
+ return DocumentRecord(
123
+ IDItem(data.id), SimpleTextItem(data.contents), UrlItem(data.url)
124
+ )
@@ -1,9 +1,9 @@
1
+ import re
1
2
  from typing import Dict, List, Optional
2
- from datamaestro.data import Base
3
3
  from experimaestro import documentation, Param, Meta
4
4
  from pathlib import Path
5
- from datamaestro.record import Record
6
5
  from datamaestro_text.data.ir import (
6
+ AdhocRunDict,
7
7
  Documents,
8
8
  Topics,
9
9
  AdhocAssessments,
@@ -47,6 +47,11 @@ class TrecAdhocAssessments(AdhocAssessments):
47
47
  class TrecAdhocRun(AdhocRun):
48
48
  path: Param[Path]
49
49
 
50
+ def get_dict(self) -> AdhocRunDict:
51
+ import datamaestro_text.interfaces.trec as trec
52
+
53
+ return trec.parse_run(self.path)
54
+
50
55
 
51
56
  class TrecAdhocResults(AdhocResults):
52
57
  """Adhoc results (TREC format)"""
@@ -62,8 +67,6 @@ class TrecAdhocResults(AdhocResults):
62
67
 
63
68
  def get_results(self) -> Dict[str, float]:
64
69
  """Returns the results as a dictionary {metric_name: value}"""
65
- import re
66
-
67
70
  re_spaces = re.compile(r"\s+")
68
71
 
69
72
  results = {}
@@ -122,7 +122,14 @@ class Documents(ir.DocumentStore, IRDSId):
122
122
  formats.Touche2020, "doc_id", "text", "title", "stance", "url"
123
123
  ),
124
124
  _irds.beir.BeirSciDoc: tuple_constructor(
125
- formats.SciDocs, "doc_id", "text", "title", "authors", "year", "cited_by", "references"
125
+ formats.SciDocs,
126
+ "doc_id",
127
+ "text",
128
+ "title",
129
+ "authors",
130
+ "year",
131
+ "cited_by",
132
+ "references",
126
133
  ),
127
134
  _irds.msmarco_document.MsMarcoDocument: tuple_constructor(
128
135
  formats.MsMarcoDocument, "doc_id", "url", "title", "body"
@@ -198,6 +205,10 @@ class Documents(ir.DocumentStore, IRDSId):
198
205
  for doc in self.dataset.docs_iter():
199
206
  yield self.converter(self.document_recordtype, doc)
200
207
 
208
+ def iter_documents_from(self, start=0):
209
+ for doc in self.dataset.docs_iter()[start:]:
210
+ yield self.converter(self.document_recordtype, doc)
211
+
201
212
  @property
202
213
  def documentcount(self):
203
214
  return self.dataset.docs_count()
@@ -244,7 +255,7 @@ if hasattr(_irds, "miracl"):
244
255
  )
245
256
 
246
257
 
247
- class LZ4DocumentStore(ir.DocumentStore):
258
+ class LZ4DocumentStore(ir.DocumentStore, ABC):
248
259
  """A LZ4-based document store"""
249
260
 
250
261
  path: Param[Path]
@@ -253,7 +264,7 @@ class LZ4DocumentStore(ir.DocumentStore):
253
264
  lookup_field: Param[str]
254
265
 
255
266
  # Extra indexed fields (e.g. URLs)
256
- index_fields: List[str]
267
+ index_fields: List[str] = []
257
268
 
258
269
  @cached_property
259
270
  def store(self):
@@ -285,6 +296,9 @@ class LZ4DocumentStore(ir.DocumentStore):
285
296
  """Returns an iterator over documents"""
286
297
  return map(self.converter, self.store.__iter__())
287
298
 
299
+ def iter_documents_from(self, start=0):
300
+ return map(self.converter, self.store.__iter__()[start:])
301
+
288
302
  @cached_property
289
303
  def documentcount(self):
290
304
  if self.count:
@@ -386,7 +400,13 @@ class Topics(ir.TopicsStore, IRDSId):
386
400
  formats.TrecTopic, "query_id", "text", "description", "narrative"
387
401
  ),
388
402
  _irds.beir.BeirSciQuery: tuple_constructor(
389
- formats.SciDocsTopic, "query_id", "text", "authors", "year", "cited_by", "references"
403
+ formats.SciDocsTopic,
404
+ "query_id",
405
+ "text",
406
+ "authors",
407
+ "year",
408
+ "cited_by",
409
+ "references",
390
410
  ),
391
411
  _irds.tweets2013_ia.TrecMb13Query: tuple_constructor(
392
412
  formats.TrecMb13Query, "query_id", "query", "time", "tweet_time"
@@ -400,10 +420,7 @@ class Topics(ir.TopicsStore, IRDSId):
400
420
  "description",
401
421
  ),
402
422
  _irds.dpr_w100.DprW100Query: tuple_constructor(
403
- formats.DprW100Query,
404
- "query_id",
405
- "text",
406
- "answers"
423
+ formats.DprW100Query, "query_id", "text", "answers"
407
424
  ),
408
425
  }
409
426
 
@@ -435,11 +452,12 @@ class Topics(ir.TopicsStore, IRDSId):
435
452
  def iter(self) -> Iterator[TopicRecord]:
436
453
  """Returns an iterator over topics"""
437
454
  return self.handler.iter()
438
-
455
+
456
+
439
457
  class TrecBackgroundLinkingTopicsHandler(TopicsHandler):
440
458
  def __init__(self, dataset):
441
459
  self.dataset = dataset
442
-
460
+
443
461
  @cached_property
444
462
  def ext2records(self):
445
463
  return {record[IDItem].id: record for record in self.records}
@@ -462,10 +480,12 @@ class TrecBackgroundLinkingTopicsHandler(TopicsHandler):
462
480
  records = []
463
481
 
464
482
  for query in self.dataset.dataset.queries_iter():
465
- topic = Record(
483
+ topic = Record(
466
484
  IDItem(query.query_id),
467
485
  # Following BEIR documentation, we use title of documents as queries: https://github.com/beir-cellar/beir/blob/main/examples/dataset/README.md#queries-and-qrels
468
- SimpleTextItem(self.dataset.dataset.docs_store().get(query.doc_id).title),
486
+ SimpleTextItem(
487
+ self.dataset.dataset.docs_store().get(query.doc_id).title
488
+ ),
469
489
  UrlItem(query.url),
470
490
  )
471
491
  records.append(topic)
@@ -477,11 +497,10 @@ class TrecBackgroundLinkingTopicsHandler(TopicsHandler):
477
497
 
478
498
 
479
499
  Topics.HANDLERS.update(
480
- {
481
- _irds.wapo.TrecBackgroundLinkingQuery: TrecBackgroundLinkingTopicsHandler
482
- }
500
+ {_irds.wapo.TrecBackgroundLinkingQuery: TrecBackgroundLinkingTopicsHandler}
483
501
  )
484
502
 
503
+
485
504
  class CastTopicsHandler(TopicsHandler):
486
505
  def __init__(self, dataset):
487
506
  self.dataset = dataset
@@ -1,7 +1,7 @@
1
- from attrs import define
2
1
  from pathlib import Path
3
2
  from typing import Iterator, Optional
4
3
  import re
4
+ from datamaestro_text.data.ir import AdhocRunDict
5
5
  from datamaestro_text.data.ir.base import (
6
6
  AdhocAssessedTopic,
7
7
  TopicRecord,
@@ -10,6 +10,33 @@ from datamaestro_text.data.ir.base import (
10
10
  )
11
11
  from datamaestro_text.data.ir.formats import TrecTopicRecord, TrecTopic
12
12
 
13
+ # --- Runs
14
+
15
+
16
+ def parse_run(path: Path) -> AdhocRunDict:
17
+ results = {}
18
+ with path.open("rt") as f:
19
+ for line in f:
20
+ query_id, _q0, doc_id, _rank, score, _model_id = re.split(
21
+ r"\s+", line.strip()
22
+ )
23
+ results.setdefault(query_id, {})[doc_id] = score
24
+
25
+ return results
26
+
27
+
28
+ def write_run_dict(run: AdhocRunDict, run_path: Path):
29
+ """Write run dict"""
30
+ with run_path.open("wt") as f:
31
+ for query_id, scored_documents in run.items():
32
+ scored_documents = list(
33
+ [(doc_id, score) for doc_id, score in scored_documents.items()]
34
+ )
35
+ scored_documents.sort(key=lambda x: x[1], reverse=True)
36
+ for ix, (doc_id, score) in enumerate(scored_documents):
37
+ f.write(f"{query_id} Q0 {doc_id} {ix + 1} {score} run\n")
38
+
39
+
13
40
  # --- Assessments
14
41
 
15
42
 
@@ -1,3 +1,5 @@
1
+ import os
2
+ from tqdm import tqdm
1
3
  import gzip
2
4
  from pathlib import Path
3
5
 
@@ -6,3 +8,104 @@ def auto_open(path: Path, mode: str):
6
8
  if path.suffix == ".gz":
7
9
  return gzip.open(path, mode)
8
10
  return path.open(mode)
11
+
12
+
13
+ class CountingWrapper:
14
+ """Wrap a file object to count the actual compressed bytes read."""
15
+
16
+ def __init__(self, file_obj):
17
+ self.file_obj = file_obj
18
+ self.bytes_read = 0
19
+
20
+ def read(self, size=-1):
21
+ data = self.file_obj.read(size)
22
+ self.bytes_read += len(data)
23
+ return data
24
+
25
+ def readline(self, size=-1):
26
+ data = self.file_obj.readline(size)
27
+ self.bytes_read += len(data)
28
+ return data
29
+
30
+ def __iter__(self):
31
+ return self
32
+
33
+ def __next__(self):
34
+ line = self.readline()
35
+ if not line:
36
+ raise StopIteration
37
+ return line
38
+
39
+ def close(self):
40
+ self.file_obj.close()
41
+
42
+ def __getattr__(self, attr):
43
+ return getattr(self.file_obj, attr)
44
+
45
+
46
+ class TQDMBytesReader:
47
+ def __init__(self, file_obj, total_size, **tqdm_kwargs):
48
+ self.file_obj = CountingWrapper(file_obj)
49
+ self.tqdm = tqdm(
50
+ total=total_size,
51
+ unit="B",
52
+ unit_scale=True,
53
+ unit_divisor=1024,
54
+ **tqdm_kwargs,
55
+ )
56
+
57
+ def _update_progress(self):
58
+ delta = self.file_obj.bytes_read - self.tqdm.n
59
+ if delta > 0:
60
+ self.tqdm.update(delta)
61
+
62
+ def read(self, size=-1):
63
+ data = self.file_obj.read(size)
64
+ self._update_progress()
65
+ return data
66
+
67
+ def readline(self, size=-1):
68
+ line = self.file_obj.readline(size)
69
+ self._update_progress()
70
+ return line
71
+
72
+ def readlines(self, hint=-1):
73
+ lines = self.file_obj.readlines(hint)
74
+ self._update_progress()
75
+ return lines
76
+
77
+ def __iter__(self):
78
+ return self
79
+
80
+ def __next__(self):
81
+ line = self.readline()
82
+ if not line:
83
+ raise StopIteration
84
+ return line
85
+
86
+ def close(self):
87
+ self.tqdm.close()
88
+ self.file_obj.close()
89
+
90
+ def __getattr__(self, attr):
91
+ # Delegate any other attribute to the underlying file object
92
+ return getattr(self.file_obj, attr)
93
+
94
+
95
+ class TQDMFileReader:
96
+ def __init__(self, filepath, mode="rt", file_opener=open, **tqdm_kwargs):
97
+ self.filepath = filepath
98
+ self.mode = mode
99
+ self.file_opener = file_opener
100
+ self.tqdm_kwargs = tqdm_kwargs
101
+
102
+ def __enter__(self):
103
+ self.file_obj = self.file_opener(self.filepath, self.mode)
104
+ total_size = os.path.getsize(self.filepath) # this is compressed size
105
+ self.reader = TQDMBytesReader(
106
+ self.file_obj, total_size=total_size, **self.tqdm_kwargs
107
+ )
108
+ return self.reader
109
+
110
+ def __exit__(self, exc_type, exc_val, exc_tb):
111
+ self.reader.close()
@@ -82,3 +82,8 @@ class LazyList(Sequence):
82
82
  # Convert the iterable to a list if it hasn't been already
83
83
  if self.materialized_list is None:
84
84
  self.materialized_list = list(self.iterable)
85
+
86
+ def reverse(self):
87
+ """Reverse the list in place, materializing it if necessary"""
88
+ self._materialize()
89
+ self.materialized_list.reverse()
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '2025.6.11'
21
- __version_tuple__ = version_tuple = (2025, 6, 11)
20
+ __version__ = version = '2025.7.28'
21
+ __version_tuple__ = version_tuple = (2025, 7, 28)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datamaestro-text
3
- Version: 2025.6.11
3
+ Version: 2025.7.28
4
4
  Summary: Datamaestro module for text-related datasets
5
5
  Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
6
6
  License: GPL-3
@@ -15,10 +15,10 @@ Classifier: Operating System :: OS Independent
15
15
  Classifier: Programming Language :: Python
16
16
  Classifier: Programming Language :: Python :: 3
17
17
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
- Requires-Python: >=3.8
18
+ Requires-Python: >=3.10
19
19
  Description-Content-Type: text/markdown
20
20
  License-File: LICENSE
21
- Requires-Dist: datamaestro>=1.4.2
21
+ Requires-Dist: datamaestro>=1.5.0
22
22
  Requires-Dist: ir_datasets>=0.5.8
23
23
  Requires-Dist: attrs
24
24
  Provides-Extra: dev
@@ -1,12 +1,13 @@
1
1
  datamaestro_text/__init__.py,sha256=hU8jZpkXl3F74qIfqnJl7v4nJ9YxfoR7IpJpUREFNRI,248
2
- datamaestro_text/version.py,sha256=M2FaCl2nFdTTxwIhOsBo1SQ_3ytid7NHYp2QLIimPXY,519
2
+ datamaestro_text/version.py,sha256=rJQHFC3G5XDG0rUPZ6r1msOA_XCbSY-qMukJgu2nA1M,519
3
3
  datamaestro_text/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  datamaestro_text/config/ai/quac.yaml,sha256=h1D7UJo2z1nZ_9MXpDHuQNJG9Ma2oayUdJV6lyumAIg,1103
5
5
  datamaestro_text/config/com/oscar-corpus.py,sha256=dDfdy_uhG6esRQQzUOOORwK8KvEq4c4hZbWt22pv-zY,733
6
- datamaestro_text/config/com/sentiment140.py,sha256=3cZfqs395gY14gGojRC_RTYEdG5l8RUTSdF9li8ynBc,1283
6
+ datamaestro_text/config/com/sentiment140.py,sha256=WKKLaD7psbj9fIaTBHDTzOZanO2mukaB1g7aeTN1jdU,1289
7
7
  datamaestro_text/config/com/fastml/goodbooks-10k.yaml,sha256=5ZABxUnBFs2ZnCXtBH8YoBiPb3SocRRdH1DLSfVWF-Y,1172
8
+ datamaestro_text/config/com/github/ikat.py,sha256=DCayX-t2OBeW5bOJvRxoQgIH3vy-__mYzdmVcnayAkk,4230
8
9
  datamaestro_text/config/com/github/aagohary/canard.py,sha256=DREIKU2FFrfwibjU48Y1AsKTUzhwxmD4Kuykd4bDIus,1762
9
- datamaestro_text/config/com/github/apple/ml-qrecc.py,sha256=-iHKJocWZ8N9N-P8E45y4ewg3OT_23XonlDh5-NcH2g,3055
10
+ datamaestro_text/config/com/github/apple/ml-qrecc.py,sha256=oYI0SUxEYzGoL2IbRrnze2cQuWwENwNk4ID9NQuI2Tw,3061
10
11
  datamaestro_text/config/com/github/prdwb/orconvqa.py,sha256=MFJYaxJoqJQ6kMDHa2PIE0zoxYTA8Kyl26-vzFoMML0,3032
11
12
  datamaestro_text/config/com/github/soskek/bookcorpus.yaml,sha256=qJKs35yeEIilEMgNvU3OEqMp1TSn7mDM2T-uYyA7kTU,1607
12
13
  datamaestro_text/config/com/microsoft/wikiqa.yaml,sha256=U7rU-W2Xz1MYv2YXT4jCTj5DsHyM0VssbQPNa3EISaM,540
@@ -28,7 +29,7 @@ datamaestro_text/config/gov/nist/trec/adhoc.py,sha256=vcFaLlZ-chwDt013MYI8bYZ2ug
28
29
  datamaestro_text/config/gov/nist/trec/clueweb.yaml,sha256=sm4UbdtMzWoDVPsewtVDS7Vj2jBOdgp18Xqo1X4ysQc,792
29
30
  datamaestro_text/config/gov/nist/trec/deeplearning.yaml,sha256=QGM7PtXLJRttNdOPE16o7-k3e5tA9HgcaM_-qFDV_5Q,2125
30
31
  datamaestro_text/config/gov/nist/trec/index.yaml,sha256=oSUhUmtukf5oXqUbJLhae8xZx7Uac5V6uZoUB-RJ7Sw,2711
31
- datamaestro_text/config/gov/nist/trec/tipster.py,sha256=t0w9lOBfvwt6YCYCyDj9fo1QiBXfs0q8qzKl74f40jU,5398
32
+ datamaestro_text/config/gov/nist/trec/tipster.py,sha256=DirpnHpS10e27LcL7v9ksKreKVy7EgfVhyztV49VRds,5364
32
33
  datamaestro_text/config/gov/nist/trec/web.yaml,sha256=iNIJ-PCeLwpUaJByJesyCqfFJolWCts_VETACha3hfQ,563
33
34
  datamaestro_text/config/io/github/rajpurkar/squad.yaml,sha256=JchkTruEhVd0uOTGgek2tOgBL4UT3G3KVASvZcNigLg,1143
34
35
  datamaestro_text/config/io/github/thunlp/fewrel.py,sha256=LJYexk2ssZ5SqKIKLjPdybS9ju4g9fGIFy5eybtsvYQ,1224
@@ -46,41 +47,42 @@ datamaestro_text/data/recommendation.py,sha256=wHV_9SXSclouuXaBmvwg3ncZLOFfdKRZG
46
47
  datamaestro_text/data/tagging.py,sha256=yWm7bNLks77cAySa1aZNCmLFxTTqhqXZ0PaoaEYU6hI,697
47
48
  datamaestro_text/data/text.py,sha256=_9J7-j06BOIZ1HsOcBsWy_asanAKkDoZIsWWMMj4tm4,496
48
49
  datamaestro_text/data/conversation/__init__.py,sha256=esOWnSImMlQs5QtfxUt559ABLd6a5wwoNpj9XtIq71Y,159
49
- datamaestro_text/data/conversation/base.py,sha256=PUVRCSMBlV9bSayBl-vnzsYvyr2Tdv_zTadIC_Tswe0,6508
50
+ datamaestro_text/data/conversation/base.py,sha256=BAqu5YOjlRcBOSiyvrjiTIuuc1OalM0BDtcEvJYhO2Y,7515
50
51
  datamaestro_text/data/conversation/canard.py,sha256=FI02rPNuzSsA_uXOcgcuQn31d3QahmHndzYCV1Zm8rk,3305
52
+ datamaestro_text/data/conversation/ikat.py,sha256=djxTv0vHLXIUPzfmwpB25fyUWJNVPqthTtDbQ6CzmQo,4363
51
53
  datamaestro_text/data/conversation/orconvqa.py,sha256=RL9cpK7QK2dsk_j2e4Th8wzGFq-G3ww_EjdM1eKh-8o,3764
52
54
  datamaestro_text/data/conversation/qrecc.py,sha256=es4GmqPtE63A7O_GARe8Zy3rQvuLEhAvUA7CfN_nMeA,2562
53
- datamaestro_text/data/ir/__init__.py,sha256=EOVnRv9oKEEucMOgrFjhXxeUWEkaJkbXBndpkKhZaiY,8701
54
- datamaestro_text/data/ir/base.py,sha256=TSaY8UWoixqvKg1z5JfkXPnuZiVoChYEUvc7jjbpZqY,1495
55
+ datamaestro_text/data/ir/__init__.py,sha256=jHoyIWyl0beUbg1gmkwNFf1cQRawB8p3SGfa17gniGM,9442
56
+ datamaestro_text/data/ir/base.py,sha256=uwIiKn0ryK5DWUQsEegeTs6bye0uAOGl0XVm_cvV3ZU,1506
55
57
  datamaestro_text/data/ir/cord19.py,sha256=yu1Khgy6AZjH2BPQKdnxDid0vQgQ8zvb8-FQlHH-GOU,1465
56
58
  datamaestro_text/data/ir/csv.py,sha256=tnxuqR_MZ3GQhuoXEMYyWLQw8PyD0gRqsnzIP5Gsziw,1212
57
59
  datamaestro_text/data/ir/data.py,sha256=ERmmOxz_9mUkIuccNbzUa5Y6gVLLVDdyc4cCxbCCUbY,20
58
- datamaestro_text/data/ir/formats.py,sha256=K0mqmCGg0seneKo-Rt3cBfDVjEMS4_6t1MfNj4iW9Y4,3637
60
+ datamaestro_text/data/ir/formats.py,sha256=Tbu5rrssq9m76a448-ixgs7a-KKvNCvXYkdv5FkFqp4,3641
59
61
  datamaestro_text/data/ir/huggingface.py,sha256=G71VFDN-SllZy4LFxumEbCumEJvb5-orAbLemHjWhiA,956
60
- datamaestro_text/data/ir/stores.py,sha256=rlOogoBAfsP7o01KqvHiTF1XqzK2Fp6QbRmuasabKAE,859
61
- datamaestro_text/data/ir/trec.py,sha256=qDOzmgUn0hMqeP9KdI190-9IKsLl6yNRqIoBz6s-cBs,1898
62
+ datamaestro_text/data/ir/stores.py,sha256=A4Ew0L4P6iLLmDKhxqjjVkHcz797BHf4d76YguGkB1A,4412
63
+ datamaestro_text/data/ir/trec.py,sha256=IOtQRMUz8zx-dYEMR2NIIM6qXEUjsV0eVOhGvKIRJK4,1974
62
64
  datamaestro_text/data/ir/utils.py,sha256=6-GhXVtgkBZGhIs2-ODZua_3DmKjSSVydStpHDqbAwE,833
63
65
  datamaestro_text/datasets/irds/__init__.py,sha256=Tq0HN1qojnZYLBumM59BuTkz7r0gcu-5OXmDDLgPpAc,707
64
- datamaestro_text/datasets/irds/data.py,sha256=0V5nPMQeh1I3Sp36x_NE7xrbC_vbZAKhU5NONj95aok,22058
66
+ datamaestro_text/datasets/irds/data.py,sha256=eUehp_80H1yyh7CVkM7mOWJtB9XHlmI-A7fLewXuaDE,22487
65
67
  datamaestro_text/datasets/irds/datasets.py,sha256=n5mvD2omVI4PkeuJgX13Cio6gLrgYjxq8ZUKKyqocEs,5625
66
68
  datamaestro_text/datasets/irds/helpers.py,sha256=fGE-fbuJbXdTzl1qo55247jzn9cvApY-d82GJBgfY1E,3982
67
69
  datamaestro_text/datasets/irds/utils.py,sha256=m30JLIrV_HgilN11TvY9dGTyumES6LLzWZDUAMT915M,1425
68
70
  datamaestro_text/download/tmdb.py,sha256=kU_Vz9jhznlyeKMHziVu58IHoWv8zPu6CZTHVNQvmu4,4009
69
71
  datamaestro_text/interfaces/plaintext.py,sha256=cWfS_xjqZxQ0EV4Ax5BEarZ4lnhQ1I7mc_vgfBgE76w,885
70
- datamaestro_text/interfaces/trec.py,sha256=g5UIjOvhMBaib9mm280dkQLdtLtuId8bjfptaVi5Pew,2709
72
+ datamaestro_text/interfaces/trec.py,sha256=GrP0N_Hcj5f73KS6CSzkyN4aaI-XoBJ19oVMpHVR3QM,3579
71
73
  datamaestro_text/test/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
72
74
  datamaestro_text/test/test_datasets.py,sha256=hD1pe-CjEnOj0JNqDLasz_97ltFzMbYfyYBy1QyYdf8,202
73
75
  datamaestro_text/test/test_documented.py,sha256=Kl90fz_r_dsV0oXE1Mad34GiQyZ9dc6bfZKNtxP0O2s,453
74
76
  datamaestro_text/transforms/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
75
77
  datamaestro_text/transforms/ir/__init__.py,sha256=eWxr0ZtfK9F9J4TachW5V8Ra5AhQJueDyOGR8vXi1uA,6553
76
78
  datamaestro_text/utils/__init__.py,sha256=2449YLTAtKJzkmt84Mu8sBRCCveNs5fiaqTCK_p5ha0,3340
77
- datamaestro_text/utils/files.py,sha256=n6ZGl5LNrZbHLcV9RFwd7cFT0vPUezit-2dsBzs1vRQ,170
78
- datamaestro_text/utils/iter.py,sha256=uzBmavBeqVyEvYmi9Ds2dqh0ywhJE-1SBO5v8AoIKcc,2537
79
+ datamaestro_text/utils/files.py,sha256=jhcirufgTztEkx1hs2-qgScEcHnIcGF_BjCeuCSsNv0,2838
80
+ datamaestro_text/utils/iter.py,sha256=QBajeSPLHvkeh6BCTZDSqWlOYNjwUDvgTTZ_YxJntXw,2701
79
81
  datamaestro_text/utils/randomstream.py,sha256=_-boH4IIqN8qcl3IktjpNp9vmF4TWRzHUSNVwg7WAr8,973
80
82
  datamaestro_text/utils/shuffle.py,sha256=o8JTz3mr0lYWyv0zEh91jEK12ci1etMiUnzh5GkOHCM,3490
81
- datamaestro_text-2025.6.11.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
82
- datamaestro_text-2025.6.11.dist-info/METADATA,sha256=oMXEU9_JPRCrUCoSE6IGmmyLzfdpVQgKHAaaEbFXMXY,1847
83
- datamaestro_text-2025.6.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
84
- datamaestro_text-2025.6.11.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
85
- datamaestro_text-2025.6.11.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
86
- datamaestro_text-2025.6.11.dist-info/RECORD,,
83
+ datamaestro_text-2025.7.28.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
84
+ datamaestro_text-2025.7.28.dist-info/METADATA,sha256=M0V-4Q2_EBFIRnP0czVXvZC9t_qhhmVRbWSAry31SW4,1848
85
+ datamaestro_text-2025.7.28.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
86
+ datamaestro_text-2025.7.28.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
87
+ datamaestro_text-2025.7.28.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
88
+ datamaestro_text-2025.7.28.dist-info/RECORD,,