datamaestro-text 2025.6.30__py3-none-any.whl → 2025.9.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. datamaestro_text/config/com/github/aagohary/canard.py +3 -3
  2. datamaestro_text/config/com/github/apple/ml-qrecc.py +2 -1
  3. datamaestro_text/config/com/github/ikat.py +102 -19
  4. datamaestro_text/config/com/microsoft/msmarco/passage.py +1 -1
  5. datamaestro_text/config/com/sentiment140.py +4 -4
  6. datamaestro_text/config/edu/stanford/aclimdb.py +2 -2
  7. datamaestro_text/config/gov/nist/trec/tipster.py +0 -1
  8. datamaestro_text/config/io/github/thunlp/fewrel.py +1 -1
  9. datamaestro_text/config/io/metamind/research/wikitext.py +3 -3
  10. datamaestro_text/config/org/grouplens/movielens.py +8 -8
  11. datamaestro_text/config/org/universaldependencies/french.py +3 -3
  12. datamaestro_text/data/conversation/base.py +34 -9
  13. datamaestro_text/data/conversation/ikat.py +38 -13
  14. datamaestro_text/data/ir/__init__.py +44 -4
  15. datamaestro_text/data/ir/base.py +2 -1
  16. datamaestro_text/data/ir/formats.py +8 -2
  17. datamaestro_text/data/ir/stores.py +99 -4
  18. datamaestro_text/data/ir/trec.py +7 -4
  19. datamaestro_text/datasets/irds/data.py +47 -16
  20. datamaestro_text/interfaces/trec.py +28 -1
  21. datamaestro_text/utils/files.py +103 -0
  22. datamaestro_text/utils/iter.py +5 -0
  23. datamaestro_text/version.py +16 -3
  24. {datamaestro_text-2025.6.30.dist-info → datamaestro_text-2025.9.11.dist-info}/METADATA +3 -3
  25. {datamaestro_text-2025.6.30.dist-info → datamaestro_text-2025.9.11.dist-info}/RECORD +29 -29
  26. {datamaestro_text-2025.6.30.dist-info → datamaestro_text-2025.9.11.dist-info}/WHEEL +0 -0
  27. {datamaestro_text-2025.6.30.dist-info → datamaestro_text-2025.9.11.dist-info}/entry_points.txt +0 -0
  28. {datamaestro_text-2025.6.30.dist-info → datamaestro_text-2025.9.11.dist-info}/licenses/LICENSE +0 -0
  29. {datamaestro_text-2025.6.30.dist-info → datamaestro_text-2025.9.11.dist-info}/top_level.txt +0 -0
@@ -37,7 +37,7 @@ def main(train, dev, test):
37
37
  Each dataset is an instance of :class:`datamaestro_text.data.conversation.CanardDataset`
38
38
  """
39
39
  return {
40
- "train": CanardDataset(path=train),
41
- "validation": CanardDataset(path=dev),
42
- "test": CanardDataset(path=test),
40
+ "train": CanardDataset.C(path=train),
41
+ "validation": CanardDataset.C(path=dev),
42
+ "test": CanardDataset.C(path=test),
43
43
  }
@@ -51,6 +51,7 @@ def main(data: Path) -> Supervised[QReCCDataset, None, QReCCDataset]:
51
51
  )
52
52
  class Content(LZ4JSONLDocumentStore):
53
53
  """QReCC mentionned URLs content"""
54
+
54
55
  @staticmethod
55
56
  def __create_dataset__(dataset, options=None):
56
57
  ds = reference(reference=main).setup(dataset, options)
@@ -65,7 +66,7 @@ class Content(LZ4JSONLDocumentStore):
65
66
  "id",
66
67
  ).setup(dataset, options)
67
68
 
68
- return Content(jsonl_path=store_path)
69
+ return Content.C(jsonl_path=store_path)
69
70
 
70
71
  @staticmethod
71
72
  def _documents(path: Path):
@@ -1,38 +1,121 @@
1
1
  # See documentation on https://datamaestro.readthedocs.io
2
2
 
3
+ import bz2
4
+ from datamaestro.download import reference
3
5
  from datamaestro.definitions import datatasks, datatags, dataset
4
- from datamaestro.data.ml import Supervised
5
- from datamaestro.data import Base
6
+ from datamaestro_text.data.conversation.base import ConversationUserTopics
7
+ from datamaestro_text.data.ir import Adhoc
6
8
 
7
9
  from datamaestro.utils import HashCheck
10
+ from datamaestro.context import DatafolderPath
8
11
  from datamaestro.download.single import filedownloader
9
- from datamaestro_text.data.conversation.ikat import IkatDatasetEntry, IkatDataset
10
- from datamaestro_text.datasets.irds.data import (
11
- SimpleJsonDocument,
12
- LZ4JSONLDocumentStore,
13
- )
14
- import logging
12
+ from datamaestro_text.data.conversation.ikat import IkatConversations
13
+ from datamaestro.download.links import linkfolder
14
+
15
+ from datamaestro_text.data.ir.stores import IKatClueWeb22DocumentStore
16
+ from datamaestro_text.data.ir.trec import TrecAdhocAssessments
17
+ from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
18
+
19
+
20
+ @dataset(as_prepare=True)
21
+ def clueweb22(dataset, options=None) -> IKatClueWeb22DocumentStore:
22
+ # Number of documents in the dataset
23
+ count = 116_838_987
24
+
25
+ jsonl_folder = linkfolder(
26
+ "documents", [DatafolderPath("gov.nist.trec.ikat.clueweb22", "jsonl")]
27
+ ).setup(dataset, options)
28
+ store_path = lz4docstore_builder(
29
+ "store",
30
+ IKatClueWeb22DocumentStore.generator(
31
+ jsonl_folder,
32
+ jsonl_folder / "ikat_2023_passages_jsonl.sha256sums",
33
+ jsonl_folder / "ikat_2023_passages_hashes.tsv.bz2",
34
+ ),
35
+ IKatClueWeb22DocumentStore.Document,
36
+ "id",
37
+ count_hint=count,
38
+ ).setup(dataset, options)
39
+
40
+ return IKatClueWeb22DocumentStore.C(path=store_path, count=count)
41
+
15
42
 
16
43
  @datatags("conversation", "context", "query")
17
- @datatasks("query rewriting")
44
+ @datatasks("conversational search", "query rewriting")
45
+ @reference("documents", clueweb22)
18
46
  @filedownloader(
19
- "test.json",
47
+ "topics.json",
20
48
  "https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2025/data/2025_test_topics.json",
21
49
  checker=HashCheck("16f8444a8d0a8dfe0090f478f185a63c"),
22
50
  )
23
-
24
51
  @dataset(
25
- Base,
52
+ id="2025",
26
53
  url="https://github.com/irlabamsterdam/iKAT/tree/main/2025",
27
54
  )
28
-
29
- def main(test) -> Supervised[IkatDataset, None, IkatDataset]:
55
+ def test_2025(topics, documents) -> Adhoc.C:
30
56
  """Question-in-context rewriting
31
57
 
32
- iKAT is a test dataset for question-in-context rewriting that consists of
58
+ iKAT is a test dataset for question-in-context rewriting that consists of
33
59
  questions each given in a dialog context together with a context-independent
34
- rewriting of the question.
35
- One of the special features of iKAT is that it includes a Personal PKTB',
60
+ rewriting of the question.
36
61
  """
37
- logging.info("Creating iKAT dataset from %s", test)
38
- return IkatDataset.C(path=test)
62
+ return Adhoc.C(
63
+ topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
64
+ # TODO: add when available
65
+ assessments=TrecAdhocAssessments.C(path="/to/do"),
66
+ documents=documents,
67
+ )
68
+
69
+
70
+ @datatags("conversation", "context", "query")
71
+ @datatasks("conversational search", "query rewriting")
72
+ @reference("documents", clueweb22)
73
+ @filedownloader(
74
+ "qrels",
75
+ "https://trec.nist.gov/data/ikat/2024-qrels.txt",
76
+ checker=HashCheck("57f958903ed1c12bbac207f62800814f"),
77
+ )
78
+ @filedownloader(
79
+ "topics.json",
80
+ "https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2024/data/2024_test_topics.json",
81
+ checker=HashCheck("ad45bc6e7add2081d69ea60a0a4d1203"),
82
+ )
83
+ @dataset(
84
+ Adhoc,
85
+ id="2024",
86
+ url="https://github.com/irlabamsterdam/iKAT/tree/main/2024",
87
+ )
88
+ def test_2024(topics, qrels, documents) -> Adhoc.C:
89
+ """iKAT 2024 dataset"""
90
+ return Adhoc.C(
91
+ topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
92
+ assessments=TrecAdhocAssessments.C(path=qrels),
93
+ documents=documents,
94
+ )
95
+
96
+
97
+ @datatags("conversation", "context", "query")
98
+ @datatasks("conversational search", "query rewriting")
99
+ @reference("documents", clueweb22)
100
+ @filedownloader(
101
+ "qrels",
102
+ "https://trec.nist.gov/data/ikat/2023-qrels.all-turns.txt",
103
+ checker=HashCheck("79dc121bab25b2245e52a53263e5ad1f"),
104
+ )
105
+ @filedownloader(
106
+ "topics.json",
107
+ "https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2023/data/2023_test_topics.json",
108
+ checker=HashCheck("684fa0197cdec8c3cfb6a2e586ab83f6"),
109
+ )
110
+ @dataset(
111
+ Adhoc,
112
+ id="2023",
113
+ url="https://github.com/irlabamsterdam/iKAT/tree/main/2023",
114
+ )
115
+ def test_2023(topics, qrels, documents) -> Adhoc.C:
116
+ """iKAT 2023 dataset"""
117
+ return Adhoc.C(
118
+ topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
119
+ assessments=TrecAdhocAssessments.C(path=qrels),
120
+ documents=documents,
121
+ )
@@ -47,7 +47,7 @@ http://www.msmarco.org/dataset.aspx""",
47
47
  @dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
48
48
  def collection_etc(data) -> Folder:
49
49
  """Documents and some more files"""
50
- return Folder(path=data)
50
+ return Folder.C(path=data)
51
51
 
52
52
 
53
53
  @lua
@@ -26,7 +26,7 @@ def english(dir):
26
26
 
27
27
  If you use this data, please cite Sentiment140 as your source.
28
28
  """
29
- return {
30
- "train": Generic(path=dir / "training.1600000.processed.noemoticon.csv"),
31
- "test": Generic(path=dir / "testdata.manual.2009.06.14.csv"),
32
- }
29
+ return Supervised.C(
30
+ train=Generic.C(path=dir / "training.1600000.processed.noemoticon.csv"),
31
+ test=Generic.C(path=dir / "testdata.manual.2009.06.14.csv"),
32
+ )
@@ -11,6 +11,6 @@ def aclimdb(data):
11
11
  Paper http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf
12
12
  """
13
13
  return {
14
- "train": FolderBased(path=data / "train", classes=["neg", "pos"]),
15
- "test": FolderBased(path=data / "test", classes=["neg", "pos"]),
14
+ "train": FolderBased.C(path=data / "train", classes=["neg", "pos"]),
15
+ "test": FolderBased.C(path=data / "test", classes=["neg", "pos"]),
16
16
  }
@@ -17,7 +17,6 @@ of three CD-ROMs of SGML encoded documents distributed by LDC plus queries and a
17
17
  See also https://trec.nist.gov/data/docs_eng.html and https://trec.nist.gov/data/intro_eng.html
18
18
  """
19
19
 
20
- from datamaestro.data import Base
21
20
  from datamaestro_text.data.ir.trec import TipsterCollection
22
21
  from datamaestro.download.links import linkfolder
23
22
  from datamaestro.definitions import (
@@ -32,4 +32,4 @@ def v1(train, validation):
32
32
  Only the train and validation dataset are available. The test set is hidden
33
33
  for the leaderboard.
34
34
  """
35
- return {"train": File(path=train), "validation": File(path=validation)}
35
+ return {"train": File.C(path=train), "validation": File.C(path=validation)}
@@ -30,9 +30,9 @@ def WikiText(data, type):
30
30
  https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/
31
31
  """
32
32
  return {
33
- "train": File(path=data / ("wiki.train.%s" % type)),
34
- "validation": File(path=data / ("wiki.valid.%s" % type)),
35
- "test": File(path=data / ("wiki.test.%s" % type)),
33
+ "train": File.C(path=data / ("wiki.train.%s" % type)),
34
+ "validation": File.C(path=data / ("wiki.valid.%s" % type)),
35
+ "test": File.C(path=data / ("wiki.test.%s" % type)),
36
36
  }
37
37
 
38
38
 
@@ -31,10 +31,10 @@ def small(ds) -> Movielens:
31
31
  100,000 ratings and 3,600 tag applications applied to 9,000 movies by 600 users (as of 9/2018)
32
32
  """
33
33
  return {
34
- "ratings": csv.Generic(path=ds / "ratings.csv", names_row=0),
35
- "links": csv.Generic(path=ds / "links.csv", names_row=0),
36
- "movies": csv.Generic(path=ds / "movies.csv", names_row=0),
37
- "tags": csv.Generic(path=ds / "tags.csv", names_row=0),
34
+ "ratings": csv.Generic.C(path=ds / "ratings.csv", names_row=0),
35
+ "links": csv.Generic.C(path=ds / "links.csv", names_row=0),
36
+ "movies": csv.Generic.C(path=ds / "movies.csv", names_row=0),
37
+ "tags": csv.Generic.C(path=ds / "tags.csv", names_row=0),
38
38
  }
39
39
 
40
40
 
@@ -46,8 +46,8 @@ def full(ds) -> Movielens:
46
46
  27,000,000 ratings and 1,100,000 tag applications applied to 58,000 movies by 280,000 users (as of 9/2018)
47
47
  """
48
48
  return {
49
- "ratings": csv.Generic(path=ds / "ratings.csv", names_row=0),
50
- "links": csv.Generic(path=ds / "links.csv", names_row=0),
51
- "movies": csv.Generic(path=ds / "movies.csv", names_row=0),
52
- "tags": csv.Generic(path=ds / "tags.csv", names_row=0),
49
+ "ratings": csv.Generic.C(path=ds / "ratings.csv", names_row=0),
50
+ "links": csv.Generic.C(path=ds / "links.csv", names_row=0),
51
+ "movies": csv.Generic.C(path=ds / "movies.csv", names_row=0),
52
+ "tags": csv.Generic.C(path=ds / "tags.csv", names_row=0),
53
53
  }
@@ -34,9 +34,9 @@ def gsd(ds) -> Supervised:
34
34
  is updated since 2015 independently from the previous source.
35
35
  """
36
36
  return {
37
- "train": CoNLL_U(path=ds / "fr_gsd-ud-train.conllu"),
38
- "test": CoNLL_U(path=ds / "fr_gsd-ud-dev.conllu"),
39
- "validation": CoNLL_U(path=ds / "fr_gsd-ud-test.conllu"),
37
+ "train": CoNLL_U.C(path=ds / "fr_gsd-ud-train.conllu"),
38
+ "test": CoNLL_U.C(path=ds / "fr_gsd-ud-dev.conllu"),
39
+ "validation": CoNLL_U.C(path=ds / "fr_gsd-ud-test.conllu"),
40
40
  }
41
41
 
42
42
 
@@ -1,10 +1,13 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from enum import Enum
3
+ from datamaestro_text.data.ir.base import IDItem, SimpleTextItem
4
+ from experimaestro import Param
3
5
  from typing import Dict, Generic, Iterator, List, Optional, Sequence, Tuple
4
6
  from attr import define
7
+ from datamaestro.record import record_type
5
8
  from datamaestro.data import Base
6
9
  from datamaestro.record import Record, Item
7
- from datamaestro_text.data.ir import TopicRecord
10
+ from datamaestro_text.data.ir import TopicRecord, Topics
8
11
  from datamaestro_text.utils.iter import FactoryIterable, LazyList, RangeView
9
12
 
10
13
  # ---- Basic types
@@ -120,20 +123,17 @@ class ConversationNode:
120
123
  ...
121
124
 
122
125
  @abstractmethod
123
- def parent(self) -> Optional["ConversationNode"]:
124
- ...
126
+ def parent(self) -> Optional["ConversationNode"]: ...
125
127
 
126
128
  @abstractmethod
127
- def children(self) -> List["ConversationNode"]:
128
- ...
129
+ def children(self) -> List["ConversationNode"]: ...
129
130
 
130
131
 
131
132
  class ConversationTree(ABC):
132
133
  """Represents a conversation tree"""
133
134
 
134
135
  @abstractmethod
135
- def root(self) -> ConversationNode:
136
- ...
136
+ def root(self) -> ConversationNode: ...
137
137
 
138
138
  @abstractmethod
139
139
  def __iter__(self) -> Iterator[ConversationNode]:
@@ -253,5 +253,30 @@ class ConversationDataset(Base, ABC):
253
253
  @abstractmethod
254
254
  def __iter__(self) -> Iterator[ConversationTree]:
255
255
  """Return an iterator over conversations"""
256
- for i in range(len(self)):
257
- yield self.get(i)
256
+ ...
257
+
258
+
259
+ class ConversationUserTopics(Topics):
260
+ """Extract user topics from conversations"""
261
+
262
+ conversations: Param[ConversationDataset]
263
+
264
+ topic_recordtype = record_type(IDItem, SimpleTextItem)
265
+
266
+ def iter(self) -> Iterator[TopicRecord]:
267
+ """Returns an iterator over topics"""
268
+ # Extracts topics from conversations, Each user query is a topic (can perform retrieval on it)
269
+ # TODO: merge with xpmir.learning.DatasetConversationBase -> same logic
270
+
271
+ records: List[TopicRecord] = []
272
+ for conversation in self.conversations.__iter__():
273
+ nodes = [
274
+ node
275
+ for node in conversation
276
+ if node.entry[EntryType] == EntryType.USER_QUERY
277
+ ]
278
+ for node in nodes:
279
+ records.append(
280
+ node.entry.update(ConversationHistoryItem(node.history()))
281
+ )
282
+ return iter(records)
@@ -1,10 +1,11 @@
1
- from typing import Iterator, List, Optional
1
+ from typing import Iterator, List
2
2
  from attr import define, field
3
3
  import json
4
4
  import logging
5
5
  from datamaestro.data import File
6
6
  from datamaestro.record import Record
7
7
 
8
+ from datamaestro_text.data.ir import Topics
8
9
  from datamaestro_text.data.ir.base import (
9
10
  IDItem,
10
11
  SimpleTextItem,
@@ -12,7 +13,6 @@ from datamaestro_text.data.ir.base import (
12
13
 
13
14
 
14
15
  from .base import (
15
- AnswerDocumentURL,
16
16
  AnswerEntry,
17
17
  ConversationTree,
18
18
  EntryType,
@@ -21,6 +21,25 @@ from .base import (
21
21
  )
22
22
  from . import ConversationDataset
23
23
 
24
+ # Keys to change in the dataset entries for compatibility across different years
25
+
26
+ KEY_MAPPINGS = {
27
+ # Keys to replace: Target Key
28
+ "turns": "responses",
29
+ "utterance": "user_utterance",
30
+ "ptkb_provenance": "relevant_ptkbs",
31
+ "response_provenance": "citations",
32
+ }
33
+
34
+
35
+ def norm_dict(entry: dict) -> dict:
36
+ """Convert keys in the entry to match the expected format."""
37
+ normalized = {}
38
+ for k, v in entry.items():
39
+ # Check for direct mapping, then try lowercase mapping
40
+ new_key = KEY_MAPPINGS.get(k) or KEY_MAPPINGS.get(k.lower()) or k.lower()
41
+ normalized[new_key] = v
42
+ return normalized
24
43
 
25
44
 
26
45
  @define(kw_only=True)
@@ -47,7 +66,7 @@ class IkatConversationEntry:
47
66
 
48
67
 
49
68
  @define(kw_only=True)
50
- class IkatDatasetEntry:
69
+ class IkatConversationTopic:
51
70
  """A query with past history"""
52
71
 
53
72
  number: str
@@ -60,14 +79,20 @@ class IkatDatasetEntry:
60
79
  """The personal knowledge base associated with the user"""
61
80
 
62
81
  responses: List[IkatConversationEntry] = field(
63
- converter=lambda items: [IkatConversationEntry(**item) if isinstance(item, dict) else item for item in items]
82
+ converter=lambda items: [
83
+ IkatConversationEntry(**item) if isinstance(item, dict) else item
84
+ for item in map(norm_dict, items)
85
+ ]
64
86
  )
65
87
  """The list of responses to the query"""
66
88
 
67
89
 
68
- class IkatDataset(ConversationDataset, File):
90
+ class IkatConversations(ConversationDataset, File):
91
+ """A dataset containing conversations from the IKAT project"""
69
92
 
70
- def entries(self) -> Iterator[IkatDatasetEntry]:
93
+ """Keys to change in the dataset entries for compatibility across different years"""
94
+
95
+ def entries(self) -> Iterator[IkatConversationTopic]:
71
96
  """Reads all conversation entries from the dataset file."""
72
97
  with self.path.open("rt") as fp:
73
98
  raw_data = json.load(fp)
@@ -75,12 +100,13 @@ class IkatDataset(ConversationDataset, File):
75
100
  logging.debug("Loaded %d entries from %s", len(raw_data), self.path)
76
101
  logging.debug(f"raw data has keys {raw_data[0].keys()}")
77
102
 
78
- processed_data = []
79
103
  for entry in raw_data:
80
- processed_data.append(IkatDatasetEntry(**{key.lower(): value for key, value in entry.items()}))
81
-
82
- logging.debug(f"First parsed data sample: {processed_data[0]}")
83
- return iter(processed_data)
104
+ try:
105
+ normalized_entry = norm_dict(entry)
106
+ yield IkatConversationTopic(**normalized_entry)
107
+ except Exception as e:
108
+ logging.warning(f"Failed to parse entry: {e}")
109
+ raise e
84
110
 
85
111
  def __iter__(self) -> Iterator[ConversationTree]:
86
112
  for entry in self.entries():
@@ -88,7 +114,7 @@ class IkatDataset(ConversationDataset, File):
88
114
 
89
115
  for turn in entry.responses:
90
116
  turn: IkatConversationEntry = turn # Ensure type is correct
91
- query_id = f"{entry.number}#{turn.turn_id}"
117
+ query_id = f"{entry.number}_{turn.turn_id}"
92
118
 
93
119
  # USER QUERY record
94
120
  history.append(
@@ -117,4 +143,3 @@ class IkatDataset(ConversationDataset, File):
117
143
  # Ensure reverse if needed for compatibility (optional)
118
144
  history.reverse()
119
145
  yield SingleConversationTree(entry.number, history)
120
-
@@ -1,10 +1,12 @@
1
1
  """Generic data types for information retrieval"""
2
2
 
3
3
  from abc import ABC, abstractmethod
4
+ from enum import Enum
4
5
  from functools import cached_property
6
+ import logging
5
7
  from pathlib import Path
6
8
  from attrs import define
7
- from typing import Callable, Dict, Iterator, List, Optional, Tuple, Type
9
+ from typing import Callable, Dict, Iterator, List, Optional, Tuple, Type, TYPE_CHECKING
8
10
  import random
9
11
  from experimaestro import Config
10
12
  from datamaestro.definitions import datatasks, Param, Meta
@@ -28,6 +30,9 @@ from .base import ( # noqa: F401
28
30
  AdhocAssessedTopic,
29
31
  )
30
32
 
33
+ #: A adhoc run dictionary (query id -> doc id -> score)
34
+ AdhocRunDict = dict[str, dict[str, float]]
35
+
31
36
 
32
37
  class Documents(Base):
33
38
  """A set of documents with identifiers
@@ -45,6 +50,22 @@ class Documents(Base):
45
50
  def iter_documents(self) -> Iterator[DocumentRecord]:
46
51
  return self.iter()
47
52
 
53
+ def iter_documents_from(self, start=0) -> Iterator[DocumentRecord]:
54
+ """Iterate over a range of documents
55
+
56
+ Can be specialized in a subclass for faster access
57
+
58
+ :param start: The starting document, defaults to 0
59
+ :return: An iterator
60
+ """
61
+ iter = self.iter()
62
+ if start > 0:
63
+ logging.info("skipping %d documents", start + 1)
64
+ for _ in range(start + 1):
65
+ next(iter)
66
+
67
+ return iter
68
+
48
69
  def iter_ids(self) -> Iterator[str]:
49
70
  """Iterates over document ids
50
71
 
@@ -68,6 +89,19 @@ class Documents(Base):
68
89
  ...
69
90
 
70
91
 
92
+ class FileAccess(Enum):
93
+ """Defines how to access files (e.g. for document stores)"""
94
+
95
+ FILE = 0
96
+ """Direct file access"""
97
+
98
+ MMAP = 1
99
+ """Use mmap"""
100
+
101
+ MEMORY = 2
102
+ """Use memory"""
103
+
104
+
71
105
  class DocumentStore(Documents):
72
106
  """A document store
73
107
 
@@ -77,6 +111,10 @@ class DocumentStore(Documents):
77
111
  - return the number of documents
78
112
  """
79
113
 
114
+ file_access: Meta[FileAccess] = FileAccess.MMAP
115
+ """How to access the file collection (might not have any impact, depends on
116
+ the docstore)"""
117
+
80
118
  def docid_internal2external(self, docid: int):
81
119
  """Converts an internal collection ID (integer) to an external ID"""
82
120
  raise NotImplementedError(f"For class {self.__class__}")
@@ -168,7 +206,10 @@ class AdhocAssessments(Base, ABC):
168
206
  class AdhocRun(Base):
169
207
  """IR adhoc run"""
170
208
 
171
- pass
209
+ @abstractmethod
210
+ def get_dict(self) -> "AdhocRunDict":
211
+ """Get the run as a dictionary query ID -> doc ID -> score"""
212
+ ...
172
213
 
173
214
 
174
215
  class AdhocResults(Base):
@@ -304,5 +345,4 @@ class PairwiseSampleDataset(Base, ABC):
304
345
  """Datasets where each record is a query with positive and negative samples"""
305
346
 
306
347
  @abstractmethod
307
- def iter(self) -> Iterator[PairwiseSample]:
308
- ...
348
+ def iter(self) -> Iterator[PairwiseSample]: ...
@@ -43,6 +43,7 @@ class IDItem(Item, ABC):
43
43
 
44
44
  id: str
45
45
 
46
+
46
47
  @define
47
48
  class UrlItem(Item):
48
49
  """An url item"""
@@ -70,7 +71,7 @@ class AdhocAssessedTopic:
70
71
  """List of assessments for this topic"""
71
72
 
72
73
 
73
- def create_record(*items: Item, id: str = None, text: str = None):
74
+ def create_record(*items: Item, id: str = None, text: str = None) -> Record:
74
75
  """Easy creation of a text/id item"""
75
76
  extra_items = []
76
77
  if id is not None:
@@ -99,7 +99,7 @@ class WapoDocument(TextItem):
99
99
  body_media: Tuple[WapoDocMedia, ...]
100
100
 
101
101
  @cached_property
102
- def text(self):
102
+ def text(self):
103
103
  return f"{self.title} {self.body_paras_html}"
104
104
 
105
105
 
@@ -132,11 +132,15 @@ class DprW100Doc(TextItem):
132
132
  text: str
133
133
  title: str
134
134
 
135
- @define
135
+
136
+ @define
136
137
  class MsMarcoV2Passage(TextItem):
137
138
  text: str
138
139
  spans: Tuple[Tuple[int, int], ...]
139
140
  msmarco_document_id: str
141
+
142
+
143
+ @define
140
144
  class Touche2020(TextItem):
141
145
  text: str
142
146
  title: str
@@ -204,11 +208,13 @@ class TrecTopic(SimpleTextItem):
204
208
 
205
209
  TrecTopicRecord = record_type(IDItem, TrecTopic)
206
210
 
211
+
207
212
  @define
208
213
  class DprW100Query(TextItem):
209
214
  text: str
210
215
  answers: Tuple[str]
211
216
 
217
+
212
218
  @define
213
219
  class TrecBackgroundLinkingQuery(IDItem):
214
220
  query_id: str
@@ -1,12 +1,21 @@
1
- from collections import namedtuple
1
+ import bz2
2
+ from hashlib import md5, sha256
3
+ import json
4
+ import logging
5
+ from pathlib import Path
2
6
  from typing import List, NamedTuple
7
+ from datamaestro_text.utils.files import TQDMFileReader
3
8
  from experimaestro import Constant
4
- import attrs
5
-
6
9
  from datamaestro.record import Record
7
- from datamaestro_text.data.ir.base import IDItem
10
+ from datamaestro_text.data.ir.base import (
11
+ DocumentRecord,
12
+ IDItem,
13
+ SimpleTextItem,
14
+ UrlItem,
15
+ )
8
16
  from datamaestro_text.datasets.irds.data import LZ4DocumentStore
9
17
  from datamaestro_text.data.ir.formats import OrConvQADocument
18
+ from tqdm import tqdm
10
19
 
11
20
 
12
21
  class OrConvQADocumentStore(LZ4DocumentStore):
@@ -27,3 +36,89 @@ class OrConvQADocumentStore(LZ4DocumentStore):
27
36
  fields = data._asdict()
28
37
  del fields["id"]
29
38
  return Record(OrConvQADocument(**fields), IDItem(data.id))
39
+
40
+
41
+ class IKatClueWeb22DocumentStore(LZ4DocumentStore):
42
+ @staticmethod
43
+ def generator(path: Path, checksums_file: Path, passages_hashes: Path):
44
+ """Returns an iterator over iKAT 2022-25 documents
45
+
46
+ :param path: The folder containing the files
47
+ """
48
+
49
+ def __iter__():
50
+ errors = False
51
+
52
+ assert checksums_file.is_file(), f"{checksums_file} does not exist"
53
+ assert passages_hashes.is_file(), f"{passages_hashes} does not exist"
54
+
55
+ # Get the list of files
56
+ with checksums_file.open("rt") as fp:
57
+ files = []
58
+ for line in fp:
59
+ checksum, filename = line.strip().split()
60
+ files.append((checksum, filename))
61
+ if not (path / filename).is_file():
62
+ logging.error("File %s does not exist", path / filename)
63
+ errors = True
64
+
65
+ assert not errors, "Errors detected, stopping"
66
+
67
+ # Check the SHA256 sums
68
+ match checksums_file.suffix:
69
+ case ".sha256sums":
70
+ hasher_factory = sha256
71
+ case _:
72
+ raise NotImplementedError(
73
+ f"Cannot handle {checksums_file.suffix} checksum files"
74
+ )
75
+
76
+ for checksum, filename in tqdm(files):
77
+ logging.info("Checking %s", filename)
78
+ hasher = hasher_factory()
79
+ with (path / filename).open("rb") as fp:
80
+ while data := fp.read(2**20):
81
+ hasher.update(data)
82
+
83
+ file_checksum = hasher.hexdigest()
84
+ assert file_checksum == checksum, (
85
+ f"Expected {checksum}, " f"got {file_checksum} for {filename}"
86
+ )
87
+
88
+ # Get the MD5 hashes of all the passages
89
+ logging.info("Reading the hashes of all passages")
90
+ with TQDMFileReader(passages_hashes, "rt", bz2.open) as fp:
91
+ passage_checksums = {}
92
+ for line in tqdm(fp):
93
+ doc_id, passage_no, checksum = line.strip().split()
94
+ passage_checksums[f"{doc_id}:{passage_no}"] = checksum # noqa: E231
95
+
96
+ # Read the files
97
+ logging.info("Starting to read the files")
98
+ for _, filename in tqdm(files):
99
+ with TQDMFileReader(path / filename, "rt", bz2.open) as jsonl_fp:
100
+ for line in jsonl_fp:
101
+ data = json.loads(line)
102
+ expected = passage_checksums[data["id"]]
103
+ computed = md5(data["contents"].encode("utf-8")).hexdigest()
104
+ assert expected == computed, (
105
+ f"Expected {expected}, "
106
+ f"got {computed} for passage {data['id']} in {filename}"
107
+ )
108
+ yield IKatClueWeb22DocumentStore.Document(**data)
109
+
110
+ return __iter__
111
+
112
+ class Document(NamedTuple):
113
+ id: str
114
+ contents: str
115
+ url: str
116
+
117
+ data_cls = Document
118
+ lookup_field: Constant[str] = "id"
119
+ index_fields: Constant[List[str]] = ["id"]
120
+
121
+ def converter(self, data):
122
+ return DocumentRecord(
123
+ IDItem(data.id), SimpleTextItem(data.contents), UrlItem(data.url)
124
+ )
@@ -1,9 +1,9 @@
1
+ import re
1
2
  from typing import Dict, List, Optional
2
- from datamaestro.data import Base
3
3
  from experimaestro import documentation, Param, Meta
4
4
  from pathlib import Path
5
- from datamaestro.record import Record
6
5
  from datamaestro_text.data.ir import (
6
+ AdhocRunDict,
7
7
  Documents,
8
8
  Topics,
9
9
  AdhocAssessments,
@@ -47,6 +47,11 @@ class TrecAdhocAssessments(AdhocAssessments):
47
47
  class TrecAdhocRun(AdhocRun):
48
48
  path: Param[Path]
49
49
 
50
+ def get_dict(self) -> AdhocRunDict:
51
+ import datamaestro_text.interfaces.trec as trec
52
+
53
+ return trec.parse_run(self.path)
54
+
50
55
 
51
56
  class TrecAdhocResults(AdhocResults):
52
57
  """Adhoc results (TREC format)"""
@@ -62,8 +67,6 @@ class TrecAdhocResults(AdhocResults):
62
67
 
63
68
  def get_results(self) -> Dict[str, float]:
64
69
  """Returns the results as a dictionary {metric_name: value}"""
65
- import re
66
-
67
70
  re_spaces = re.compile(r"\s+")
68
71
 
69
72
  results = {}
@@ -122,7 +122,14 @@ class Documents(ir.DocumentStore, IRDSId):
122
122
  formats.Touche2020, "doc_id", "text", "title", "stance", "url"
123
123
  ),
124
124
  _irds.beir.BeirSciDoc: tuple_constructor(
125
- formats.SciDocs, "doc_id", "text", "title", "authors", "year", "cited_by", "references"
125
+ formats.SciDocs,
126
+ "doc_id",
127
+ "text",
128
+ "title",
129
+ "authors",
130
+ "year",
131
+ "cited_by",
132
+ "references",
126
133
  ),
127
134
  _irds.msmarco_document.MsMarcoDocument: tuple_constructor(
128
135
  formats.MsMarcoDocument, "doc_id", "url", "title", "body"
@@ -198,13 +205,29 @@ class Documents(ir.DocumentStore, IRDSId):
198
205
  for doc in self.dataset.docs_iter():
199
206
  yield self.converter(self.document_recordtype, doc)
200
207
 
208
+ def iter_documents_from(self, start=0):
209
+ for doc in self.dataset.docs_iter()[start:]:
210
+ yield self.converter(self.document_recordtype, doc)
211
+
201
212
  @property
202
213
  def documentcount(self):
203
214
  return self.dataset.docs_count()
204
215
 
205
216
  @cached_property
206
217
  def store(self):
207
- return self.dataset.docs_store()
218
+ kwargs = {}
219
+ try:
220
+ # Translate to ir datasets docstore options
221
+ import ir_datasets.indices as ir_indices
222
+ file_access = {
223
+ ir.FileAccess.MMAP: ir_indices.FileAccess.MMAP,
224
+ ir.FileAccess.FILE: ir_indices.FileAccess.FILE,
225
+ ir.FileAccess.MEMORY: ir_indices.FileAccess.MEMORY
226
+ }[self.file_access]
227
+ kwargs = {"options": ir_indices.DocstoreOptions(file_access=file_access)}
228
+ except ImportError:
229
+ logging.warning("This version of ir-datasets cannot handle docstore options")
230
+ return self.dataset.docs_store(**kwargs)
208
231
 
209
232
  @cached_property
210
233
  def _docs(self):
@@ -244,7 +267,7 @@ if hasattr(_irds, "miracl"):
244
267
  )
245
268
 
246
269
 
247
- class LZ4DocumentStore(ir.DocumentStore):
270
+ class LZ4DocumentStore(ir.DocumentStore, ABC):
248
271
  """A LZ4-based document store"""
249
272
 
250
273
  path: Param[Path]
@@ -253,7 +276,7 @@ class LZ4DocumentStore(ir.DocumentStore):
253
276
  lookup_field: Param[str]
254
277
 
255
278
  # Extra indexed fields (e.g. URLs)
256
- index_fields: List[str]
279
+ index_fields: List[str] = []
257
280
 
258
281
  @cached_property
259
282
  def store(self):
@@ -285,6 +308,9 @@ class LZ4DocumentStore(ir.DocumentStore):
285
308
  """Returns an iterator over documents"""
286
309
  return map(self.converter, self.store.__iter__())
287
310
 
311
+ def iter_documents_from(self, start=0):
312
+ return map(self.converter, self.store.__iter__()[start:])
313
+
288
314
  @cached_property
289
315
  def documentcount(self):
290
316
  if self.count:
@@ -386,7 +412,13 @@ class Topics(ir.TopicsStore, IRDSId):
386
412
  formats.TrecTopic, "query_id", "text", "description", "narrative"
387
413
  ),
388
414
  _irds.beir.BeirSciQuery: tuple_constructor(
389
- formats.SciDocsTopic, "query_id", "text", "authors", "year", "cited_by", "references"
415
+ formats.SciDocsTopic,
416
+ "query_id",
417
+ "text",
418
+ "authors",
419
+ "year",
420
+ "cited_by",
421
+ "references",
390
422
  ),
391
423
  _irds.tweets2013_ia.TrecMb13Query: tuple_constructor(
392
424
  formats.TrecMb13Query, "query_id", "query", "time", "tweet_time"
@@ -400,10 +432,7 @@ class Topics(ir.TopicsStore, IRDSId):
400
432
  "description",
401
433
  ),
402
434
  _irds.dpr_w100.DprW100Query: tuple_constructor(
403
- formats.DprW100Query,
404
- "query_id",
405
- "text",
406
- "answers"
435
+ formats.DprW100Query, "query_id", "text", "answers"
407
436
  ),
408
437
  }
409
438
 
@@ -435,11 +464,12 @@ class Topics(ir.TopicsStore, IRDSId):
435
464
  def iter(self) -> Iterator[TopicRecord]:
436
465
  """Returns an iterator over topics"""
437
466
  return self.handler.iter()
438
-
467
+
468
+
439
469
  class TrecBackgroundLinkingTopicsHandler(TopicsHandler):
440
470
  def __init__(self, dataset):
441
471
  self.dataset = dataset
442
-
472
+
443
473
  @cached_property
444
474
  def ext2records(self):
445
475
  return {record[IDItem].id: record for record in self.records}
@@ -462,10 +492,12 @@ class TrecBackgroundLinkingTopicsHandler(TopicsHandler):
462
492
  records = []
463
493
 
464
494
  for query in self.dataset.dataset.queries_iter():
465
- topic = Record(
495
+ topic = Record(
466
496
  IDItem(query.query_id),
467
497
  # Following BEIR documentation, we use title of documents as queries: https://github.com/beir-cellar/beir/blob/main/examples/dataset/README.md#queries-and-qrels
468
- SimpleTextItem(self.dataset.dataset.docs_store().get(query.doc_id).title),
498
+ SimpleTextItem(
499
+ self.dataset.dataset.docs_store().get(query.doc_id).title
500
+ ),
469
501
  UrlItem(query.url),
470
502
  )
471
503
  records.append(topic)
@@ -477,11 +509,10 @@ class TrecBackgroundLinkingTopicsHandler(TopicsHandler):
477
509
 
478
510
 
479
511
  Topics.HANDLERS.update(
480
- {
481
- _irds.wapo.TrecBackgroundLinkingQuery: TrecBackgroundLinkingTopicsHandler
482
- }
512
+ {_irds.wapo.TrecBackgroundLinkingQuery: TrecBackgroundLinkingTopicsHandler}
483
513
  )
484
514
 
515
+
485
516
  class CastTopicsHandler(TopicsHandler):
486
517
  def __init__(self, dataset):
487
518
  self.dataset = dataset
@@ -1,7 +1,7 @@
1
- from attrs import define
2
1
  from pathlib import Path
3
2
  from typing import Iterator, Optional
4
3
  import re
4
+ from datamaestro_text.data.ir import AdhocRunDict
5
5
  from datamaestro_text.data.ir.base import (
6
6
  AdhocAssessedTopic,
7
7
  TopicRecord,
@@ -10,6 +10,33 @@ from datamaestro_text.data.ir.base import (
10
10
  )
11
11
  from datamaestro_text.data.ir.formats import TrecTopicRecord, TrecTopic
12
12
 
13
+ # --- Runs
14
+
15
+
16
+ def parse_run(path: Path) -> AdhocRunDict:
17
+ results = {}
18
+ with path.open("rt") as f:
19
+ for line in f:
20
+ query_id, _q0, doc_id, _rank, score, _model_id = re.split(
21
+ r"\s+", line.strip()
22
+ )
23
+ results.setdefault(query_id, {})[doc_id] = score
24
+
25
+ return results
26
+
27
+
28
+ def write_run_dict(run: AdhocRunDict, run_path: Path):
29
+ """Write run dict"""
30
+ with run_path.open("wt") as f:
31
+ for query_id, scored_documents in run.items():
32
+ scored_documents = list(
33
+ [(doc_id, score) for doc_id, score in scored_documents.items()]
34
+ )
35
+ scored_documents.sort(key=lambda x: x[1], reverse=True)
36
+ for ix, (doc_id, score) in enumerate(scored_documents):
37
+ f.write(f"{query_id} Q0 {doc_id} {ix + 1} {score} run\n")
38
+
39
+
13
40
  # --- Assessments
14
41
 
15
42
 
@@ -1,3 +1,5 @@
1
+ import os
2
+ from tqdm import tqdm
1
3
  import gzip
2
4
  from pathlib import Path
3
5
 
@@ -6,3 +8,104 @@ def auto_open(path: Path, mode: str):
6
8
  if path.suffix == ".gz":
7
9
  return gzip.open(path, mode)
8
10
  return path.open(mode)
11
+
12
+
13
+ class CountingWrapper:
14
+ """Wrap a file object to count the actual compressed bytes read."""
15
+
16
+ def __init__(self, file_obj):
17
+ self.file_obj = file_obj
18
+ self.bytes_read = 0
19
+
20
+ def read(self, size=-1):
21
+ data = self.file_obj.read(size)
22
+ self.bytes_read += len(data)
23
+ return data
24
+
25
+ def readline(self, size=-1):
26
+ data = self.file_obj.readline(size)
27
+ self.bytes_read += len(data)
28
+ return data
29
+
30
+ def __iter__(self):
31
+ return self
32
+
33
+ def __next__(self):
34
+ line = self.readline()
35
+ if not line:
36
+ raise StopIteration
37
+ return line
38
+
39
+ def close(self):
40
+ self.file_obj.close()
41
+
42
+ def __getattr__(self, attr):
43
+ return getattr(self.file_obj, attr)
44
+
45
+
46
+ class TQDMBytesReader:
47
+ def __init__(self, file_obj, total_size, **tqdm_kwargs):
48
+ self.file_obj = CountingWrapper(file_obj)
49
+ self.tqdm = tqdm(
50
+ total=total_size,
51
+ unit="B",
52
+ unit_scale=True,
53
+ unit_divisor=1024,
54
+ **tqdm_kwargs,
55
+ )
56
+
57
+ def _update_progress(self):
58
+ delta = self.file_obj.bytes_read - self.tqdm.n
59
+ if delta > 0:
60
+ self.tqdm.update(delta)
61
+
62
+ def read(self, size=-1):
63
+ data = self.file_obj.read(size)
64
+ self._update_progress()
65
+ return data
66
+
67
+ def readline(self, size=-1):
68
+ line = self.file_obj.readline(size)
69
+ self._update_progress()
70
+ return line
71
+
72
+ def readlines(self, hint=-1):
73
+ lines = self.file_obj.readlines(hint)
74
+ self._update_progress()
75
+ return lines
76
+
77
+ def __iter__(self):
78
+ return self
79
+
80
+ def __next__(self):
81
+ line = self.readline()
82
+ if not line:
83
+ raise StopIteration
84
+ return line
85
+
86
+ def close(self):
87
+ self.tqdm.close()
88
+ self.file_obj.close()
89
+
90
+ def __getattr__(self, attr):
91
+ # Delegate any other attribute to the underlying file object
92
+ return getattr(self.file_obj, attr)
93
+
94
+
95
+ class TQDMFileReader:
96
+ def __init__(self, filepath, mode="rt", file_opener=open, **tqdm_kwargs):
97
+ self.filepath = filepath
98
+ self.mode = mode
99
+ self.file_opener = file_opener
100
+ self.tqdm_kwargs = tqdm_kwargs
101
+
102
+ def __enter__(self):
103
+ self.file_obj = self.file_opener(self.filepath, self.mode)
104
+ total_size = os.path.getsize(self.filepath) # this is compressed size
105
+ self.reader = TQDMBytesReader(
106
+ self.file_obj, total_size=total_size, **self.tqdm_kwargs
107
+ )
108
+ return self.reader
109
+
110
+ def __exit__(self, exc_type, exc_val, exc_tb):
111
+ self.reader.close()
@@ -82,3 +82,8 @@ class LazyList(Sequence):
82
82
  # Convert the iterable to a list if it hasn't been already
83
83
  if self.materialized_list is None:
84
84
  self.materialized_list = list(self.iterable)
85
+
86
+ def reverse(self):
87
+ """Reverse the list in place, materializing it if necessary"""
88
+ self._materialize()
89
+ self.materialized_list.reverse()
@@ -1,7 +1,14 @@
1
1
  # file generated by setuptools-scm
2
2
  # don't change, don't track in version control
3
3
 
4
- __all__ = ["__version__", "__version_tuple__", "version", "version_tuple"]
4
+ __all__ = [
5
+ "__version__",
6
+ "__version_tuple__",
7
+ "version",
8
+ "version_tuple",
9
+ "__commit_id__",
10
+ "commit_id",
11
+ ]
5
12
 
6
13
  TYPE_CHECKING = False
7
14
  if TYPE_CHECKING:
@@ -9,13 +16,19 @@ if TYPE_CHECKING:
9
16
  from typing import Union
10
17
 
11
18
  VERSION_TUPLE = Tuple[Union[int, str], ...]
19
+ COMMIT_ID = Union[str, None]
12
20
  else:
13
21
  VERSION_TUPLE = object
22
+ COMMIT_ID = object
14
23
 
15
24
  version: str
16
25
  __version__: str
17
26
  __version_tuple__: VERSION_TUPLE
18
27
  version_tuple: VERSION_TUPLE
28
+ commit_id: COMMIT_ID
29
+ __commit_id__: COMMIT_ID
19
30
 
20
- __version__ = version = '2025.6.30'
21
- __version_tuple__ = version_tuple = (2025, 6, 30)
31
+ __version__ = version = '2025.9.11'
32
+ __version_tuple__ = version_tuple = (2025, 9, 11)
33
+
34
+ __commit_id__ = commit_id = 'gadcc9bd27'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datamaestro-text
3
- Version: 2025.6.30
3
+ Version: 2025.9.11
4
4
  Summary: Datamaestro module for text-related datasets
5
5
  Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
6
6
  License: GPL-3
@@ -15,10 +15,10 @@ Classifier: Operating System :: OS Independent
15
15
  Classifier: Programming Language :: Python
16
16
  Classifier: Programming Language :: Python :: 3
17
17
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
- Requires-Python: >=3.8
18
+ Requires-Python: >=3.10
19
19
  Description-Content-Type: text/markdown
20
20
  License-File: LICENSE
21
- Requires-Dist: datamaestro>=1.4.2
21
+ Requires-Dist: datamaestro>=1.5.0
22
22
  Requires-Dist: ir_datasets>=0.5.8
23
23
  Requires-Dist: attrs
24
24
  Provides-Extra: dev
@@ -1,21 +1,21 @@
1
1
  datamaestro_text/__init__.py,sha256=hU8jZpkXl3F74qIfqnJl7v4nJ9YxfoR7IpJpUREFNRI,248
2
- datamaestro_text/version.py,sha256=roAk0Y2ehJXQqbh_xSb4D-q2BlNYSoIoP3QPIZIy72s,519
2
+ datamaestro_text/version.py,sha256=aHTcqRU_FAq8moUGgwqhCMrhMi8VBbk38TX-uMF8p20,720
3
3
  datamaestro_text/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  datamaestro_text/config/ai/quac.yaml,sha256=h1D7UJo2z1nZ_9MXpDHuQNJG9Ma2oayUdJV6lyumAIg,1103
5
5
  datamaestro_text/config/com/oscar-corpus.py,sha256=dDfdy_uhG6esRQQzUOOORwK8KvEq4c4hZbWt22pv-zY,733
6
- datamaestro_text/config/com/sentiment140.py,sha256=3cZfqs395gY14gGojRC_RTYEdG5l8RUTSdF9li8ynBc,1283
6
+ datamaestro_text/config/com/sentiment140.py,sha256=itfBEgcOniECXKOw8I2dhzyS9LOMsltMLfKK6NGRpVY,1293
7
7
  datamaestro_text/config/com/fastml/goodbooks-10k.yaml,sha256=5ZABxUnBFs2ZnCXtBH8YoBiPb3SocRRdH1DLSfVWF-Y,1172
8
- datamaestro_text/config/com/github/ikat.py,sha256=aozSgFcVK_vYZokD9YdF187aa3WwTkc6_Cx6NJ9I_74,1337
9
- datamaestro_text/config/com/github/aagohary/canard.py,sha256=DREIKU2FFrfwibjU48Y1AsKTUzhwxmD4Kuykd4bDIus,1762
10
- datamaestro_text/config/com/github/apple/ml-qrecc.py,sha256=oYI0SUxEYzGoL2IbRrnze2cQuWwENwNk4ID9NQuI2Tw,3061
8
+ datamaestro_text/config/com/github/ikat.py,sha256=DCayX-t2OBeW5bOJvRxoQgIH3vy-__mYzdmVcnayAkk,4230
9
+ datamaestro_text/config/com/github/aagohary/canard.py,sha256=5fLwCLNBGM_7--naTCDayAMYLvK3yTD8auaEf-dqrb4,1768
10
+ datamaestro_text/config/com/github/apple/ml-qrecc.py,sha256=zP3w7A9KSvJVCo44OaB1az1pDKWxE6qXS4qFm3hqg3Y,3064
11
11
  datamaestro_text/config/com/github/prdwb/orconvqa.py,sha256=MFJYaxJoqJQ6kMDHa2PIE0zoxYTA8Kyl26-vzFoMML0,3032
12
12
  datamaestro_text/config/com/github/soskek/bookcorpus.yaml,sha256=qJKs35yeEIilEMgNvU3OEqMp1TSn7mDM2T-uYyA7kTU,1607
13
13
  datamaestro_text/config/com/microsoft/wikiqa.yaml,sha256=U7rU-W2Xz1MYv2YXT4jCTj5DsHyM0VssbQPNa3EISaM,540
14
- datamaestro_text/config/com/microsoft/msmarco/passage.py,sha256=tVldwVh0pQpXXifqcMNEN9cLO1HXkXoEhToSazSx5RE,11643
14
+ datamaestro_text/config/com/microsoft/msmarco/passage.py,sha256=hN2KOdi6ToHlodozqsYAOtxaqiUGkGGtRtb3RFSgnEU,11645
15
15
  datamaestro_text/config/com/smashwords/bookcorpus.py,sha256=hCFjZg9t1AHET05qU31uatiAOD5EEzzWG9y_W90jcOE,861
16
16
  datamaestro_text/config/edu/cornell/nlvr.yaml,sha256=9Yk5VZMncSmrP7JNuGXqExksgX5nQ_Zfnlps8hWze3Q,921
17
17
  datamaestro_text/config/edu/stanford/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
- datamaestro_text/config/edu/stanford/aclimdb.py,sha256=lsPDxnp_rWOCpBte6pZ0_LVaC33w5mmgfGh51rcTgt8,643
18
+ datamaestro_text/config/edu/stanford/aclimdb.py,sha256=QtriReAVsbJlxkgfJWQCZdCeJ9LswYnOR9mFrgghL9c,647
19
19
  datamaestro_text/config/edu/stanford/glove.py,sha256=bXxwiJqT8alPs5MwwxVuY0xBO8g1QWhm2L3AKVskTlA,2391
20
20
  datamaestro_text/config/edu/stanford/im2p.yaml,sha256=JoToNyEPpmwdyLFedCBot5ypyw7p9rzi12mGXJuZin0,2909
21
21
  datamaestro_text/config/edu/upenn/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -29,17 +29,17 @@ datamaestro_text/config/gov/nist/trec/adhoc.py,sha256=vcFaLlZ-chwDt013MYI8bYZ2ug
29
29
  datamaestro_text/config/gov/nist/trec/clueweb.yaml,sha256=sm4UbdtMzWoDVPsewtVDS7Vj2jBOdgp18Xqo1X4ysQc,792
30
30
  datamaestro_text/config/gov/nist/trec/deeplearning.yaml,sha256=QGM7PtXLJRttNdOPE16o7-k3e5tA9HgcaM_-qFDV_5Q,2125
31
31
  datamaestro_text/config/gov/nist/trec/index.yaml,sha256=oSUhUmtukf5oXqUbJLhae8xZx7Uac5V6uZoUB-RJ7Sw,2711
32
- datamaestro_text/config/gov/nist/trec/tipster.py,sha256=t0w9lOBfvwt6YCYCyDj9fo1QiBXfs0q8qzKl74f40jU,5398
32
+ datamaestro_text/config/gov/nist/trec/tipster.py,sha256=DirpnHpS10e27LcL7v9ksKreKVy7EgfVhyztV49VRds,5364
33
33
  datamaestro_text/config/gov/nist/trec/web.yaml,sha256=iNIJ-PCeLwpUaJByJesyCqfFJolWCts_VETACha3hfQ,563
34
34
  datamaestro_text/config/io/github/rajpurkar/squad.yaml,sha256=JchkTruEhVd0uOTGgek2tOgBL4UT3G3KVASvZcNigLg,1143
35
- datamaestro_text/config/io/github/thunlp/fewrel.py,sha256=LJYexk2ssZ5SqKIKLjPdybS9ju4g9fGIFy5eybtsvYQ,1224
35
+ datamaestro_text/config/io/github/thunlp/fewrel.py,sha256=rg_qAnMrXYUZhQYxA12r_Npl0ggyfTLJQjdSCjU0QxM,1228
36
36
  datamaestro_text/config/io/metamind/research/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
- datamaestro_text/config/io/metamind/research/wikitext.py,sha256=jWpCadMYJnX2I_Ls230lB8gwBvV51SOhABUznL8_dCA,2301
37
+ datamaestro_text/config/io/metamind/research/wikitext.py,sha256=DjyBmG74JvuMt9RpMwuLAnxzOdByIWsk4VnXgkJp1NM,2307
38
38
  datamaestro_text/config/net/mattmahoney/enwiki.yaml,sha256=HCUn3s0AydXX3BjJ6yUXY0vGLGWSBkOCaDhQ4PA2Adg,2452
39
39
  datamaestro_text/config/org/acm/recsys/cb2014.yaml,sha256=5SAK3Am1k0HFugSSCIQN5mLPBfr1zZZAkhLrSH5pHQc,1274
40
40
  datamaestro_text/config/org/cocodataset/index.yaml,sha256=KISJChMeKwlZbSnHmRcGMsm6jbcFGVe1aA4GhP2fzqw,474
41
- datamaestro_text/config/org/grouplens/movielens.py,sha256=e78JoiHuwP6gbj7Q84UwPTvjtN_ehfD5AhvuUt8DiH4,1824
42
- datamaestro_text/config/org/universaldependencies/french.py,sha256=W_gDtfinjlw08qohX_PWvzQlacDwRFB7PeOzO33mRVU,2208
41
+ datamaestro_text/config/org/grouplens/movielens.py,sha256=tV6OSTDdtjll1dQBCsYIlsBbtOO-MCiLles2aj0MgDA,1840
42
+ datamaestro_text/config/org/universaldependencies/french.py,sha256=etedb3_SC-fV5Oa2rM4_smZk6t4CPiNvU4C4keUFZHY,2214
43
43
  datamaestro_text/config/uk/ac/ucl/cs/qangaroo.yaml,sha256=IBy82CDNNLjJPNPzues1EgDXu0A5WDvUFeVNSOyrIpI,1137
44
44
  datamaestro_text/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
45
  datamaestro_text/data/embeddings.py,sha256=qdeZ4m6drFNkqW_4pKxJKdJHtOnDRs0P7QVJ7AO6xFE,1127
@@ -47,42 +47,42 @@ datamaestro_text/data/recommendation.py,sha256=wHV_9SXSclouuXaBmvwg3ncZLOFfdKRZG
47
47
  datamaestro_text/data/tagging.py,sha256=yWm7bNLks77cAySa1aZNCmLFxTTqhqXZ0PaoaEYU6hI,697
48
48
  datamaestro_text/data/text.py,sha256=_9J7-j06BOIZ1HsOcBsWy_asanAKkDoZIsWWMMj4tm4,496
49
49
  datamaestro_text/data/conversation/__init__.py,sha256=esOWnSImMlQs5QtfxUt559ABLd6a5wwoNpj9XtIq71Y,159
50
- datamaestro_text/data/conversation/base.py,sha256=j4ftXRblBWwt3AqhIS4avalqY9o7VX2C9Wrw_ZMPqek,6514
50
+ datamaestro_text/data/conversation/base.py,sha256=BAqu5YOjlRcBOSiyvrjiTIuuc1OalM0BDtcEvJYhO2Y,7515
51
51
  datamaestro_text/data/conversation/canard.py,sha256=FI02rPNuzSsA_uXOcgcuQn31d3QahmHndzYCV1Zm8rk,3305
52
- datamaestro_text/data/conversation/ikat.py,sha256=AOqJk_LQdhaNnemsNy6vkcEVN3ULMf2twXqmcQQ0t_g,3489
52
+ datamaestro_text/data/conversation/ikat.py,sha256=djxTv0vHLXIUPzfmwpB25fyUWJNVPqthTtDbQ6CzmQo,4363
53
53
  datamaestro_text/data/conversation/orconvqa.py,sha256=RL9cpK7QK2dsk_j2e4Th8wzGFq-G3ww_EjdM1eKh-8o,3764
54
54
  datamaestro_text/data/conversation/qrecc.py,sha256=es4GmqPtE63A7O_GARe8Zy3rQvuLEhAvUA7CfN_nMeA,2562
55
- datamaestro_text/data/ir/__init__.py,sha256=EOVnRv9oKEEucMOgrFjhXxeUWEkaJkbXBndpkKhZaiY,8701
56
- datamaestro_text/data/ir/base.py,sha256=TSaY8UWoixqvKg1z5JfkXPnuZiVoChYEUvc7jjbpZqY,1495
55
+ datamaestro_text/data/ir/__init__.py,sha256=ZRJrUeeUyD1ncMN5JINVvFJ2lDr3KsbgiiEBJkczSi0,9814
56
+ datamaestro_text/data/ir/base.py,sha256=uwIiKn0ryK5DWUQsEegeTs6bye0uAOGl0XVm_cvV3ZU,1506
57
57
  datamaestro_text/data/ir/cord19.py,sha256=yu1Khgy6AZjH2BPQKdnxDid0vQgQ8zvb8-FQlHH-GOU,1465
58
58
  datamaestro_text/data/ir/csv.py,sha256=tnxuqR_MZ3GQhuoXEMYyWLQw8PyD0gRqsnzIP5Gsziw,1212
59
59
  datamaestro_text/data/ir/data.py,sha256=ERmmOxz_9mUkIuccNbzUa5Y6gVLLVDdyc4cCxbCCUbY,20
60
- datamaestro_text/data/ir/formats.py,sha256=K0mqmCGg0seneKo-Rt3cBfDVjEMS4_6t1MfNj4iW9Y4,3637
60
+ datamaestro_text/data/ir/formats.py,sha256=Tbu5rrssq9m76a448-ixgs7a-KKvNCvXYkdv5FkFqp4,3641
61
61
  datamaestro_text/data/ir/huggingface.py,sha256=G71VFDN-SllZy4LFxumEbCumEJvb5-orAbLemHjWhiA,956
62
- datamaestro_text/data/ir/stores.py,sha256=rlOogoBAfsP7o01KqvHiTF1XqzK2Fp6QbRmuasabKAE,859
63
- datamaestro_text/data/ir/trec.py,sha256=qDOzmgUn0hMqeP9KdI190-9IKsLl6yNRqIoBz6s-cBs,1898
62
+ datamaestro_text/data/ir/stores.py,sha256=A4Ew0L4P6iLLmDKhxqjjVkHcz797BHf4d76YguGkB1A,4412
63
+ datamaestro_text/data/ir/trec.py,sha256=IOtQRMUz8zx-dYEMR2NIIM6qXEUjsV0eVOhGvKIRJK4,1974
64
64
  datamaestro_text/data/ir/utils.py,sha256=6-GhXVtgkBZGhIs2-ODZua_3DmKjSSVydStpHDqbAwE,833
65
65
  datamaestro_text/datasets/irds/__init__.py,sha256=Tq0HN1qojnZYLBumM59BuTkz7r0gcu-5OXmDDLgPpAc,707
66
- datamaestro_text/datasets/irds/data.py,sha256=0V5nPMQeh1I3Sp36x_NE7xrbC_vbZAKhU5NONj95aok,22058
66
+ datamaestro_text/datasets/irds/data.py,sha256=YlDbGFsh6_mCmk49F3bwdsLEbpHVvMv4gvc1H8KZnpo,23096
67
67
  datamaestro_text/datasets/irds/datasets.py,sha256=n5mvD2omVI4PkeuJgX13Cio6gLrgYjxq8ZUKKyqocEs,5625
68
68
  datamaestro_text/datasets/irds/helpers.py,sha256=fGE-fbuJbXdTzl1qo55247jzn9cvApY-d82GJBgfY1E,3982
69
69
  datamaestro_text/datasets/irds/utils.py,sha256=m30JLIrV_HgilN11TvY9dGTyumES6LLzWZDUAMT915M,1425
70
70
  datamaestro_text/download/tmdb.py,sha256=kU_Vz9jhznlyeKMHziVu58IHoWv8zPu6CZTHVNQvmu4,4009
71
71
  datamaestro_text/interfaces/plaintext.py,sha256=cWfS_xjqZxQ0EV4Ax5BEarZ4lnhQ1I7mc_vgfBgE76w,885
72
- datamaestro_text/interfaces/trec.py,sha256=g5UIjOvhMBaib9mm280dkQLdtLtuId8bjfptaVi5Pew,2709
72
+ datamaestro_text/interfaces/trec.py,sha256=GrP0N_Hcj5f73KS6CSzkyN4aaI-XoBJ19oVMpHVR3QM,3579
73
73
  datamaestro_text/test/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
74
74
  datamaestro_text/test/test_datasets.py,sha256=hD1pe-CjEnOj0JNqDLasz_97ltFzMbYfyYBy1QyYdf8,202
75
75
  datamaestro_text/test/test_documented.py,sha256=Kl90fz_r_dsV0oXE1Mad34GiQyZ9dc6bfZKNtxP0O2s,453
76
76
  datamaestro_text/transforms/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
77
77
  datamaestro_text/transforms/ir/__init__.py,sha256=eWxr0ZtfK9F9J4TachW5V8Ra5AhQJueDyOGR8vXi1uA,6553
78
78
  datamaestro_text/utils/__init__.py,sha256=2449YLTAtKJzkmt84Mu8sBRCCveNs5fiaqTCK_p5ha0,3340
79
- datamaestro_text/utils/files.py,sha256=n6ZGl5LNrZbHLcV9RFwd7cFT0vPUezit-2dsBzs1vRQ,170
80
- datamaestro_text/utils/iter.py,sha256=uzBmavBeqVyEvYmi9Ds2dqh0ywhJE-1SBO5v8AoIKcc,2537
79
+ datamaestro_text/utils/files.py,sha256=jhcirufgTztEkx1hs2-qgScEcHnIcGF_BjCeuCSsNv0,2838
80
+ datamaestro_text/utils/iter.py,sha256=QBajeSPLHvkeh6BCTZDSqWlOYNjwUDvgTTZ_YxJntXw,2701
81
81
  datamaestro_text/utils/randomstream.py,sha256=_-boH4IIqN8qcl3IktjpNp9vmF4TWRzHUSNVwg7WAr8,973
82
82
  datamaestro_text/utils/shuffle.py,sha256=o8JTz3mr0lYWyv0zEh91jEK12ci1etMiUnzh5GkOHCM,3490
83
- datamaestro_text-2025.6.30.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
84
- datamaestro_text-2025.6.30.dist-info/METADATA,sha256=SHQDBJyeUeNlRUYIPvwhTInQtTV02LIpwOg2v1YVL3s,1847
85
- datamaestro_text-2025.6.30.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
86
- datamaestro_text-2025.6.30.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
87
- datamaestro_text-2025.6.30.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
88
- datamaestro_text-2025.6.30.dist-info/RECORD,,
83
+ datamaestro_text-2025.9.11.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
84
+ datamaestro_text-2025.9.11.dist-info/METADATA,sha256=ChGV_8bnixfGl91eG_3-Qwba8tjMwe2VPCwXdGxG_xM,1848
85
+ datamaestro_text-2025.9.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
86
+ datamaestro_text-2025.9.11.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
87
+ datamaestro_text-2025.9.11.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
88
+ datamaestro_text-2025.9.11.dist-info/RECORD,,