datamaestro-text 2025.9.11__py3-none-any.whl → 2026.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. datamaestro_text/__init__.py +1 -1
  2. datamaestro_text/config/com/github/ikat.py +0 -1
  3. datamaestro_text/config/com/oscar-corpus.py +1 -1
  4. datamaestro_text/config/com/smashwords/bookcorpus.py +1 -1
  5. datamaestro_text/config/edu/stanford/aclimdb.py +1 -1
  6. datamaestro_text/config/edu/stanford/glove.py +0 -1
  7. datamaestro_text/config/fr/granddebat.py +186 -0
  8. datamaestro_text/config/gov/nist/ir/covid.py +1 -2
  9. datamaestro_text/config/io/metamind/research/wikitext.py +1 -1
  10. datamaestro_text/data/conversation/__init__.py +6 -6
  11. datamaestro_text/data/conversation/base.py +4 -4
  12. datamaestro_text/data/conversation/canard.py +3 -4
  13. datamaestro_text/data/conversation/ikat.py +0 -1
  14. datamaestro_text/data/conversation/orconvqa.py +3 -3
  15. datamaestro_text/data/debate/__init__.py +5 -0
  16. datamaestro_text/data/debate/granddebat.py +68 -0
  17. datamaestro_text/data/embeddings.py +1 -0
  18. datamaestro_text/data/ir/__init__.py +1 -1
  19. datamaestro_text/data/ir/base.py +1 -1
  20. datamaestro_text/data/ir/csv.py +7 -8
  21. datamaestro_text/data/ir/data.py +1 -1
  22. datamaestro_text/data/ir/formats.py +2 -3
  23. datamaestro_text/data/ir/stores.py +1 -1
  24. datamaestro_text/data/text.py +1 -0
  25. datamaestro_text/datasets/__init__.py +1 -0
  26. datamaestro_text/datasets/irds/data.py +14 -20
  27. datamaestro_text/datasets/irds/datasets.py +1 -1
  28. datamaestro_text/download/tmdb.py +0 -1
  29. datamaestro_text/transforms/ir/__init__.py +13 -14
  30. datamaestro_text/utils/shuffle.py +1 -1
  31. datamaestro_text/version.py +3 -3
  32. {datamaestro_text-2025.9.11.dist-info → datamaestro_text-2026.2.2.dist-info}/METADATA +15 -17
  33. {datamaestro_text-2025.9.11.dist-info → datamaestro_text-2026.2.2.dist-info}/RECORD +36 -33
  34. {datamaestro_text-2025.9.11.dist-info → datamaestro_text-2026.2.2.dist-info}/WHEEL +1 -2
  35. datamaestro_text-2025.9.11.dist-info/top_level.txt +0 -1
  36. {datamaestro_text-2025.9.11.dist-info → datamaestro_text-2026.2.2.dist-info}/entry_points.txt +0 -0
  37. {datamaestro_text-2025.9.11.dist-info → datamaestro_text-2026.2.2.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,6 @@
1
1
  import datamaestro
2
2
 
3
- from .version import version, version_tuple
3
+ from .version import version as version, version_tuple as version_tuple
4
4
 
5
5
 
6
6
  class Repository(datamaestro.Repository):
@@ -1,6 +1,5 @@
1
1
  # See documentation on https://datamaestro.readthedocs.io
2
2
 
3
- import bz2
4
3
  from datamaestro.download import reference
5
4
  from datamaestro.definitions import datatasks, datatags, dataset
6
5
  from datamaestro_text.data.conversation.base import ConversationUserTopics
@@ -1,4 +1,4 @@
1
- from datamaestro.definitions import argument, datatasks, datatags, dataset
1
+ from datamaestro.definitions import dataset
2
2
  from datamaestro.download.single import filedownloader
3
3
  from datamaestro_text.data.text import TextFile
4
4
  from datamaestro.utils import HashCheck
@@ -1,6 +1,6 @@
1
1
  # See documentation on https://datamaestro.readthedocs.io
2
2
 
3
- from datamaestro.definitions import argument, datatasks, datatags, dataset
3
+ from datamaestro.definitions import datatasks, datatags, dataset
4
4
  from datamaestro_text.data.text import TextFolder
5
5
  from datamaestro.download.archive import tardownloader
6
6
  from datamaestro.utils import HashCheck
@@ -1,5 +1,5 @@
1
1
  from datamaestro.data.ml import FolderBased, Supervised
2
- from datamaestro.definitions import datatasks, datatags, dataset
2
+ from datamaestro.definitions import dataset
3
3
  from datamaestro.download.archive import tardownloader
4
4
 
5
5
 
@@ -5,7 +5,6 @@ GloVe is an unsupervised learning algorithm for obtaining vector representations
5
5
  """
6
6
 
7
7
  from datamaestro.definitions import dataset
8
- from datamaestro.data import Base, Generic
9
8
  from datamaestro.download import reference
10
9
  from datamaestro.download.archive import zipdownloader
11
10
  from datamaestro.download.single import filedownloader
@@ -0,0 +1,186 @@
1
+ # See documentation on https://datamaestro.readthedocs.io
2
+
3
+ from pathlib import Path
4
+ from datamaestro.definitions import datatags, dataset
5
+ from datamaestro_text.data.debate import GrandDebatFile
6
+ from datamaestro.download.single import filedownloader
7
+ from datamaestro.utils import HashCheck
8
+ from datamaestro.stream import Transform
9
+ import io
10
+ import json
11
+ import ijson
12
+ import os
13
+ import threading
14
+
15
+
16
+ class JsonToJsonl(Transform):
17
+ """Transforms a JSON file with an array into a JSONL file with one line per
18
+ array element"""
19
+
20
+ def __call__(self, fileobj: io.IOBase) -> io.IOBase:
21
+ # Stream items from the top-level array into a read-end pipe.
22
+ try:
23
+ fileobj.seek(0)
24
+ except Exception:
25
+ pass
26
+
27
+ r_fd, w_fd = os.pipe()
28
+ r_file = os.fdopen(r_fd, "rb")
29
+ w_file = os.fdopen(w_fd, "wb")
30
+
31
+ def _writer(fin, fout):
32
+ try:
33
+ for item in ijson.items(fin, "item"):
34
+ line = json.dumps(item, ensure_ascii=False) + "\n"
35
+ fout.write(line.encode("utf-8"))
36
+ fout.close()
37
+ except Exception:
38
+ try:
39
+ fout.close()
40
+ except Exception:
41
+ pass
42
+
43
+ t = threading.Thread(target=_writer, args=(fileobj, w_file), daemon=True)
44
+ t.start()
45
+
46
+ return r_file
47
+
48
+
49
+ @filedownloader(
50
+ "la_transition_ecologique_2019_03_21.jsonl",
51
+ "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LA_TRANSITION_ECOLOGIQUE.json",
52
+ checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
53
+ transforms=JsonToJsonl(),
54
+ )
55
+ @datatags("politics", "debate", "french")
56
+ @dataset(
57
+ GrandDebatFile,
58
+ url="https://granddebat.fr",
59
+ )
60
+ def transition(la_transition_ecologique_2019_03_21: Path):
61
+ """Grand Débat National (transition écologique)
62
+
63
+ The *Grand Débat National* (GDN) is a country-wide citizen consultation held
64
+ in France in 2019.
65
+
66
+
67
+ The consultation prompted citizens to express their views across four main
68
+ themes: *Taxation and public spending*, *Organization of the state and
69
+ public services*, *Democracy and citizenship*, and *Ecological transition*.
70
+ A significant portion of this consultation involved online questionnaires,
71
+ each concluding with a critical open-ended prompt: "Do you have anything to
72
+ add about [theme]?".
73
+ """
74
+ return GrandDebatFile.C(path=la_transition_ecologique_2019_03_21)
75
+
76
+
77
+ @filedownloader(
78
+ "fiscalité_et_dépenses_publiques_2019_03_21.jsonl",
79
+ "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json",
80
+ checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
81
+ transforms=JsonToJsonl(),
82
+ )
83
+ @datatags("politics", "debate", "french")
84
+ @dataset(
85
+ GrandDebatFile,
86
+ url="https://granddebat.fr",
87
+ )
88
+ def fiscalité(fiscalité_et_dépenses_publiques_2019_03_21: Path):
89
+ """Grand Débat National (fiscalité et dépenses publiques)
90
+
91
+ The *Grand Débat National* (GDN) is a country-wide citizen consultation held
92
+ in France in 2019.
93
+
94
+
95
+ The consultation prompted citizens to express their views across four main
96
+ themes: *Taxation and public spending*, *Organization of the state and
97
+ public services*, *Democracy and citizenship*, and *Ecological transition*.
98
+ A significant portion of this consultation involved online questionnaires,
99
+ each concluding with a critical open-ended prompt: "Do you have anything to
100
+ add about [theme]?".
101
+ """
102
+ return GrandDebatFile.C(path=fiscalité_et_dépenses_publiques_2019_03_21)
103
+
104
+
105
+ @filedownloader(
106
+ "democratie_et_citoyennete_2019_03_21.jsonl",
107
+ "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/DEMOCRATIE_ET_CITOYENNETE.json",
108
+ checker=HashCheck("049aaeca7e51747f2da5d68143c760fa"),
109
+ transforms=JsonToJsonl(),
110
+ )
111
+ @datatags("politics", "debate", "french")
112
+ @dataset(
113
+ GrandDebatFile,
114
+ url="https://granddebat.fr",
115
+ )
116
+ def démocratie(democratie_et_citoyennete_2019_03_21: Path):
117
+ """Grand Débat National (démocratie et citoyenneté)
118
+
119
+ The *Grand Débat National* (GDN) is a country-wide citizen consultation held
120
+ in France in 2019.
121
+
122
+
123
+ The consultation prompted citizens to express their views across four main
124
+ themes: *Taxation and public spending*, *Organization of the state and
125
+ public services*, *Democracy and citizenship*, and *Ecological transition*.
126
+ A significant portion of this consultation involved online questionnaires,
127
+ each concluding with a critical open-ended prompt: "Do you have anything to
128
+ add about [theme]?".
129
+ """
130
+ return GrandDebatFile.C(path=democratie_et_citoyennete_2019_03_21)
131
+
132
+
133
+ @filedownloader(
134
+ "organisation_etat_services_publics_2019_03_21.jsonl",
135
+ "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json",
136
+ checker=HashCheck("0ccb7c401889f738b73b0caab897a68b"),
137
+ transforms=JsonToJsonl(),
138
+ )
139
+ @datatags("politics", "debate", "french")
140
+ @dataset(
141
+ GrandDebatFile,
142
+ url="https://granddebat.fr",
143
+ )
144
+ def organisation(organisation_etat_services_publics_2019_03_21: Path):
145
+ """Grand Débat National (organisation de l'État et des services publics)
146
+
147
+ The *Grand Débat National* (GDN) is a country-wide citizen consultation held
148
+ in France in 2019.
149
+
150
+
151
+ The consultation prompted citizens to express their views across four main
152
+ themes: *Taxation and public spending*, *Organization of the state and
153
+ public services*, *Democracy and citizenship*, and *Ecological transition*.
154
+ A significant portion of this consultation involved online questionnaires,
155
+ each concluding with a critical open-ended prompt: "Do you have anything to
156
+ add about [theme]?".
157
+ """
158
+ return GrandDebatFile.C(path=organisation_etat_services_publics_2019_03_21)
159
+
160
+
161
+ @filedownloader(
162
+ "les_evenements_2019_03_21.jsonl",
163
+ "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LES_EVENEMENTS.json",
164
+ checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
165
+ transforms=JsonToJsonl(),
166
+ )
167
+ @datatags("politics", "debate", "french")
168
+ @dataset(
169
+ GrandDebatFile,
170
+ url="https://granddebat.fr",
171
+ )
172
+ def evenements(les_evenements_2019_03_21: Path):
173
+ """Grand Débat National (événements)
174
+
175
+ The *Grand Débat National* (GDN) is a country-wide citizen consultation held
176
+ in France in 2019.
177
+
178
+
179
+ The consultation prompted citizens to express their views across four main
180
+ themes: *Taxation and public spending*, *Organization of the state and
181
+ public services*, *Democracy and citizenship*, and *Ecological transition*.
182
+ A significant portion of this consultation involved online questionnaires,
183
+ each concluding with a critical open-ended prompt: "Do you have anything to
184
+ add about [theme]?".
185
+ """
186
+ return GrandDebatFile.C(path=les_evenements_2019_03_21)
@@ -1,5 +1,4 @@
1
- """CORD-19 dataset
2
- """
1
+ """CORD-19 dataset"""
3
2
 
4
3
  from datamaestro.annotations.agreement import useragreement
5
4
  from datamaestro.definitions import datatasks, dataset
@@ -1,4 +1,4 @@
1
- from datamaestro.data import Base, File
1
+ from datamaestro.data import File
2
2
  from datamaestro.definitions import (
3
3
  datatasks,
4
4
  datatags,
@@ -1,8 +1,8 @@
1
1
  from .base import (
2
- AnswerEntry,
3
- ConversationDataset,
4
- ConversationHistory,
5
- ConversationHistoryItem,
6
- DecontextualizedItem,
7
- EntryType,
2
+ AnswerEntry as AnswerEntry,
3
+ ConversationDataset as ConversationDataset,
4
+ ConversationHistory as ConversationHistory,
5
+ ConversationHistoryItem as ConversationHistoryItem,
6
+ DecontextualizedItem as DecontextualizedItem,
7
+ EntryType as EntryType,
8
8
  )
@@ -2,13 +2,13 @@ from abc import ABC, abstractmethod
2
2
  from enum import Enum
3
3
  from datamaestro_text.data.ir.base import IDItem, SimpleTextItem
4
4
  from experimaestro import Param
5
- from typing import Dict, Generic, Iterator, List, Optional, Sequence, Tuple
5
+ from typing import Dict, Iterator, List, Optional, Sequence, Tuple
6
6
  from attr import define
7
7
  from datamaestro.record import record_type
8
8
  from datamaestro.data import Base
9
9
  from datamaestro.record import Record, Item
10
10
  from datamaestro_text.data.ir import TopicRecord, Topics
11
- from datamaestro_text.utils.iter import FactoryIterable, LazyList, RangeView
11
+ from datamaestro_text.utils.iter import FactoryIterable, LazyList
12
12
 
13
13
  # ---- Basic types
14
14
 
@@ -267,7 +267,7 @@ class ConversationUserTopics(Topics):
267
267
  """Returns an iterator over topics"""
268
268
  # Extracts topics from conversations, Each user query is a topic (can perform retrieval on it)
269
269
  # TODO: merge with xpmir.learning.DatasetConversationBase -> same logic
270
-
270
+
271
271
  records: List[TopicRecord] = []
272
272
  for conversation in self.conversations.__iter__():
273
273
  nodes = [
@@ -279,4 +279,4 @@ class ConversationUserTopics(Topics):
279
279
  records.append(
280
280
  node.entry.update(ConversationHistoryItem(node.history()))
281
281
  )
282
- return iter(records)
282
+ return iter(records)
@@ -11,7 +11,6 @@ from datamaestro_text.data.conversation.base import (
11
11
  EntryType,
12
12
  )
13
13
  from datamaestro_text.data.ir import IDItem, SimpleTextItem
14
- import logging
15
14
 
16
15
 
17
16
  @define(kw_only=True)
@@ -82,9 +81,9 @@ class CanardDataset(ConversationDataset, File):
82
81
  )
83
82
  else:
84
83
  # The utterance before the last is the last user query
85
- assert (
86
- entry.history[-2] == history[-1][SimpleTextItem].text
87
- ), f"{entry.dialogue_id} {entry.history} / {history[-4:-1]}"
84
+ assert entry.history[-2] == history[-1][SimpleTextItem].text, (
85
+ f"{entry.dialogue_id} {entry.history} / {history[-4:-1]}"
86
+ )
88
87
 
89
88
  # The last utterance is the system side
90
89
  history.append(
@@ -5,7 +5,6 @@ import logging
5
5
  from datamaestro.data import File
6
6
  from datamaestro.record import Record
7
7
 
8
- from datamaestro_text.data.ir import Topics
9
8
  from datamaestro_text.data.ir.base import (
10
9
  IDItem,
11
10
  SimpleTextItem,
@@ -113,9 +113,9 @@ class OrConvQADataset(ConversationDataset, File):
113
113
  if relevance > 0:
114
114
  relevances[rank] = (entry.answer.answer_start, None)
115
115
 
116
- assert (
117
- len(relevances) <= 1
118
- ), f"Too many relevance labels ({len(relevances)}) for {entry.query_id}"
116
+ assert len(relevances) <= 1, (
117
+ f"Too many relevance labels ({len(relevances)}) for {entry.query_id}"
118
+ )
119
119
 
120
120
  history.append(
121
121
  Record(
@@ -0,0 +1,5 @@
1
+ """Data classes for debate datasets"""
2
+
3
+ from .granddebat import GrandDebatEntry, GrandDebatFile, GrandDebatResponse
4
+
5
+ __all__ = ["GrandDebatEntry", "GrandDebatFile", "GrandDebatResponse"]
@@ -0,0 +1,68 @@
1
+ """Data classes for the Grand Débat National dataset"""
2
+
3
+ import json
4
+ from dataclasses import dataclass, field
5
+ from typing import Iterator, List, Optional
6
+
7
+ from datamaestro.data import File
8
+
9
+
10
+ @dataclass
11
+ class GrandDebatResponse:
12
+ """A response to a question in the Grand Débat National"""
13
+
14
+ question_id: str
15
+ question_title: str
16
+ value: Optional[str]
17
+ formatted_value: Optional[str]
18
+
19
+
20
+ @dataclass
21
+ class GrandDebatEntry:
22
+ """An entry (contribution) in the Grand Débat National dataset"""
23
+
24
+ id: str
25
+ reference: str
26
+ title: str
27
+ created_at: str
28
+ published_at: str
29
+ updated_at: Optional[str]
30
+ trashed: bool
31
+ trashed_status: Optional[str]
32
+ author_id: str
33
+ author_type: str
34
+ author_zip_code: str
35
+ responses: List[GrandDebatResponse] = field(default_factory=list)
36
+
37
+
38
+ class GrandDebatFile(File):
39
+ """A Grand Débat National JSONL file with iteration support"""
40
+
41
+ def __iter__(self) -> Iterator[GrandDebatEntry]:
42
+ """Iterate over entries in the JSONL file"""
43
+ with self.path.open("r", encoding="utf-8") as f:
44
+ for line in f:
45
+ data = json.loads(line)
46
+ responses = [
47
+ GrandDebatResponse(
48
+ question_id=r["questionId"],
49
+ question_title=r["questionTitle"],
50
+ value=r.get("value"),
51
+ formatted_value=r.get("formattedValue"),
52
+ )
53
+ for r in data.get("responses", [])
54
+ ]
55
+ yield GrandDebatEntry(
56
+ id=data["id"],
57
+ reference=data["reference"],
58
+ title=data["title"],
59
+ created_at=data["createdAt"],
60
+ published_at=data["publishedAt"],
61
+ updated_at=data.get("updatedAt"),
62
+ trashed=data["trashed"],
63
+ trashed_status=data.get("trashedStatus"),
64
+ author_id=data["authorId"],
65
+ author_type=data["authorType"],
66
+ author_zip_code=data["authorZipCode"],
67
+ responses=responses,
68
+ )
@@ -20,6 +20,7 @@ class WordEmbeddings(Base):
20
20
 
21
21
  class WordEmbeddingsText(WordEmbeddings, File):
22
22
  """Word embeddings as a text word / values"""
23
+
23
24
  encoding: Meta[str] = "utf-8"
24
25
 
25
26
  def load(self):
@@ -6,7 +6,7 @@ from functools import cached_property
6
6
  import logging
7
7
  from pathlib import Path
8
8
  from attrs import define
9
- from typing import Callable, Dict, Iterator, List, Optional, Tuple, Type, TYPE_CHECKING
9
+ from typing import Callable, Dict, Iterator, List, Optional, Tuple, Type
10
10
  import random
11
11
  from experimaestro import Config
12
12
  from datamaestro.definitions import datatasks, Param, Meta
@@ -1,7 +1,7 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from attrs import define
3
3
  from typing import List
4
- from datamaestro.record import Record, Item, record_type
4
+ from datamaestro.record import Record, Item
5
5
 
6
6
 
7
7
  TopicRecord = DocumentRecord = Record
@@ -1,27 +1,26 @@
1
1
  from functools import cached_property
2
2
  from pathlib import Path
3
- from typing import Iterator, Tuple, Type
4
3
 
5
- from experimaestro import Param, Option, Constant, Meta
6
- from datamaestro.definitions import argument
4
+ from experimaestro import Param, Meta
7
5
  from datamaestro.record import Record, RecordType
8
6
  import datamaestro_text.data.ir as ir
9
7
  from datamaestro_text.data.ir.base import IDItem, SimpleTextItem
10
8
  from datamaestro_text.interfaces.plaintext import read_tsv
11
9
 
12
10
 
13
- @argument("path", type=Path)
14
- @argument("separator", type=str, default="\t", ignored=True)
15
11
  class AdhocRunWithText(ir.AdhocRun):
16
12
  "(qid, doc.id, query, passage)"
17
- pass
13
+
14
+ path: Meta[Path]
15
+ separator: Meta[str] = "\t"
18
16
 
19
17
 
20
- @argument("path", type=Path)
21
- @argument("separator", type=str, default="\t", ignored=True)
22
18
  class Topics(ir.Topics):
23
19
  "Pairs of query id - query using a separator"
24
20
 
21
+ path: Meta[Path]
22
+ separator: Meta[str] = "\t"
23
+
25
24
  def iter(self):
26
25
  return (
27
26
  Record(IDItem(qid), SimpleTextItem(title))
@@ -1 +1 @@
1
- from .base import *
1
+ from .base import * # noqa: F403
@@ -1,5 +1,5 @@
1
1
  from functools import cached_property
2
- from typing import ClassVar, Tuple, List
2
+ from typing import Tuple, List
3
3
  from attrs import define
4
4
  from datamaestro.record import record_type
5
5
  from ir_datasets.datasets.wapo import WapoDocMedia
@@ -11,9 +11,8 @@ from ir_datasets.datasets.cord19 import Cord19FullTextSection
11
11
  class DocumentWithTitle(TextItem):
12
12
  """Web document with title and body"""
13
13
 
14
- body: str
15
-
16
14
  title: str
15
+ body: str
17
16
 
18
17
  @cached_property
19
18
  def text(self):
@@ -82,7 +82,7 @@ class IKatClueWeb22DocumentStore(LZ4DocumentStore):
82
82
 
83
83
  file_checksum = hasher.hexdigest()
84
84
  assert file_checksum == checksum, (
85
- f"Expected {checksum}, " f"got {file_checksum} for {filename}"
85
+ f"Expected {checksum}, got {file_checksum} for {filename}"
86
86
  )
87
87
 
88
88
  # Get the MD5 hashes of all the passages
@@ -14,6 +14,7 @@ class TrainingText(Supervised):
14
14
 
15
15
  class TextFolder(Folder):
16
16
  "A folder composed of texts"
17
+
17
18
  pass
18
19
 
19
20
 
@@ -0,0 +1 @@
1
+ # IR datasets integration package
@@ -1,7 +1,6 @@
1
1
  import logging
2
2
  from abc import ABC, abstractmethod
3
- from dataclasses import dataclass
4
- from functools import partial
3
+ from functools import cached_property, partial
5
4
  from pathlib import Path
6
5
  from typing import Dict, Iterator, List, NamedTuple, Tuple, Type
7
6
 
@@ -9,7 +8,6 @@ import ir_datasets
9
8
  import ir_datasets.datasets as _irds
10
9
  from datamaestro.record import RecordType, record_type
11
10
  from experimaestro import Config, Meta, Option, Param
12
- from experimaestro.compat import cached_property
13
11
  from ir_datasets.formats import (
14
12
  GenericDoc,
15
13
  GenericDocPair,
@@ -112,6 +110,9 @@ class Documents(ir.DocumentStore, IRDSId):
112
110
  _irds.beir.BeirCordDoc: tuple_constructor(
113
111
  formats.CordDocument, "doc_id", "text", "title", "url", "pubmed_id"
114
112
  ),
113
+ _irds.miracl.MiraclDoc: tuple_constructor(
114
+ formats.DocumentWithTitle, "doc_id", "title", "text"
115
+ ),
115
116
  _irds.beir.BeirTitleDoc: tuple_constructor(
116
117
  formats.TitleDocument, "doc_id", "text", "title"
117
118
  ),
@@ -202,11 +203,11 @@ class Documents(ir.DocumentStore, IRDSId):
202
203
 
203
204
  def iter(self) -> Iterator[ir.DocumentRecord]:
204
205
  """Returns an iterator over adhoc documents"""
205
- for doc in self.dataset.docs_iter():
206
+ for doc in self._docs:
206
207
  yield self.converter(self.document_recordtype, doc)
207
208
 
208
209
  def iter_documents_from(self, start=0):
209
- for doc in self.dataset.docs_iter()[start:]:
210
+ for doc in self._docs[start:]:
210
211
  yield self.converter(self.document_recordtype, doc)
211
212
 
212
213
  @property
@@ -219,19 +220,22 @@ class Documents(ir.DocumentStore, IRDSId):
219
220
  try:
220
221
  # Translate to ir datasets docstore options
221
222
  import ir_datasets.indices as ir_indices
223
+
222
224
  file_access = {
223
225
  ir.FileAccess.MMAP: ir_indices.FileAccess.MMAP,
224
226
  ir.FileAccess.FILE: ir_indices.FileAccess.FILE,
225
- ir.FileAccess.MEMORY: ir_indices.FileAccess.MEMORY
227
+ ir.FileAccess.MEMORY: ir_indices.FileAccess.MEMORY,
226
228
  }[self.file_access]
227
229
  kwargs = {"options": ir_indices.DocstoreOptions(file_access=file_access)}
228
230
  except ImportError:
229
- logging.warning("This version of ir-datasets cannot handle docstore options")
231
+ logging.warning(
232
+ "This version of ir-datasets cannot handle docstore options"
233
+ )
230
234
  return self.dataset.docs_store(**kwargs)
231
235
 
232
- @cached_property
236
+ @property
233
237
  def _docs(self):
234
- return self.dataset.docs_iter()
238
+ return iter(self.store)
235
239
 
236
240
  def docid_internal2external(self, ix: int):
237
241
  return self._docs[ix].doc_id
@@ -261,12 +265,6 @@ class Documents(ir.DocumentStore, IRDSId):
261
265
  return converter
262
266
 
263
267
 
264
- if hasattr(_irds, "miracl"):
265
- Documents.CONVERTERS[_irds.miracl.MiraclDoc] = tuple_constructor(
266
- formats.DocumentWithTitle, "doc_id", "text", "title"
267
- )
268
-
269
-
270
268
  class LZ4DocumentStore(ir.DocumentStore, ABC):
271
269
  """A LZ4-based document store"""
272
270
 
@@ -614,11 +612,7 @@ class Cast2022TopicsHandler(CastTopicsHandler):
614
612
  records = []
615
613
  nodes: Dict[str, ConversationTreeNode] = {}
616
614
 
617
- for (
618
- query
619
- ) in (
620
- self.dataset.dataset.queries_iter()
621
- ): # type: _irds.trec_cast.Cast2022Query
615
+ for query in self.dataset.dataset.queries_iter(): # type: _irds.trec_cast.Cast2022Query
622
616
  parent = nodes[query.parent_id] if query.parent_id else None
623
617
 
624
618
  if query.participant == "User":
@@ -103,7 +103,7 @@ class TrainingTripletsDataset(Dataset):
103
103
  SUFFIX = "docpairs"
104
104
 
105
105
  def _prepare(self, download=False) -> Documents:
106
- return TrainingTriplets(
106
+ return TrainingTriplets.C(
107
107
  id=self.fullid,
108
108
  )
109
109
 
@@ -10,7 +10,6 @@ from collections import namedtuple
10
10
 
11
11
  from datamaestro.download import Download
12
12
  from datamaestro.definitions import AbstractDataset
13
- from datamaestro.utils import TemporaryDirectory
14
13
 
15
14
  APIKEY_KEY = "org.themoviedb.apikey"
16
15
 
@@ -2,7 +2,6 @@ import logging
2
2
  import gzip
3
3
  from abc import ABC, abstractmethod
4
4
  from pathlib import Path
5
- from typing import Type
6
5
  from experimaestro import Config, Task, Param, Annotated, pathgenerator, Option, tqdm
7
6
  import numpy as np
8
7
  from datamaestro.record import RecordType
@@ -131,26 +130,26 @@ class ShuffledTrainingTripletsLines(Task):
131
130
 
132
131
  def __validate__(self):
133
132
  if self.topic_ids:
134
- assert self.data.topic_recordtype.has(
135
- ir.IDItem
136
- ), f"No topic ID in the source data ({self.data.topic_recordtype})"
133
+ assert self.data.topic_recordtype.has(ir.IDItem), (
134
+ f"No topic ID in the source data ({self.data.topic_recordtype})"
135
+ )
137
136
  else:
138
- assert self.data.topic_recordtype.has(
139
- ir.TextItem
140
- ), f"No topic text in the source data ({self.data.topic_recordtype})"
137
+ assert self.data.topic_recordtype.has(ir.TextItem), (
138
+ f"No topic text in the source data ({self.data.topic_recordtype})"
139
+ )
141
140
 
142
141
  if self.doc_ids:
143
- assert self.data.document_recordtype.has(
144
- ir.IDItem
145
- ), "No doc ID in the source data"
142
+ assert self.data.document_recordtype.has(ir.IDItem), (
143
+ "No doc ID in the source data"
144
+ )
146
145
  else:
147
- assert self.data.document_recordtype.has(
148
- ir.TextItem
149
- ), "No doc text in the source data"
146
+ assert self.data.document_recordtype.has(ir.TextItem), (
147
+ "No doc text in the source data"
148
+ )
150
149
 
151
150
  def task_outputs(self, dep):
152
151
  return dep(
153
- ir.TrainingTripletsLines(
152
+ ir.TrainingTripletsLines.C(
154
153
  id="",
155
154
  path=self.path,
156
155
  topic_ids=self.topic_ids,
@@ -50,7 +50,7 @@ def shuffle(
50
50
  *,
51
51
  memory=MEMORY,
52
52
  random=None,
53
- tmp_path: Optional[Path] = None
53
+ tmp_path: Optional[Path] = None,
54
54
  ):
55
55
  """Shuffle using temporary file"""
56
56
  if random is None:
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '2025.9.11'
32
- __version_tuple__ = version_tuple = (2025, 9, 11)
31
+ __version__ = version = '2026.2.2'
32
+ __version_tuple__ = version_tuple = (2026, 2, 2)
33
33
 
34
- __commit_id__ = commit_id = 'gadcc9bd27'
34
+ __commit_id__ = commit_id = None
@@ -1,33 +1,31 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datamaestro-text
3
- Version: 2025.9.11
3
+ Version: 2026.2.2
4
4
  Summary: Datamaestro module for text-related datasets
5
+ Project-URL: Homepage, https://github.com/experimaestro/datamaestro_text
6
+ Project-URL: Documentation, https://datamaestro-text.readthedocs.io/en/latest/
7
+ Project-URL: Repository, https://github.com/experimaestro/datamaestro_text
8
+ Project-URL: Bug Tracker, https://github.com/experimaestro/datamaestro_text/issues
5
9
  Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
6
- License: GPL-3
7
- Project-URL: homepage, https://github.com/experimaestro/datamaestro_text
8
- Project-URL: documentation, https://datamaestro-text.readthedocs.io/en/latest/
9
- Project-URL: repository, https://github.com/experimaestro/datamaestro_text
10
- Keywords: dataset manager,information retrieval,experiments
10
+ License: GPL-3.0-or-later
11
+ License-File: LICENSE
12
+ Keywords: dataset manager,experiments,information retrieval
11
13
  Classifier: Development Status :: 4 - Beta
12
14
  Classifier: Intended Audience :: Science/Research
13
15
  Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
14
16
  Classifier: Operating System :: OS Independent
15
17
  Classifier: Programming Language :: Python
16
18
  Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
17
22
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
23
  Requires-Python: >=3.10
19
- Description-Content-Type: text/markdown
20
- License-File: LICENSE
21
- Requires-Dist: datamaestro>=1.5.0
22
- Requires-Dist: ir_datasets>=0.5.8
23
24
  Requires-Dist: attrs
24
- Provides-Extra: dev
25
- Requires-Dist: pytest; extra == "dev"
26
- Requires-Dist: docutils; extra == "dev"
27
- Requires-Dist: sphobjinv; extra == "dev"
28
- Requires-Dist: flake8; extra == "dev"
29
- Requires-Dist: sphinx; extra == "dev"
30
- Dynamic: license-file
25
+ Requires-Dist: datamaestro>=1.6.2
26
+ Requires-Dist: experimaestro
27
+ Requires-Dist: ir-datasets>=0.5.8
28
+ Description-Content-Type: text/markdown
31
29
 
32
30
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) [![PyPI version](https://badge.fury.io/py/datamaestro-text.svg)](https://badge.fury.io/py/datamaestro-text)
33
31
 
@@ -1,29 +1,30 @@
1
- datamaestro_text/__init__.py,sha256=hU8jZpkXl3F74qIfqnJl7v4nJ9YxfoR7IpJpUREFNRI,248
2
- datamaestro_text/version.py,sha256=aHTcqRU_FAq8moUGgwqhCMrhMi8VBbk38TX-uMF8p20,720
1
+ datamaestro_text/__init__.py,sha256=MP7ShYx32k5irdgml1PjnmSofzioYQh9rzUEcHs5eys,276
2
+ datamaestro_text/version.py,sha256=PcJXzZYuv0SaBM1rOymP9IhKDJxqcLKUPHINlOD-hL0,710
3
3
  datamaestro_text/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  datamaestro_text/config/ai/quac.yaml,sha256=h1D7UJo2z1nZ_9MXpDHuQNJG9Ma2oayUdJV6lyumAIg,1103
5
- datamaestro_text/config/com/oscar-corpus.py,sha256=dDfdy_uhG6esRQQzUOOORwK8KvEq4c4hZbWt22pv-zY,733
5
+ datamaestro_text/config/com/oscar-corpus.py,sha256=gEWz8Nxpv7VXU8X-vfRZLwPfq0KXtkGSNtsfoqfcUI0,702
6
6
  datamaestro_text/config/com/sentiment140.py,sha256=itfBEgcOniECXKOw8I2dhzyS9LOMsltMLfKK6NGRpVY,1293
7
7
  datamaestro_text/config/com/fastml/goodbooks-10k.yaml,sha256=5ZABxUnBFs2ZnCXtBH8YoBiPb3SocRRdH1DLSfVWF-Y,1172
8
- datamaestro_text/config/com/github/ikat.py,sha256=DCayX-t2OBeW5bOJvRxoQgIH3vy-__mYzdmVcnayAkk,4230
8
+ datamaestro_text/config/com/github/ikat.py,sha256=nAmBre9zNlnGhx-C50EvLGvHqtoB7Ce-mZUZqM_ymO8,4219
9
9
  datamaestro_text/config/com/github/aagohary/canard.py,sha256=5fLwCLNBGM_7--naTCDayAMYLvK3yTD8auaEf-dqrb4,1768
10
10
  datamaestro_text/config/com/github/apple/ml-qrecc.py,sha256=zP3w7A9KSvJVCo44OaB1az1pDKWxE6qXS4qFm3hqg3Y,3064
11
11
  datamaestro_text/config/com/github/prdwb/orconvqa.py,sha256=MFJYaxJoqJQ6kMDHa2PIE0zoxYTA8Kyl26-vzFoMML0,3032
12
12
  datamaestro_text/config/com/github/soskek/bookcorpus.yaml,sha256=qJKs35yeEIilEMgNvU3OEqMp1TSn7mDM2T-uYyA7kTU,1607
13
13
  datamaestro_text/config/com/microsoft/wikiqa.yaml,sha256=U7rU-W2Xz1MYv2YXT4jCTj5DsHyM0VssbQPNa3EISaM,540
14
14
  datamaestro_text/config/com/microsoft/msmarco/passage.py,sha256=hN2KOdi6ToHlodozqsYAOtxaqiUGkGGtRtb3RFSgnEU,11645
15
- datamaestro_text/config/com/smashwords/bookcorpus.py,sha256=hCFjZg9t1AHET05qU31uatiAOD5EEzzWG9y_W90jcOE,861
15
+ datamaestro_text/config/com/smashwords/bookcorpus.py,sha256=LgUcnR-z99kTrZj6QaCLuLrj1bG-wHMM5GlVNmbrY2k,851
16
16
  datamaestro_text/config/edu/cornell/nlvr.yaml,sha256=9Yk5VZMncSmrP7JNuGXqExksgX5nQ_Zfnlps8hWze3Q,921
17
17
  datamaestro_text/config/edu/stanford/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
- datamaestro_text/config/edu/stanford/aclimdb.py,sha256=QtriReAVsbJlxkgfJWQCZdCeJ9LswYnOR9mFrgghL9c,647
19
- datamaestro_text/config/edu/stanford/glove.py,sha256=bXxwiJqT8alPs5MwwxVuY0xBO8g1QWhm2L3AKVskTlA,2391
18
+ datamaestro_text/config/edu/stanford/aclimdb.py,sha256=gv_4IauUCURbMzMWpSMyx3qgOAXVQuBwKR-mMaKExpc,626
19
+ datamaestro_text/config/edu/stanford/glove.py,sha256=FiVYbzQMD11CiKfklrggtm7YXBCevyTXXwhehRd65H8,2348
20
20
  datamaestro_text/config/edu/stanford/im2p.yaml,sha256=JoToNyEPpmwdyLFedCBot5ypyw7p9rzi12mGXJuZin0,2909
21
21
  datamaestro_text/config/edu/upenn/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
22
  datamaestro_text/config/edu/upenn/ldc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
23
  datamaestro_text/config/edu/upenn/ldc/aquaint.py,sha256=YEU3kIXzv71Vjg9bUoXyQ-vSLvC-y4LlE3mJQf38XuY,1589
24
+ datamaestro_text/config/fr/granddebat.py,sha256=JRLC3q6o-XhJECjAh40w2p40pCSRw9K3-YMDUpdNwMM,7016
24
25
  datamaestro_text/config/gov/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
26
  datamaestro_text/config/gov/nist/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
- datamaestro_text/config/gov/nist/ir/covid.py,sha256=wn2E7sQ8M6pAucVD4sKJYImyzKUKphyiDFJD0oYRCbg,4004
27
+ datamaestro_text/config/gov/nist/ir/covid.py,sha256=i9xxZcrKeX1gezK_TE68oropMF9PKHX2ofyREEUWYPY,4003
27
28
  datamaestro_text/config/gov/nist/trec/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
29
  datamaestro_text/config/gov/nist/trec/adhoc.py,sha256=vcFaLlZ-chwDt013MYI8bYZ2ug39jPaeimsiok_sqfU,11035
29
30
  datamaestro_text/config/gov/nist/trec/clueweb.yaml,sha256=sm4UbdtMzWoDVPsewtVDS7Vj2jBOdgp18Xqo1X4ysQc,792
@@ -34,7 +35,7 @@ datamaestro_text/config/gov/nist/trec/web.yaml,sha256=iNIJ-PCeLwpUaJByJesyCqfFJo
34
35
  datamaestro_text/config/io/github/rajpurkar/squad.yaml,sha256=JchkTruEhVd0uOTGgek2tOgBL4UT3G3KVASvZcNigLg,1143
35
36
  datamaestro_text/config/io/github/thunlp/fewrel.py,sha256=rg_qAnMrXYUZhQYxA12r_Npl0ggyfTLJQjdSCjU0QxM,1228
36
37
  datamaestro_text/config/io/metamind/research/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
- datamaestro_text/config/io/metamind/research/wikitext.py,sha256=DjyBmG74JvuMt9RpMwuLAnxzOdByIWsk4VnXgkJp1NM,2307
38
+ datamaestro_text/config/io/metamind/research/wikitext.py,sha256=jw_CbBbradIUp_mrhG-z3rfa4_0ybvIBSkDqJvGLCCI,2301
38
39
  datamaestro_text/config/net/mattmahoney/enwiki.yaml,sha256=HCUn3s0AydXX3BjJ6yUXY0vGLGWSBkOCaDhQ4PA2Adg,2452
39
40
  datamaestro_text/config/org/acm/recsys/cb2014.yaml,sha256=5SAK3Am1k0HFugSSCIQN5mLPBfr1zZZAkhLrSH5pHQc,1274
40
41
  datamaestro_text/config/org/cocodataset/index.yaml,sha256=KISJChMeKwlZbSnHmRcGMsm6jbcFGVe1aA4GhP2fzqw,474
@@ -42,47 +43,49 @@ datamaestro_text/config/org/grouplens/movielens.py,sha256=tV6OSTDdtjll1dQBCsYIls
42
43
  datamaestro_text/config/org/universaldependencies/french.py,sha256=etedb3_SC-fV5Oa2rM4_smZk6t4CPiNvU4C4keUFZHY,2214
43
44
  datamaestro_text/config/uk/ac/ucl/cs/qangaroo.yaml,sha256=IBy82CDNNLjJPNPzues1EgDXu0A5WDvUFeVNSOyrIpI,1137
44
45
  datamaestro_text/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
- datamaestro_text/data/embeddings.py,sha256=qdeZ4m6drFNkqW_4pKxJKdJHtOnDRs0P7QVJ7AO6xFE,1127
46
+ datamaestro_text/data/embeddings.py,sha256=YMoNLyVvaOt86bq_8X71_Fgu7EYYI71vr67xSQsi57I,1128
46
47
  datamaestro_text/data/recommendation.py,sha256=wHV_9SXSclouuXaBmvwg3ncZLOFfdKRZG3IHkkPJX9Y,279
47
48
  datamaestro_text/data/tagging.py,sha256=yWm7bNLks77cAySa1aZNCmLFxTTqhqXZ0PaoaEYU6hI,697
48
- datamaestro_text/data/text.py,sha256=_9J7-j06BOIZ1HsOcBsWy_asanAKkDoZIsWWMMj4tm4,496
49
- datamaestro_text/data/conversation/__init__.py,sha256=esOWnSImMlQs5QtfxUt559ABLd6a5wwoNpj9XtIq71Y,159
50
- datamaestro_text/data/conversation/base.py,sha256=BAqu5YOjlRcBOSiyvrjiTIuuc1OalM0BDtcEvJYhO2Y,7515
51
- datamaestro_text/data/conversation/canard.py,sha256=FI02rPNuzSsA_uXOcgcuQn31d3QahmHndzYCV1Zm8rk,3305
52
- datamaestro_text/data/conversation/ikat.py,sha256=djxTv0vHLXIUPzfmwpB25fyUWJNVPqthTtDbQ6CzmQo,4363
53
- datamaestro_text/data/conversation/orconvqa.py,sha256=RL9cpK7QK2dsk_j2e4Th8wzGFq-G3ww_EjdM1eKh-8o,3764
49
+ datamaestro_text/data/text.py,sha256=Lln4eoegU9B27oS-2mv3eEQC6MyRBgVhoewQ2-YNxEQ,497
50
+ datamaestro_text/data/conversation/__init__.py,sha256=Kk7FxPz_0oGO2PtIa8zH7UBqbCUsywTHfA-yKd_KO6c,284
51
+ datamaestro_text/data/conversation/base.py,sha256=gF_-izQ1ijX7w49pKQvjfjUVzrX3VSHXxcqVIPWmAfY,7488
52
+ datamaestro_text/data/conversation/canard.py,sha256=aYpkHzuJWGT3-myFNUjCYAtvG3gVh_d3Zc5lyiasQ04,3290
53
+ datamaestro_text/data/conversation/ikat.py,sha256=hoGqHUWyT8BhC_ouUmnwoh93B2jGLHn8uc6npKP4Sl8,4319
54
+ datamaestro_text/data/conversation/orconvqa.py,sha256=zNp02jyYgny0qtIFOMjmrUy7hG8VKWcELHWrg3FBCc0,3764
54
55
  datamaestro_text/data/conversation/qrecc.py,sha256=es4GmqPtE63A7O_GARe8Zy3rQvuLEhAvUA7CfN_nMeA,2562
55
- datamaestro_text/data/ir/__init__.py,sha256=ZRJrUeeUyD1ncMN5JINVvFJ2lDr3KsbgiiEBJkczSi0,9814
56
- datamaestro_text/data/ir/base.py,sha256=uwIiKn0ryK5DWUQsEegeTs6bye0uAOGl0XVm_cvV3ZU,1506
56
+ datamaestro_text/data/debate/__init__.py,sha256=PzCV3Bd9fmonE-OQp4VtK1NglH42-iv34WAWUIU-eYk,187
57
+ datamaestro_text/data/debate/granddebat.py,sha256=4-HMfgvF2bPru56D3hkA1E2bN3dgIUmcvX9eOIXroLA,2176
58
+ datamaestro_text/data/ir/__init__.py,sha256=oYI7eIScg-olxPh95XBgTK-E2PunieXvqQPlrRlHU8M,9799
59
+ datamaestro_text/data/ir/base.py,sha256=ksluGOOzOwbdZ2SPnwiDMMUhBa6P1Ti2sr6Ch5xXUgg,1493
57
60
  datamaestro_text/data/ir/cord19.py,sha256=yu1Khgy6AZjH2BPQKdnxDid0vQgQ8zvb8-FQlHH-GOU,1465
58
- datamaestro_text/data/ir/csv.py,sha256=tnxuqR_MZ3GQhuoXEMYyWLQw8PyD0gRqsnzIP5Gsziw,1212
59
- datamaestro_text/data/ir/data.py,sha256=ERmmOxz_9mUkIuccNbzUa5Y6gVLLVDdyc4cCxbCCUbY,20
60
- datamaestro_text/data/ir/formats.py,sha256=Tbu5rrssq9m76a448-ixgs7a-KKvNCvXYkdv5FkFqp4,3641
61
+ datamaestro_text/data/ir/csv.py,sha256=0jnaV-wKLgslH7izR-xP_RX7l90vykQTn3bPhaCFR-c,1027
62
+ datamaestro_text/data/ir/data.py,sha256=6ASVsyVVfiSd1m8C8QTrxVLnFVmtoW3d9c9nQ07zlbY,34
63
+ datamaestro_text/data/ir/formats.py,sha256=rKflCuY8UBpXC3nltBqzC4waWYoxuyP91xJvG7p690Y,3630
61
64
  datamaestro_text/data/ir/huggingface.py,sha256=G71VFDN-SllZy4LFxumEbCumEJvb5-orAbLemHjWhiA,956
62
- datamaestro_text/data/ir/stores.py,sha256=A4Ew0L4P6iLLmDKhxqjjVkHcz797BHf4d76YguGkB1A,4412
65
+ datamaestro_text/data/ir/stores.py,sha256=rdOwYCG_NzHSsUQpJ1aneiA2SDWrcfdi16aY-df852U,4408
63
66
  datamaestro_text/data/ir/trec.py,sha256=IOtQRMUz8zx-dYEMR2NIIM6qXEUjsV0eVOhGvKIRJK4,1974
64
67
  datamaestro_text/data/ir/utils.py,sha256=6-GhXVtgkBZGhIs2-ODZua_3DmKjSSVydStpHDqbAwE,833
68
+ datamaestro_text/datasets/__init__.py,sha256=ORn-Q1gGibg-N5grVc7MqOYfExels3FRI51oQ4xI1QA,34
65
69
  datamaestro_text/datasets/irds/__init__.py,sha256=Tq0HN1qojnZYLBumM59BuTkz7r0gcu-5OXmDDLgPpAc,707
66
- datamaestro_text/datasets/irds/data.py,sha256=YlDbGFsh6_mCmk49F3bwdsLEbpHVvMv4gvc1H8KZnpo,23096
67
- datamaestro_text/datasets/irds/datasets.py,sha256=n5mvD2omVI4PkeuJgX13Cio6gLrgYjxq8ZUKKyqocEs,5625
70
+ datamaestro_text/datasets/irds/data.py,sha256=sIU7_rt4I1E9rjkIGcpNfbD5mtO97vxFsUDmouRMDV4,22914
71
+ datamaestro_text/datasets/irds/datasets.py,sha256=CJ8MA44XCwIQGZTzYIJnR-qFm890rUZZB7C3lKIwNyY,5627
68
72
  datamaestro_text/datasets/irds/helpers.py,sha256=fGE-fbuJbXdTzl1qo55247jzn9cvApY-d82GJBgfY1E,3982
69
73
  datamaestro_text/datasets/irds/utils.py,sha256=m30JLIrV_HgilN11TvY9dGTyumES6LLzWZDUAMT915M,1425
70
- datamaestro_text/download/tmdb.py,sha256=kU_Vz9jhznlyeKMHziVu58IHoWv8zPu6CZTHVNQvmu4,4009
74
+ datamaestro_text/download/tmdb.py,sha256=sfnSUJwGSjBsLNVVhT30db2m0R8mrRkDZpbpBUt7GMg,3960
71
75
  datamaestro_text/interfaces/plaintext.py,sha256=cWfS_xjqZxQ0EV4Ax5BEarZ4lnhQ1I7mc_vgfBgE76w,885
72
76
  datamaestro_text/interfaces/trec.py,sha256=GrP0N_Hcj5f73KS6CSzkyN4aaI-XoBJ19oVMpHVR3QM,3579
73
77
  datamaestro_text/test/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
74
78
  datamaestro_text/test/test_datasets.py,sha256=hD1pe-CjEnOj0JNqDLasz_97ltFzMbYfyYBy1QyYdf8,202
75
79
  datamaestro_text/test/test_documented.py,sha256=Kl90fz_r_dsV0oXE1Mad34GiQyZ9dc6bfZKNtxP0O2s,453
76
80
  datamaestro_text/transforms/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
77
- datamaestro_text/transforms/ir/__init__.py,sha256=eWxr0ZtfK9F9J4TachW5V8Ra5AhQJueDyOGR8vXi1uA,6553
81
+ datamaestro_text/transforms/ir/__init__.py,sha256=7D6wurKVQf-f2mu1I3tT-baQbKo7yRCxW8pOHh-MSjM,6539
78
82
  datamaestro_text/utils/__init__.py,sha256=2449YLTAtKJzkmt84Mu8sBRCCveNs5fiaqTCK_p5ha0,3340
79
83
  datamaestro_text/utils/files.py,sha256=jhcirufgTztEkx1hs2-qgScEcHnIcGF_BjCeuCSsNv0,2838
80
84
  datamaestro_text/utils/iter.py,sha256=QBajeSPLHvkeh6BCTZDSqWlOYNjwUDvgTTZ_YxJntXw,2701
81
85
  datamaestro_text/utils/randomstream.py,sha256=_-boH4IIqN8qcl3IktjpNp9vmF4TWRzHUSNVwg7WAr8,973
82
- datamaestro_text/utils/shuffle.py,sha256=o8JTz3mr0lYWyv0zEh91jEK12ci1etMiUnzh5GkOHCM,3490
83
- datamaestro_text-2025.9.11.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
84
- datamaestro_text-2025.9.11.dist-info/METADATA,sha256=ChGV_8bnixfGl91eG_3-Qwba8tjMwe2VPCwXdGxG_xM,1848
85
- datamaestro_text-2025.9.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
86
- datamaestro_text-2025.9.11.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
87
- datamaestro_text-2025.9.11.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
88
- datamaestro_text-2025.9.11.dist-info/RECORD,,
86
+ datamaestro_text/utils/shuffle.py,sha256=xXzgBQ8An7tKboxI0z123Tl6ywXI4S0tWf8MnfOon0c,3491
87
+ datamaestro_text-2026.2.2.dist-info/METADATA,sha256=cHXRhpnNO6sliuE09Jg-eHJtr2kl1Z4Dy3mE1RCGELA,1886
88
+ datamaestro_text-2026.2.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
89
+ datamaestro_text-2026.2.2.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
90
+ datamaestro_text-2026.2.2.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
91
+ datamaestro_text-2026.2.2.dist-info/RECORD,,
@@ -1,5 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: hatchling 1.28.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
-
@@ -1 +0,0 @@
1
- datamaestro_text