datamaestro-text 2024.3.10__py3-none-any.whl → 2025.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,87 @@
1
+ # See documentation on https://datamaestro.readthedocs.io
2
+
3
+ import re
4
+ import json
5
+ from pathlib import Path
6
+ from datamaestro.definitions import datatasks, datatags, dataset
7
+ from datamaestro.data.ml import Supervised
8
+ from datamaestro.download import reference
9
+ from datamaestro.download.archive import zipdownloader
10
+ from datamaestro.download.wayback import wayback_documents
11
+ from datamaestro.utils import HashCheck
12
+ from datamaestro_text.data.conversation.qrecc import QReCCDataset
13
+ from datamaestro_text.datasets.irds.data import (
14
+ LZ4JSONLDocumentStore,
15
+ SimpleJsonDocument,
16
+ )
17
+ from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
18
+
19
+
20
+ @datatags("conversation", "context", "query")
21
+ @datatasks("query rewriting")
22
+ @zipdownloader(
23
+ "data",
24
+ "https://github.com/apple/ml-qrecc/raw/main/dataset/qrecc_data.zip",
25
+ checker=HashCheck("f88fcc7ef3678cd6312080389c8abd67"),
26
+ )
27
+ @dataset(
28
+ Supervised[QReCCDataset, None, QReCCDataset],
29
+ url="https://github.com/apple/ml-qrecc",
30
+ doi="https://doi.org/10.48550/arXiv.2010.04898",
31
+ id="",
32
+ )
33
+ def main(data: Path):
34
+ """Open-Domain Question Answering Goes Conversational via Question Rewriting
35
+
36
+ We introduce QReCC (Question Rewriting in Conversational Context), an
37
+ end-to-end open-domain question answering dataset comprising of 14K
38
+ conversations with 81K question-answer pairs. The goal of this dataset is to
39
+ provide a challenging benchmark for end-to-end conversational question
40
+ answering that includes the individual subtasks of question rewriting,
41
+ passage retrieval and reading comprehension
42
+ """
43
+ return {
44
+ "train": QReCCDataset(path=data / "qrecc_train.json"),
45
+ "test": QReCCDataset(path=data / "qrecc_test.json"),
46
+ }
47
+
48
+
49
+ @dataset(
50
+ url="https://github.com/apple/ml-qrecc",
51
+ doi="https://doi.org/10.48550/arXiv.2010.04898",
52
+ )
53
+ class Content(LZ4JSONLDocumentStore):
54
+ """QReCC mentionned URLs content"""
55
+
56
+ @staticmethod
57
+ def __create_dataset__(dataset, options=None):
58
+ ds = reference(reference=main).setup(dataset, options)
59
+ documents_path = wayback_documents(
60
+ "20191127", lambda: Content._urls(ds), name="wayback.jsonl"
61
+ ).setup(dataset, options)
62
+
63
+ store_path = lz4docstore_builder(
64
+ "store",
65
+ lambda: Content._documents(documents_path),
66
+ SimpleJsonDocument,
67
+ "id",
68
+ ).setup(dataset, options)
69
+
70
+ return LZ4JSONLDocumentStore(jsonl_path=store_path)
71
+
72
+ @staticmethod
73
+ def _documents(path: Path):
74
+ """Iterates over documents from wayback"""
75
+ with path.open("rt") as fp:
76
+ for line in fp:
77
+ yield SimpleJsonDocument(**json.loads(line))
78
+
79
+ @staticmethod
80
+ def _urls(supervised: Supervised[QReCCDataset, None, QReCCDataset]):
81
+ urls = set()
82
+ for ds in [supervised.train, supervised.test]:
83
+ for entry in ds.entries():
84
+ if entry.answer_url:
85
+ url = re.sub("#.*$", "", entry.answer_url)
86
+ urls.add(url)
87
+ return urls
@@ -1,11 +1,9 @@
1
1
  # See documentation on https://datamaestro.readthedocs.io
2
2
 
3
- from collections import namedtuple
4
3
  import gzip
5
4
  import json
6
5
  from pathlib import Path
7
- from typing import Iterator, NamedTuple
8
- import attrs
6
+ from typing import Iterator
9
7
  from datamaestro.definitions import datatasks, datatags, dataset
10
8
  from datamaestro.download.single import filedownloader
11
9
  from datamaestro.utils import HashCheck
@@ -14,10 +12,7 @@ from datamaestro.utils import HashCheck
14
12
  from datamaestro_text.data.conversation.orconvqa import OrConvQADataset
15
13
  from datamaestro.data.ml import Supervised
16
14
 
17
- from datamaestro_text.data.ir import DocumentStore
18
- from datamaestro_text.data.ir.formats import OrConvQADocument
19
15
  from datamaestro_text.data.ir.stores import OrConvQADocumentStore
20
- from datamaestro_text.datasets.irds.data import LZ4DocumentStore
21
16
  from datamaestro_text.datasets.irds.helpers import lz4docstore_downloader
22
17
 
23
18
 
@@ -63,7 +58,9 @@ def preprocessed(train, dev, test):
63
58
  def orConvQADocumentReader(source: Path) -> Iterator[OrConvQADocumentStore.NAMED_TUPLE]:
64
59
  with gzip.open(source, "rt") as fp:
65
60
  for line in fp:
66
- yield OrConvQADocumentStore.NAMED_TUPLE(**json.loads(line))
61
+ data = json.loads(line)
62
+ data["body"] = data.pop("text")
63
+ yield OrConvQADocumentStore.NAMED_TUPLE(**data)
67
64
 
68
65
 
69
66
  @lz4docstore_downloader(
@@ -1,6 +1,6 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from enum import Enum
3
- from typing import Dict, Generic, Iterator, List, Optional, Sequence
3
+ from typing import Dict, Generic, Iterator, List, Optional, Sequence, Tuple
4
4
  from attr import define
5
5
  from datamaestro.data import Base
6
6
  from datamaestro.record import Record, Item
@@ -61,6 +61,20 @@ class AnswerEntry(Item):
61
61
  """The system answer"""
62
62
 
63
63
 
64
+ @define
65
+ class AnswerDocumentID(Item):
66
+ """An answer as a document ID"""
67
+
68
+ document_id: str
69
+
70
+
71
+ @define
72
+ class AnswerDocumentURL(Item):
73
+ """An answer as a document ID"""
74
+
75
+ url: str
76
+
77
+
64
78
  @define
65
79
  class RetrievedEntry(Item):
66
80
  """List of system-retrieved documents and their relevance"""
@@ -68,8 +82,8 @@ class RetrievedEntry(Item):
68
82
  documents: List[str]
69
83
  """List of retrieved documents"""
70
84
 
71
- document_relevances: Optional[List[str]] = None
72
- """List of retrieved documents and their relevance status"""
85
+ relevant_documents: Optional[Dict[int, Tuple[Optional[int], Optional[int]]]] = None
86
+ """List of relevance status (optional), with start/stop position"""
73
87
 
74
88
 
75
89
  @define
@@ -95,56 +109,99 @@ class ConversationHistoryItem(Item):
95
109
 
96
110
 
97
111
  class ConversationNode:
112
+ @abstractmethod
98
113
  def entry(self) -> Record:
99
114
  """The current conversation entry"""
100
115
  ...
101
116
 
117
+ @abstractmethod
102
118
  def history(self) -> ConversationHistory:
103
119
  """Preceding conversation entries, from most recent to more ancient"""
104
120
  ...
105
121
 
122
+ @abstractmethod
123
+ def parent(self) -> Optional["ConversationNode"]:
124
+ ...
125
+
126
+ @abstractmethod
127
+ def children(self) -> List["ConversationNode"]:
128
+ ...
129
+
106
130
 
107
- class ConversationTree:
131
+ class ConversationTree(ABC):
132
+ @abstractmethod
133
+ def root(self) -> ConversationNode:
134
+ ...
135
+
136
+ @abstractmethod
108
137
  def __iter__(self) -> Iterator[ConversationNode]:
109
138
  """Iterates over conversation nodes"""
110
- pass
139
+ ...
111
140
 
112
141
 
113
142
  # ---- A conversation tree
114
143
 
115
144
 
116
- class SingleConversationTree(ConversationTree):
145
+ class SingleConversationTree(ConversationTree, ABC):
117
146
  """Simple conversations, based on a sequence of entries"""
118
147
 
119
148
  id: str
120
- history: Sequence[Record]
149
+ history: List[Record]
121
150
 
122
151
  def __init__(self, id: Optional[str], history: List[Record]):
123
152
  """Create a simple conversation
124
153
 
125
- :param history: The entries, in reverse order (i.e. more ancient first)
154
+ :param history: The entries, in **reverse** order (i.e. more ancient first)
126
155
  """
127
156
  self.history = history or []
157
+ self.id = id
128
158
 
129
159
  def add(self, entry: Record):
130
160
  self.history.insert(0, entry)
131
161
 
132
162
  def __iter__(self) -> Iterator[ConversationNode]:
133
- for ix in range(len(self.history)):
163
+ """Iterates over the conversation (starting with the beginning)"""
164
+ for ix in reversed(range(len(self.history))):
134
165
  yield SingleConversationTreeNode(self, ix)
135
166
 
167
+ def root(self):
168
+ return SingleConversationTreeNode(self, len(self.history) - 1)
169
+
136
170
 
137
171
  @define
138
172
  class SingleConversationTreeNode(ConversationNode):
139
173
  tree: SingleConversationTree
140
174
  index: int
141
175
 
176
+ @property
142
177
  def entry(self) -> Record:
143
178
  return self.tree.history[self.index]
144
179
 
180
+ @entry.setter
181
+ def entry(self, record: Record):
182
+ try:
183
+ self.tree.history[self.index] = record
184
+ except Exception as e:
185
+ print(e)
186
+ raise
187
+
145
188
  def history(self) -> Sequence[Record]:
146
189
  return self.tree.history[self.index + 1 :]
147
190
 
191
+ def parent(self) -> Optional[ConversationNode]:
192
+ return (
193
+ SingleConversationTreeNode(self.tree, self.index + 1)
194
+ if self.index < len(self.tree.history) - 1
195
+ else []
196
+ )
197
+
198
+ def children(self) -> List[ConversationNode]:
199
+ return (
200
+ [SingleConversationTreeNode(self.tree, self.index - 1)]
201
+ if self.index > 0
202
+ else []
203
+ )
204
+
148
205
 
149
206
  class ConversationTreeNode(ConversationNode, ConversationTree):
150
207
  """A conversation tree node"""
@@ -178,6 +235,15 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
178
235
  for child in self.children:
179
236
  yield from child
180
237
 
238
+ def parent(self) -> Optional[ConversationNode]:
239
+ return self.parent
240
+
241
+ def children(self) -> List[ConversationNode]:
242
+ return self.children
243
+
244
+ def root(self):
245
+ return self
246
+
181
247
 
182
248
  class ConversationDataset(Base, ABC):
183
249
  """A dataset made of conversations"""
@@ -186,4 +252,4 @@ class ConversationDataset(Base, ABC):
186
252
  def __iter__(self) -> Iterator[ConversationTree]:
187
253
  """Return an iterator over conversations"""
188
254
  for i in range(len(self)):
189
- return self.get(i)
255
+ yield self.get(i)
@@ -102,16 +102,26 @@ class OrConvQADataset(ConversationDataset, File):
102
102
  # Add to current
103
103
  history.append(
104
104
  Record(
105
- IDItem(query_no),
105
+ IDItem(entry.query_id),
106
106
  SimpleTextItem(entry.query),
107
107
  SimpleDecontextualizedItem(entry.rewrite),
108
108
  EntryType.USER_QUERY,
109
109
  )
110
110
  )
111
+
112
+ relevances = {}
113
+ for rank, relevance in enumerate(entry.retrieval_labels):
114
+ if relevance > 0:
115
+ relevances[rank] = (entry.answer.answer_start, None)
116
+
117
+ assert (
118
+ len(relevances) <= 1
119
+ ), f"Too many relevance labels ({len(relevances)}) for {entry.query_id}"
120
+
111
121
  history.append(
112
122
  Record(
113
123
  AnswerEntry(entry.answer.text),
114
- RetrievedEntry(entry.evidences, entry.retrieval_labels),
124
+ RetrievedEntry(entry.evidences, relevances),
115
125
  EntryType.SYSTEM_ANSWER,
116
126
  )
117
127
  )
@@ -0,0 +1,99 @@
1
+ from typing import Iterator, List, Optional
2
+ from attr import define
3
+ import json
4
+ from datamaestro.data import File
5
+ from datamaestro.record import Record
6
+
7
+ from datamaestro_text.data.ir.base import (
8
+ IDItem,
9
+ SimpleTextItem,
10
+ )
11
+
12
+
13
+ from .base import (
14
+ AnswerDocumentURL,
15
+ AnswerEntry,
16
+ ConversationTree,
17
+ EntryType,
18
+ SimpleDecontextualizedItem,
19
+ SingleConversationTree,
20
+ )
21
+ from . import ConversationDataset
22
+
23
+
24
+ @define(kw_only=True)
25
+ class QReCCDatasetEntry:
26
+ """A query with past history"""
27
+
28
+ conversation_no: int
29
+ """Conversation ID"""
30
+
31
+ turn_no: int
32
+ """The turn in the conversation"""
33
+
34
+ conversation_source: str
35
+ """Conversation source"""
36
+
37
+ question: str
38
+ """The last issued query"""
39
+
40
+ rewrite: str
41
+ """Manually rewritten query"""
42
+
43
+ context: List[str]
44
+ """The list of queries asked by the user"""
45
+
46
+ answer: str
47
+ """The answer"""
48
+
49
+ answer_url: str
50
+ """The URL containing the answer"""
51
+
52
+
53
+ class QReCCDataset(ConversationDataset, File):
54
+ def entries(self) -> Iterator[QReCCDatasetEntry]:
55
+ """Iterates over re-written query with their context"""
56
+ with self.path.open("rt") as fp:
57
+ data = json.load(fp)
58
+
59
+ data = [
60
+ QReCCDatasetEntry(**{key.lower(): value for key, value in entry.items()})
61
+ for entry in data
62
+ ]
63
+ return iter(data)
64
+
65
+ def __iter__(self) -> Iterator[ConversationTree]:
66
+ history: List[Record] = []
67
+ current_id: Optional[str] = None
68
+
69
+ for entry in self.entries():
70
+ # Creates a new conversation if needed
71
+ if entry.conversation_no != current_id:
72
+ if current_id is not None:
73
+ history.reverse()
74
+ yield SingleConversationTree(current_id, history)
75
+
76
+ current_id = entry.conversation_no
77
+ history = []
78
+
79
+ # Add to current
80
+ history.append(
81
+ Record(
82
+ IDItem(f"{entry.conversation_no}#{entry.turn_no}"),
83
+ SimpleTextItem(entry.question),
84
+ AnswerDocumentURL(entry.answer_url),
85
+ SimpleDecontextualizedItem(entry.rewrite),
86
+ EntryType.USER_QUERY,
87
+ )
88
+ )
89
+
90
+ history.append(
91
+ Record(
92
+ AnswerEntry(entry.answer),
93
+ EntryType.SYSTEM_ANSWER,
94
+ )
95
+ )
96
+
97
+ # Yields the last one
98
+ history.reverse()
99
+ yield SingleConversationTree(current_id, history)
@@ -25,6 +25,7 @@ from .base import ( # noqa: F401
25
25
  create_record,
26
26
  # Other things
27
27
  AdhocAssessment,
28
+ AdhocAssessedTopic,
28
29
  )
29
30
 
30
31
 
@@ -83,7 +84,7 @@ class DocumentStore(Documents):
83
84
  def document_int(self, internal_docid: int) -> DocumentRecord:
84
85
  """Returns a document given its internal ID"""
85
86
  docid = self.docid_internal2external(internal_docid)
86
- return self.document(docid)
87
+ return self.document_ext(docid)
87
88
 
88
89
  def document_ext(self, docid: str) -> DocumentRecord:
89
90
  """Returns a document given its external ID"""
@@ -159,7 +160,7 @@ class TopicsStore(Topics):
159
160
  class AdhocAssessments(Base, ABC):
160
161
  """Ad-hoc assessments (qrels)"""
161
162
 
162
- def iter(self) -> Iterator[AdhocAssessment]:
163
+ def iter(self) -> Iterator[AdhocAssessedTopic]:
163
164
  """Returns an iterator over assessments"""
164
165
  raise NotImplementedError(f"For class {self.__class__}")
165
166
 
@@ -1,5 +1,5 @@
1
1
  from functools import cached_property
2
- from typing import ClassVar, Tuple
2
+ from typing import ClassVar, Tuple, List
3
3
  from attrs import define
4
4
  from datamaestro.record import record_type
5
5
  from ir_datasets.datasets.wapo import WapoDocMedia
@@ -117,7 +117,6 @@ class TweetDoc(TextItem):
117
117
 
118
118
  @define
119
119
  class OrConvQADocument(TextItem):
120
- id: str
121
120
  title: str
122
121
  body: str
123
122
  aid: str
@@ -127,12 +126,21 @@ class OrConvQADocument(TextItem):
127
126
  def text(self):
128
127
  return f"{self.title} {self.body}"
129
128
 
129
+ @define
130
+ class Touche2020(TextItem):
131
+ text: str
132
+ title: str
133
+ stance: str
134
+ url: str
130
135
 
131
136
  @define
132
- class TrecTopic(TextItem):
137
+ class SciDocs(TextItem):
133
138
  text: str
134
- query: str
135
- narrative: str
139
+ title: str
140
+ authors: List[str]
141
+ year: int
142
+ cited_by: List[str]
143
+ references: List[str]
136
144
 
137
145
 
138
146
  @define
@@ -167,6 +175,13 @@ class TrecMb14Query(TextItem):
167
175
  def get_text(self):
168
176
  return f"{self.query}"
169
177
 
178
+ @define
179
+ class SciDocsTopic(TextItem):
180
+ text: str
181
+ authors: List[str]
182
+ year: int
183
+ cited_by: List[str]
184
+ references: List[str]
170
185
 
171
186
  @define()
172
187
  class TrecTopic(SimpleTextItem):
@@ -1,16 +1,21 @@
1
1
  from collections import namedtuple
2
- from typing import List
2
+ from typing import List, NamedTuple
3
3
  from experimaestro import Constant
4
4
  import attrs
5
5
 
6
+ from datamaestro.record import Record
7
+ from datamaestro_text.data.ir.base import IDItem
6
8
  from datamaestro_text.datasets.irds.data import LZ4DocumentStore
7
9
  from datamaestro_text.data.ir.formats import OrConvQADocument
8
10
 
9
11
 
10
12
  class OrConvQADocumentStore(LZ4DocumentStore):
11
- NAMED_TUPLE = namedtuple(
12
- "OrConvQADocument", [a.name for a in attrs.fields(OrConvQADocument)]
13
- )
13
+ class NAMED_TUPLE(NamedTuple):
14
+ id: str
15
+ title: str
16
+ body: str
17
+ aid: str
18
+ bid: int
14
19
 
15
20
  lookup_field: Constant[str] = "id"
16
21
  fields: Constant[List[str]] = list(NAMED_TUPLE._fields)
@@ -18,5 +23,7 @@ class OrConvQADocumentStore(LZ4DocumentStore):
18
23
 
19
24
  data_cls = NAMED_TUPLE
20
25
 
21
- def converter(self, data: NAMED_TUPLE) -> OrConvQADocument:
22
- return OrConvQADocument(**data._asdict())
26
+ def converter(self, data: NAMED_TUPLE) -> Record:
27
+ fields = data._asdict()
28
+ del fields["id"]
29
+ return Record(OrConvQADocument(**fields), IDItem(data.id))
@@ -1,35 +1,44 @@
1
+ import logging
1
2
  from abc import ABC, abstractmethod
3
+ from dataclasses import dataclass
2
4
  from functools import partial
3
- import logging
4
5
  from pathlib import Path
5
- from typing import Iterator, Tuple, Type, List
6
+ from typing import Dict, Iterator, List, NamedTuple, Tuple, Type
7
+
6
8
  import ir_datasets
7
- from ir_datasets.indices import PickleLz4FullStore
9
+ import ir_datasets.datasets as _irds
10
+ from datamaestro.record import RecordType, record_type
11
+ from experimaestro import Config, Meta, Option, Param
12
+ from experimaestro.compat import cached_property
8
13
  from ir_datasets.formats import (
9
14
  GenericDoc,
10
- GenericQuery,
11
15
  GenericDocPair,
16
+ GenericQuery,
12
17
  TrecParsedDoc,
13
18
  TrecQuery,
14
19
  )
15
- import ir_datasets.datasets as _irds
16
- from experimaestro import Config, Param
17
- from experimaestro.compat import cached_property
18
- from experimaestro import Option
19
- from datamaestro.record import RecordType, record_type
20
+ from ir_datasets.indices import PickleLz4FullStore
21
+
20
22
  import datamaestro_text.data.ir as ir
23
+ import datamaestro_text.data.ir.formats as formats
24
+ from datamaestro_text.data.conversation.base import (
25
+ AnswerDocumentID,
26
+ AnswerEntry,
27
+ ConversationHistoryItem,
28
+ ConversationTreeNode,
29
+ DecontextualizedDictItem,
30
+ EntryType,
31
+ )
21
32
  from datamaestro_text.data.ir.base import (
22
- Record,
23
- TopicRecord,
24
- DocumentRecord,
25
- SimpleTextItem,
26
33
  AdhocAssessedTopic,
27
- SimpleAdhocAssessment,
34
+ DocumentRecord,
28
35
  IDItem,
36
+ Record,
37
+ SimpleAdhocAssessment,
38
+ SimpleTextItem,
39
+ TopicRecord,
29
40
  create_record,
30
41
  )
31
- import datamaestro_text.data.ir.formats as formats
32
-
33
42
 
34
43
  # Interface between ir_datasets and datamaestro:
35
44
  # provides adapted data types
@@ -108,6 +117,12 @@ class Documents(ir.DocumentStore, IRDSId):
108
117
  _irds.beir.BeirTitleUrlDoc: tuple_constructor(
109
118
  formats.TitleUrlDocument, "doc_id", "text", "title", "url"
110
119
  ),
120
+ _irds.beir.BeirToucheDoc: tuple_constructor(
121
+ formats.Touche2020, "doc_id", "text", "title", "stance", "url"
122
+ ),
123
+ _irds.beir.BeirSciDoc: tuple_constructor(
124
+ formats.SciDocs, "doc_id", "text", "title", "authors", "year", "cited_by", "references"
125
+ ),
111
126
  _irds.msmarco_document.MsMarcoDocument: tuple_constructor(
112
127
  formats.MsMarcoDocument, "doc_id", "url", "title", "body"
113
128
  ),
@@ -215,20 +230,6 @@ if hasattr(_irds, "miracl"):
215
230
  )
216
231
 
217
232
 
218
- # Fix while PR https://github.com/allenai/ir_datasets/pull/252
219
- # is not in.
220
- class DMPickleLz4FullStore(PickleLz4FullStore):
221
- def get_many(self, doc_ids, field=None):
222
- result = {}
223
- field_idx = self._doc_cls._fields.index(field) if field is not None else None
224
- for doc in self.get_many_iter(doc_ids):
225
- if field is not None:
226
- result[getattr(doc, self._id_field)] = doc[field_idx]
227
- else:
228
- result[getattr(doc, self._id_field)] = doc
229
- return result
230
-
231
-
232
233
  class LZ4DocumentStore(ir.DocumentStore):
233
234
  """A LZ4-based document store"""
234
235
 
@@ -242,7 +243,7 @@ class LZ4DocumentStore(ir.DocumentStore):
242
243
 
243
244
  @cached_property
244
245
  def store(self):
245
- return DMPickleLz4FullStore(
246
+ return PickleLz4FullStore(
246
247
  self.path, None, self.data_cls, self.lookup_field, self.index_fields
247
248
  )
248
249
 
@@ -254,33 +255,48 @@ class LZ4DocumentStore(ir.DocumentStore):
254
255
  return getattr(self._docs[ix], self.store._id_field)
255
256
 
256
257
  def document_ext(self, docid: str) -> DocumentRecord:
257
- return self.converter(self.document_recordtype, self.store.get(docid))
258
+ return self.converter(self.store.get(docid))
258
259
 
259
260
  def documents_ext(self, docids: List[str]) -> DocumentRecord:
260
261
  """Returns documents given their external IDs (optimized for batch)"""
261
262
  retrieved = self.store.get_many(docids)
262
- return [
263
- self.converter(self.document_recordtype, retrieved[docid])
264
- for docid in docids
265
- ]
263
+ return [self.converter(retrieved[docid]) for docid in docids]
266
264
 
265
+ @abstractmethod
267
266
  def converter(self, data):
268
- """Converts a document from LZ4 tuples to any other format"""
269
- # By default, use identity
270
- return data
267
+ """Converts a document from LZ4 tuples to a document record"""
268
+ ...
271
269
 
272
270
  def iter(self) -> Iterator[DocumentRecord]:
273
271
  """Returns an iterator over documents"""
274
- return map(
275
- partial(self.converter, self.document_recordtype), self.store.__iter__()
276
- )
272
+ return map(self.converter, self.store.__iter__())
277
273
 
274
+ @cached_property
278
275
  def documentcount(self):
279
276
  if self.count:
280
277
  return self.count
281
278
  return self.store.count()
282
279
 
283
280
 
281
+ class SimpleJsonDocument(NamedTuple):
282
+ id: str
283
+ text: str
284
+
285
+
286
+ class LZ4JSONLDocumentStore(LZ4DocumentStore):
287
+ jsonl_path: Meta[Path]
288
+ """json-l based document store
289
+
290
+ Each line is of the form
291
+ ```json
292
+ { "id": "...", "text": "..." }
293
+ ```
294
+ """
295
+
296
+ def converter(self, data):
297
+ return DocumentRecord(IDItem(data["id"]), SimpleTextItem(data["text"]))
298
+
299
+
284
300
  class TopicsHandler(ABC):
285
301
  @abstractmethod
286
302
  def topic_int(self, internal_topic_id: int) -> TopicRecord:
@@ -352,6 +368,12 @@ class Topics(ir.TopicsStore, IRDSId):
352
368
  TrecQuery: tuple_constructor(
353
369
  formats.TrecTopic, "query_id", "title", "description", "narrative"
354
370
  ),
371
+ _irds.beir.BeirToucheQuery: tuple_constructor(
372
+ formats.TrecTopic, "query_id", "text", "description", "narrative"
373
+ ),
374
+ _irds.beir.BeirSciQuery: tuple_constructor(
375
+ formats.SciDocsTopic, "query_id", "text", "authors", "year", "cited_by", "references"
376
+ ),
355
377
  _irds.tweets2013_ia.TrecMb13Query: tuple_constructor(
356
378
  formats.TrecMb13Query, "query_id", "query", "time", "tweet_time"
357
379
  ),
@@ -395,99 +417,190 @@ class Topics(ir.TopicsStore, IRDSId):
395
417
  return self.handler.iter()
396
418
 
397
419
 
398
- if hasattr(_irds.trec_cast, "Cast2022Query"):
399
- from datamaestro_text.data.conversation.base import (
400
- ConversationTreeNode,
401
- DecontextualizedDictItem,
402
- RetrievedEntry,
403
- ConversationHistoryItem,
404
- )
420
+ class CastTopicsHandler(TopicsHandler):
421
+ def __init__(self, dataset):
422
+ self.dataset = dataset
423
+
424
+ @cached_property
425
+ def ext2records(self):
426
+ return {record[IDItem].id: record for record in self.records}
427
+
428
+ def topic_int(self, internal_topic_id: int) -> TopicRecord:
429
+ """Returns a document given its internal ID"""
430
+ return self.records[internal_topic_id]
405
431
 
406
- class CastTopicsHandler(TopicsHandler):
407
- def __init__(self, dataset):
408
- self.dataset = dataset
409
-
410
- @property
411
- @abstractmethod
412
- def records(self):
413
- ...
414
-
415
- @cached_property
416
- def ext2records(self):
417
- return {record[IDItem].id: record for record in self.records}
418
-
419
- def topic_int(self, internal_topic_id: int) -> TopicRecord:
420
- """Returns a document given its internal ID"""
421
- return self.records[internal_topic_id]
422
-
423
- def topic_ext(self, external_topic_id: str) -> TopicRecord:
424
- """Returns a document given its external ID"""
425
- return self.ext2records[external_topic_id]
426
-
427
- def iter(self) -> Iterator[ir.TopicRecord]:
428
- """Returns an iterator over topics"""
429
- return iter(self.records)
430
-
431
- class Cast2020TopicsHandler(CastTopicsHandler):
432
- @cached_property
433
- def records(self):
434
- try:
435
- topic_number = None
436
- node = None
437
- conversation = []
438
- records = []
439
-
440
- for (
441
- query
442
- ) in (
443
- self.dataset.dataset.queries_iter()
444
- ): # type: _irds.trec_cast.Cast2020Query
445
- decontextualized = DecontextualizedDictItem(
446
- "manual",
447
- {
448
- "manual": query.manual_rewritten_utterance,
449
- "auto": query.automatic_rewritten_utterance,
450
- },
432
+ def topic_ext(self, external_topic_id: str) -> TopicRecord:
433
+ """Returns a document given its external ID"""
434
+ return self.ext2records[external_topic_id]
435
+
436
+ def iter(self) -> Iterator[ir.TopicRecord]:
437
+ """Returns an iterator over topics"""
438
+ return iter(self.records)
439
+
440
+ @cached_property
441
+ def records(self):
442
+ try:
443
+ topic_number = None
444
+ node = None
445
+ conversation = []
446
+ records = []
447
+
448
+ for query in self.dataset.dataset.queries_iter():
449
+ decontextualized = DecontextualizedDictItem(
450
+ "manual",
451
+ {
452
+ "manual": query.manual_rewritten_utterance,
453
+ "auto": query.automatic_rewritten_utterance,
454
+ },
455
+ )
456
+
457
+ is_new_conversation = topic_number != query.topic_number
458
+
459
+ topic = Record(
460
+ IDItem(query.query_id),
461
+ SimpleTextItem(query.raw_utterance),
462
+ decontextualized,
463
+ ConversationHistoryItem(
464
+ [] if is_new_conversation else node.conversation(False)
465
+ ),
466
+ EntryType.USER_QUERY,
467
+ )
468
+
469
+ if is_new_conversation:
470
+ conversation = []
471
+ node = ConversationTreeNode(topic)
472
+ topic_number = query.topic_number
473
+ else:
474
+ node = node.add(ConversationTreeNode(topic))
475
+
476
+ records.append(topic)
477
+
478
+ conversation.append(node)
479
+ node = node.add(
480
+ ConversationTreeNode(
481
+ Record(
482
+ AnswerDocumentID(self.get_canonical_result_id(query)),
483
+ EntryType.SYSTEM_ANSWER,
484
+ )
451
485
  )
486
+ )
487
+ conversation.append(node)
488
+ except Exception:
489
+ logging.exception("Error while computing topic records")
490
+ raise
491
+
492
+ return records
493
+
494
+ @staticmethod
495
+ def get_canonical_result_id():
496
+ return None
497
+
498
+
499
+ class Cast2020TopicsHandler(CastTopicsHandler):
500
+ @staticmethod
501
+ def get_canonical_result_id(query: _irds.trec_cast.Cast2020Query):
502
+ return query.manual_canonical_result_id
503
+
504
+
505
+ class Cast2021TopicsHandler(CastTopicsHandler):
506
+ @staticmethod
507
+ def get_canonical_result_id(query: _irds.trec_cast.Cast2021Query):
508
+ return query.canonical_result_id
509
+
510
+
511
+ class Cast2022TopicsHandler(CastTopicsHandler):
512
+ def __init__(self, dataset):
513
+ self.dataset = dataset
514
+
515
+ @cached_property
516
+ def records(self):
517
+ try:
518
+ records = []
519
+ nodes: Dict[str, ConversationTreeNode] = {}
520
+
521
+ for (
522
+ query
523
+ ) in (
524
+ self.dataset.dataset.queries_iter()
525
+ ): # type: _irds.trec_cast.Cast2022Query
526
+ parent = nodes[query.parent_id] if query.parent_id else None
527
+
528
+ if query.participant == "User":
452
529
  topic = Record(
453
530
  IDItem(query.query_id),
454
531
  SimpleTextItem(query.raw_utterance),
455
- decontextualized,
532
+ DecontextualizedDictItem(
533
+ "manual",
534
+ {
535
+ "manual": query.manual_rewritten_utterance,
536
+ },
537
+ ),
456
538
  ConversationHistoryItem(
457
- node.conversation(False) if node else []
539
+ parent.conversation(False) if parent else []
458
540
  ),
541
+ EntryType.USER_QUERY,
459
542
  )
460
-
461
- if topic_number == query.topic_number:
462
- node = node.add(ConversationTreeNode(topic))
463
- else:
464
- conversation = []
465
- node = ConversationTreeNode(topic)
466
- topic_number = query.topic_number
467
-
543
+ node = ConversationTreeNode(topic)
468
544
  records.append(topic)
469
-
470
- conversation.append(node)
471
- node = node.add(
472
- ConversationTreeNode(
473
- Record(RetrievedEntry(query.manual_canonical_result_id))
545
+ else:
546
+ node = ConversationTreeNode(
547
+ Record(
548
+ AnswerEntry(query.response),
549
+ EntryType.SYSTEM_ANSWER,
474
550
  )
475
551
  )
476
- conversation.append(node)
477
- except Exception:
478
- logging.exception("Error while computing topic records")
479
- raise
480
-
481
- return records
482
-
483
- Topics.HANDLERS.update(
484
- {
485
- # _irds.trec_cast.Cast2019Query: Cast2019TopicsHandler,
486
- _irds.trec_cast.Cast2020Query: Cast2020TopicsHandler,
487
- # _irds.trec_cast.Cast2021Query: Cast2021TopicsHandler,
488
- # _irds.trec_cast.Cast2022Query: Cast2022TopicsHandler
489
- }
490
- )
552
+
553
+ nodes[query.query_id] = node
554
+ if parent:
555
+ parent.add(node)
556
+ except Exception:
557
+ logging.exception("Error while computing topic records")
558
+ raise
559
+
560
+ return records
561
+
562
+
563
+ Topics.HANDLERS.update(
564
+ {
565
+ # _irds.trec_cast.Cast2019Query: Cast2019TopicsHandler,
566
+ _irds.trec_cast.Cast2020Query: Cast2020TopicsHandler,
567
+ _irds.trec_cast.Cast2021Query: Cast2021TopicsHandler,
568
+ _irds.trec_cast.Cast2022Query: Cast2022TopicsHandler,
569
+ }
570
+ )
571
+
572
+
573
+ class CastDocHandler:
574
+ def check(self, cls):
575
+ assert issubclass(cls, _irds.trec_cast.CastDoc)
576
+
577
+ @cached_property
578
+ def target_cls(self):
579
+ return formats.TitleUrlDocument
580
+
581
+ def __call__(self, _, doc: _irds.trec_cast.CastDoc):
582
+ return Record(
583
+ IDItem(doc.doc_id), formats.SimpleTextItem(" ".join(doc.passages))
584
+ )
585
+
586
+
587
+ class CastPassageDocHandler:
588
+ def check(self, cls):
589
+ assert issubclass(cls, _irds.trec_cast.CastPassageDoc)
590
+
591
+ @cached_property
592
+ def target_cls(self):
593
+ return formats.TitleUrlDocument
594
+
595
+ def __call__(self, _, doc: _irds.trec_cast.CastPassageDoc):
596
+ return Record(
597
+ IDItem(doc.doc_id),
598
+ formats.TitleUrlDocument(doc.text, doc.title, doc.url),
599
+ )
600
+
601
+
602
+ Documents.CONVERTERS[_irds.trec_cast.CastDoc] = CastDocHandler()
603
+ Documents.CONVERTERS[_irds.trec_cast.CastPassageDoc] = CastPassageDocHandler()
491
604
 
492
605
 
493
606
  class Adhoc(ir.Adhoc, IRDSId):
@@ -1,13 +1,13 @@
1
1
  import logging
2
2
  from typing import Optional, Type, Callable, Iterator
3
3
  from ir_datasets.indices import PickleLz4FullStore
4
- from datamaestro.download import Download
4
+ from datamaestro.download import Resource
5
5
  from datamaestro.utils import FileChecker
6
6
  from pathlib import Path
7
7
  import urllib3
8
8
 
9
9
 
10
- class lz4docstore_downloader(Download):
10
+ class lz4docstore_downloader(Resource):
11
11
  """Uses ir_datasets Lz4FullStore to build a document store for a stream of documents"""
12
12
 
13
13
  def __init__(
@@ -69,3 +69,59 @@ class lz4docstore_downloader(Download):
69
69
 
70
70
  # All good!
71
71
  (destination / "done").touch()
72
+
73
+
74
+ class lz4docstore_builder(Resource):
75
+ def __init__(
76
+ self,
77
+ name: str,
78
+ iter_factory: Callable[[], Iterator],
79
+ doc_cls: Type,
80
+ lookup_field: str,
81
+ *,
82
+ count_hint: Optional[int] = None,
83
+ ):
84
+ """Uses ir_datasets Lz4FullStore to build a document store for a stream of documents
85
+
86
+ :param name: The name of the variable for path construction
87
+ :param iter_factory: Iterator over documents
88
+ :param doc_cls: The class of documents (must be a dataclass because of how ir-datasets works)
89
+ :param lookup_field: Which field to use for lookup
90
+ :param count_hint: Number of documents (hint), defaults to None
91
+ """
92
+ super().__init__(name)
93
+ self.iter_factory = iter_factory
94
+ self.doc_cls = doc_cls
95
+ self.lookup_field = lookup_field
96
+ self.count_hint = count_hint
97
+
98
+ def prepare(self):
99
+ return self.definition.datapath / self.varname
100
+
101
+ def download(self, force=False):
102
+ # Creates directory if needed
103
+ destination = self.definition.datapath / self.varname
104
+ destination.mkdir(exist_ok=True)
105
+
106
+ # Early exit
107
+ if (destination / "done").is_file() and not force:
108
+ return True
109
+
110
+ # Download (cache)
111
+ logging.info("Building the document index")
112
+
113
+ # Builds the LZ4 store
114
+ store = PickleLz4FullStore(
115
+ destination,
116
+ lambda: self.iter_factory(),
117
+ self.doc_cls,
118
+ lookup_field=self.lookup_field,
119
+ index_fields=[self.lookup_field],
120
+ key_field_prefix=None,
121
+ size_hint=None,
122
+ count_hint=self.count_hint,
123
+ )
124
+ store.build()
125
+
126
+ # All good!
127
+ (destination / "done").touch()
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '2024.3.10'
16
- __version_tuple__ = version_tuple = (2024, 3, 10)
15
+ __version__ = version = '2025.1.7'
16
+ __version_tuple__ = version_tuple = (2025, 1, 7)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamaestro-text
3
- Version: 2024.3.10
3
+ Version: 2025.1.7
4
4
  Summary: Datamaestro module for text-related datasets
5
5
  Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
6
6
  License: GPL-3
@@ -18,8 +18,8 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
18
  Requires-Python: >=3.8
19
19
  Description-Content-Type: text/markdown
20
20
  License-File: LICENSE
21
- Requires-Dist: datamaestro >=1.1.0
22
- Requires-Dist: ir-datasets
21
+ Requires-Dist: datamaestro>=1.2.1
22
+ Requires-Dist: ir_datasets>=0.5.8
23
23
  Requires-Dist: attrs
24
24
 
25
25
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) [![PyPI version](https://badge.fury.io/py/datamaestro-text.svg)](https://badge.fury.io/py/datamaestro-text)
@@ -1,12 +1,13 @@
1
1
  datamaestro_text/__init__.py,sha256=hU8jZpkXl3F74qIfqnJl7v4nJ9YxfoR7IpJpUREFNRI,248
2
- datamaestro_text/version.py,sha256=Bq97zWpOGRS-xWJRpvk6kRdLhLjS83bAhj3DIaONmi8,419
2
+ datamaestro_text/version.py,sha256=k65KHkS4PD3NjDDCJgWcTwRBE4yVcKs32B8SUxv2DvE,417
3
3
  datamaestro_text/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  datamaestro_text/config/ai/quac.yaml,sha256=h1D7UJo2z1nZ_9MXpDHuQNJG9Ma2oayUdJV6lyumAIg,1103
5
5
  datamaestro_text/config/com/oscar-corpus.py,sha256=dDfdy_uhG6esRQQzUOOORwK8KvEq4c4hZbWt22pv-zY,733
6
6
  datamaestro_text/config/com/sentiment140.py,sha256=iRV_rSjQcr9WNjjQ4EdznaEMGLADV_rwpeRx7ycQi0k,1383
7
7
  datamaestro_text/config/com/fastml/goodbooks-10k.yaml,sha256=5ZABxUnBFs2ZnCXtBH8YoBiPb3SocRRdH1DLSfVWF-Y,1172
8
8
  datamaestro_text/config/com/github/aagohary/canard.py,sha256=tIwb_KxMUR3st7rzQUkt6rIjolTl8uKvDq6t795b1nY,1468
9
- datamaestro_text/config/com/github/prdwb/orconvqa.py,sha256=BytI8euqX04RlTCM8LvYKNKm9SVUTClSnszE3QUhGR8,3196
9
+ datamaestro_text/config/com/github/apple/ml-qrecc.py,sha256=_lufJlg-4zaQyjr5Ae-X-9hXzpl2JFjfxh_RhnBbva4,3068
10
+ datamaestro_text/config/com/github/prdwb/orconvqa.py,sha256=Tg3hbnvilxW_Lwt7fpKvGMtu-6mc9oNIHM-LX6JTR1U,3026
10
11
  datamaestro_text/config/com/github/soskek/bookcorpus.yaml,sha256=qJKs35yeEIilEMgNvU3OEqMp1TSn7mDM2T-uYyA7kTU,1607
11
12
  datamaestro_text/config/com/microsoft/wikiqa.yaml,sha256=U7rU-W2Xz1MYv2YXT4jCTj5DsHyM0VssbQPNa3EISaM,540
12
13
  datamaestro_text/config/com/microsoft/msmarco/passage.py,sha256=ouNn2nivS6mUMaCyMzqxNv1YMoPrSEX-UcSZpG1v_uw,11645
@@ -45,23 +46,24 @@ datamaestro_text/data/recommendation.py,sha256=MatelpJQiCMpNDuaQVBkRY809J1qiQo-8
45
46
  datamaestro_text/data/tagging.py,sha256=yWm7bNLks77cAySa1aZNCmLFxTTqhqXZ0PaoaEYU6hI,697
46
47
  datamaestro_text/data/text.py,sha256=pOI8nrEd6RoQA28DVH1JufHTunr9vG3FQzwElR8YirI,499
47
48
  datamaestro_text/data/conversation/__init__.py,sha256=esOWnSImMlQs5QtfxUt559ABLd6a5wwoNpj9XtIq71Y,159
48
- datamaestro_text/data/conversation/base.py,sha256=2invpOXa2DnrfWO0kdpohSw1Feb__obySSUtu7W4CYc,4883
49
+ datamaestro_text/data/conversation/base.py,sha256=zXX5sPMoVtDf4WTkLx45IvZz6lgKPgcUZFu6N4lVBlc,6457
49
50
  datamaestro_text/data/conversation/canard.py,sha256=IMxu5NfytWZLZ_cyT8UFOICbDE82HLJoNBT_6j36Faw,1808
50
- datamaestro_text/data/conversation/orconvqa.py,sha256=TsaeJkxgNobyCNaRH8ZdAVNIAAfiMIxuRq_XDRzyC-I,3457
51
- datamaestro_text/data/ir/__init__.py,sha256=FwK6U6Yw3UjZjqZoaE1Dfe7UQktO5CFeyHCLfmxC3fE,8670
51
+ datamaestro_text/data/conversation/orconvqa.py,sha256=VG3GV5_IPQcvla9rrQPypDNcZYmKNMgozmr2oudeLA4,3802
52
+ datamaestro_text/data/conversation/qrecc.py,sha256=es4GmqPtE63A7O_GARe8Zy3rQvuLEhAvUA7CfN_nMeA,2562
53
+ datamaestro_text/data/ir/__init__.py,sha256=EOVnRv9oKEEucMOgrFjhXxeUWEkaJkbXBndpkKhZaiY,8701
52
54
  datamaestro_text/data/ir/base.py,sha256=Cw8egjChpx4ksUwp-vTA70B2OWxROH7FIeJylsXggAk,1429
53
55
  datamaestro_text/data/ir/cord19.py,sha256=7xDIzsy63WrA9lxxyNOMu5ECRymu5x23EzYG977nS6Y,1440
54
56
  datamaestro_text/data/ir/csv.py,sha256=tnxuqR_MZ3GQhuoXEMYyWLQw8PyD0gRqsnzIP5Gsziw,1212
55
57
  datamaestro_text/data/ir/data.py,sha256=ERmmOxz_9mUkIuccNbzUa5Y6gVLLVDdyc4cCxbCCUbY,20
56
- datamaestro_text/data/ir/formats.py,sha256=wgjXIkNJjqRbHEMkkXyXRRMnxnho45jfUbPsJCazkZk,2866
58
+ datamaestro_text/data/ir/formats.py,sha256=rU9uJhdFDdlnQ2qfFowK_--1WVOH1fVgfXWOPPDF_FY,3160
57
59
  datamaestro_text/data/ir/huggingface.py,sha256=G71VFDN-SllZy4LFxumEbCumEJvb5-orAbLemHjWhiA,956
58
- datamaestro_text/data/ir/stores.py,sha256=JdeDhPxAQOM5_1Pqi_HGoPNUbe63_zMaz-NRs24RS94,687
60
+ datamaestro_text/data/ir/stores.py,sha256=rlOogoBAfsP7o01KqvHiTF1XqzK2Fp6QbRmuasabKAE,859
59
61
  datamaestro_text/data/ir/trec.py,sha256=qDOzmgUn0hMqeP9KdI190-9IKsLl6yNRqIoBz6s-cBs,1898
60
62
  datamaestro_text/data/ir/utils.py,sha256=6-GhXVtgkBZGhIs2-ODZua_3DmKjSSVydStpHDqbAwE,833
61
63
  datamaestro_text/datasets/irds/__init__.py,sha256=Tq0HN1qojnZYLBumM59BuTkz7r0gcu-5OXmDDLgPpAc,707
62
- datamaestro_text/datasets/irds/data.py,sha256=VjvqEvMY3VfuX4Kx7YdoVOoS_fIrMR_3RIIf_PdErsc,16785
64
+ datamaestro_text/datasets/irds/data.py,sha256=BTsFZdjFJ0XdlYRh2rcshAoeknHrypf7be2BGUbWWFk,19931
63
65
  datamaestro_text/datasets/irds/datasets.py,sha256=qtN-nX2_OV9FD339aZjwVL1jFf6I4T6MbNCcuRAjmtU,5682
64
- datamaestro_text/datasets/irds/helpers.py,sha256=KC-2nQPCIl4VnbfDkAkr4iFlhkknn8zvbADlClWZvwU,2207
66
+ datamaestro_text/datasets/irds/helpers.py,sha256=fGE-fbuJbXdTzl1qo55247jzn9cvApY-d82GJBgfY1E,3982
65
67
  datamaestro_text/datasets/irds/utils.py,sha256=m30JLIrV_HgilN11TvY9dGTyumES6LLzWZDUAMT915M,1425
66
68
  datamaestro_text/download/tmdb.py,sha256=kU_Vz9jhznlyeKMHziVu58IHoWv8zPu6CZTHVNQvmu4,4009
67
69
  datamaestro_text/interfaces/plaintext.py,sha256=cWfS_xjqZxQ0EV4Ax5BEarZ4lnhQ1I7mc_vgfBgE76w,885
@@ -76,9 +78,9 @@ datamaestro_text/utils/files.py,sha256=n6ZGl5LNrZbHLcV9RFwd7cFT0vPUezit-2dsBzs1v
76
78
  datamaestro_text/utils/iter.py,sha256=2_UZ8y9Ma4k5U9ZD4w55Zfb6NGrKM1L4G40OygRm1is,2459
77
79
  datamaestro_text/utils/randomstream.py,sha256=_-boH4IIqN8qcl3IktjpNp9vmF4TWRzHUSNVwg7WAr8,973
78
80
  datamaestro_text/utils/shuffle.py,sha256=o8JTz3mr0lYWyv0zEh91jEK12ci1etMiUnzh5GkOHCM,3490
79
- datamaestro_text-2024.3.10.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
80
- datamaestro_text-2024.3.10.dist-info/METADATA,sha256=MCmmREa3bheRgoqsHnhxZ3QvvuiGOWwqgrRGVQw67pw,1604
81
- datamaestro_text-2024.3.10.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
82
- datamaestro_text-2024.3.10.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
83
- datamaestro_text-2024.3.10.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
84
- datamaestro_text-2024.3.10.dist-info/RECORD,,
81
+ datamaestro_text-2025.1.7.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
82
+ datamaestro_text-2025.1.7.dist-info/METADATA,sha256=-wsBcUcnEnOMiHb1ROIf43r55ZNNNjpIemuFXm5hHUY,1609
83
+ datamaestro_text-2025.1.7.dist-info/WHEEL,sha256=A3WOREP4zgxI0fKrHUG8DC8013e3dK3n7a6HDbcEIwE,91
84
+ datamaestro_text-2025.1.7.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
85
+ datamaestro_text-2025.1.7.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
86
+ datamaestro_text-2025.1.7.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.42.0)
2
+ Generator: setuptools (75.7.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5