datamaestro-text 2023.12.5.1__py3-none-any.whl → 2023.12.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,11 @@
1
1
  # See documentation on https://datamaestro.readthedocs.io
2
2
 
3
+ from collections import namedtuple
4
+ import gzip
5
+ import json
6
+ from pathlib import Path
7
+ from typing import Iterator, NamedTuple
8
+ import attrs
3
9
  from datamaestro.definitions import datatasks, datatags, dataset
4
10
  from datamaestro.download.single import filedownloader
5
11
  from datamaestro.utils import HashCheck
@@ -8,6 +14,12 @@ from datamaestro.utils import HashCheck
8
14
  from datamaestro_text.data.conversation.orconvqa import OrConvQADataset
9
15
  from datamaestro.data.ml import Supervised
10
16
 
17
+ from datamaestro_text.data.ir import DocumentStore
18
+ from datamaestro_text.data.ir.formats import OrConvQADocument
19
+ from datamaestro_text.data.ir.stores import OrConvQADocumentStore
20
+ from datamaestro_text.datasets.irds.data import LZ4DocumentStore
21
+ from datamaestro_text.datasets.irds.helpers import lz4docstore_downloader
22
+
11
23
 
12
24
  @datatags("conversation", "context", "query")
13
25
  @datatasks("query rewriting")
@@ -31,14 +43,13 @@ from datamaestro.data.ml import Supervised
31
43
  url="https://github.com/prdwb/orconvqa-release",
32
44
  )
33
45
  def preprocessed(train, dev, test):
34
- """Question-in-context rewriting
46
+ """Open-Retrieval Conversational Question Answering datasets
47
+
48
+ OrConvQA is an aggregation of three existing datasets:
35
49
 
36
- CANARD is a dataset for question-in-context rewriting that consists of
37
- questions each given in a dialog context together with a context-independent
38
- rewriting of the question. The context of each question is the dialog
39
- utterances that precede the question. CANARD can be used to evaluate
40
- question rewriting models that handle important linguistic phenomena such as
41
- co-reference and ellipsis resolution.
50
+ 1. the QuAC dataset that offers information-seeking conversations,
51
+ 1. the CANARD dataset that consists of context-independent rewrites of QuAC questions, and
52
+ 3. the Wikipedia corpus that serves as the knowledge source of answering questions.
42
53
 
43
54
  Each dataset is an instance of :class:`datamaestro_text.data.conversation.OrConvQADataset`
44
55
  """
@@ -47,3 +58,35 @@ def preprocessed(train, dev, test):
47
58
  "validation": OrConvQADataset(path=dev),
48
59
  "test": OrConvQADataset(path=test),
49
60
  }
61
+
62
+
63
+ def orConvQADocumentReader(source: Path) -> Iterator[OrConvQADocumentStore.NAMED_TUPLE]:
64
+ with gzip.open(source, "rt") as fp:
65
+ for line in fp:
66
+ yield OrConvQADocumentStore.NAMED_TUPLE(**json.loads(line))
67
+
68
+
69
+ @lz4docstore_downloader(
70
+ "all_blocks",
71
+ "https://ciir.cs.umass.edu/downloads/ORConvQA/all_blocks.txt.gz",
72
+ orConvQADocumentReader,
73
+ OrConvQADocumentStore.NAMED_TUPLE,
74
+ "id",
75
+ checker=HashCheck("1095a3408690e7bbe4d8a87a2bae6356"),
76
+ size=5_086_902_800,
77
+ count_hint=11_377_951,
78
+ )
79
+ @dataset(
80
+ OrConvQADocumentStore,
81
+ url="https://github.com/prdwb/orconvqa-release",
82
+ )
83
+ def passages(all_blocks):
84
+ """orConvQA wikipedia files
85
+
86
+ OrConvQA is an aggregation of three existing datasets:
87
+
88
+ 1. the QuAC dataset that offers information-seeking conversations,
89
+ 1. the CANARD dataset that consists of context-independent rewrites of QuAC questions, and
90
+ 3. the Wikipedia corpus that serves as the knowledge source of answering questions.
91
+ """
92
+ return {"path": all_blocks, "count": 11_377_951}
@@ -130,6 +130,20 @@ class TweetDoc(IDHolder, Document):
130
130
  return f"{self.text}"
131
131
 
132
132
 
133
+ @define
134
+ class OrConvQADocument(IDHolder, Document):
135
+ id: str
136
+ title: str
137
+ text: str
138
+ aid: str
139
+ bid: int
140
+
141
+ has_text: ClassVar[bool] = True
142
+
143
+ def get_text(self):
144
+ return f"{self.title} {self.text}"
145
+
146
+
133
147
  @define
134
148
  class TrecTopic(GenericTopic):
135
149
  text: str
@@ -0,0 +1,22 @@
1
+ from collections import namedtuple
2
+ from typing import List
3
+ from experimaestro import Constant
4
+ import attrs
5
+
6
+ from datamaestro_text.datasets.irds.data import LZ4DocumentStore
7
+ from datamaestro_text.data.ir.formats import OrConvQADocument
8
+
9
+
10
+ class OrConvQADocumentStore(LZ4DocumentStore):
11
+ NAMED_TUPLE = namedtuple(
12
+ "OrConvQADocument", [a.name for a in attrs.fields(OrConvQADocument)]
13
+ )
14
+
15
+ lookup_field: Constant[str] = "id"
16
+ fields: Constant[List[str]] = list(NAMED_TUPLE._fields)
17
+ index_fields: Constant[List[str]] = ["id"]
18
+
19
+ data_cls = NAMED_TUPLE
20
+
21
+ def converter(self, data: NAMED_TUPLE) -> OrConvQADocument:
22
+ return OrConvQADocument(**data._asdict())
@@ -1,7 +1,9 @@
1
1
  import logging
2
- from typing import Any, Iterator, Tuple, Type, List
2
+ from pathlib import Path
3
+ from typing import Any, Iterator, NamedTuple, Tuple, Type, List
3
4
  import attrs
4
5
  import ir_datasets
6
+ from ir_datasets.indices import PickleLz4FullStore
5
7
  from ir_datasets.formats import (
6
8
  GenericDoc,
7
9
  GenericQuery,
@@ -10,7 +12,7 @@ from ir_datasets.formats import (
10
12
  TrecQuery,
11
13
  )
12
14
  import ir_datasets.datasets as _irds
13
- from experimaestro import Config
15
+ from experimaestro import Config, Param
14
16
  from experimaestro.compat import cached_property
15
17
  from experimaestro import Option
16
18
  import datamaestro_text.data.ir as ir
@@ -208,6 +210,67 @@ if hasattr(_irds, "miracl"):
208
210
  )
209
211
 
210
212
 
213
+ # Fix while PR https://github.com/allenai/ir_datasets/pull/252
214
+ # is not in.
215
+ class DMPickleLz4FullStore(PickleLz4FullStore):
216
+ def get_many(self, doc_ids, field=None):
217
+ result = {}
218
+ field_idx = self._doc_cls._fields.index(field) if field is not None else None
219
+ for doc in self.get_many_iter(doc_ids):
220
+ if field is not None:
221
+ result[getattr(doc, self._id_field)] = doc[field_idx]
222
+ else:
223
+ result[getattr(doc, self._id_field)] = doc
224
+ return result
225
+
226
+
227
+ class LZ4DocumentStore(ir.DocumentStore):
228
+ """A LZ4-based document store"""
229
+
230
+ path: Param[Path]
231
+
232
+ #: Lookup field
233
+ lookup_field: Param[str]
234
+
235
+ # Extra indexed fields (e.g. URLs)
236
+ index_fields: List[str]
237
+
238
+ @cached_property
239
+ def store(self):
240
+ return DMPickleLz4FullStore(
241
+ self.path, None, self.data_cls, self.lookup_field, self.index_fields
242
+ )
243
+
244
+ @cached_property
245
+ def _docs(self):
246
+ return self.store.__iter__()
247
+
248
+ def docid_internal2external(self, ix: int):
249
+ return getattr(self._docs[ix], self.store._id_field)
250
+
251
+ def document_ext(self, docid: str) -> Document:
252
+ return self.converter(self.store.get(docid))
253
+
254
+ def documents_ext(self, docids: List[str]) -> Document:
255
+ """Returns documents given their external IDs (optimized for batch)"""
256
+ retrieved = self.store.get_many(docids)
257
+ return [self.converter(retrieved[docid]) for docid in docids]
258
+
259
+ def converter(self, data):
260
+ """Converts a document from LZ4 tuples to any other format"""
261
+ # By default, use identity
262
+ return data
263
+
264
+ def iter(self) -> Iterator[Document]:
265
+ """Returns an iterator over documents"""
266
+ return map(self.converter, self.store.__iter__())
267
+
268
+ def documentcount(self):
269
+ if self.count:
270
+ return self.count
271
+ return self.store.count()
272
+
273
+
211
274
  @attrs.define()
212
275
  class IRDSQueryWrapper(ir.Topic):
213
276
  query: Any
@@ -0,0 +1,71 @@
1
+ import logging
2
+ from typing import Optional, Type, Callable, Iterator
3
+ from ir_datasets.indices import PickleLz4FullStore
4
+ from datamaestro.download import Download
5
+ from datamaestro.utils import FileChecker
6
+ from pathlib import Path
7
+ import urllib3
8
+
9
+
10
+ class lz4docstore_downloader(Download):
11
+ """Uses ir_datasets Lz4FullStore to build a document store for a stream of documents"""
12
+
13
+ def __init__(
14
+ self,
15
+ varname: str,
16
+ url: str,
17
+ iter_factory: Callable[[Path], Iterator],
18
+ doc_cls: Type,
19
+ lookup_field: str,
20
+ *,
21
+ count_hint: Optional[int] = None,
22
+ size: Optional[int] = None,
23
+ checker: FileChecker = None,
24
+ ):
25
+ super().__init__(varname)
26
+ self.iter_factory = iter_factory
27
+ self.url = url
28
+ self.doc_cls = doc_cls
29
+ self.size = size
30
+ self.lookup_field = lookup_field
31
+ self.count_hint = count_hint
32
+ self.checker = checker
33
+
34
+ p = urllib3.util.parse_url(self.url)
35
+ assert p is not None
36
+ self.name = Path(p.path).with_suffix("").name
37
+
38
+ def prepare(self):
39
+ return self.definition.datapath / self.name
40
+
41
+ def download(self, force=False):
42
+ # Creates directory if needed
43
+ destination = self.definition.datapath / self.name
44
+ destination.mkdir(exist_ok=True)
45
+
46
+ # Early exit
47
+ if (destination / "done").is_file() and not force:
48
+ return True
49
+
50
+ # Download (cache)
51
+ logging.info("Building the document index")
52
+ with self.context.downloadURL(self.url, size=self.size) as file:
53
+ # Checks the file
54
+ if self.checker:
55
+ self.checker.check(file.path)
56
+
57
+ # Builds the LZ4 store
58
+ store = PickleLz4FullStore(
59
+ destination,
60
+ lambda: self.iter_factory(Path(file.path)),
61
+ self.doc_cls,
62
+ lookup_field=self.lookup_field,
63
+ index_fields=[self.lookup_field],
64
+ key_field_prefix=None,
65
+ size_hint=None,
66
+ count_hint=self.count_hint,
67
+ )
68
+ store.build()
69
+
70
+ # All good!
71
+ (destination / "done").touch()
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '2023.12.5.1'
16
- __version_tuple__ = version_tuple = (2023, 12, 5, 1)
15
+ __version__ = version = '2023.12.12'
16
+ __version_tuple__ = version_tuple = (2023, 12, 12)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamaestro-text
3
- Version: 2023.12.5.1
3
+ Version: 2023.12.12
4
4
  Summary: Datamaestro module for text-related datasets
5
5
  Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
6
6
  License: GPL-3
@@ -1,12 +1,12 @@
1
1
  datamaestro_text/__init__.py,sha256=hU8jZpkXl3F74qIfqnJl7v4nJ9YxfoR7IpJpUREFNRI,248
2
- datamaestro_text/version.py,sha256=dmMi7UY_HMKPQFgxCPaECvzfrSKLdDUboEx4k8N_wnE,424
2
+ datamaestro_text/version.py,sha256=VZXVckR_vXa9FiYA1ju8Nq6CTGMqdvgq-SfQ3rz-1S0,421
3
3
  datamaestro_text/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  datamaestro_text/config/ai/quac.yaml,sha256=h1D7UJo2z1nZ_9MXpDHuQNJG9Ma2oayUdJV6lyumAIg,1103
5
5
  datamaestro_text/config/com/oscar-corpus.py,sha256=dDfdy_uhG6esRQQzUOOORwK8KvEq4c4hZbWt22pv-zY,733
6
6
  datamaestro_text/config/com/sentiment140.py,sha256=iRV_rSjQcr9WNjjQ4EdznaEMGLADV_rwpeRx7ycQi0k,1383
7
7
  datamaestro_text/config/com/fastml/goodbooks-10k.yaml,sha256=5ZABxUnBFs2ZnCXtBH8YoBiPb3SocRRdH1DLSfVWF-Y,1172
8
8
  datamaestro_text/config/com/github/aagohary/canard.py,sha256=tIwb_KxMUR3st7rzQUkt6rIjolTl8uKvDq6t795b1nY,1468
9
- datamaestro_text/config/com/github/prdwb/orconvqa.py,sha256=0umr_mo9N1av7b3V9eOnHVFFQNEtJkXatLdgZL0KXP4,1767
9
+ datamaestro_text/config/com/github/prdwb/orconvqa.py,sha256=BytI8euqX04RlTCM8LvYKNKm9SVUTClSnszE3QUhGR8,3196
10
10
  datamaestro_text/config/com/github/soskek/bookcorpus.yaml,sha256=qJKs35yeEIilEMgNvU3OEqMp1TSn7mDM2T-uYyA7kTU,1607
11
11
  datamaestro_text/config/com/microsoft/wikiqa.yaml,sha256=U7rU-W2Xz1MYv2YXT4jCTj5DsHyM0VssbQPNa3EISaM,540
12
12
  datamaestro_text/config/com/microsoft/msmarco/passage.py,sha256=2PP9-HmJAqxFmQkvVcDwfGKpg_gJOQZd1q5ocoi12No,11755
@@ -52,13 +52,15 @@ datamaestro_text/data/ir/base.py,sha256=7FUh4ursVdLMaqUEngZ-TSFki_3xxdEihpVe09hl
52
52
  datamaestro_text/data/ir/cord19.py,sha256=JN31EQeg0UFAJlIkg0Ie0_pq-f-oS1OstZGJLJBeKyY,1130
53
53
  datamaestro_text/data/ir/csv.py,sha256=vgBNOeayEALwO01LmrzVOEVbs_MWJn3eIm-o0KiXjiE,1836
54
54
  datamaestro_text/data/ir/data.py,sha256=ERmmOxz_9mUkIuccNbzUa5Y6gVLLVDdyc4cCxbCCUbY,20
55
- datamaestro_text/data/ir/formats.py,sha256=6344Tj2yTxQ5KW-YtkBbdbCgWTbSsO6f0AaJlvvibqM,3248
55
+ datamaestro_text/data/ir/formats.py,sha256=sQ08vvuHxPMUJMQZjNpwjUZ9BMJNdzlOqSB-PahdZ70,3474
56
56
  datamaestro_text/data/ir/huggingface.py,sha256=G71VFDN-SllZy4LFxumEbCumEJvb5-orAbLemHjWhiA,956
57
+ datamaestro_text/data/ir/stores.py,sha256=JdeDhPxAQOM5_1Pqi_HGoPNUbe63_zMaz-NRs24RS94,687
57
58
  datamaestro_text/data/ir/trec.py,sha256=n98_O_sPPdU2i037fAboD4lB_I7C-RJrOLmmkg3osL8,1741
58
59
  datamaestro_text/data/ir/utils.py,sha256=6-GhXVtgkBZGhIs2-ODZua_3DmKjSSVydStpHDqbAwE,833
59
60
  datamaestro_text/datasets/irds/__init__.py,sha256=Tq0HN1qojnZYLBumM59BuTkz7r0gcu-5OXmDDLgPpAc,707
60
- datamaestro_text/datasets/irds/data.py,sha256=hDctKswyzD_VrCRcD6pNIoKiiwvapWQBUwxzdFHesIM,9348
61
+ datamaestro_text/datasets/irds/data.py,sha256=1DRhDk4kBJLSWCX3LoJaHULy-YR0DaIj38s_n3x2YEM,11342
61
62
  datamaestro_text/datasets/irds/datasets.py,sha256=4tNTmlcF2OmUttCMyz5YTepi91pvaZB4syy5u-jAKh4,5556
63
+ datamaestro_text/datasets/irds/helpers.py,sha256=KC-2nQPCIl4VnbfDkAkr4iFlhkknn8zvbADlClWZvwU,2207
62
64
  datamaestro_text/datasets/irds/utils.py,sha256=m30JLIrV_HgilN11TvY9dGTyumES6LLzWZDUAMT915M,1425
63
65
  datamaestro_text/download/tmdb.py,sha256=kU_Vz9jhznlyeKMHziVu58IHoWv8zPu6CZTHVNQvmu4,4009
64
66
  datamaestro_text/interfaces/plaintext.py,sha256=cWfS_xjqZxQ0EV4Ax5BEarZ4lnhQ1I7mc_vgfBgE76w,885
@@ -72,9 +74,9 @@ datamaestro_text/utils/files.py,sha256=n6ZGl5LNrZbHLcV9RFwd7cFT0vPUezit-2dsBzs1v
72
74
  datamaestro_text/utils/iter.py,sha256=-m0Y_0YjSlEVbotzZYIA0Ca0Hq0G_bF9GfAZR2yxrAk,520
73
75
  datamaestro_text/utils/randomstream.py,sha256=_-boH4IIqN8qcl3IktjpNp9vmF4TWRzHUSNVwg7WAr8,973
74
76
  datamaestro_text/utils/shuffle.py,sha256=o8JTz3mr0lYWyv0zEh91jEK12ci1etMiUnzh5GkOHCM,3490
75
- datamaestro_text-2023.12.5.1.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
76
- datamaestro_text-2023.12.5.1.dist-info/METADATA,sha256=rs3ITl4eS2SqYOWKDaG4Ak3GlvFKF_cBWiJ78FHx7XE,1580
77
- datamaestro_text-2023.12.5.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
78
- datamaestro_text-2023.12.5.1.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
79
- datamaestro_text-2023.12.5.1.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
80
- datamaestro_text-2023.12.5.1.dist-info/RECORD,,
77
+ datamaestro_text-2023.12.12.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
78
+ datamaestro_text-2023.12.12.dist-info/METADATA,sha256=YYoyl-_XL58GbzeG3MV5vYhdc9WbK_HCTtvx1rvRvu8,1579
79
+ datamaestro_text-2023.12.12.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
80
+ datamaestro_text-2023.12.12.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
81
+ datamaestro_text-2023.12.12.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
82
+ datamaestro_text-2023.12.12.dist-info/RECORD,,