PyPI - datamaestro-text - Versions diffs - 2023.12.5.1__py3-none-any.whl → 2023.12.12__py3-none-any.whl - Mend

datamaestro-text 2023.12.5.1py3-none-any.whl → 2023.12.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

datamaestro_text/config/com/github/prdwb/orconvqa.py CHANGED Viewed

@@ -1,5 +1,11 @@
 # See documentation on https://datamaestro.readthedocs.io
+from collections import namedtuple
+import gzip
+import json
+from pathlib import Path
+from typing import Iterator, NamedTuple
+import attrs
 from datamaestro.definitions import datatasks, datatags, dataset
 from datamaestro.download.single import filedownloader
 from datamaestro.utils import HashCheck
@@ -8,6 +14,12 @@ from datamaestro.utils import HashCheck
 from datamaestro_text.data.conversation.orconvqa import OrConvQADataset
 from datamaestro.data.ml import Supervised
+from datamaestro_text.data.ir import DocumentStore
+from datamaestro_text.data.ir.formats import OrConvQADocument
+from datamaestro_text.data.ir.stores import OrConvQADocumentStore
+from datamaestro_text.datasets.irds.data import LZ4DocumentStore
+from datamaestro_text.datasets.irds.helpers import lz4docstore_downloader
 @datatags("conversation", "context", "query")
 @datatasks("query rewriting")
@@ -31,14 +43,13 @@ from datamaestro.data.ml import Supervised
     url="https://github.com/prdwb/orconvqa-release",
 )
 def preprocessed(train, dev, test):
-    """Question-in-context rewriting
+    """Open-Retrieval Conversational Question Answering datasets
+    OrConvQA is an aggregation of three existing datasets:
-    CANARD is a dataset for question-in-context rewriting that consists of
-    questions each given in a dialog context together with a context-independent
-    rewriting of the question. The context of each question is the dialog
-    utterances that precede the question. CANARD can be used to evaluate
-    question rewriting models that handle important linguistic phenomena such as
-    co-reference and ellipsis resolution.
+    1. the QuAC dataset that offers information-seeking conversations,
+    1. the CANARD dataset that consists of context-independent rewrites of QuAC questions, and
+    3. the Wikipedia corpus that serves as the knowledge source of answering questions.
     Each dataset is an instance of :class:`datamaestro_text.data.conversation.OrConvQADataset`
     """
@@ -47,3 +58,35 @@ def preprocessed(train, dev, test):
         "validation": OrConvQADataset(path=dev),
         "test": OrConvQADataset(path=test),
     }
+def orConvQADocumentReader(source: Path) -> Iterator[OrConvQADocumentStore.NAMED_TUPLE]:
+    with gzip.open(source, "rt") as fp:
+        for line in fp:
+            yield OrConvQADocumentStore.NAMED_TUPLE(**json.loads(line))
+@lz4docstore_downloader(
+    "all_blocks",
+    "https://ciir.cs.umass.edu/downloads/ORConvQA/all_blocks.txt.gz",
+    orConvQADocumentReader,
+    OrConvQADocumentStore.NAMED_TUPLE,
+    "id",
+    checker=HashCheck("1095a3408690e7bbe4d8a87a2bae6356"),
+    size=5_086_902_800,
+    count_hint=11_377_951,
+)
+@dataset(
+    OrConvQADocumentStore,
+    url="https://github.com/prdwb/orconvqa-release",
+)
+def passages(all_blocks):
+    """orConvQA wikipedia files
+    OrConvQA is an aggregation of three existing datasets:
+    1. the QuAC dataset that offers information-seeking conversations,
+    1. the CANARD dataset that consists of context-independent rewrites of QuAC questions, and
+    3. the Wikipedia corpus that serves as the knowledge source of answering questions.
+    """
+    return {"path": all_blocks, "count": 11_377_951}

datamaestro_text/data/ir/formats.py CHANGED Viewed

@@ -130,6 +130,20 @@ class TweetDoc(IDHolder, Document):
         return f"{self.text}"
+@define
+class OrConvQADocument(IDHolder, Document):
+    id: str
+    title: str
+    text: str
+    aid: str
+    bid: int
+    has_text: ClassVar[bool] = True
+    def get_text(self):
+        return f"{self.title} {self.text}"
 @define
 class TrecTopic(GenericTopic):
     text: str

datamaestro_text/data/ir/stores.py ADDED Viewed

@@ -0,0 +1,22 @@
+from collections import namedtuple
+from typing import List
+from experimaestro import Constant
+import attrs
+from datamaestro_text.datasets.irds.data import LZ4DocumentStore
+from datamaestro_text.data.ir.formats import OrConvQADocument
+class OrConvQADocumentStore(LZ4DocumentStore):
+    NAMED_TUPLE = namedtuple(
+        "OrConvQADocument", [a.name for a in attrs.fields(OrConvQADocument)]
+    )
+    lookup_field: Constant[str] = "id"
+    fields: Constant[List[str]] = list(NAMED_TUPLE._fields)
+    index_fields: Constant[List[str]] = ["id"]
+    data_cls = NAMED_TUPLE
+    def converter(self, data: NAMED_TUPLE) -> OrConvQADocument:
+        return OrConvQADocument(**data._asdict())

datamaestro_text/datasets/irds/data.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import logging
-from typing import Any, Iterator, Tuple, Type, List
+from pathlib import Path
+from typing import Any, Iterator, NamedTuple, Tuple, Type, List
 import attrs
 import ir_datasets
+from ir_datasets.indices import PickleLz4FullStore
 from ir_datasets.formats import (
     GenericDoc,
     GenericQuery,
@@ -10,7 +12,7 @@ from ir_datasets.formats import (
     TrecQuery,
 )
 import ir_datasets.datasets as _irds
-from experimaestro import Config
+from experimaestro import Config, Param
 from experimaestro.compat import cached_property
 from experimaestro import Option
 import datamaestro_text.data.ir as ir
@@ -208,6 +210,67 @@ if hasattr(_irds, "miracl"):
     )
+# Fix while PR https://github.com/allenai/ir_datasets/pull/252
+# is not in.
+class DMPickleLz4FullStore(PickleLz4FullStore):
+    def get_many(self, doc_ids, field=None):
+        result = {}
+        field_idx = self._doc_cls._fields.index(field) if field is not None else None
+        for doc in self.get_many_iter(doc_ids):
+            if field is not None:
+                result[getattr(doc, self._id_field)] = doc[field_idx]
+            else:
+                result[getattr(doc, self._id_field)] = doc
+        return result
+class LZ4DocumentStore(ir.DocumentStore):
+    """A LZ4-based document store"""
+    path: Param[Path]
+    #: Lookup field
+    lookup_field: Param[str]
+    # Extra indexed fields (e.g. URLs)
+    index_fields: List[str]
+    @cached_property
+    def store(self):
+        return DMPickleLz4FullStore(
+            self.path, None, self.data_cls, self.lookup_field, self.index_fields
+        )
+    @cached_property
+    def _docs(self):
+        return self.store.__iter__()
+    def docid_internal2external(self, ix: int):
+        return getattr(self._docs[ix], self.store._id_field)
+    def document_ext(self, docid: str) -> Document:
+        return self.converter(self.store.get(docid))
+    def documents_ext(self, docids: List[str]) -> Document:
+        """Returns documents given their external IDs (optimized for batch)"""
+        retrieved = self.store.get_many(docids)
+        return [self.converter(retrieved[docid]) for docid in docids]
+    def converter(self, data):
+        """Converts a document from LZ4 tuples to any other format"""
+        # By default, use identity
+        return data
+    def iter(self) -> Iterator[Document]:
+        """Returns an iterator over documents"""
+        return map(self.converter, self.store.__iter__())
+    def documentcount(self):
+        if self.count:
+            return self.count
+        return self.store.count()
 @attrs.define()
 class IRDSQueryWrapper(ir.Topic):
     query: Any

datamaestro_text/datasets/irds/helpers.py ADDED Viewed

@@ -0,0 +1,71 @@
+import logging
+from typing import Optional, Type, Callable, Iterator
+from ir_datasets.indices import PickleLz4FullStore
+from datamaestro.download import Download
+from datamaestro.utils import FileChecker
+from pathlib import Path
+import urllib3
+class lz4docstore_downloader(Download):
+    """Uses ir_datasets Lz4FullStore to build a document store for a stream of documents"""
+    def __init__(
+        self,
+        varname: str,
+        url: str,
+        iter_factory: Callable[[Path], Iterator],
+        doc_cls: Type,
+        lookup_field: str,
+        *,
+        count_hint: Optional[int] = None,
+        size: Optional[int] = None,
+        checker: FileChecker = None,
+    ):
+        super().__init__(varname)
+        self.iter_factory = iter_factory
+        self.url = url
+        self.doc_cls = doc_cls
+        self.size = size
+        self.lookup_field = lookup_field
+        self.count_hint = count_hint
+        self.checker = checker
+        p = urllib3.util.parse_url(self.url)
+        assert p is not None
+        self.name = Path(p.path).with_suffix("").name
+    def prepare(self):
+        return self.definition.datapath / self.name
+    def download(self, force=False):
+        # Creates directory if needed
+        destination = self.definition.datapath / self.name
+        destination.mkdir(exist_ok=True)
+        # Early exit
+        if (destination / "done").is_file() and not force:
+            return True
+        # Download (cache)
+        logging.info("Building the document index")
+        with self.context.downloadURL(self.url, size=self.size) as file:
+            # Checks the file
+            if self.checker:
+                self.checker.check(file.path)
+            # Builds the LZ4 store
+            store = PickleLz4FullStore(
+                destination,
+                lambda: self.iter_factory(Path(file.path)),
+                self.doc_cls,
+                lookup_field=self.lookup_field,
+                index_fields=[self.lookup_field],
+                key_field_prefix=None,
+                size_hint=None,
+                count_hint=self.count_hint,
+            )
+            store.build()
+            # All good!
+            (destination / "done").touch()

datamaestro_text/version.py CHANGED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '2023.12.5.1'
-__version_tuple__ = version_tuple = (2023, 12, 5, 1)
+__version__ = version = '2023.12.12'
+__version_tuple__ = version_tuple = (2023, 12, 12)

{datamaestro_text-2023.12.5.1.dist-info → datamaestro_text-2023.12.12.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamaestro-text
-Version: 2023.12.5.1
+Version: 2023.12.12
 Summary: Datamaestro module for text-related datasets
 Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
 License: GPL-3

{datamaestro_text-2023.12.5.1.dist-info → datamaestro_text-2023.12.12.dist-info}/RECORD RENAMED Viewed

@@ -1,12 +1,12 @@
 datamaestro_text/__init__.py,sha256=hU8jZpkXl3F74qIfqnJl7v4nJ9YxfoR7IpJpUREFNRI,248
-datamaestro_text/version.py,sha256=dmMi7UY_HMKPQFgxCPaECvzfrSKLdDUboEx4k8N_wnE,424
+datamaestro_text/version.py,sha256=VZXVckR_vXa9FiYA1ju8Nq6CTGMqdvgq-SfQ3rz-1S0,421
 datamaestro_text/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamaestro_text/config/ai/quac.yaml,sha256=h1D7UJo2z1nZ_9MXpDHuQNJG9Ma2oayUdJV6lyumAIg,1103
 datamaestro_text/config/com/oscar-corpus.py,sha256=dDfdy_uhG6esRQQzUOOORwK8KvEq4c4hZbWt22pv-zY,733
 datamaestro_text/config/com/sentiment140.py,sha256=iRV_rSjQcr9WNjjQ4EdznaEMGLADV_rwpeRx7ycQi0k,1383
 datamaestro_text/config/com/fastml/goodbooks-10k.yaml,sha256=5ZABxUnBFs2ZnCXtBH8YoBiPb3SocRRdH1DLSfVWF-Y,1172
 datamaestro_text/config/com/github/aagohary/canard.py,sha256=tIwb_KxMUR3st7rzQUkt6rIjolTl8uKvDq6t795b1nY,1468
-datamaestro_text/config/com/github/prdwb/orconvqa.py,sha256=0umr_mo9N1av7b3V9eOnHVFFQNEtJkXatLdgZL0KXP4,1767
+datamaestro_text/config/com/github/prdwb/orconvqa.py,sha256=BytI8euqX04RlTCM8LvYKNKm9SVUTClSnszE3QUhGR8,3196
 datamaestro_text/config/com/github/soskek/bookcorpus.yaml,sha256=qJKs35yeEIilEMgNvU3OEqMp1TSn7mDM2T-uYyA7kTU,1607
 datamaestro_text/config/com/microsoft/wikiqa.yaml,sha256=U7rU-W2Xz1MYv2YXT4jCTj5DsHyM0VssbQPNa3EISaM,540
 datamaestro_text/config/com/microsoft/msmarco/passage.py,sha256=2PP9-HmJAqxFmQkvVcDwfGKpg_gJOQZd1q5ocoi12No,11755
@@ -52,13 +52,15 @@ datamaestro_text/data/ir/base.py,sha256=7FUh4ursVdLMaqUEngZ-TSFki_3xxdEihpVe09hl
 datamaestro_text/data/ir/cord19.py,sha256=JN31EQeg0UFAJlIkg0Ie0_pq-f-oS1OstZGJLJBeKyY,1130
 datamaestro_text/data/ir/csv.py,sha256=vgBNOeayEALwO01LmrzVOEVbs_MWJn3eIm-o0KiXjiE,1836
 datamaestro_text/data/ir/data.py,sha256=ERmmOxz_9mUkIuccNbzUa5Y6gVLLVDdyc4cCxbCCUbY,20
-datamaestro_text/data/ir/formats.py,sha256=6344Tj2yTxQ5KW-YtkBbdbCgWTbSsO6f0AaJlvvibqM,3248
+datamaestro_text/data/ir/formats.py,sha256=sQ08vvuHxPMUJMQZjNpwjUZ9BMJNdzlOqSB-PahdZ70,3474
 datamaestro_text/data/ir/huggingface.py,sha256=G71VFDN-SllZy4LFxumEbCumEJvb5-orAbLemHjWhiA,956
+datamaestro_text/data/ir/stores.py,sha256=JdeDhPxAQOM5_1Pqi_HGoPNUbe63_zMaz-NRs24RS94,687
 datamaestro_text/data/ir/trec.py,sha256=n98_O_sPPdU2i037fAboD4lB_I7C-RJrOLmmkg3osL8,1741
 datamaestro_text/data/ir/utils.py,sha256=6-GhXVtgkBZGhIs2-ODZua_3DmKjSSVydStpHDqbAwE,833
 datamaestro_text/datasets/irds/__init__.py,sha256=Tq0HN1qojnZYLBumM59BuTkz7r0gcu-5OXmDDLgPpAc,707
-datamaestro_text/datasets/irds/data.py,sha256=hDctKswyzD_VrCRcD6pNIoKiiwvapWQBUwxzdFHesIM,9348
+datamaestro_text/datasets/irds/data.py,sha256=1DRhDk4kBJLSWCX3LoJaHULy-YR0DaIj38s_n3x2YEM,11342
 datamaestro_text/datasets/irds/datasets.py,sha256=4tNTmlcF2OmUttCMyz5YTepi91pvaZB4syy5u-jAKh4,5556
+datamaestro_text/datasets/irds/helpers.py,sha256=KC-2nQPCIl4VnbfDkAkr4iFlhkknn8zvbADlClWZvwU,2207
 datamaestro_text/datasets/irds/utils.py,sha256=m30JLIrV_HgilN11TvY9dGTyumES6LLzWZDUAMT915M,1425
 datamaestro_text/download/tmdb.py,sha256=kU_Vz9jhznlyeKMHziVu58IHoWv8zPu6CZTHVNQvmu4,4009
 datamaestro_text/interfaces/plaintext.py,sha256=cWfS_xjqZxQ0EV4Ax5BEarZ4lnhQ1I7mc_vgfBgE76w,885
@@ -72,9 +74,9 @@ datamaestro_text/utils/files.py,sha256=n6ZGl5LNrZbHLcV9RFwd7cFT0vPUezit-2dsBzs1v
 datamaestro_text/utils/iter.py,sha256=-m0Y_0YjSlEVbotzZYIA0Ca0Hq0G_bF9GfAZR2yxrAk,520
 datamaestro_text/utils/randomstream.py,sha256=_-boH4IIqN8qcl3IktjpNp9vmF4TWRzHUSNVwg7WAr8,973
 datamaestro_text/utils/shuffle.py,sha256=o8JTz3mr0lYWyv0zEh91jEK12ci1etMiUnzh5GkOHCM,3490
-datamaestro_text-2023.12.5.1.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
-datamaestro_text-2023.12.5.1.dist-info/METADATA,sha256=rs3ITl4eS2SqYOWKDaG4Ak3GlvFKF_cBWiJ78FHx7XE,1580
-datamaestro_text-2023.12.5.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
-datamaestro_text-2023.12.5.1.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
-datamaestro_text-2023.12.5.1.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
-datamaestro_text-2023.12.5.1.dist-info/RECORD,,
+datamaestro_text-2023.12.12.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
+datamaestro_text-2023.12.12.dist-info/METADATA,sha256=YYoyl-_XL58GbzeG3MV5vYhdc9WbK_HCTtvx1rvRvu8,1579
+datamaestro_text-2023.12.12.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
+datamaestro_text-2023.12.12.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
+datamaestro_text-2023.12.12.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
+datamaestro_text-2023.12.12.dist-info/RECORD,,

{datamaestro_text-2023.12.5.1.dist-info → datamaestro_text-2023.12.12.dist-info}/LICENSE RENAMED Viewed

File without changes

{datamaestro_text-2023.12.5.1.dist-info → datamaestro_text-2023.12.12.dist-info}/WHEEL RENAMED Viewed

File without changes

{datamaestro_text-2023.12.5.1.dist-info → datamaestro_text-2023.12.12.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{datamaestro_text-2023.12.5.1.dist-info → datamaestro_text-2023.12.12.dist-info}/top_level.txt RENAMED Viewed

File without changes

datamaestro-text 2023.12.5.1__py3-none-any.whl → 2023.12.12__py3-none-any.whl

datamaestro-text 2023.12.5.1py3-none-any.whl → 2023.12.12py3-none-any.whl