PyPI - datamaestro-text - Versions diffs - 2024.5.31__tar.gz → 2025.4.3__tar.gz - Mend

datamaestro-text 2024.5.31tar.gz → 2025.4.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (127) hide show

{datamaestro_text-2024.5.31/src/datamaestro_text.egg-info → datamaestro_text-2025.4.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.4
 Name: datamaestro-text
-Version: 2024.5.31
+Version: 2025.4.3
 Summary: Datamaestro module for text-related datasets
 Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
 License: GPL-3
@@ -18,9 +18,10 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Requires-Python: >=3.8
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: datamaestro>=1.1.1
-Requires-Dist: ir_datasets
+Requires-Dist: datamaestro>=1.2.1
+Requires-Dist: ir_datasets>=0.5.8
 Requires-Dist: attrs
+Dynamic: license-file
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) [![PyPI version](https://badge.fury.io/py/datamaestro-text.svg)](https://badge.fury.io/py/datamaestro-text)

datamaestro_text-2025.4.3/requirements.txt ADDED Viewed

@@ -0,0 +1,3 @@
+datamaestro>=1.2.1
+ir_datasets>=0.5.8
+attrs

datamaestro_text-2025.4.3/src/datamaestro_text/config/com/github/apple/ml-qrecc.py ADDED Viewed

@@ -0,0 +1,87 @@
+# See documentation on https://datamaestro.readthedocs.io
+import re
+import json
+from pathlib import Path
+from datamaestro.definitions import datatasks, datatags, dataset
+from datamaestro.data.ml import Supervised
+from datamaestro.download import reference
+from datamaestro.download.archive import zipdownloader
+from datamaestro.download.wayback import wayback_documents
+from datamaestro.utils import HashCheck
+from datamaestro_text.data.conversation.qrecc import QReCCDataset
+from datamaestro_text.datasets.irds.data import (
+    LZ4JSONLDocumentStore,
+    SimpleJsonDocument,
+)
+from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
+@datatags("conversation", "context", "query")
+@datatasks("query rewriting")
+@zipdownloader(
+    "data",
+    "https://github.com/apple/ml-qrecc/raw/main/dataset/qrecc_data.zip",
+    checker=HashCheck("f88fcc7ef3678cd6312080389c8abd67"),
+)
+@dataset(
+    Supervised[QReCCDataset, None, QReCCDataset],
+    url="https://github.com/apple/ml-qrecc",
+    doi="https://doi.org/10.48550/arXiv.2010.04898",
+    id="",
+)
+def main(data: Path):
+    """Open-Domain Question Answering Goes Conversational via Question Rewriting
+    We introduce QReCC (Question Rewriting in Conversational Context), an
+    end-to-end open-domain question answering dataset comprising of 14K
+    conversations with 81K question-answer pairs. The goal of this dataset is to
+    provide a challenging benchmark for end-to-end conversational question
+    answering that includes the individual subtasks of question rewriting,
+    passage retrieval and reading comprehension
+    """
+    return {
+        "train": QReCCDataset(path=data / "qrecc_train.json"),
+        "test": QReCCDataset(path=data / "qrecc_test.json"),
+    }
+@dataset(
+    url="https://github.com/apple/ml-qrecc",
+    doi="https://doi.org/10.48550/arXiv.2010.04898",
+)
+class Content(LZ4JSONLDocumentStore):
+    """QReCC mentionned URLs content"""
+    @staticmethod
+    def __create_dataset__(dataset, options=None):
+        ds = reference(reference=main).setup(dataset, options)
+        documents_path = wayback_documents(
+            "20191127", lambda: Content._urls(ds), name="wayback.jsonl"
+        ).setup(dataset, options)
+        store_path = lz4docstore_builder(
+            "store",
+            lambda: Content._documents(documents_path),
+            SimpleJsonDocument,
+            "id",
+        ).setup(dataset, options)
+        return LZ4JSONLDocumentStore(jsonl_path=store_path)
+    @staticmethod
+    def _documents(path: Path):
+        """Iterates over documents from wayback"""
+        with path.open("rt") as fp:
+            for line in fp:
+                yield SimpleJsonDocument(**json.loads(line))
+    @staticmethod
+    def _urls(supervised: Supervised[QReCCDataset, None, QReCCDataset]):
+        urls = set()
+        for ds in [supervised.train, supervised.test]:
+            for entry in ds.entries():
+                if entry.answer_url:
+                    url = re.sub("#.*$", "", entry.answer_url)
+                    urls.add(url)
+        return urls

{datamaestro_text-2024.5.31 → datamaestro_text-2025.4.3}/src/datamaestro_text/config/com/microsoft/msmarco/passage.py RENAMED Viewed

@@ -1,11 +1,11 @@
 """MS MARCO (Microsoft Machine Reading Comprehension) is a large scale dataset focused on machine reading comprehension, question answering, and passage ranking. A variant of this task will be the part of TREC and AFIRM 2019. For Updates about TREC 2019 please follow This Repository Passage Reranking task Task Given a query q and a the 1000 most relevant passages P = p1, p2, p3,... p1000, as retrieved by BM25 a succeful system is expected to rerank the most relevant passage as high as possible. For this task not all 1000 relevant items have a human labeled relevant passage. Evaluation will be done using MRR.
-  **Publication**:
-  Tri Nguyen, Mir Rosenberg, Xia Song, Jianfeng Gao, Saurabh Tiwary, RanganMajumder, and Li Deng. 2016.
-  MS MARCO: A Human Generated MAchineReading COmprehension Dataset. In CoCo@NIPS.
+**Publication**:
+Tri Nguyen, Mir Rosenberg, Xia Song, Jianfeng Gao, Saurabh Tiwary, RanganMajumder, and Li Deng. 2016.
+MS MARCO: A Human Generated MAchineReading COmprehension Dataset. In CoCo@NIPS.
-  See [https://github.com/microsoft/MSMARCO-Passage-Ranking](https://github.com/microsoft/MSMARCO-Passage-Ranking) for more details
+See [https://github.com/microsoft/MSMARCO-Passage-Ranking](https://github.com/microsoft/MSMARCO-Passage-Ranking) for more details
 """
 from datamaestro.annotations.agreement import useragreement
@@ -35,6 +35,7 @@ http://www.msmarco.org/dataset.aspx""",
 # --- Document collection
 # TODO: Not ideal since it would be better to have small versions right away
 # instead of downloading again the MS Marco Collection
 @lua
@@ -43,10 +44,10 @@ http://www.msmarco.org/dataset.aspx""",
     url="https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz",
     checker=HashCheck("31644046b18952c1386cd4564ba2ae69", md5),
 )
-@dataset(Folder, url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
-def collection_etc(data):
+@dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
+def collection_etc(data) -> Folder:
     """Documents and some more files"""
-    return {"path": data}
+    return Folder(path=data)
 @lua

{datamaestro_text-2024.5.31 → datamaestro_text-2025.4.3}/src/datamaestro_text/config/com/sentiment140.py RENAMED Viewed

@@ -4,10 +4,6 @@ from datamaestro.download.archive import zipdownloader
 from datamaestro.data.ml import Supervised
 from datamaestro.utils import HashCheck
-# name: Sentiment140
-# web: http://help.sentiment140.com/for-students/
-# description: |
 @zipdownloader(

{datamaestro_text-2024.5.31 → datamaestro_text-2025.4.3}/src/datamaestro_text/config/edu/upenn/ldc/aquaint.py RENAMED Viewed

@@ -1,10 +1,9 @@
 """The AQUAINT Corpus, Linguistic Data Consortium (LDC) catalog number LDC2002T31 and ISBN 1-58563-240-6 consists of newswire text data in English, drawn from three sources: the Xinhua News Service (People's Republic of China), the New York Times News Service, and the Associated Press Worldstream News Service. It was prepared by the LDC for the AQUAINT Project, and will be used in official benchmark evaluations conducted by National Institute of Standards and Technology (NIST)."""
-from datamaestro.definitions import DatafolderPath
-from datamaestro.data import Base
-from datamaestro_text.data.ir.trec import TipsterCollection
-from datamaestro.definitions import argument, datatasks, datatags, dataset
+from datamaestro.context import DatafolderPath
+from datamaestro.definitions import dataset
 from datamaestro.download.links import links, linkfolder
+from datamaestro_text.data.ir.trec import TipsterCollection
 URL = "https://catalog.ldc.upenn.edu/LDC2002T31"

{datamaestro_text-2024.5.31 → datamaestro_text-2025.4.3}/src/datamaestro_text/config/gov/nist/trec/tipster.py RENAMED Viewed

@@ -22,8 +22,8 @@ from datamaestro_text.data.ir.trec import TipsterCollection
 from datamaestro.download.links import linkfolder
 from datamaestro.definitions import (
     dataset,
-    DatafolderPath,
 )
+from datamaestro.context import DatafolderPath
 # Store meta-information
 TIPSTER = dataset(TipsterCollection, url="https://catalog.ldc.upenn.edu/LDC93T3A")

{datamaestro_text-2024.5.31 → datamaestro_text-2025.4.3}/src/datamaestro_text/data/conversation/base.py RENAMED Viewed

@@ -188,7 +188,7 @@ class SingleConversationTreeNode(ConversationNode):
     def history(self) -> Sequence[Record]:
         return self.tree.history[self.index + 1 :]
-    def parent(self) -> ConversationNode | None:
+    def parent(self) -> Optional[ConversationNode]:
         return (
             SingleConversationTreeNode(self.tree, self.index + 1)
             if self.index < len(self.tree.history) - 1
@@ -235,7 +235,7 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
         for child in self.children:
             yield from child
-    def parent(self) -> ConversationNode | None:
+    def parent(self) -> Optional[ConversationNode]:
         return self.parent
     def children(self) -> List[ConversationNode]:

{datamaestro_text-2024.5.31 → datamaestro_text-2025.4.3}/src/datamaestro_text/data/embeddings.py RENAMED Viewed

@@ -1,5 +1,5 @@
-from pathlib import Path
-from datamaestro.data import Base, File, argument
+from experimaestro import Meta
+from datamaestro.data import Base, File
 from datamaestro.definitions import datatags
 import numpy as np
 from typing import Tuple, List
@@ -18,9 +18,9 @@ class WordEmbeddings(Base):
         raise NotImplementedError()
-@argument("encoding", str, ignored=True, default="utf-8")
 class WordEmbeddingsText(WordEmbeddings, File):
     """Word embeddings as a text word / values"""
+    encoding: Meta[str] = "utf-8"
     def load(self):
         words = []

{datamaestro_text-2024.5.31 → datamaestro_text-2025.4.3}/src/datamaestro_text/data/ir/cord19.py RENAMED Viewed

@@ -1,7 +1,8 @@
 from csv import DictReader
 from typing import Iterator
-from datamaestro.data import File, documentation
+from experimaestro import documentation
+from datamaestro.data import File
 from datamaestro.record import Record
 from datamaestro_text.data.ir import Documents, TopicRecord, Topics, IDItem
 from datamaestro_text.data.ir.formats import (

{datamaestro_text-2024.5.31 → datamaestro_text-2025.4.3}/src/datamaestro_text/data/ir/formats.py RENAMED Viewed

@@ -1,5 +1,5 @@
 from functools import cached_property
-from typing import ClassVar, Tuple
+from typing import ClassVar, Tuple, List
 from attrs import define
 from datamaestro.record import record_type
 from ir_datasets.datasets.wapo import WapoDocMedia
@@ -117,7 +117,6 @@ class TweetDoc(TextItem):
 @define
 class OrConvQADocument(TextItem):
-    id: str
     title: str
     body: str
     aid: str
@@ -127,12 +126,21 @@ class OrConvQADocument(TextItem):
     def text(self):
         return f"{self.title} {self.body}"
+@define
+class Touche2020(TextItem):
+    text: str
+    title: str
+    stance: str
+    url: str
 @define
-class TrecTopic(TextItem):
+class SciDocs(TextItem):
     text: str
-    query: str
-    narrative: str
+    title: str
+    authors: List[str]
+    year: int
+    cited_by: List[str]
+    references: List[str]
 @define
@@ -167,6 +175,13 @@ class TrecMb14Query(TextItem):
     def get_text(self):
         return f"{self.query}"
+@define
+class SciDocsTopic(TextItem):
+    text: str
+    authors: List[str]
+    year: int
+    cited_by: List[str]
+    references: List[str]
 @define()
 class TrecTopic(SimpleTextItem):

{datamaestro_text-2024.5.31 → datamaestro_text-2025.4.3}/src/datamaestro_text/data/ir/stores.py RENAMED Viewed

@@ -1,17 +1,21 @@
 from collections import namedtuple
-from typing import List
+from typing import List, NamedTuple
 from experimaestro import Constant
 import attrs
 from datamaestro.record import Record
+from datamaestro_text.data.ir.base import IDItem
 from datamaestro_text.datasets.irds.data import LZ4DocumentStore
 from datamaestro_text.data.ir.formats import OrConvQADocument
 class OrConvQADocumentStore(LZ4DocumentStore):
-    NAMED_TUPLE = namedtuple(
-        "OrConvQADocument", [a.name for a in attrs.fields(OrConvQADocument)]
-    )
+    class NAMED_TUPLE(NamedTuple):
+        id: str
+        title: str
+        body: str
+        aid: str
+        bid: int
     lookup_field: Constant[str] = "id"
     fields: Constant[List[str]] = list(NAMED_TUPLE._fields)
@@ -19,5 +23,7 @@ class OrConvQADocumentStore(LZ4DocumentStore):
     data_cls = NAMED_TUPLE
-    def converter(self, data: NAMED_TUPLE) -> OrConvQADocument:
-        return Record(OrConvQADocument(**data._asdict()))
+    def converter(self, data: NAMED_TUPLE) -> Record:
+        fields = data._asdict()
+        del fields["id"]
+        return Record(OrConvQADocument(**fields), IDItem(data.id))

datamaestro_text-2025.4.3/src/datamaestro_text/data/recommendation.py ADDED Viewed

@@ -0,0 +1,13 @@
+from experimaestro import Param
+from datamaestro.data import Base, File
+import datamaestro.data.csv as csv
+class RatedItems(Base):
+    ratings: Param[File]
+class Movielens(RatedItems):
+    links: Param[csv.Generic]
+    movies: Param[csv.Generic]
+    tags: Param[csv.Generic]

{datamaestro_text-2024.5.31 → datamaestro_text-2025.4.3}/src/datamaestro_text/data/text.py RENAMED Viewed

@@ -1,15 +1,15 @@
-from pathlib import Path
-from datamaestro.data import Base, Folder, File, argument
+from typing import Optional
+from experimaestro import Param
+from datamaestro.data import Base, Folder, File
 from datamaestro.data.ml import Supervised
-@argument("train", type=Base)
-@argument("test", type=Base, required=False)
-@argument("validation", type=Base, required=False)
 class TrainingText(Supervised):
     """ "A dataset used for training with a train and a test"""
-    pass
+    train: Param[Base]
+    test: Param[Optional[Base]] = None
+    validation: Param[Optional[Base]] = None
 class TextFolder(Folder):

datamaestro-text 2024.5.31__tar.gz → 2025.4.3__tar.gz

datamaestro-text 2024.5.31tar.gz → 2025.4.3tar.gz