PyPI - datamaestro-text - Versions diffs - 2025.1.7__tar.gz → 2025.5.13__tar.gz - Mend

datamaestro-text 2025.1.7tar.gz → 2025.5.13tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (126) hide show

{datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/.flake8 RENAMED Viewed

@@ -1,5 +1,5 @@
 [flake8]
-ignore = E203, E266, E501, W503, F403, F401
+ignore = E203, E266, E501, W503, F403, F401, E704
 max-line-length = 79
 max-complexity = 18
 select = B,C,E,F,W,T4,B9

{datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/.github/workflows/pytest.yml RENAMED Viewed

@@ -15,20 +15,18 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.8, 3.9, "3.10", "3.11"]
+        python-version: ["3.10", "3.11", "3.12"]
     steps:
     - uses: actions/checkout@v2
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install flake8 pytest
-        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-        SETUPTOOLS_SCM_PRETEND_VERSION="0.1-dev" pip install --no-dependencies -e .
+        SETUPTOOLS_SCM_PRETEND_VERSION="0.1-dev" pip install -e '.[dev]'
     - name: Lint with flake8
       run: |
         # stop the build if there are Python syntax errors or undefined names

{datamaestro_text-2025.1.7/src/datamaestro_text.egg-info → datamaestro_text-2025.5.13}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.4
 Name: datamaestro-text
-Version: 2025.1.7
+Version: 2025.5.13
 Summary: Datamaestro module for text-related datasets
 Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
 License: GPL-3
@@ -18,9 +18,16 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Requires-Python: >=3.8
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: datamaestro>=1.2.1
+Requires-Dist: datamaestro>=1.4.2
 Requires-Dist: ir_datasets>=0.5.8
 Requires-Dist: attrs
+Provides-Extra: dev
+Requires-Dist: pytest; extra == "dev"
+Requires-Dist: docutils; extra == "dev"
+Requires-Dist: sphobjinv; extra == "dev"
+Requires-Dist: flake8; extra == "dev"
+Requires-Dist: sphinx; extra == "dev"
+Dynamic: license-file
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) [![PyPI version](https://badge.fury.io/py/datamaestro-text.svg)](https://badge.fury.io/py/datamaestro-text)

{datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/pyproject.toml RENAMED Viewed

@@ -27,6 +27,16 @@ homepage = "https://github.com/experimaestro/datamaestro_text"
 documentation = "https://datamaestro-text.readthedocs.io/en/latest/"
 repository = "https://github.com/experimaestro/datamaestro_text"
+[project.optional-dependencies]
+dev = [
+    "pytest",
+    "docutils",
+    "sphobjinv",
+    "flake8",
+    "sphinx"
+]
 [tool.setuptools_scm]
 write_to = "src/datamaestro_text/version.py"
 fallback_version = "0.0.0-dev"

{datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/requirements.txt RENAMED Viewed

@@ -1,3 +1,3 @@
-datamaestro>=1.2.1
+datamaestro>=1.4.2
 ir_datasets>=0.5.8
 attrs

{datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/com/github/aagohary/canard.py RENAMED Viewed

@@ -1,7 +1,5 @@
-# See documentation on https://datamaestro.readthedocs.io
 from datamaestro.definitions import datatasks, datatags, dataset
-from datamaestro.download.archive import zipdownloader
+from datamaestro.download.single import filedownloader
 from datamaestro.utils import HashCheck
 from datamaestro.data.ml import Supervised
@@ -10,14 +8,23 @@ from datamaestro_text.data.conversation.canard import CanardDataset
 @datatags("conversation", "context", "query")
 @datatasks("query rewriting")
-@zipdownloader(
-    "archive",
-    "https://obj.umiacs.umd.edu/elgohary/CANARD_Release.zip",
-    subpath="CANARD_Release",
-    checker=HashCheck("c9bba7c6bb898f669383415b54fd6ffd"),
+@filedownloader(
+    "train.json",
+    "https://raw.githubusercontent.com/aagohary/canard/refs/heads/master/data/release/train.json",
+    checker=HashCheck("73624ac646fb81e09b0fd7f01370ada3"),
+)
+@filedownloader(
+    "dev.json",
+    "https://raw.githubusercontent.com/aagohary/canard/refs/heads/master/data/release/dev.json",
+    checker=HashCheck("c84525631a83bc771c58ff31f4a9b601"),
+)
+@filedownloader(
+    "test.json",
+    "https://raw.githubusercontent.com/aagohary/canard/refs/heads/master/data/release/test.json",
+    checker=HashCheck("3fc14d0078e7a5056f5da571728f024e"),
 )
 @dataset(Supervised, url="https://sites.google.com/view/qanta/projects/canard", id="")
-def main(archive):
+def main(train, dev, test):
     """Question-in-context rewriting
     CANARD is a dataset for question-in-context rewriting that consists of
@@ -30,7 +37,7 @@ def main(archive):
     Each dataset is an instance of :class:`datamaestro_text.data.conversation.CanardDataset`
     """
     return {
-        "train": CanardDataset(path=archive / "train.json"),
-        "validation": CanardDataset(path=archive / "dev.json"),
-        "test": CanardDataset(path=archive / "test.json"),
+        "train": CanardDataset(path=train),
+        "validation": CanardDataset(path=dev),
+        "test": CanardDataset(path=test),
     }

{datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/com/github/apple/ml-qrecc.py RENAMED Viewed

@@ -25,12 +25,11 @@ from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
     checker=HashCheck("f88fcc7ef3678cd6312080389c8abd67"),
 )
 @dataset(
-    Supervised[QReCCDataset, None, QReCCDataset],
     url="https://github.com/apple/ml-qrecc",
     doi="https://doi.org/10.48550/arXiv.2010.04898",
     id="",
 )
-def main(data: Path):
+def main(data: Path) -> Supervised[QReCCDataset, None, QReCCDataset]:
     """Open-Domain Question Answering Goes Conversational via Question Rewriting
     We introduce QReCC (Question Rewriting in Conversational Context), an
@@ -40,10 +39,10 @@ def main(data: Path):
     answering that includes the individual subtasks of question rewriting,
     passage retrieval and reading comprehension
     """
-    return {
-        "train": QReCCDataset(path=data / "qrecc_train.json"),
-        "test": QReCCDataset(path=data / "qrecc_test.json"),
-    }
+    return Supervised(
+        train=QReCCDataset(path=data / "qrecc_train.json"),
+        test=QReCCDataset(path=data / "qrecc_test.json"),
+    )
 @dataset(
@@ -52,7 +51,6 @@ def main(data: Path):
 )
 class Content(LZ4JSONLDocumentStore):
     """QReCC mentionned URLs content"""
     @staticmethod
     def __create_dataset__(dataset, options=None):
         ds = reference(reference=main).setup(dataset, options)
@@ -67,7 +65,7 @@ class Content(LZ4JSONLDocumentStore):
             "id",
         ).setup(dataset, options)
-        return LZ4JSONLDocumentStore(jsonl_path=store_path)
+        return Content(jsonl_path=store_path)
     @staticmethod
     def _documents(path: Path):

{datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/com/microsoft/msmarco/passage.py RENAMED Viewed

@@ -1,11 +1,11 @@
 """MS MARCO (Microsoft Machine Reading Comprehension) is a large scale dataset focused on machine reading comprehension, question answering, and passage ranking. A variant of this task will be the part of TREC and AFIRM 2019. For Updates about TREC 2019 please follow This Repository Passage Reranking task Task Given a query q and a the 1000 most relevant passages P = p1, p2, p3,... p1000, as retrieved by BM25 a succeful system is expected to rerank the most relevant passage as high as possible. For this task not all 1000 relevant items have a human labeled relevant passage. Evaluation will be done using MRR.
-  **Publication**:
-  Tri Nguyen, Mir Rosenberg, Xia Song, Jianfeng Gao, Saurabh Tiwary, RanganMajumder, and Li Deng. 2016.
-  MS MARCO: A Human Generated MAchineReading COmprehension Dataset. In CoCo@NIPS.
+**Publication**:
+Tri Nguyen, Mir Rosenberg, Xia Song, Jianfeng Gao, Saurabh Tiwary, RanganMajumder, and Li Deng. 2016.
+MS MARCO: A Human Generated MAchineReading COmprehension Dataset. In CoCo@NIPS.
-  See [https://github.com/microsoft/MSMARCO-Passage-Ranking](https://github.com/microsoft/MSMARCO-Passage-Ranking) for more details
+See [https://github.com/microsoft/MSMARCO-Passage-Ranking](https://github.com/microsoft/MSMARCO-Passage-Ranking) for more details
 """
 from datamaestro.annotations.agreement import useragreement
@@ -35,6 +35,7 @@ http://www.msmarco.org/dataset.aspx""",
 # --- Document collection
 # TODO: Not ideal since it would be better to have small versions right away
 # instead of downloading again the MS Marco Collection
 @lua
@@ -43,10 +44,10 @@ http://www.msmarco.org/dataset.aspx""",
     url="https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz",
     checker=HashCheck("31644046b18952c1386cd4564ba2ae69", md5),
 )
-@dataset(Folder, url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
-def collection_etc(data):
+@dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
+def collection_etc(data) -> Folder:
     """Documents and some more files"""
-    return {"path": data}
+    return Folder(path=data)
 @lua

{datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/com/sentiment140.py RENAMED Viewed

@@ -1,14 +1,9 @@
 from datamaestro.data.csv import Generic
-from datamaestro.definitions import argument, datatasks, datatags, dataset
+from datamaestro.definitions import datatasks, datatags, dataset
 from datamaestro.download.archive import zipdownloader
 from datamaestro.data.ml import Supervised
 from datamaestro.utils import HashCheck
-# name: Sentiment140
-# web: http://help.sentiment140.com/for-students/
-# description: |
 @zipdownloader(
     "dir",

{datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/edu/stanford/glove.py RENAMED Viewed

@@ -11,6 +11,7 @@ from datamaestro.download.archive import zipdownloader
 from datamaestro.download.single import filedownloader
 from datamaestro_text.data.embeddings import WordEmbeddingsText
 # size: 822M
 # statistics:
 #   tokens: 6G

{datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/edu/upenn/ldc/aquaint.py RENAMED Viewed

@@ -1,10 +1,9 @@
 """The AQUAINT Corpus, Linguistic Data Consortium (LDC) catalog number LDC2002T31 and ISBN 1-58563-240-6 consists of newswire text data in English, drawn from three sources: the Xinhua News Service (People's Republic of China), the New York Times News Service, and the Associated Press Worldstream News Service. It was prepared by the LDC for the AQUAINT Project, and will be used in official benchmark evaluations conducted by National Institute of Standards and Technology (NIST)."""
-from datamaestro.definitions import DatafolderPath
-from datamaestro.data import Base
-from datamaestro_text.data.ir.trec import TipsterCollection
-from datamaestro.definitions import argument, datatasks, datatags, dataset
+from datamaestro.context import DatafolderPath
+from datamaestro.definitions import dataset
 from datamaestro.download.links import links, linkfolder
+from datamaestro_text.data.ir.trec import TipsterCollection
 URL = "https://catalog.ldc.upenn.edu/LDC2002T31"

{datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/config/gov/nist/trec/tipster.py RENAMED Viewed

@@ -22,8 +22,8 @@ from datamaestro_text.data.ir.trec import TipsterCollection
 from datamaestro.download.links import linkfolder
 from datamaestro.definitions import (
     dataset,
-    DatafolderPath,
 )
+from datamaestro.context import DatafolderPath
 # Store meta-information
 TIPSTER = dataset(TipsterCollection, url="https://catalog.ldc.upenn.edu/LDC93T3A")

{datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/conversation/base.py RENAMED Viewed

@@ -129,6 +129,8 @@ class ConversationNode:
 class ConversationTree(ABC):
+    """Represents a conversation tree"""
     @abstractmethod
     def root(self) -> ConversationNode:
         ...
@@ -192,7 +194,7 @@ class SingleConversationTreeNode(ConversationNode):
         return (
             SingleConversationTreeNode(self.tree, self.index + 1)
             if self.index < len(self.tree.history) - 1
-            else []
+            else None
         )
     def children(self) -> List[ConversationNode]:
@@ -207,8 +209,8 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
     """A conversation tree node"""
     entry: Record
-    parent: Optional["ConversationTreeNode"]
-    children: List["ConversationTreeNode"]
+    _parent: Optional["ConversationTreeNode"]
+    _children: List["ConversationTreeNode"]
     def __init__(self, entry):
         self.entry = entry
@@ -216,8 +218,8 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
         self.children = []
     def add(self, node: "ConversationTreeNode") -> "ConversationTreeNode":
-        self.children.append(node)
-        node.parent = self
+        self._children.append(node)
+        node._parent = self
         return node
     def conversation(self, skip_self: bool) -> ConversationHistory:
@@ -232,14 +234,14 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
     def __iter__(self) -> Iterator["ConversationTreeNode"]:
         """Iterates over all conversation tree nodes (pre-order)"""
         yield self.entry
-        for child in self.children:
+        for child in self._children:
             yield from child
     def parent(self) -> Optional[ConversationNode]:
-        return self.parent
+        return self._parent
     def children(self) -> List[ConversationNode]:
-        return self.children
+        return self._children
     def root(self):
         return self

datamaestro_text-2025.5.13/src/datamaestro_text/data/conversation/canard.py ADDED Viewed

@@ -0,0 +1,107 @@
+from typing import Iterator, List
+from attr import define
+import json
+from datamaestro.record import Record
+from datamaestro.data import File
+from datamaestro_text.data.conversation.base import (
+    ConversationDataset,
+    ConversationTree,
+    SingleConversationTree,
+    SimpleDecontextualizedItem,
+    EntryType,
+)
+from datamaestro_text.data.ir import IDItem, SimpleTextItem
+import logging
+@define(kw_only=True)
+class CanardConversation:
+    """A query with past history"""
+    history: List[str]
+    """The list of queries asked by the user"""
+    query: str
+    """The last issued query"""
+    rewrite: str
+    """Manually rewritten query"""
+    dialogue_id: str
+    """Conversation identifier"""
+    query_no: int
+    """Question number"""
+class CanardDataset(ConversationDataset, File):
+    """A dataset in the CANARD JSON format
+    The CANARD dataset is composed of
+    """
+    def entries(self) -> Iterator[CanardConversation]:
+        """Iterates over re-written query with their context"""
+        with self.path.open("rt") as fp:
+            data = json.load(fp)
+        for entry in data:
+            yield CanardConversation(
+                history=entry["History"],
+                query=entry["Question"],
+                rewrite=entry["Rewrite"],
+                dialogue_id=entry["QuAC_dialog_id"],
+                query_no=entry["Question_no"],
+            )
+    def __iter__(self) -> Iterator[ConversationTree]:
+        history: list[Record] = []
+        current_id = None
+        for entry in self.entries():
+            # Check if current conversation, otherwise we are OK
+            if current_id != entry.dialogue_id:
+                if current_id is not None:
+                    history.reverse()
+                    yield SingleConversationTree(current_id, history)
+                    history = []
+                current_id = entry.dialogue_id
+            if not history:
+                # First round
+                # The two first items are the wikipedia title and section,
+                # we interpret them as two user queries
+                assert len(entry.history) == 2
+                history.extend(
+                    Record(
+                        SimpleTextItem(text),
+                        EntryType.USER_QUERY,
+                    )
+                    for text in entry.history
+                )
+            else:
+                # The utterance before the last is the last user query
+                assert (
+                    entry.history[-2] == history[-1][SimpleTextItem].text
+                ), f"{entry.dialogue_id} {entry.history} / {history[-4:-1]}"
+                # The last utterance is the system side
+                history.append(
+                    Record(SimpleTextItem(entry.history[-1]), EntryType.SYSTEM_ANSWER)
+                )
+            assert len(entry.history) == len(history)
+            # Add to current
+            history.append(
+                Record(
+                    IDItem(f"{entry.dialogue_id}-{entry.query_no}"),
+                    SimpleTextItem(entry.query),
+                    SimpleDecontextualizedItem(entry.rewrite),
+                    EntryType.USER_QUERY,
+                )
+            )
+        if current_id:
+            yield SingleConversationTree(current_id, history)

{datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/conversation/orconvqa.py RENAMED Viewed

@@ -1,4 +1,3 @@
-from functools import cached_property
 from typing import Iterator, List, Optional
 from attr import define
 import json

{datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/embeddings.py RENAMED Viewed

@@ -1,5 +1,5 @@
-from pathlib import Path
-from datamaestro.data import Base, File, argument
+from experimaestro import Meta
+from datamaestro.data import Base, File
 from datamaestro.definitions import datatags
 import numpy as np
 from typing import Tuple, List
@@ -18,9 +18,9 @@ class WordEmbeddings(Base):
         raise NotImplementedError()
-@argument("encoding", str, ignored=True, default="utf-8")
 class WordEmbeddingsText(WordEmbeddings, File):
     """Word embeddings as a text word / values"""
+    encoding: Meta[str] = "utf-8"
     def load(self):
         words = []

{datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/ir/base.py RENAMED Viewed

@@ -43,6 +43,12 @@ class IDItem(Item, ABC):
     id: str
+@define
+class UrlItem(Item):
+    """An url item"""
+    url: str
 @define
 class AdhocAssessment:

{datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/ir/cord19.py RENAMED Viewed

@@ -1,7 +1,8 @@
 from csv import DictReader
 from typing import Iterator
-from datamaestro.data import File, documentation
+from experimaestro import documentation
+from datamaestro.data import File
 from datamaestro.record import Record
 from datamaestro_text.data.ir import Documents, TopicRecord, Topics, IDItem
 from datamaestro_text.data.ir.formats import (

{datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/ir/formats.py RENAMED Viewed

@@ -58,7 +58,7 @@ class NFCorpusDocument(TextItem):
     @cached_property
     def text(self):
-        return self.abstract
+        return f"{self.title} {self.abstract}"
 @define
@@ -99,8 +99,8 @@ class WapoDocument(TextItem):
     body_media: Tuple[WapoDocMedia, ...]
     @cached_property
-    def text(self):
-        return self.body
+    def text(self):
+        return f"{self.title} {self.body_paras_html}"
 @define
@@ -126,13 +126,24 @@ class OrConvQADocument(TextItem):
     def text(self):
         return f"{self.title} {self.body}"
 @define
+class DprW100Doc(TextItem):
+    text: str
+    title: str
+@define
+class MsMarcoV2Passage(TextItem):
+    text: str
+    spans: Tuple[Tuple[int, int], ...]
+    msmarco_document_id: str
 class Touche2020(TextItem):
     text: str
     title: str
     stance: str
     url: str
 @define
 class SciDocs(TextItem):
     text: str
@@ -175,7 +186,8 @@ class TrecMb14Query(TextItem):
     def get_text(self):
         return f"{self.query}"
-@define
+@define
 class SciDocsTopic(TextItem):
     text: str
     authors: List[str]
@@ -183,6 +195,7 @@ class SciDocsTopic(TextItem):
     cited_by: List[str]
     references: List[str]
 @define()
 class TrecTopic(SimpleTextItem):
     description: str
@@ -190,3 +203,17 @@ class TrecTopic(SimpleTextItem):
 TrecTopicRecord = record_type(IDItem, TrecTopic)
+@define
+class DprW100Query(TextItem):
+    text: str
+    answers: Tuple[str]
+@define
+class TrecBackgroundLinkingQuery(IDItem):
+    query_id: str
+    doc_id: str
+    url: str
+    def get_text(self):
+        raise NotImplementedError()

datamaestro_text-2025.5.13/src/datamaestro_text/data/recommendation.py ADDED Viewed

@@ -0,0 +1,13 @@
+from experimaestro import Param
+from datamaestro.data import Base, File
+import datamaestro.data.csv as csv
+class RatedItems(Base):
+    ratings: Param[File]
+class Movielens(RatedItems):
+    links: Param[csv.Generic]
+    movies: Param[csv.Generic]
+    tags: Param[csv.Generic]

{datamaestro_text-2025.1.7 → datamaestro_text-2025.5.13}/src/datamaestro_text/data/text.py RENAMED Viewed

@@ -1,15 +1,15 @@
-from pathlib import Path
-from datamaestro.data import Base, Folder, File, argument
+from typing import Optional
+from experimaestro import Param
+from datamaestro.data import Base, Folder, File
 from datamaestro.data.ml import Supervised
-@argument("train", type=Base)
-@argument("test", type=Base, required=False)
-@argument("validation", type=Base, required=False)
 class TrainingText(Supervised):
     """ "A dataset used for training with a train and a test"""
-    pass
+    train: Param[Base]
+    test: Param[Optional[Base]] = None
+    validation: Param[Optional[Base]] = None
 class TextFolder(Folder):

datamaestro-text 2025.1.7__tar.gz → 2025.5.13__tar.gz

datamaestro-text 2025.1.7tar.gz → 2025.5.13tar.gz