PyPI - datamaestro-text - Versions diffs - 2025.1.7__py3-none-any.whl → 2025.5.13__py3-none-any.whl - Mend

datamaestro-text 2025.1.7py3-none-any.whl → 2025.5.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

datamaestro_text/config/com/github/aagohary/canard.py CHANGED Viewed

@@ -1,7 +1,5 @@
-# See documentation on https://datamaestro.readthedocs.io
 from datamaestro.definitions import datatasks, datatags, dataset
-from datamaestro.download.archive import zipdownloader
+from datamaestro.download.single import filedownloader
 from datamaestro.utils import HashCheck
 from datamaestro.data.ml import Supervised
@@ -10,14 +8,23 @@ from datamaestro_text.data.conversation.canard import CanardDataset
 @datatags("conversation", "context", "query")
 @datatasks("query rewriting")
-@zipdownloader(
-    "archive",
-    "https://obj.umiacs.umd.edu/elgohary/CANARD_Release.zip",
-    subpath="CANARD_Release",
-    checker=HashCheck("c9bba7c6bb898f669383415b54fd6ffd"),
+@filedownloader(
+    "train.json",
+    "https://raw.githubusercontent.com/aagohary/canard/refs/heads/master/data/release/train.json",
+    checker=HashCheck("73624ac646fb81e09b0fd7f01370ada3"),
+)
+@filedownloader(
+    "dev.json",
+    "https://raw.githubusercontent.com/aagohary/canard/refs/heads/master/data/release/dev.json",
+    checker=HashCheck("c84525631a83bc771c58ff31f4a9b601"),
+)
+@filedownloader(
+    "test.json",
+    "https://raw.githubusercontent.com/aagohary/canard/refs/heads/master/data/release/test.json",
+    checker=HashCheck("3fc14d0078e7a5056f5da571728f024e"),
 )
 @dataset(Supervised, url="https://sites.google.com/view/qanta/projects/canard", id="")
-def main(archive):
+def main(train, dev, test):
     """Question-in-context rewriting
     CANARD is a dataset for question-in-context rewriting that consists of
@@ -30,7 +37,7 @@ def main(archive):
     Each dataset is an instance of :class:`datamaestro_text.data.conversation.CanardDataset`
     """
     return {
-        "train": CanardDataset(path=archive / "train.json"),
-        "validation": CanardDataset(path=archive / "dev.json"),
-        "test": CanardDataset(path=archive / "test.json"),
+        "train": CanardDataset(path=train),
+        "validation": CanardDataset(path=dev),
+        "test": CanardDataset(path=test),
     }

datamaestro_text/config/com/github/apple/ml-qrecc.py CHANGED Viewed

@@ -25,12 +25,11 @@ from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
     checker=HashCheck("f88fcc7ef3678cd6312080389c8abd67"),
 )
 @dataset(
-    Supervised[QReCCDataset, None, QReCCDataset],
     url="https://github.com/apple/ml-qrecc",
     doi="https://doi.org/10.48550/arXiv.2010.04898",
     id="",
 )
-def main(data: Path):
+def main(data: Path) -> Supervised[QReCCDataset, None, QReCCDataset]:
     """Open-Domain Question Answering Goes Conversational via Question Rewriting
     We introduce QReCC (Question Rewriting in Conversational Context), an
@@ -40,10 +39,10 @@ def main(data: Path):
     answering that includes the individual subtasks of question rewriting,
     passage retrieval and reading comprehension
     """
-    return {
-        "train": QReCCDataset(path=data / "qrecc_train.json"),
-        "test": QReCCDataset(path=data / "qrecc_test.json"),
-    }
+    return Supervised(
+        train=QReCCDataset(path=data / "qrecc_train.json"),
+        test=QReCCDataset(path=data / "qrecc_test.json"),
+    )
 @dataset(
@@ -52,7 +51,6 @@ def main(data: Path):
 )
 class Content(LZ4JSONLDocumentStore):
     """QReCC mentionned URLs content"""
     @staticmethod
     def __create_dataset__(dataset, options=None):
         ds = reference(reference=main).setup(dataset, options)
@@ -67,7 +65,7 @@ class Content(LZ4JSONLDocumentStore):
             "id",
         ).setup(dataset, options)
-        return LZ4JSONLDocumentStore(jsonl_path=store_path)
+        return Content(jsonl_path=store_path)
     @staticmethod
     def _documents(path: Path):

datamaestro_text/config/com/microsoft/msmarco/passage.py CHANGED Viewed

@@ -1,11 +1,11 @@
 """MS MARCO (Microsoft Machine Reading Comprehension) is a large scale dataset focused on machine reading comprehension, question answering, and passage ranking. A variant of this task will be the part of TREC and AFIRM 2019. For Updates about TREC 2019 please follow This Repository Passage Reranking task Task Given a query q and a the 1000 most relevant passages P = p1, p2, p3,... p1000, as retrieved by BM25 a succeful system is expected to rerank the most relevant passage as high as possible. For this task not all 1000 relevant items have a human labeled relevant passage. Evaluation will be done using MRR.
-  **Publication**:
-  Tri Nguyen, Mir Rosenberg, Xia Song, Jianfeng Gao, Saurabh Tiwary, RanganMajumder, and Li Deng. 2016.
-  MS MARCO: A Human Generated MAchineReading COmprehension Dataset. In CoCo@NIPS.
+**Publication**:
+Tri Nguyen, Mir Rosenberg, Xia Song, Jianfeng Gao, Saurabh Tiwary, RanganMajumder, and Li Deng. 2016.
+MS MARCO: A Human Generated MAchineReading COmprehension Dataset. In CoCo@NIPS.
-  See [https://github.com/microsoft/MSMARCO-Passage-Ranking](https://github.com/microsoft/MSMARCO-Passage-Ranking) for more details
+See [https://github.com/microsoft/MSMARCO-Passage-Ranking](https://github.com/microsoft/MSMARCO-Passage-Ranking) for more details
 """
 from datamaestro.annotations.agreement import useragreement
@@ -35,6 +35,7 @@ http://www.msmarco.org/dataset.aspx""",
 # --- Document collection
 # TODO: Not ideal since it would be better to have small versions right away
 # instead of downloading again the MS Marco Collection
 @lua
@@ -43,10 +44,10 @@ http://www.msmarco.org/dataset.aspx""",
     url="https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz",
     checker=HashCheck("31644046b18952c1386cd4564ba2ae69", md5),
 )
-@dataset(Folder, url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
-def collection_etc(data):
+@dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
+def collection_etc(data) -> Folder:
     """Documents and some more files"""
-    return {"path": data}
+    return Folder(path=data)
 @lua

datamaestro_text/config/com/sentiment140.py CHANGED Viewed

@@ -1,14 +1,9 @@
 from datamaestro.data.csv import Generic
-from datamaestro.definitions import argument, datatasks, datatags, dataset
+from datamaestro.definitions import datatasks, datatags, dataset
 from datamaestro.download.archive import zipdownloader
 from datamaestro.data.ml import Supervised
 from datamaestro.utils import HashCheck
-# name: Sentiment140
-# web: http://help.sentiment140.com/for-students/
-# description: |
 @zipdownloader(
     "dir",

datamaestro_text/config/edu/stanford/glove.py CHANGED Viewed

@@ -11,6 +11,7 @@ from datamaestro.download.archive import zipdownloader
 from datamaestro.download.single import filedownloader
 from datamaestro_text.data.embeddings import WordEmbeddingsText
 # size: 822M
 # statistics:
 #   tokens: 6G

datamaestro_text/config/edu/upenn/ldc/aquaint.py CHANGED Viewed

@@ -1,10 +1,9 @@
 """The AQUAINT Corpus, Linguistic Data Consortium (LDC) catalog number LDC2002T31 and ISBN 1-58563-240-6 consists of newswire text data in English, drawn from three sources: the Xinhua News Service (People's Republic of China), the New York Times News Service, and the Associated Press Worldstream News Service. It was prepared by the LDC for the AQUAINT Project, and will be used in official benchmark evaluations conducted by National Institute of Standards and Technology (NIST)."""
-from datamaestro.definitions import DatafolderPath
-from datamaestro.data import Base
-from datamaestro_text.data.ir.trec import TipsterCollection
-from datamaestro.definitions import argument, datatasks, datatags, dataset
+from datamaestro.context import DatafolderPath
+from datamaestro.definitions import dataset
 from datamaestro.download.links import links, linkfolder
+from datamaestro_text.data.ir.trec import TipsterCollection
 URL = "https://catalog.ldc.upenn.edu/LDC2002T31"

datamaestro_text/config/gov/nist/trec/tipster.py CHANGED Viewed

@@ -22,8 +22,8 @@ from datamaestro_text.data.ir.trec import TipsterCollection
 from datamaestro.download.links import linkfolder
 from datamaestro.definitions import (
     dataset,
-    DatafolderPath,
 )
+from datamaestro.context import DatafolderPath
 # Store meta-information
 TIPSTER = dataset(TipsterCollection, url="https://catalog.ldc.upenn.edu/LDC93T3A")

datamaestro_text/data/conversation/base.py CHANGED Viewed

@@ -129,6 +129,8 @@ class ConversationNode:
 class ConversationTree(ABC):
+    """Represents a conversation tree"""
     @abstractmethod
     def root(self) -> ConversationNode:
         ...
@@ -192,7 +194,7 @@ class SingleConversationTreeNode(ConversationNode):
         return (
             SingleConversationTreeNode(self.tree, self.index + 1)
             if self.index < len(self.tree.history) - 1
-            else []
+            else None
         )
     def children(self) -> List[ConversationNode]:
@@ -207,8 +209,8 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
     """A conversation tree node"""
     entry: Record
-    parent: Optional["ConversationTreeNode"]
-    children: List["ConversationTreeNode"]
+    _parent: Optional["ConversationTreeNode"]
+    _children: List["ConversationTreeNode"]
     def __init__(self, entry):
         self.entry = entry
@@ -216,8 +218,8 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
         self.children = []
     def add(self, node: "ConversationTreeNode") -> "ConversationTreeNode":
-        self.children.append(node)
-        node.parent = self
+        self._children.append(node)
+        node._parent = self
         return node
     def conversation(self, skip_self: bool) -> ConversationHistory:
@@ -232,14 +234,14 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
     def __iter__(self) -> Iterator["ConversationTreeNode"]:
         """Iterates over all conversation tree nodes (pre-order)"""
         yield self.entry
-        for child in self.children:
+        for child in self._children:
             yield from child
     def parent(self) -> Optional[ConversationNode]:
-        return self.parent
+        return self._parent
     def children(self) -> List[ConversationNode]:
-        return self.children
+        return self._children
     def root(self):
         return self

datamaestro_text/data/conversation/canard.py CHANGED Viewed

@@ -1,12 +1,17 @@
 from typing import Iterator, List
 from attr import define
 import json
+from datamaestro.record import Record
 from datamaestro.data import File
-from .base import (
+from datamaestro_text.data.conversation.base import (
+    ConversationDataset,
     ConversationTree,
     SingleConversationTree,
+    SimpleDecontextualizedItem,
+    EntryType,
 )
-from . import ConversationDataset
+from datamaestro_text.data.ir import IDItem, SimpleTextItem
+import logging
 @define(kw_only=True)
@@ -30,7 +35,10 @@ class CanardConversation:
 class CanardDataset(ConversationDataset, File):
-    """A dataset in the CANARD JSON format"""
+    """A dataset in the CANARD JSON format
+    The CANARD dataset is composed of
+    """
     def entries(self) -> Iterator[CanardConversation]:
         """Iterates over re-written query with their context"""
@@ -47,22 +55,53 @@ class CanardDataset(ConversationDataset, File):
             )
     def __iter__(self) -> Iterator[ConversationTree]:
-        history = []
+        history: list[Record] = []
         current_id = None
         for entry in self.entries():
-            # Check if current conversation
-            if current_id != entry.dialogue_id and current_id is not None:
-                history.reverse()
-                yield SingleConversationTree(current_id, history)
+            # Check if current conversation, otherwise we are OK
+            if current_id != entry.dialogue_id:
+                if current_id is not None:
+                    history.reverse()
+                    yield SingleConversationTree(current_id, history)
+                    history = []
+                current_id = entry.dialogue_id
+            if not history:
+                # First round
+                # The two first items are the wikipedia title and section,
+                # we interpret them as two user queries
+                assert len(entry.history) == 2
+                history.extend(
+                    Record(
+                        SimpleTextItem(text),
+                        EntryType.USER_QUERY,
+                    )
+                    for text in entry.history
+                )
+            else:
+                # The utterance before the last is the last user query
+                assert (
+                    entry.history[-2] == history[-1][SimpleTextItem].text
+                ), f"{entry.dialogue_id} {entry.history} / {history[-4:-1]}"
+                # The last utterance is the system side
+                history.append(
+                    Record(SimpleTextItem(entry.history[-1]), EntryType.SYSTEM_ANSWER)
+                )
+            assert len(entry.history) == len(history)
             # Add to current
             history.append(
-                # FIXME: not working anymore
-                CanardEntry(
-                    query=entry.query,
-                    decontextualized_query=entry.rewrite,
+                Record(
+                    IDItem(f"{entry.dialogue_id}-{entry.query_no}"),
+                    SimpleTextItem(entry.query),
+                    SimpleDecontextualizedItem(entry.rewrite),
+                    EntryType.USER_QUERY,
                 )
             )
-        yield current
+        if current_id:
+            yield SingleConversationTree(current_id, history)

datamaestro_text/data/conversation/orconvqa.py CHANGED Viewed

@@ -1,4 +1,3 @@
-from functools import cached_property
 from typing import Iterator, List, Optional
 from attr import define
 import json

datamaestro_text/data/embeddings.py CHANGED Viewed

@@ -1,5 +1,5 @@
-from pathlib import Path
-from datamaestro.data import Base, File, argument
+from experimaestro import Meta
+from datamaestro.data import Base, File
 from datamaestro.definitions import datatags
 import numpy as np
 from typing import Tuple, List
@@ -18,9 +18,9 @@ class WordEmbeddings(Base):
         raise NotImplementedError()
-@argument("encoding", str, ignored=True, default="utf-8")
 class WordEmbeddingsText(WordEmbeddings, File):
     """Word embeddings as a text word / values"""
+    encoding: Meta[str] = "utf-8"
     def load(self):
         words = []

datamaestro_text/data/ir/base.py CHANGED Viewed

@@ -43,6 +43,12 @@ class IDItem(Item, ABC):
     id: str
+@define
+class UrlItem(Item):
+    """An url item"""
+    url: str
 @define
 class AdhocAssessment:

datamaestro_text/data/ir/cord19.py CHANGED Viewed

@@ -1,7 +1,8 @@
 from csv import DictReader
 from typing import Iterator
-from datamaestro.data import File, documentation
+from experimaestro import documentation
+from datamaestro.data import File
 from datamaestro.record import Record
 from datamaestro_text.data.ir import Documents, TopicRecord, Topics, IDItem
 from datamaestro_text.data.ir.formats import (

datamaestro_text/data/ir/formats.py CHANGED Viewed

@@ -58,7 +58,7 @@ class NFCorpusDocument(TextItem):
     @cached_property
     def text(self):
-        return self.abstract
+        return f"{self.title} {self.abstract}"
 @define
@@ -99,8 +99,8 @@ class WapoDocument(TextItem):
     body_media: Tuple[WapoDocMedia, ...]
     @cached_property
-    def text(self):
-        return self.body
+    def text(self):
+        return f"{self.title} {self.body_paras_html}"
 @define
@@ -126,13 +126,24 @@ class OrConvQADocument(TextItem):
     def text(self):
         return f"{self.title} {self.body}"
 @define
+class DprW100Doc(TextItem):
+    text: str
+    title: str
+@define
+class MsMarcoV2Passage(TextItem):
+    text: str
+    spans: Tuple[Tuple[int, int], ...]
+    msmarco_document_id: str
 class Touche2020(TextItem):
     text: str
     title: str
     stance: str
     url: str
 @define
 class SciDocs(TextItem):
     text: str
@@ -175,7 +186,8 @@ class TrecMb14Query(TextItem):
     def get_text(self):
         return f"{self.query}"
-@define
+@define
 class SciDocsTopic(TextItem):
     text: str
     authors: List[str]
@@ -183,6 +195,7 @@ class SciDocsTopic(TextItem):
     cited_by: List[str]
     references: List[str]
 @define()
 class TrecTopic(SimpleTextItem):
     description: str
@@ -190,3 +203,17 @@ class TrecTopic(SimpleTextItem):
 TrecTopicRecord = record_type(IDItem, TrecTopic)
+@define
+class DprW100Query(TextItem):
+    text: str
+    answers: Tuple[str]
+@define
+class TrecBackgroundLinkingQuery(IDItem):
+    query_id: str
+    doc_id: str
+    url: str
+    def get_text(self):
+        raise NotImplementedError()

datamaestro_text/data/recommendation.py CHANGED Viewed

@@ -1,14 +1,13 @@
-from datamaestro.data import Base, File, argument
+from experimaestro import Param
+from datamaestro.data import Base, File
 import datamaestro.data.csv as csv
-@argument("ratings", type=File)
 class RatedItems(Base):
-    pass
+    ratings: Param[File]
-@argument("links", type=csv.Generic)
-@argument("movies", type=csv.Generic)
-@argument("tags", type=csv.Generic)
 class Movielens(RatedItems):
-    pass
+    links: Param[csv.Generic]
+    movies: Param[csv.Generic]
+    tags: Param[csv.Generic]

datamaestro_text/data/text.py CHANGED Viewed

@@ -1,15 +1,15 @@
-from pathlib import Path
-from datamaestro.data import Base, Folder, File, argument
+from typing import Optional
+from experimaestro import Param
+from datamaestro.data import Base, Folder, File
 from datamaestro.data.ml import Supervised
-@argument("train", type=Base)
-@argument("test", type=Base, required=False)
-@argument("validation", type=Base, required=False)
 class TrainingText(Supervised):
     """ "A dataset used for training with a train and a test"""
-    pass
+    train: Param[Base]
+    test: Param[Optional[Base]] = None
+    validation: Param[Optional[Base]] = None
 class TextFolder(Folder):

datamaestro_text/datasets/irds/data.py CHANGED Viewed

@@ -37,6 +37,7 @@ from datamaestro_text.data.ir.base import (
     SimpleAdhocAssessment,
     SimpleTextItem,
     TopicRecord,
+    UrlItem,
     create_record,
 )
@@ -165,6 +166,19 @@ class Documents(ir.DocumentStore, IRDSId):
             "source",
             "source_content_type",
         ),
+        _irds.dpr_w100.DprW100Doc: tuple_constructor(
+            formats.DprW100Doc,
+            "doc_id",
+            "text",
+            "title",
+        ),
+        _irds.msmarco_passage_v2.MsMarcoV2Passage: tuple_constructor(
+            formats.MsMarcoV2Passage,
+            "doc_id",
+            "text",
+            "spans",
+            "msmarco_document_id",
+        ),
     }
     """Wraps an ir datasets collection -- and provide a default text
@@ -385,6 +399,12 @@ class Topics(ir.TopicsStore, IRDSId):
             "tweet_time",
             "description",
         ),
+        _irds.dpr_w100.DprW100Query: tuple_constructor(
+            formats.DprW100Query,
+            "query_id",
+            "text",
+            "answers"
+        ),
     }
     HANDLERS = {
@@ -415,7 +435,52 @@ class Topics(ir.TopicsStore, IRDSId):
     def iter(self) -> Iterator[TopicRecord]:
         """Returns an iterator over topics"""
         return self.handler.iter()
+class TrecBackgroundLinkingTopicsHandler(TopicsHandler):
+    def __init__(self, dataset):
+        self.dataset = dataset
+    @cached_property
+    def ext2records(self):
+        return {record[IDItem].id: record for record in self.records}
+    def topic_int(self, internal_topic_id: int) -> TopicRecord:
+        """Returns a document given its internal ID"""
+        return self.records[internal_topic_id]
+    def topic_ext(self, external_topic_id: str) -> TopicRecord:
+        """Returns a document given its external ID"""
+        return self.ext2records[external_topic_id]
+    def iter(self) -> Iterator[ir.TopicRecord]:
+        """Returns an iterator over topics"""
+        return iter(self.records)
+    @cached_property
+    def records(self):
+        try:
+            records = []
+            for query in self.dataset.dataset.queries_iter():
+                topic =  Record(
+                    IDItem(query.query_id),
+                    # Following BEIR documentation, we use title of documents as queries: https://github.com/beir-cellar/beir/blob/main/examples/dataset/README.md#queries-and-qrels
+                    SimpleTextItem(self.dataset.dataset.docs_store().get(query.doc_id).title),
+                    UrlItem(query.url),
+                )
+                records.append(topic)
+        except Exception:
+            logging.exception("Error while computing topic records")
+            raise
+        return records
+Topics.HANDLERS.update(
+    {
+        _irds.wapo.TrecBackgroundLinkingQuery: TrecBackgroundLinkingTopicsHandler
+    }
+)
 class CastTopicsHandler(TopicsHandler):
     def __init__(self, dataset):

datamaestro_text/datasets/irds/datasets.py CHANGED Viewed

@@ -116,10 +116,6 @@ class AdhocRunDataset(Dataset):
     def _prepare(self, download=False) -> AdhocRun:
         return AdhocRun(id=self.fullid)
-    @property
-    def configtype(self):
-        return AdhocRun
 class Collection(Dataset):
     base = Adhoc

datamaestro_text/utils/iter.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Callable, TypeVar, Iterator, List, Union
+from typing import Callable, Sequence, TypeVar, Iterator, List, Union
 T = TypeVar("T")
@@ -45,7 +45,7 @@ class RangeView:
         return RangeView(self.source, key)
-class LazyList:
+class LazyList(Sequence):
     """Iterable-based list
     The list is only materialized if needed"""
@@ -63,6 +63,9 @@ class LazyList:
         else:
             return iter(self.materialized_list)
+    def __len__(self):
+        return len(self.iterable)
     def __getitem__(self, index):
         # Materialize the list if accessing an index above the threshold or any slice
         if isinstance(index, slice) or index >= self.materialize_threshold:

datamaestro_text/version.py CHANGED Viewed

@@ -1,8 +1,13 @@
-# file generated by setuptools_scm
+# file generated by setuptools-scm
 # don't change, don't track in version control
+__all__ = ["__version__", "__version_tuple__", "version", "version_tuple"]
 TYPE_CHECKING = False
 if TYPE_CHECKING:
-    from typing import Tuple, Union
+    from typing import Tuple
+    from typing import Union
     VERSION_TUPLE = Tuple[Union[int, str], ...]
 else:
     VERSION_TUPLE = object
@@ -12,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '2025.1.7'
-__version_tuple__ = version_tuple = (2025, 1, 7)
+__version__ = version = '2025.5.13'
+__version_tuple__ = version_tuple = (2025, 5, 13)

{datamaestro_text-2025.1.7.dist-info → datamaestro_text-2025.5.13.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.4
 Name: datamaestro-text
-Version: 2025.1.7
+Version: 2025.5.13
 Summary: Datamaestro module for text-related datasets
 Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
 License: GPL-3
@@ -18,9 +18,16 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Requires-Python: >=3.8
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: datamaestro>=1.2.1
+Requires-Dist: datamaestro>=1.4.2
 Requires-Dist: ir_datasets>=0.5.8
 Requires-Dist: attrs
+Provides-Extra: dev
+Requires-Dist: pytest; extra == "dev"
+Requires-Dist: docutils; extra == "dev"
+Requires-Dist: sphobjinv; extra == "dev"
+Requires-Dist: flake8; extra == "dev"
+Requires-Dist: sphinx; extra == "dev"
+Dynamic: license-file
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) [![PyPI version](https://badge.fury.io/py/datamaestro-text.svg)](https://badge.fury.io/py/datamaestro-text)

{datamaestro_text-2025.1.7.dist-info → datamaestro_text-2025.5.13.dist-info}/RECORD RENAMED Viewed

@@ -1,25 +1,25 @@
 datamaestro_text/__init__.py,sha256=hU8jZpkXl3F74qIfqnJl7v4nJ9YxfoR7IpJpUREFNRI,248
-datamaestro_text/version.py,sha256=k65KHkS4PD3NjDDCJgWcTwRBE4yVcKs32B8SUxv2DvE,417
+datamaestro_text/version.py,sha256=EsLzhbhZSIiOqGSyEpMlneQnIpzB12JreUxG8EMn7EE,519
 datamaestro_text/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamaestro_text/config/ai/quac.yaml,sha256=h1D7UJo2z1nZ_9MXpDHuQNJG9Ma2oayUdJV6lyumAIg,1103
 datamaestro_text/config/com/oscar-corpus.py,sha256=dDfdy_uhG6esRQQzUOOORwK8KvEq4c4hZbWt22pv-zY,733
-datamaestro_text/config/com/sentiment140.py,sha256=iRV_rSjQcr9WNjjQ4EdznaEMGLADV_rwpeRx7ycQi0k,1383
+datamaestro_text/config/com/sentiment140.py,sha256=3cZfqs395gY14gGojRC_RTYEdG5l8RUTSdF9li8ynBc,1283
 datamaestro_text/config/com/fastml/goodbooks-10k.yaml,sha256=5ZABxUnBFs2ZnCXtBH8YoBiPb3SocRRdH1DLSfVWF-Y,1172
-datamaestro_text/config/com/github/aagohary/canard.py,sha256=tIwb_KxMUR3st7rzQUkt6rIjolTl8uKvDq6t795b1nY,1468
-datamaestro_text/config/com/github/apple/ml-qrecc.py,sha256=_lufJlg-4zaQyjr5Ae-X-9hXzpl2JFjfxh_RhnBbva4,3068
+datamaestro_text/config/com/github/aagohary/canard.py,sha256=DREIKU2FFrfwibjU48Y1AsKTUzhwxmD4Kuykd4bDIus,1762
+datamaestro_text/config/com/github/apple/ml-qrecc.py,sha256=-iHKJocWZ8N9N-P8E45y4ewg3OT_23XonlDh5-NcH2g,3055
 datamaestro_text/config/com/github/prdwb/orconvqa.py,sha256=Tg3hbnvilxW_Lwt7fpKvGMtu-6mc9oNIHM-LX6JTR1U,3026
 datamaestro_text/config/com/github/soskek/bookcorpus.yaml,sha256=qJKs35yeEIilEMgNvU3OEqMp1TSn7mDM2T-uYyA7kTU,1607
 datamaestro_text/config/com/microsoft/wikiqa.yaml,sha256=U7rU-W2Xz1MYv2YXT4jCTj5DsHyM0VssbQPNa3EISaM,540
-datamaestro_text/config/com/microsoft/msmarco/passage.py,sha256=ouNn2nivS6mUMaCyMzqxNv1YMoPrSEX-UcSZpG1v_uw,11645
+datamaestro_text/config/com/microsoft/msmarco/passage.py,sha256=tVldwVh0pQpXXifqcMNEN9cLO1HXkXoEhToSazSx5RE,11643
 datamaestro_text/config/com/smashwords/bookcorpus.py,sha256=hCFjZg9t1AHET05qU31uatiAOD5EEzzWG9y_W90jcOE,861
 datamaestro_text/config/edu/cornell/nlvr.yaml,sha256=9Yk5VZMncSmrP7JNuGXqExksgX5nQ_Zfnlps8hWze3Q,921
 datamaestro_text/config/edu/stanford/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamaestro_text/config/edu/stanford/aclimdb.py,sha256=lsPDxnp_rWOCpBte6pZ0_LVaC33w5mmgfGh51rcTgt8,643
-datamaestro_text/config/edu/stanford/glove.py,sha256=ykkQ7nYWqhmgc2TeohNMliYSiX831cYUygftkBTGIac,2390
+datamaestro_text/config/edu/stanford/glove.py,sha256=bXxwiJqT8alPs5MwwxVuY0xBO8g1QWhm2L3AKVskTlA,2391
 datamaestro_text/config/edu/stanford/im2p.yaml,sha256=JoToNyEPpmwdyLFedCBot5ypyw7p9rzi12mGXJuZin0,2909
 datamaestro_text/config/edu/upenn/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamaestro_text/config/edu/upenn/ldc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datamaestro_text/config/edu/upenn/ldc/aquaint.py,sha256=Y_biKee8LA8arsHiKOUlPBWfylDDM9k-x5UgN-uJdLE,1658
+datamaestro_text/config/edu/upenn/ldc/aquaint.py,sha256=YEU3kIXzv71Vjg9bUoXyQ-vSLvC-y4LlE3mJQf38XuY,1589
 datamaestro_text/config/gov/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamaestro_text/config/gov/nist/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamaestro_text/config/gov/nist/ir/covid.py,sha256=wn2E7sQ8M6pAucVD4sKJYImyzKUKphyiDFJD0oYRCbg,4004
@@ -28,7 +28,7 @@ datamaestro_text/config/gov/nist/trec/adhoc.py,sha256=vcFaLlZ-chwDt013MYI8bYZ2ug
 datamaestro_text/config/gov/nist/trec/clueweb.yaml,sha256=sm4UbdtMzWoDVPsewtVDS7Vj2jBOdgp18Xqo1X4ysQc,792
 datamaestro_text/config/gov/nist/trec/deeplearning.yaml,sha256=QGM7PtXLJRttNdOPE16o7-k3e5tA9HgcaM_-qFDV_5Q,2125
 datamaestro_text/config/gov/nist/trec/index.yaml,sha256=oSUhUmtukf5oXqUbJLhae8xZx7Uac5V6uZoUB-RJ7Sw,2711
-datamaestro_text/config/gov/nist/trec/tipster.py,sha256=rmVFcwUPAfD529rneZUlCLBke-edYjrBIH3n02-qfvc,5371
+datamaestro_text/config/gov/nist/trec/tipster.py,sha256=t0w9lOBfvwt6YCYCyDj9fo1QiBXfs0q8qzKl74f40jU,5398
 datamaestro_text/config/gov/nist/trec/web.yaml,sha256=iNIJ-PCeLwpUaJByJesyCqfFJolWCts_VETACha3hfQ,563
 datamaestro_text/config/io/github/rajpurkar/squad.yaml,sha256=JchkTruEhVd0uOTGgek2tOgBL4UT3G3KVASvZcNigLg,1143
 datamaestro_text/config/io/github/thunlp/fewrel.py,sha256=LJYexk2ssZ5SqKIKLjPdybS9ju4g9fGIFy5eybtsvYQ,1224
@@ -41,28 +41,28 @@ datamaestro_text/config/org/grouplens/movielens.py,sha256=e78JoiHuwP6gbj7Q84UwPT
 datamaestro_text/config/org/universaldependencies/french.py,sha256=W_gDtfinjlw08qohX_PWvzQlacDwRFB7PeOzO33mRVU,2208
 datamaestro_text/config/uk/ac/ucl/cs/qangaroo.yaml,sha256=IBy82CDNNLjJPNPzues1EgDXu0A5WDvUFeVNSOyrIpI,1137
 datamaestro_text/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datamaestro_text/data/embeddings.py,sha256=AskX7Ggvkpqhb-Je_hBTFp_vfkiWzWtJH1gFQxuUTwM,1155
-datamaestro_text/data/recommendation.py,sha256=MatelpJQiCMpNDuaQVBkRY809J1qiQo-8QO9XD2J_dU,303
+datamaestro_text/data/embeddings.py,sha256=qdeZ4m6drFNkqW_4pKxJKdJHtOnDRs0P7QVJ7AO6xFE,1127
+datamaestro_text/data/recommendation.py,sha256=wHV_9SXSclouuXaBmvwg3ncZLOFfdKRZG3IHkkPJX9Y,279
 datamaestro_text/data/tagging.py,sha256=yWm7bNLks77cAySa1aZNCmLFxTTqhqXZ0PaoaEYU6hI,697
-datamaestro_text/data/text.py,sha256=pOI8nrEd6RoQA28DVH1JufHTunr9vG3FQzwElR8YirI,499
+datamaestro_text/data/text.py,sha256=_9J7-j06BOIZ1HsOcBsWy_asanAKkDoZIsWWMMj4tm4,496
 datamaestro_text/data/conversation/__init__.py,sha256=esOWnSImMlQs5QtfxUt559ABLd6a5wwoNpj9XtIq71Y,159
-datamaestro_text/data/conversation/base.py,sha256=zXX5sPMoVtDf4WTkLx45IvZz6lgKPgcUZFu6N4lVBlc,6457
-datamaestro_text/data/conversation/canard.py,sha256=IMxu5NfytWZLZ_cyT8UFOICbDE82HLJoNBT_6j36Faw,1808
-datamaestro_text/data/conversation/orconvqa.py,sha256=VG3GV5_IPQcvla9rrQPypDNcZYmKNMgozmr2oudeLA4,3802
+datamaestro_text/data/conversation/base.py,sha256=PUVRCSMBlV9bSayBl-vnzsYvyr2Tdv_zTadIC_Tswe0,6508
+datamaestro_text/data/conversation/canard.py,sha256=FI02rPNuzSsA_uXOcgcuQn31d3QahmHndzYCV1Zm8rk,3305
+datamaestro_text/data/conversation/orconvqa.py,sha256=RL9cpK7QK2dsk_j2e4Th8wzGFq-G3ww_EjdM1eKh-8o,3764
 datamaestro_text/data/conversation/qrecc.py,sha256=es4GmqPtE63A7O_GARe8Zy3rQvuLEhAvUA7CfN_nMeA,2562
 datamaestro_text/data/ir/__init__.py,sha256=EOVnRv9oKEEucMOgrFjhXxeUWEkaJkbXBndpkKhZaiY,8701
-datamaestro_text/data/ir/base.py,sha256=Cw8egjChpx4ksUwp-vTA70B2OWxROH7FIeJylsXggAk,1429
-datamaestro_text/data/ir/cord19.py,sha256=7xDIzsy63WrA9lxxyNOMu5ECRymu5x23EzYG977nS6Y,1440
+datamaestro_text/data/ir/base.py,sha256=TSaY8UWoixqvKg1z5JfkXPnuZiVoChYEUvc7jjbpZqY,1495
+datamaestro_text/data/ir/cord19.py,sha256=yu1Khgy6AZjH2BPQKdnxDid0vQgQ8zvb8-FQlHH-GOU,1465
 datamaestro_text/data/ir/csv.py,sha256=tnxuqR_MZ3GQhuoXEMYyWLQw8PyD0gRqsnzIP5Gsziw,1212
 datamaestro_text/data/ir/data.py,sha256=ERmmOxz_9mUkIuccNbzUa5Y6gVLLVDdyc4cCxbCCUbY,20
-datamaestro_text/data/ir/formats.py,sha256=rU9uJhdFDdlnQ2qfFowK_--1WVOH1fVgfXWOPPDF_FY,3160
+datamaestro_text/data/ir/formats.py,sha256=K0mqmCGg0seneKo-Rt3cBfDVjEMS4_6t1MfNj4iW9Y4,3637
 datamaestro_text/data/ir/huggingface.py,sha256=G71VFDN-SllZy4LFxumEbCumEJvb5-orAbLemHjWhiA,956
 datamaestro_text/data/ir/stores.py,sha256=rlOogoBAfsP7o01KqvHiTF1XqzK2Fp6QbRmuasabKAE,859
 datamaestro_text/data/ir/trec.py,sha256=qDOzmgUn0hMqeP9KdI190-9IKsLl6yNRqIoBz6s-cBs,1898
 datamaestro_text/data/ir/utils.py,sha256=6-GhXVtgkBZGhIs2-ODZua_3DmKjSSVydStpHDqbAwE,833
 datamaestro_text/datasets/irds/__init__.py,sha256=Tq0HN1qojnZYLBumM59BuTkz7r0gcu-5OXmDDLgPpAc,707
-datamaestro_text/datasets/irds/data.py,sha256=BTsFZdjFJ0XdlYRh2rcshAoeknHrypf7be2BGUbWWFk,19931
-datamaestro_text/datasets/irds/datasets.py,sha256=qtN-nX2_OV9FD339aZjwVL1jFf6I4T6MbNCcuRAjmtU,5682
+datamaestro_text/datasets/irds/data.py,sha256=0V5nPMQeh1I3Sp36x_NE7xrbC_vbZAKhU5NONj95aok,22058
+datamaestro_text/datasets/irds/datasets.py,sha256=yrJx3X7u7oYcHXsL8YmUrXsQhkiqkBC6LjeZA_Ldx5Q,5617
 datamaestro_text/datasets/irds/helpers.py,sha256=fGE-fbuJbXdTzl1qo55247jzn9cvApY-d82GJBgfY1E,3982
 datamaestro_text/datasets/irds/utils.py,sha256=m30JLIrV_HgilN11TvY9dGTyumES6LLzWZDUAMT915M,1425
 datamaestro_text/download/tmdb.py,sha256=kU_Vz9jhznlyeKMHziVu58IHoWv8zPu6CZTHVNQvmu4,4009
@@ -75,12 +75,12 @@ datamaestro_text/transforms/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMp
 datamaestro_text/transforms/ir/__init__.py,sha256=eWxr0ZtfK9F9J4TachW5V8Ra5AhQJueDyOGR8vXi1uA,6553
 datamaestro_text/utils/__init__.py,sha256=2449YLTAtKJzkmt84Mu8sBRCCveNs5fiaqTCK_p5ha0,3340
 datamaestro_text/utils/files.py,sha256=n6ZGl5LNrZbHLcV9RFwd7cFT0vPUezit-2dsBzs1vRQ,170
-datamaestro_text/utils/iter.py,sha256=2_UZ8y9Ma4k5U9ZD4w55Zfb6NGrKM1L4G40OygRm1is,2459
+datamaestro_text/utils/iter.py,sha256=uzBmavBeqVyEvYmi9Ds2dqh0ywhJE-1SBO5v8AoIKcc,2537
 datamaestro_text/utils/randomstream.py,sha256=_-boH4IIqN8qcl3IktjpNp9vmF4TWRzHUSNVwg7WAr8,973
 datamaestro_text/utils/shuffle.py,sha256=o8JTz3mr0lYWyv0zEh91jEK12ci1etMiUnzh5GkOHCM,3490
-datamaestro_text-2025.1.7.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
-datamaestro_text-2025.1.7.dist-info/METADATA,sha256=-wsBcUcnEnOMiHb1ROIf43r55ZNNNjpIemuFXm5hHUY,1609
-datamaestro_text-2025.1.7.dist-info/WHEEL,sha256=A3WOREP4zgxI0fKrHUG8DC8013e3dK3n7a6HDbcEIwE,91
-datamaestro_text-2025.1.7.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
-datamaestro_text-2025.1.7.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
-datamaestro_text-2025.1.7.dist-info/RECORD,,
+datamaestro_text-2025.5.13.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
+datamaestro_text-2025.5.13.dist-info/METADATA,sha256=EYppi8IJMqWU3ObzwSvM_PuOkC_pgwGAxwvaFx2dG3A,1847
+datamaestro_text-2025.5.13.dist-info/WHEEL,sha256=DnLRTWE75wApRYVsjgc6wsVswC54sMSJhAEd4xhDpBk,91
+datamaestro_text-2025.5.13.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
+datamaestro_text-2025.5.13.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
+datamaestro_text-2025.5.13.dist-info/RECORD,,

{datamaestro_text-2025.1.7.dist-info → datamaestro_text-2025.5.13.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.7.0)
+Generator: setuptools (80.4.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{datamaestro_text-2025.1.7.dist-info → datamaestro_text-2025.5.13.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{datamaestro_text-2025.1.7.dist-info → datamaestro_text-2025.5.13.dist-info/licenses}/LICENSE RENAMED Viewed

File without changes

{datamaestro_text-2025.1.7.dist-info → datamaestro_text-2025.5.13.dist-info}/top_level.txt RENAMED Viewed

File without changes

datamaestro-text 2025.1.7__py3-none-any.whl → 2025.5.13__py3-none-any.whl

datamaestro-text 2025.1.7py3-none-any.whl → 2025.5.13py3-none-any.whl