PyPI - datamaestro-text - Versions diffs - 2024.5.31__py3-none-any.whl → 2025.1.7__py3-none-any.whl - Mend

datamaestro-text 2024.5.31py3-none-any.whl → 2025.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

datamaestro_text/config/com/github/apple/ml-qrecc.py CHANGED Viewed

@@ -1,11 +1,20 @@
 # See documentation on https://datamaestro.readthedocs.io
+import re
+import json
 from pathlib import Path
 from datamaestro.definitions import datatasks, datatags, dataset
 from datamaestro.data.ml import Supervised
+from datamaestro.download import reference
 from datamaestro.download.archive import zipdownloader
+from datamaestro.download.wayback import wayback_documents
 from datamaestro.utils import HashCheck
 from datamaestro_text.data.conversation.qrecc import QReCCDataset
+from datamaestro_text.datasets.irds.data import (
+    LZ4JSONLDocumentStore,
+    SimpleJsonDocument,
+)
+from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
 @datatags("conversation", "context", "query")
@@ -35,3 +44,44 @@ def main(data: Path):
         "train": QReCCDataset(path=data / "qrecc_train.json"),
         "test": QReCCDataset(path=data / "qrecc_test.json"),
     }
+@dataset(
+    url="https://github.com/apple/ml-qrecc",
+    doi="https://doi.org/10.48550/arXiv.2010.04898",
+)
+class Content(LZ4JSONLDocumentStore):
+    """QReCC mentionned URLs content"""
+    @staticmethod
+    def __create_dataset__(dataset, options=None):
+        ds = reference(reference=main).setup(dataset, options)
+        documents_path = wayback_documents(
+            "20191127", lambda: Content._urls(ds), name="wayback.jsonl"
+        ).setup(dataset, options)
+        store_path = lz4docstore_builder(
+            "store",
+            lambda: Content._documents(documents_path),
+            SimpleJsonDocument,
+            "id",
+        ).setup(dataset, options)
+        return LZ4JSONLDocumentStore(jsonl_path=store_path)
+    @staticmethod
+    def _documents(path: Path):
+        """Iterates over documents from wayback"""
+        with path.open("rt") as fp:
+            for line in fp:
+                yield SimpleJsonDocument(**json.loads(line))
+    @staticmethod
+    def _urls(supervised: Supervised[QReCCDataset, None, QReCCDataset]):
+        urls = set()
+        for ds in [supervised.train, supervised.test]:
+            for entry in ds.entries():
+                if entry.answer_url:
+                    url = re.sub("#.*$", "", entry.answer_url)
+                    urls.add(url)
+        return urls

datamaestro_text/data/conversation/base.py CHANGED Viewed

@@ -188,7 +188,7 @@ class SingleConversationTreeNode(ConversationNode):
     def history(self) -> Sequence[Record]:
         return self.tree.history[self.index + 1 :]
-    def parent(self) -> ConversationNode | None:
+    def parent(self) -> Optional[ConversationNode]:
         return (
             SingleConversationTreeNode(self.tree, self.index + 1)
             if self.index < len(self.tree.history) - 1
@@ -235,7 +235,7 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
         for child in self.children:
             yield from child
-    def parent(self) -> ConversationNode | None:
+    def parent(self) -> Optional[ConversationNode]:
         return self.parent
     def children(self) -> List[ConversationNode]:

datamaestro_text/data/ir/formats.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from functools import cached_property
-from typing import ClassVar, Tuple
+from typing import ClassVar, Tuple, List
 from attrs import define
 from datamaestro.record import record_type
 from ir_datasets.datasets.wapo import WapoDocMedia
@@ -117,7 +117,6 @@ class TweetDoc(TextItem):
 @define
 class OrConvQADocument(TextItem):
-    id: str
     title: str
     body: str
     aid: str
@@ -127,12 +126,21 @@ class OrConvQADocument(TextItem):
     def text(self):
         return f"{self.title} {self.body}"
+@define
+class Touche2020(TextItem):
+    text: str
+    title: str
+    stance: str
+    url: str
 @define
-class TrecTopic(TextItem):
+class SciDocs(TextItem):
     text: str
-    query: str
-    narrative: str
+    title: str
+    authors: List[str]
+    year: int
+    cited_by: List[str]
+    references: List[str]
 @define
@@ -167,6 +175,13 @@ class TrecMb14Query(TextItem):
     def get_text(self):
         return f"{self.query}"
+@define
+class SciDocsTopic(TextItem):
+    text: str
+    authors: List[str]
+    year: int
+    cited_by: List[str]
+    references: List[str]
 @define()
 class TrecTopic(SimpleTextItem):

datamaestro_text/data/ir/stores.py CHANGED Viewed

@@ -1,17 +1,21 @@
 from collections import namedtuple
-from typing import List
+from typing import List, NamedTuple
 from experimaestro import Constant
 import attrs
 from datamaestro.record import Record
+from datamaestro_text.data.ir.base import IDItem
 from datamaestro_text.datasets.irds.data import LZ4DocumentStore
 from datamaestro_text.data.ir.formats import OrConvQADocument
 class OrConvQADocumentStore(LZ4DocumentStore):
-    NAMED_TUPLE = namedtuple(
-        "OrConvQADocument", [a.name for a in attrs.fields(OrConvQADocument)]
-    )
+    class NAMED_TUPLE(NamedTuple):
+        id: str
+        title: str
+        body: str
+        aid: str
+        bid: int
     lookup_field: Constant[str] = "id"
     fields: Constant[List[str]] = list(NAMED_TUPLE._fields)
@@ -19,5 +23,7 @@ class OrConvQADocumentStore(LZ4DocumentStore):
     data_cls = NAMED_TUPLE
-    def converter(self, data: NAMED_TUPLE) -> OrConvQADocument:
-        return Record(OrConvQADocument(**data._asdict()))
+    def converter(self, data: NAMED_TUPLE) -> Record:
+        fields = data._asdict()
+        del fields["id"]
+        return Record(OrConvQADocument(**fields), IDItem(data.id))

datamaestro_text/datasets/irds/data.py CHANGED Viewed

@@ -1,36 +1,44 @@
+import logging
 from abc import ABC, abstractmethod
+from dataclasses import dataclass
 from functools import partial
-import logging
 from pathlib import Path
-from typing import Dict, Iterator, Tuple, Type, List
+from typing import Dict, Iterator, List, NamedTuple, Tuple, Type
 import ir_datasets
-from ir_datasets.indices import PickleLz4FullStore
+import ir_datasets.datasets as _irds
+from datamaestro.record import RecordType, record_type
+from experimaestro import Config, Meta, Option, Param
+from experimaestro.compat import cached_property
 from ir_datasets.formats import (
     GenericDoc,
-    GenericQuery,
     GenericDocPair,
+    GenericQuery,
     TrecParsedDoc,
     TrecQuery,
 )
-import ir_datasets.datasets as _irds
-from experimaestro import Config, Param
-from experimaestro.compat import cached_property
-from experimaestro import Option
-from datamaestro.record import RecordType, record_type
-from datamaestro_text.data.conversation.base import AnswerEntry
+from ir_datasets.indices import PickleLz4FullStore
 import datamaestro_text.data.ir as ir
+import datamaestro_text.data.ir.formats as formats
+from datamaestro_text.data.conversation.base import (
+    AnswerDocumentID,
+    AnswerEntry,
+    ConversationHistoryItem,
+    ConversationTreeNode,
+    DecontextualizedDictItem,
+    EntryType,
+)
 from datamaestro_text.data.ir.base import (
-    Record,
-    TopicRecord,
-    DocumentRecord,
-    SimpleTextItem,
     AdhocAssessedTopic,
-    SimpleAdhocAssessment,
+    DocumentRecord,
     IDItem,
+    Record,
+    SimpleAdhocAssessment,
+    SimpleTextItem,
+    TopicRecord,
     create_record,
 )
-import datamaestro_text.data.ir.formats as formats
 # Interface between ir_datasets and datamaestro:
 # provides adapted data types
@@ -109,6 +117,12 @@ class Documents(ir.DocumentStore, IRDSId):
         _irds.beir.BeirTitleUrlDoc: tuple_constructor(
             formats.TitleUrlDocument, "doc_id", "text", "title", "url"
         ),
+        _irds.beir.BeirToucheDoc: tuple_constructor(
+            formats.Touche2020, "doc_id", "text", "title", "stance", "url"
+        ),
+        _irds.beir.BeirSciDoc: tuple_constructor(
+            formats.SciDocs, "doc_id", "text", "title", "authors", "year", "cited_by", "references"
+        ),
         _irds.msmarco_document.MsMarcoDocument: tuple_constructor(
             formats.MsMarcoDocument, "doc_id", "url", "title", "body"
         ),
@@ -216,20 +230,6 @@ if hasattr(_irds, "miracl"):
     )
-# Fix while PR https://github.com/allenai/ir_datasets/pull/252
-# is not in.
-class DMPickleLz4FullStore(PickleLz4FullStore):
-    def get_many(self, doc_ids, field=None):
-        result = {}
-        field_idx = self._doc_cls._fields.index(field) if field is not None else None
-        for doc in self.get_many_iter(doc_ids):
-            if field is not None:
-                result[getattr(doc, self._id_field)] = doc[field_idx]
-            else:
-                result[getattr(doc, self._id_field)] = doc
-        return result
 class LZ4DocumentStore(ir.DocumentStore):
     """A LZ4-based document store"""
@@ -243,7 +243,7 @@ class LZ4DocumentStore(ir.DocumentStore):
     @cached_property
     def store(self):
-        return DMPickleLz4FullStore(
+        return PickleLz4FullStore(
             self.path, None, self.data_cls, self.lookup_field, self.index_fields
         )
@@ -262,10 +262,10 @@ class LZ4DocumentStore(ir.DocumentStore):
         retrieved = self.store.get_many(docids)
         return [self.converter(retrieved[docid]) for docid in docids]
+    @abstractmethod
     def converter(self, data):
-        """Converts a document from LZ4 tuples to any other format"""
-        # By default, use identity
-        return data
+        """Converts a document from LZ4 tuples to a document record"""
+        ...
     def iter(self) -> Iterator[DocumentRecord]:
         """Returns an iterator over documents"""
@@ -278,6 +278,25 @@ class LZ4DocumentStore(ir.DocumentStore):
         return self.store.count()
+class SimpleJsonDocument(NamedTuple):
+    id: str
+    text: str
+class LZ4JSONLDocumentStore(LZ4DocumentStore):
+    jsonl_path: Meta[Path]
+    """json-l based document store
+    Each line is of the form
+    ```json
+    { "id": "...", "text": "..." }
+    ```
+    """
+    def converter(self, data):
+        return DocumentRecord(IDItem(data["id"]), SimpleTextItem(data["text"]))
 class TopicsHandler(ABC):
     @abstractmethod
     def topic_int(self, internal_topic_id: int) -> TopicRecord:
@@ -349,6 +368,12 @@ class Topics(ir.TopicsStore, IRDSId):
         TrecQuery: tuple_constructor(
             formats.TrecTopic, "query_id", "title", "description", "narrative"
         ),
+        _irds.beir.BeirToucheQuery: tuple_constructor(
+            formats.TrecTopic, "query_id", "text", "description", "narrative"
+        ),
+        _irds.beir.BeirSciQuery: tuple_constructor(
+            formats.SciDocsTopic, "query_id", "text", "authors", "year", "cited_by", "references"
+        ),
         _irds.tweets2013_ia.TrecMb13Query: tuple_constructor(
             formats.TrecMb13Query, "query_id", "query", "time", "tweet_time"
         ),
@@ -392,197 +417,190 @@ class Topics(ir.TopicsStore, IRDSId):
         return self.handler.iter()
-if hasattr(_irds.trec_cast, "Cast2022Query"):
-    from datamaestro_text.data.conversation.base import (
-        ConversationTreeNode,
-        DecontextualizedDictItem,
-        AnswerDocumentID,
-        ConversationHistoryItem,
-        EntryType,
-    )
+class CastTopicsHandler(TopicsHandler):
+    def __init__(self, dataset):
+        self.dataset = dataset
+    @cached_property
+    def ext2records(self):
+        return {record[IDItem].id: record for record in self.records}
+    def topic_int(self, internal_topic_id: int) -> TopicRecord:
+        """Returns a document given its internal ID"""
+        return self.records[internal_topic_id]
+    def topic_ext(self, external_topic_id: str) -> TopicRecord:
+        """Returns a document given its external ID"""
+        return self.ext2records[external_topic_id]
+    def iter(self) -> Iterator[ir.TopicRecord]:
+        """Returns an iterator over topics"""
+        return iter(self.records)
-    class CastTopicsHandler(TopicsHandler):
-        def __init__(self, dataset):
-            self.dataset = dataset
-        @property
-        @abstractmethod
-        def records(self):
-            ...
-        @cached_property
-        def ext2records(self):
-            return {record[IDItem].id: record for record in self.records}
-        def topic_int(self, internal_topic_id: int) -> TopicRecord:
-            """Returns a document given its internal ID"""
-            return self.records[internal_topic_id]
-        def topic_ext(self, external_topic_id: str) -> TopicRecord:
-            """Returns a document given its external ID"""
-            return self.ext2records[external_topic_id]
-        def iter(self) -> Iterator[ir.TopicRecord]:
-            """Returns an iterator over topics"""
-            return iter(self.records)
-        @cached_property
-        def records(self):
-            try:
-                topic_number = None
-                node = None
-                conversation = []
-                records = []
-                for query in self.dataset.dataset.queries_iter():
-                    decontextualized = DecontextualizedDictItem(
-                        "manual",
-                        {
-                            "manual": query.manual_rewritten_utterance,
-                            "auto": query.automatic_rewritten_utterance,
-                        },
+    @cached_property
+    def records(self):
+        try:
+            topic_number = None
+            node = None
+            conversation = []
+            records = []
+            for query in self.dataset.dataset.queries_iter():
+                decontextualized = DecontextualizedDictItem(
+                    "manual",
+                    {
+                        "manual": query.manual_rewritten_utterance,
+                        "auto": query.automatic_rewritten_utterance,
+                    },
+                )
+                is_new_conversation = topic_number != query.topic_number
+                topic = Record(
+                    IDItem(query.query_id),
+                    SimpleTextItem(query.raw_utterance),
+                    decontextualized,
+                    ConversationHistoryItem(
+                        [] if is_new_conversation else node.conversation(False)
+                    ),
+                    EntryType.USER_QUERY,
+                )
+                if is_new_conversation:
+                    conversation = []
+                    node = ConversationTreeNode(topic)
+                    topic_number = query.topic_number
+                else:
+                    node = node.add(ConversationTreeNode(topic))
+                records.append(topic)
+                conversation.append(node)
+                node = node.add(
+                    ConversationTreeNode(
+                        Record(
+                            AnswerDocumentID(self.get_canonical_result_id(query)),
+                            EntryType.SYSTEM_ANSWER,
+                        )
                     )
+                )
+                conversation.append(node)
+        except Exception:
+            logging.exception("Error while computing topic records")
+            raise
+        return records
+    @staticmethod
+    def get_canonical_result_id():
+        return None
-                    is_new_conversation = topic_number != query.topic_number
+class Cast2020TopicsHandler(CastTopicsHandler):
+    @staticmethod
+    def get_canonical_result_id(query: _irds.trec_cast.Cast2020Query):
+        return query.manual_canonical_result_id
+class Cast2021TopicsHandler(CastTopicsHandler):
+    @staticmethod
+    def get_canonical_result_id(query: _irds.trec_cast.Cast2021Query):
+        return query.canonical_result_id
+class Cast2022TopicsHandler(CastTopicsHandler):
+    def __init__(self, dataset):
+        self.dataset = dataset
+    @cached_property
+    def records(self):
+        try:
+            records = []
+            nodes: Dict[str, ConversationTreeNode] = {}
+            for (
+                query
+            ) in (
+                self.dataset.dataset.queries_iter()
+            ):  # type: _irds.trec_cast.Cast2022Query
+                parent = nodes[query.parent_id] if query.parent_id else None
+                if query.participant == "User":
                     topic = Record(
                         IDItem(query.query_id),
                         SimpleTextItem(query.raw_utterance),
-                        decontextualized,
+                        DecontextualizedDictItem(
+                            "manual",
+                            {
+                                "manual": query.manual_rewritten_utterance,
+                            },
+                        ),
                         ConversationHistoryItem(
-                            [] if is_new_conversation else node.conversation(False)
+                            parent.conversation(False) if parent else []
                         ),
                         EntryType.USER_QUERY,
                     )
-                    if is_new_conversation:
-                        conversation = []
-                        node = ConversationTreeNode(topic)
-                        topic_number = query.topic_number
-                    else:
-                        node = node.add(ConversationTreeNode(topic))
+                    node = ConversationTreeNode(topic)
                     records.append(topic)
-                    conversation.append(node)
-                    node = node.add(
-                        ConversationTreeNode(
-                            Record(
-                                AnswerDocumentID(self.get_canonical_result_id(query)),
-                                EntryType.SYSTEM_ANSWER,
-                            )
+                else:
+                    node = ConversationTreeNode(
+                        Record(
+                            AnswerEntry(query.response),
+                            EntryType.SYSTEM_ANSWER,
                         )
                     )
-                    conversation.append(node)
-            except Exception:
-                logging.exception("Error while computing topic records")
-                raise
-            return records
-        @staticmethod
-        def get_canonical_result_id():
-            return None
-    class Cast2020TopicsHandler(CastTopicsHandler):
-        @staticmethod
-        def get_canonical_result_id(query: _irds.trec_cast.Cast2020Query):
-            return query.manual_canonical_result_id
-    class Cast2021TopicsHandler(CastTopicsHandler):
-        @staticmethod
-        def get_canonical_result_id(query: _irds.trec_cast.Cast2021Query):
-            return query.canonical_result_id
-    class Cast2022TopicsHandler(CastTopicsHandler):
-        def __init__(self, dataset):
-            self.dataset = dataset
-        @cached_property
-        def records(self):
-            try:
-                records = []
-                nodes: Dict[str, ConversationTreeNode] = {}
-                for (
-                    query
-                ) in (
-                    self.dataset.dataset.queries_iter()
-                ):  # type: _irds.trec_cast.Cast2022Query
-                    parent = nodes[query.parent_id] if query.parent_id else None
-                    if query.participant == "User":
-                        topic = Record(
-                            IDItem(query.query_id),
-                            SimpleTextItem(query.raw_utterance),
-                            DecontextualizedDictItem(
-                                "manual",
-                                {
-                                    "manual": query.manual_rewritten_utterance,
-                                },
-                            ),
-                            ConversationHistoryItem(
-                                parent.conversation(False) if parent else []
-                            ),
-                            EntryType.USER_QUERY,
-                        )
-                        node = ConversationTreeNode(topic)
-                        records.append(topic)
-                    else:
-                        node = ConversationTreeNode(
-                            Record(
-                                AnswerEntry(query.response),
-                                EntryType.SYSTEM_ANSWER,
-                            )
-                        )
-                    nodes[query.query_id] = node
-                    if parent:
-                        parent.add(node)
-            except Exception:
-                logging.exception("Error while computing topic records")
-                raise
-            return records
-    Topics.HANDLERS.update(
-        {
-            # _irds.trec_cast.Cast2019Query: Cast2019TopicsHandler,
-            _irds.trec_cast.Cast2020Query: Cast2020TopicsHandler,
-            _irds.trec_cast.Cast2021Query: Cast2021TopicsHandler,
-            _irds.trec_cast.Cast2022Query: Cast2022TopicsHandler,
-        }
-    )
+                nodes[query.query_id] = node
+                if parent:
+                    parent.add(node)
+        except Exception:
+            logging.exception("Error while computing topic records")
+            raise
-    class CastDocHandler:
-        def check(self, cls):
-            assert issubclass(cls, _irds.trec_cast.CastDoc)
+        return records
-        @cached_property
-        def target_cls(self):
-            return formats.TitleUrlDocument
-        def __call__(self, _, doc: _irds.trec_cast.CastDoc):
-            return Record(
-                IDItem(doc.doc_id), formats.SimpleTextItem(" ".join(doc.passages))
-            )
+Topics.HANDLERS.update(
+    {
+        # _irds.trec_cast.Cast2019Query: Cast2019TopicsHandler,
+        _irds.trec_cast.Cast2020Query: Cast2020TopicsHandler,
+        _irds.trec_cast.Cast2021Query: Cast2021TopicsHandler,
+        _irds.trec_cast.Cast2022Query: Cast2022TopicsHandler,
+    }
+)
-    class CastPassageDocHandler:
-        def check(self, cls):
-            assert issubclass(cls, _irds.trec_cast.CastPassageDoc)
-        @cached_property
-        def target_cls(self):
-            return formats.TitleUrlDocument
+class CastDocHandler:
+    def check(self, cls):
+        assert issubclass(cls, _irds.trec_cast.CastDoc)
+    @cached_property
+    def target_cls(self):
+        return formats.TitleUrlDocument
+    def __call__(self, _, doc: _irds.trec_cast.CastDoc):
+        return Record(
+            IDItem(doc.doc_id), formats.SimpleTextItem(" ".join(doc.passages))
+        )
+class CastPassageDocHandler:
+    def check(self, cls):
+        assert issubclass(cls, _irds.trec_cast.CastPassageDoc)
+    @cached_property
+    def target_cls(self):
+        return formats.TitleUrlDocument
+    def __call__(self, _, doc: _irds.trec_cast.CastPassageDoc):
+        return Record(
+            IDItem(doc.doc_id),
+            formats.TitleUrlDocument(doc.text, doc.title, doc.url),
+        )
-        def __call__(self, _, doc: _irds.trec_cast.CastPassageDoc):
-            return Record(
-                IDItem(doc.doc_id),
-                formats.TitleUrlDocument(doc.text, doc.title, doc.url),
-            )
-    Documents.CONVERTERS[_irds.trec_cast.CastDoc] = CastDocHandler()
-    Documents.CONVERTERS[_irds.trec_cast.CastPassageDoc] = CastPassageDocHandler()
+Documents.CONVERTERS[_irds.trec_cast.CastDoc] = CastDocHandler()
+Documents.CONVERTERS[_irds.trec_cast.CastPassageDoc] = CastPassageDocHandler()
 class Adhoc(ir.Adhoc, IRDSId):

datamaestro_text/datasets/irds/helpers.py CHANGED Viewed

@@ -1,13 +1,13 @@
 import logging
 from typing import Optional, Type, Callable, Iterator
 from ir_datasets.indices import PickleLz4FullStore
-from datamaestro.download import Download
+from datamaestro.download import Resource
 from datamaestro.utils import FileChecker
 from pathlib import Path
 import urllib3
-class lz4docstore_downloader(Download):
+class lz4docstore_downloader(Resource):
     """Uses ir_datasets Lz4FullStore to build a document store for a stream of documents"""
     def __init__(
@@ -69,3 +69,59 @@ class lz4docstore_downloader(Download):
             # All good!
             (destination / "done").touch()
+class lz4docstore_builder(Resource):
+    def __init__(
+        self,
+        name: str,
+        iter_factory: Callable[[], Iterator],
+        doc_cls: Type,
+        lookup_field: str,
+        *,
+        count_hint: Optional[int] = None,
+    ):
+        """Uses ir_datasets Lz4FullStore to build a document store for a stream of documents
+        :param name: The name of the variable for path construction
+        :param iter_factory: Iterator over documents
+        :param doc_cls: The class of documents (must be a dataclass because of how ir-datasets works)
+        :param lookup_field: Which field to use for lookup
+        :param count_hint: Number of documents (hint), defaults to None
+        """
+        super().__init__(name)
+        self.iter_factory = iter_factory
+        self.doc_cls = doc_cls
+        self.lookup_field = lookup_field
+        self.count_hint = count_hint
+    def prepare(self):
+        return self.definition.datapath / self.varname
+    def download(self, force=False):
+        # Creates directory if needed
+        destination = self.definition.datapath / self.varname
+        destination.mkdir(exist_ok=True)
+        # Early exit
+        if (destination / "done").is_file() and not force:
+            return True
+        # Download (cache)
+        logging.info("Building the document index")
+        # Builds the LZ4 store
+        store = PickleLz4FullStore(
+            destination,
+            lambda: self.iter_factory(),
+            self.doc_cls,
+            lookup_field=self.lookup_field,
+            index_fields=[self.lookup_field],
+            key_field_prefix=None,
+            size_hint=None,
+            count_hint=self.count_hint,
+        )
+        store.build()
+        # All good!
+        (destination / "done").touch()

datamaestro_text/version.py CHANGED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '2024.5.31'
-__version_tuple__ = version_tuple = (2024, 5, 31)
+__version__ = version = '2025.1.7'
+__version_tuple__ = version_tuple = (2025, 1, 7)

{datamaestro_text-2024.5.31.dist-info → datamaestro_text-2025.1.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamaestro-text
-Version: 2024.5.31
+Version: 2025.1.7
 Summary: Datamaestro module for text-related datasets
 Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
 License: GPL-3
@@ -18,8 +18,8 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Requires-Python: >=3.8
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: datamaestro >=1.1.1
-Requires-Dist: ir-datasets
+Requires-Dist: datamaestro>=1.2.1
+Requires-Dist: ir_datasets>=0.5.8
 Requires-Dist: attrs
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) [![PyPI version](https://badge.fury.io/py/datamaestro-text.svg)](https://badge.fury.io/py/datamaestro-text)

{datamaestro_text-2024.5.31.dist-info → datamaestro_text-2025.1.7.dist-info}/RECORD RENAMED Viewed

@@ -1,12 +1,12 @@
 datamaestro_text/__init__.py,sha256=hU8jZpkXl3F74qIfqnJl7v4nJ9YxfoR7IpJpUREFNRI,248
-datamaestro_text/version.py,sha256=fwtF2zbaLZ1a4gnsJGlnkD1w9QKZyClNJUGeq39EhTE,419
+datamaestro_text/version.py,sha256=k65KHkS4PD3NjDDCJgWcTwRBE4yVcKs32B8SUxv2DvE,417
 datamaestro_text/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamaestro_text/config/ai/quac.yaml,sha256=h1D7UJo2z1nZ_9MXpDHuQNJG9Ma2oayUdJV6lyumAIg,1103
 datamaestro_text/config/com/oscar-corpus.py,sha256=dDfdy_uhG6esRQQzUOOORwK8KvEq4c4hZbWt22pv-zY,733
 datamaestro_text/config/com/sentiment140.py,sha256=iRV_rSjQcr9WNjjQ4EdznaEMGLADV_rwpeRx7ycQi0k,1383
 datamaestro_text/config/com/fastml/goodbooks-10k.yaml,sha256=5ZABxUnBFs2ZnCXtBH8YoBiPb3SocRRdH1DLSfVWF-Y,1172
 datamaestro_text/config/com/github/aagohary/canard.py,sha256=tIwb_KxMUR3st7rzQUkt6rIjolTl8uKvDq6t795b1nY,1468
-datamaestro_text/config/com/github/apple/ml-qrecc.py,sha256=prljhI25mZn4NqUwu5sfntvvzLI1-Twpe_tJYjUoWDo,1444
+datamaestro_text/config/com/github/apple/ml-qrecc.py,sha256=_lufJlg-4zaQyjr5Ae-X-9hXzpl2JFjfxh_RhnBbva4,3068
 datamaestro_text/config/com/github/prdwb/orconvqa.py,sha256=Tg3hbnvilxW_Lwt7fpKvGMtu-6mc9oNIHM-LX6JTR1U,3026
 datamaestro_text/config/com/github/soskek/bookcorpus.yaml,sha256=qJKs35yeEIilEMgNvU3OEqMp1TSn7mDM2T-uYyA7kTU,1607
 datamaestro_text/config/com/microsoft/wikiqa.yaml,sha256=U7rU-W2Xz1MYv2YXT4jCTj5DsHyM0VssbQPNa3EISaM,540
@@ -46,7 +46,7 @@ datamaestro_text/data/recommendation.py,sha256=MatelpJQiCMpNDuaQVBkRY809J1qiQo-8
 datamaestro_text/data/tagging.py,sha256=yWm7bNLks77cAySa1aZNCmLFxTTqhqXZ0PaoaEYU6hI,697
 datamaestro_text/data/text.py,sha256=pOI8nrEd6RoQA28DVH1JufHTunr9vG3FQzwElR8YirI,499
 datamaestro_text/data/conversation/__init__.py,sha256=esOWnSImMlQs5QtfxUt559ABLd6a5wwoNpj9XtIq71Y,159
-datamaestro_text/data/conversation/base.py,sha256=FWA4-5corSZUuRMzpewOBXPDG2YR60j5geZmN-SaXrg,6451
+datamaestro_text/data/conversation/base.py,sha256=zXX5sPMoVtDf4WTkLx45IvZz6lgKPgcUZFu6N4lVBlc,6457
 datamaestro_text/data/conversation/canard.py,sha256=IMxu5NfytWZLZ_cyT8UFOICbDE82HLJoNBT_6j36Faw,1808
 datamaestro_text/data/conversation/orconvqa.py,sha256=VG3GV5_IPQcvla9rrQPypDNcZYmKNMgozmr2oudeLA4,3802
 datamaestro_text/data/conversation/qrecc.py,sha256=es4GmqPtE63A7O_GARe8Zy3rQvuLEhAvUA7CfN_nMeA,2562
@@ -55,15 +55,15 @@ datamaestro_text/data/ir/base.py,sha256=Cw8egjChpx4ksUwp-vTA70B2OWxROH7FIeJylsXg
 datamaestro_text/data/ir/cord19.py,sha256=7xDIzsy63WrA9lxxyNOMu5ECRymu5x23EzYG977nS6Y,1440
 datamaestro_text/data/ir/csv.py,sha256=tnxuqR_MZ3GQhuoXEMYyWLQw8PyD0gRqsnzIP5Gsziw,1212
 datamaestro_text/data/ir/data.py,sha256=ERmmOxz_9mUkIuccNbzUa5Y6gVLLVDdyc4cCxbCCUbY,20
-datamaestro_text/data/ir/formats.py,sha256=wgjXIkNJjqRbHEMkkXyXRRMnxnho45jfUbPsJCazkZk,2866
+datamaestro_text/data/ir/formats.py,sha256=rU9uJhdFDdlnQ2qfFowK_--1WVOH1fVgfXWOPPDF_FY,3160
 datamaestro_text/data/ir/huggingface.py,sha256=G71VFDN-SllZy4LFxumEbCumEJvb5-orAbLemHjWhiA,956
-datamaestro_text/data/ir/stores.py,sha256=odp1XoCq-FakKICXsMBCxzJlx77j71QPKzyLnMg0xGA,733
+datamaestro_text/data/ir/stores.py,sha256=rlOogoBAfsP7o01KqvHiTF1XqzK2Fp6QbRmuasabKAE,859
 datamaestro_text/data/ir/trec.py,sha256=qDOzmgUn0hMqeP9KdI190-9IKsLl6yNRqIoBz6s-cBs,1898
 datamaestro_text/data/ir/utils.py,sha256=6-GhXVtgkBZGhIs2-ODZua_3DmKjSSVydStpHDqbAwE,833
 datamaestro_text/datasets/irds/__init__.py,sha256=Tq0HN1qojnZYLBumM59BuTkz7r0gcu-5OXmDDLgPpAc,707
-datamaestro_text/datasets/irds/data.py,sha256=EsqaY5UNbtQGMdEqUn5tlxW-k2LiaJ0jiD_6vVtZuU8,20261
+datamaestro_text/datasets/irds/data.py,sha256=BTsFZdjFJ0XdlYRh2rcshAoeknHrypf7be2BGUbWWFk,19931
 datamaestro_text/datasets/irds/datasets.py,sha256=qtN-nX2_OV9FD339aZjwVL1jFf6I4T6MbNCcuRAjmtU,5682
-datamaestro_text/datasets/irds/helpers.py,sha256=KC-2nQPCIl4VnbfDkAkr4iFlhkknn8zvbADlClWZvwU,2207
+datamaestro_text/datasets/irds/helpers.py,sha256=fGE-fbuJbXdTzl1qo55247jzn9cvApY-d82GJBgfY1E,3982
 datamaestro_text/datasets/irds/utils.py,sha256=m30JLIrV_HgilN11TvY9dGTyumES6LLzWZDUAMT915M,1425
 datamaestro_text/download/tmdb.py,sha256=kU_Vz9jhznlyeKMHziVu58IHoWv8zPu6CZTHVNQvmu4,4009
 datamaestro_text/interfaces/plaintext.py,sha256=cWfS_xjqZxQ0EV4Ax5BEarZ4lnhQ1I7mc_vgfBgE76w,885
@@ -78,9 +78,9 @@ datamaestro_text/utils/files.py,sha256=n6ZGl5LNrZbHLcV9RFwd7cFT0vPUezit-2dsBzs1v
 datamaestro_text/utils/iter.py,sha256=2_UZ8y9Ma4k5U9ZD4w55Zfb6NGrKM1L4G40OygRm1is,2459
 datamaestro_text/utils/randomstream.py,sha256=_-boH4IIqN8qcl3IktjpNp9vmF4TWRzHUSNVwg7WAr8,973
 datamaestro_text/utils/shuffle.py,sha256=o8JTz3mr0lYWyv0zEh91jEK12ci1etMiUnzh5GkOHCM,3490
-datamaestro_text-2024.5.31.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
-datamaestro_text-2024.5.31.dist-info/METADATA,sha256=MGRuQbJdMtcfGAGdF0MqDiPcR7NABD7PhGIMVnf71aY,1604
-datamaestro_text-2024.5.31.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-datamaestro_text-2024.5.31.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
-datamaestro_text-2024.5.31.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
-datamaestro_text-2024.5.31.dist-info/RECORD,,
+datamaestro_text-2025.1.7.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
+datamaestro_text-2025.1.7.dist-info/METADATA,sha256=-wsBcUcnEnOMiHb1ROIf43r55ZNNNjpIemuFXm5hHUY,1609
+datamaestro_text-2025.1.7.dist-info/WHEEL,sha256=A3WOREP4zgxI0fKrHUG8DC8013e3dK3n7a6HDbcEIwE,91
+datamaestro_text-2025.1.7.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
+datamaestro_text-2025.1.7.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
+datamaestro_text-2025.1.7.dist-info/RECORD,,

{datamaestro_text-2024.5.31.dist-info → datamaestro_text-2025.1.7.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.43.0)
+Generator: setuptools (75.7.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{datamaestro_text-2024.5.31.dist-info → datamaestro_text-2025.1.7.dist-info}/LICENSE RENAMED Viewed

File without changes

{datamaestro_text-2024.5.31.dist-info → datamaestro_text-2025.1.7.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{datamaestro_text-2024.5.31.dist-info → datamaestro_text-2025.1.7.dist-info}/top_level.txt RENAMED Viewed

File without changes

datamaestro-text 2024.5.31__py3-none-any.whl → 2025.1.7__py3-none-any.whl

datamaestro-text 2024.5.31py3-none-any.whl → 2025.1.7py3-none-any.whl