PyPI - datamaestro-text - Versions diffs - 2024.3.10__py3-none-any.whl → 2025.1.7__py3-none-any.whl - Mend

datamaestro-text 2024.3.10py3-none-any.whl → 2025.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

datamaestro_text/config/com/github/apple/ml-qrecc.py ADDED Viewed

@@ -0,0 +1,87 @@
+# See documentation on https://datamaestro.readthedocs.io
+import re
+import json
+from pathlib import Path
+from datamaestro.definitions import datatasks, datatags, dataset
+from datamaestro.data.ml import Supervised
+from datamaestro.download import reference
+from datamaestro.download.archive import zipdownloader
+from datamaestro.download.wayback import wayback_documents
+from datamaestro.utils import HashCheck
+from datamaestro_text.data.conversation.qrecc import QReCCDataset
+from datamaestro_text.datasets.irds.data import (
+    LZ4JSONLDocumentStore,
+    SimpleJsonDocument,
+)
+from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
+@datatags("conversation", "context", "query")
+@datatasks("query rewriting")
+@zipdownloader(
+    "data",
+    "https://github.com/apple/ml-qrecc/raw/main/dataset/qrecc_data.zip",
+    checker=HashCheck("f88fcc7ef3678cd6312080389c8abd67"),
+)
+@dataset(
+    Supervised[QReCCDataset, None, QReCCDataset],
+    url="https://github.com/apple/ml-qrecc",
+    doi="https://doi.org/10.48550/arXiv.2010.04898",
+    id="",
+)
+def main(data: Path):
+    """Open-Domain Question Answering Goes Conversational via Question Rewriting
+    We introduce QReCC (Question Rewriting in Conversational Context), an
+    end-to-end open-domain question answering dataset comprising of 14K
+    conversations with 81K question-answer pairs. The goal of this dataset is to
+    provide a challenging benchmark for end-to-end conversational question
+    answering that includes the individual subtasks of question rewriting,
+    passage retrieval and reading comprehension
+    """
+    return {
+        "train": QReCCDataset(path=data / "qrecc_train.json"),
+        "test": QReCCDataset(path=data / "qrecc_test.json"),
+    }
+@dataset(
+    url="https://github.com/apple/ml-qrecc",
+    doi="https://doi.org/10.48550/arXiv.2010.04898",
+)
+class Content(LZ4JSONLDocumentStore):
+    """QReCC mentionned URLs content"""
+    @staticmethod
+    def __create_dataset__(dataset, options=None):
+        ds = reference(reference=main).setup(dataset, options)
+        documents_path = wayback_documents(
+            "20191127", lambda: Content._urls(ds), name="wayback.jsonl"
+        ).setup(dataset, options)
+        store_path = lz4docstore_builder(
+            "store",
+            lambda: Content._documents(documents_path),
+            SimpleJsonDocument,
+            "id",
+        ).setup(dataset, options)
+        return LZ4JSONLDocumentStore(jsonl_path=store_path)
+    @staticmethod
+    def _documents(path: Path):
+        """Iterates over documents from wayback"""
+        with path.open("rt") as fp:
+            for line in fp:
+                yield SimpleJsonDocument(**json.loads(line))
+    @staticmethod
+    def _urls(supervised: Supervised[QReCCDataset, None, QReCCDataset]):
+        urls = set()
+        for ds in [supervised.train, supervised.test]:
+            for entry in ds.entries():
+                if entry.answer_url:
+                    url = re.sub("#.*$", "", entry.answer_url)
+                    urls.add(url)
+        return urls

datamaestro_text/config/com/github/prdwb/orconvqa.py CHANGED Viewed

@@ -1,11 +1,9 @@
 # See documentation on https://datamaestro.readthedocs.io
-from collections import namedtuple
 import gzip
 import json
 from pathlib import Path
-from typing import Iterator, NamedTuple
-import attrs
+from typing import Iterator
 from datamaestro.definitions import datatasks, datatags, dataset
 from datamaestro.download.single import filedownloader
 from datamaestro.utils import HashCheck
@@ -14,10 +12,7 @@ from datamaestro.utils import HashCheck
 from datamaestro_text.data.conversation.orconvqa import OrConvQADataset
 from datamaestro.data.ml import Supervised
-from datamaestro_text.data.ir import DocumentStore
-from datamaestro_text.data.ir.formats import OrConvQADocument
 from datamaestro_text.data.ir.stores import OrConvQADocumentStore
-from datamaestro_text.datasets.irds.data import LZ4DocumentStore
 from datamaestro_text.datasets.irds.helpers import lz4docstore_downloader
@@ -63,7 +58,9 @@ def preprocessed(train, dev, test):
 def orConvQADocumentReader(source: Path) -> Iterator[OrConvQADocumentStore.NAMED_TUPLE]:
     with gzip.open(source, "rt") as fp:
         for line in fp:
-            yield OrConvQADocumentStore.NAMED_TUPLE(**json.loads(line))
+            data = json.loads(line)
+            data["body"] = data.pop("text")
+            yield OrConvQADocumentStore.NAMED_TUPLE(**data)
 @lz4docstore_downloader(

datamaestro_text/data/conversation/base.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from abc import ABC, abstractmethod
 from enum import Enum
-from typing import Dict, Generic, Iterator, List, Optional, Sequence
+from typing import Dict, Generic, Iterator, List, Optional, Sequence, Tuple
 from attr import define
 from datamaestro.data import Base
 from datamaestro.record import Record, Item
@@ -61,6 +61,20 @@ class AnswerEntry(Item):
     """The system answer"""
+@define
+class AnswerDocumentID(Item):
+    """An answer as a document ID"""
+    document_id: str
+@define
+class AnswerDocumentURL(Item):
+    """An answer as a document ID"""
+    url: str
 @define
 class RetrievedEntry(Item):
     """List of system-retrieved documents and their relevance"""
@@ -68,8 +82,8 @@ class RetrievedEntry(Item):
     documents: List[str]
     """List of retrieved documents"""
-    document_relevances: Optional[List[str]] = None
-    """List of retrieved documents and their relevance status"""
+    relevant_documents: Optional[Dict[int, Tuple[Optional[int], Optional[int]]]] = None
+    """List of relevance status (optional), with start/stop position"""
 @define
@@ -95,56 +109,99 @@ class ConversationHistoryItem(Item):
 class ConversationNode:
+    @abstractmethod
     def entry(self) -> Record:
         """The current conversation entry"""
         ...
+    @abstractmethod
     def history(self) -> ConversationHistory:
         """Preceding conversation entries, from most recent to more ancient"""
         ...
+    @abstractmethod
+    def parent(self) -> Optional["ConversationNode"]:
+        ...
+    @abstractmethod
+    def children(self) -> List["ConversationNode"]:
+        ...
-class ConversationTree:
+class ConversationTree(ABC):
+    @abstractmethod
+    def root(self) -> ConversationNode:
+        ...
+    @abstractmethod
     def __iter__(self) -> Iterator[ConversationNode]:
         """Iterates over conversation nodes"""
-        pass
+        ...
 # ---- A conversation tree
-class SingleConversationTree(ConversationTree):
+class SingleConversationTree(ConversationTree, ABC):
     """Simple conversations, based on a sequence of entries"""
     id: str
-    history: Sequence[Record]
+    history: List[Record]
     def __init__(self, id: Optional[str], history: List[Record]):
         """Create a simple conversation
-        :param history: The entries, in reverse order (i.e. more ancient first)
+        :param history: The entries, in **reverse** order (i.e. more ancient first)
         """
         self.history = history or []
+        self.id = id
     def add(self, entry: Record):
         self.history.insert(0, entry)
     def __iter__(self) -> Iterator[ConversationNode]:
-        for ix in range(len(self.history)):
+        """Iterates over the conversation (starting with the beginning)"""
+        for ix in reversed(range(len(self.history))):
             yield SingleConversationTreeNode(self, ix)
+    def root(self):
+        return SingleConversationTreeNode(self, len(self.history) - 1)
 @define
 class SingleConversationTreeNode(ConversationNode):
     tree: SingleConversationTree
     index: int
+    @property
     def entry(self) -> Record:
         return self.tree.history[self.index]
+    @entry.setter
+    def entry(self, record: Record):
+        try:
+            self.tree.history[self.index] = record
+        except Exception as e:
+            print(e)
+            raise
     def history(self) -> Sequence[Record]:
         return self.tree.history[self.index + 1 :]
+    def parent(self) -> Optional[ConversationNode]:
+        return (
+            SingleConversationTreeNode(self.tree, self.index + 1)
+            if self.index < len(self.tree.history) - 1
+            else []
+        )
+    def children(self) -> List[ConversationNode]:
+        return (
+            [SingleConversationTreeNode(self.tree, self.index - 1)]
+            if self.index > 0
+            else []
+        )
 class ConversationTreeNode(ConversationNode, ConversationTree):
     """A conversation tree node"""
@@ -178,6 +235,15 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
         for child in self.children:
             yield from child
+    def parent(self) -> Optional[ConversationNode]:
+        return self.parent
+    def children(self) -> List[ConversationNode]:
+        return self.children
+    def root(self):
+        return self
 class ConversationDataset(Base, ABC):
     """A dataset made of conversations"""
@@ -186,4 +252,4 @@ class ConversationDataset(Base, ABC):
     def __iter__(self) -> Iterator[ConversationTree]:
         """Return an iterator over conversations"""
         for i in range(len(self)):
-            return self.get(i)
+            yield self.get(i)

datamaestro_text/data/conversation/orconvqa.py CHANGED Viewed

@@ -102,16 +102,26 @@ class OrConvQADataset(ConversationDataset, File):
             # Add to current
             history.append(
                 Record(
-                    IDItem(query_no),
+                    IDItem(entry.query_id),
                     SimpleTextItem(entry.query),
                     SimpleDecontextualizedItem(entry.rewrite),
                     EntryType.USER_QUERY,
                 )
             )
+            relevances = {}
+            for rank, relevance in enumerate(entry.retrieval_labels):
+                if relevance > 0:
+                    relevances[rank] = (entry.answer.answer_start, None)
+            assert (
+                len(relevances) <= 1
+            ), f"Too many relevance labels ({len(relevances)}) for {entry.query_id}"
             history.append(
                 Record(
                     AnswerEntry(entry.answer.text),
-                    RetrievedEntry(entry.evidences, entry.retrieval_labels),
+                    RetrievedEntry(entry.evidences, relevances),
                     EntryType.SYSTEM_ANSWER,
                 )
             )

datamaestro_text/data/conversation/qrecc.py ADDED Viewed

@@ -0,0 +1,99 @@
+from typing import Iterator, List, Optional
+from attr import define
+import json
+from datamaestro.data import File
+from datamaestro.record import Record
+from datamaestro_text.data.ir.base import (
+    IDItem,
+    SimpleTextItem,
+)
+from .base import (
+    AnswerDocumentURL,
+    AnswerEntry,
+    ConversationTree,
+    EntryType,
+    SimpleDecontextualizedItem,
+    SingleConversationTree,
+)
+from . import ConversationDataset
+@define(kw_only=True)
+class QReCCDatasetEntry:
+    """A query with past history"""
+    conversation_no: int
+    """Conversation ID"""
+    turn_no: int
+    """The turn in the conversation"""
+    conversation_source: str
+    """Conversation source"""
+    question: str
+    """The last issued query"""
+    rewrite: str
+    """Manually rewritten query"""
+    context: List[str]
+    """The list of queries asked by the user"""
+    answer: str
+    """The answer"""
+    answer_url: str
+    """The URL containing the answer"""
+class QReCCDataset(ConversationDataset, File):
+    def entries(self) -> Iterator[QReCCDatasetEntry]:
+        """Iterates over re-written query with their context"""
+        with self.path.open("rt") as fp:
+            data = json.load(fp)
+        data = [
+            QReCCDatasetEntry(**{key.lower(): value for key, value in entry.items()})
+            for entry in data
+        ]
+        return iter(data)
+    def __iter__(self) -> Iterator[ConversationTree]:
+        history: List[Record] = []
+        current_id: Optional[str] = None
+        for entry in self.entries():
+            # Creates a new conversation if needed
+            if entry.conversation_no != current_id:
+                if current_id is not None:
+                    history.reverse()
+                    yield SingleConversationTree(current_id, history)
+                current_id = entry.conversation_no
+                history = []
+            # Add to current
+            history.append(
+                Record(
+                    IDItem(f"{entry.conversation_no}#{entry.turn_no}"),
+                    SimpleTextItem(entry.question),
+                    AnswerDocumentURL(entry.answer_url),
+                    SimpleDecontextualizedItem(entry.rewrite),
+                    EntryType.USER_QUERY,
+                )
+            )
+            history.append(
+                Record(
+                    AnswerEntry(entry.answer),
+                    EntryType.SYSTEM_ANSWER,
+                )
+            )
+        # Yields the last one
+        history.reverse()
+        yield SingleConversationTree(current_id, history)

datamaestro_text/data/ir/__init__.py CHANGED Viewed

@@ -25,6 +25,7 @@ from .base import (  # noqa: F401
     create_record,
     # Other things
     AdhocAssessment,
+    AdhocAssessedTopic,
 )
@@ -83,7 +84,7 @@ class DocumentStore(Documents):
     def document_int(self, internal_docid: int) -> DocumentRecord:
         """Returns a document given its internal ID"""
         docid = self.docid_internal2external(internal_docid)
-        return self.document(docid)
+        return self.document_ext(docid)
     def document_ext(self, docid: str) -> DocumentRecord:
         """Returns a document given its external ID"""
@@ -159,7 +160,7 @@ class TopicsStore(Topics):
 class AdhocAssessments(Base, ABC):
     """Ad-hoc assessments (qrels)"""
-    def iter(self) -> Iterator[AdhocAssessment]:
+    def iter(self) -> Iterator[AdhocAssessedTopic]:
         """Returns an iterator over assessments"""
         raise NotImplementedError(f"For class {self.__class__}")

datamaestro_text/data/ir/formats.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from functools import cached_property
-from typing import ClassVar, Tuple
+from typing import ClassVar, Tuple, List
 from attrs import define
 from datamaestro.record import record_type
 from ir_datasets.datasets.wapo import WapoDocMedia
@@ -117,7 +117,6 @@ class TweetDoc(TextItem):
 @define
 class OrConvQADocument(TextItem):
-    id: str
     title: str
     body: str
     aid: str
@@ -127,12 +126,21 @@ class OrConvQADocument(TextItem):
     def text(self):
         return f"{self.title} {self.body}"
+@define
+class Touche2020(TextItem):
+    text: str
+    title: str
+    stance: str
+    url: str
 @define
-class TrecTopic(TextItem):
+class SciDocs(TextItem):
     text: str
-    query: str
-    narrative: str
+    title: str
+    authors: List[str]
+    year: int
+    cited_by: List[str]
+    references: List[str]
 @define
@@ -167,6 +175,13 @@ class TrecMb14Query(TextItem):
     def get_text(self):
         return f"{self.query}"
+@define
+class SciDocsTopic(TextItem):
+    text: str
+    authors: List[str]
+    year: int
+    cited_by: List[str]
+    references: List[str]
 @define()
 class TrecTopic(SimpleTextItem):

datamaestro_text/data/ir/stores.py CHANGED Viewed

@@ -1,16 +1,21 @@
 from collections import namedtuple
-from typing import List
+from typing import List, NamedTuple
 from experimaestro import Constant
 import attrs
+from datamaestro.record import Record
+from datamaestro_text.data.ir.base import IDItem
 from datamaestro_text.datasets.irds.data import LZ4DocumentStore
 from datamaestro_text.data.ir.formats import OrConvQADocument
 class OrConvQADocumentStore(LZ4DocumentStore):
-    NAMED_TUPLE = namedtuple(
-        "OrConvQADocument", [a.name for a in attrs.fields(OrConvQADocument)]
-    )
+    class NAMED_TUPLE(NamedTuple):
+        id: str
+        title: str
+        body: str
+        aid: str
+        bid: int
     lookup_field: Constant[str] = "id"
     fields: Constant[List[str]] = list(NAMED_TUPLE._fields)
@@ -18,5 +23,7 @@ class OrConvQADocumentStore(LZ4DocumentStore):
     data_cls = NAMED_TUPLE
-    def converter(self, data: NAMED_TUPLE) -> OrConvQADocument:
-        return OrConvQADocument(**data._asdict())
+    def converter(self, data: NAMED_TUPLE) -> Record:
+        fields = data._asdict()
+        del fields["id"]
+        return Record(OrConvQADocument(**fields), IDItem(data.id))

datamaestro_text/datasets/irds/data.py CHANGED Viewed

@@ -1,35 +1,44 @@
+import logging
 from abc import ABC, abstractmethod
+from dataclasses import dataclass
 from functools import partial
-import logging
 from pathlib import Path
-from typing import Iterator, Tuple, Type, List
+from typing import Dict, Iterator, List, NamedTuple, Tuple, Type
 import ir_datasets
-from ir_datasets.indices import PickleLz4FullStore
+import ir_datasets.datasets as _irds
+from datamaestro.record import RecordType, record_type
+from experimaestro import Config, Meta, Option, Param
+from experimaestro.compat import cached_property
 from ir_datasets.formats import (
     GenericDoc,
-    GenericQuery,
     GenericDocPair,
+    GenericQuery,
     TrecParsedDoc,
     TrecQuery,
 )
-import ir_datasets.datasets as _irds
-from experimaestro import Config, Param
-from experimaestro.compat import cached_property
-from experimaestro import Option
-from datamaestro.record import RecordType, record_type
+from ir_datasets.indices import PickleLz4FullStore
 import datamaestro_text.data.ir as ir
+import datamaestro_text.data.ir.formats as formats
+from datamaestro_text.data.conversation.base import (
+    AnswerDocumentID,
+    AnswerEntry,
+    ConversationHistoryItem,
+    ConversationTreeNode,
+    DecontextualizedDictItem,
+    EntryType,
+)
 from datamaestro_text.data.ir.base import (
-    Record,
-    TopicRecord,
-    DocumentRecord,
-    SimpleTextItem,
     AdhocAssessedTopic,
-    SimpleAdhocAssessment,
+    DocumentRecord,
     IDItem,
+    Record,
+    SimpleAdhocAssessment,
+    SimpleTextItem,
+    TopicRecord,
     create_record,
 )
-import datamaestro_text.data.ir.formats as formats
 # Interface between ir_datasets and datamaestro:
 # provides adapted data types
@@ -108,6 +117,12 @@ class Documents(ir.DocumentStore, IRDSId):
         _irds.beir.BeirTitleUrlDoc: tuple_constructor(
             formats.TitleUrlDocument, "doc_id", "text", "title", "url"
         ),
+        _irds.beir.BeirToucheDoc: tuple_constructor(
+            formats.Touche2020, "doc_id", "text", "title", "stance", "url"
+        ),
+        _irds.beir.BeirSciDoc: tuple_constructor(
+            formats.SciDocs, "doc_id", "text", "title", "authors", "year", "cited_by", "references"
+        ),
         _irds.msmarco_document.MsMarcoDocument: tuple_constructor(
             formats.MsMarcoDocument, "doc_id", "url", "title", "body"
         ),
@@ -215,20 +230,6 @@ if hasattr(_irds, "miracl"):
     )
-# Fix while PR https://github.com/allenai/ir_datasets/pull/252
-# is not in.
-class DMPickleLz4FullStore(PickleLz4FullStore):
-    def get_many(self, doc_ids, field=None):
-        result = {}
-        field_idx = self._doc_cls._fields.index(field) if field is not None else None
-        for doc in self.get_many_iter(doc_ids):
-            if field is not None:
-                result[getattr(doc, self._id_field)] = doc[field_idx]
-            else:
-                result[getattr(doc, self._id_field)] = doc
-        return result
 class LZ4DocumentStore(ir.DocumentStore):
     """A LZ4-based document store"""
@@ -242,7 +243,7 @@ class LZ4DocumentStore(ir.DocumentStore):
     @cached_property
     def store(self):
-        return DMPickleLz4FullStore(
+        return PickleLz4FullStore(
             self.path, None, self.data_cls, self.lookup_field, self.index_fields
         )
@@ -254,33 +255,48 @@ class LZ4DocumentStore(ir.DocumentStore):
         return getattr(self._docs[ix], self.store._id_field)
     def document_ext(self, docid: str) -> DocumentRecord:
-        return self.converter(self.document_recordtype, self.store.get(docid))
+        return self.converter(self.store.get(docid))
     def documents_ext(self, docids: List[str]) -> DocumentRecord:
         """Returns documents given their external IDs (optimized for batch)"""
         retrieved = self.store.get_many(docids)
-        return [
-            self.converter(self.document_recordtype, retrieved[docid])
-            for docid in docids
-        ]
+        return [self.converter(retrieved[docid]) for docid in docids]
+    @abstractmethod
     def converter(self, data):
-        """Converts a document from LZ4 tuples to any other format"""
-        # By default, use identity
-        return data
+        """Converts a document from LZ4 tuples to a document record"""
+        ...
     def iter(self) -> Iterator[DocumentRecord]:
         """Returns an iterator over documents"""
-        return map(
-            partial(self.converter, self.document_recordtype), self.store.__iter__()
-        )
+        return map(self.converter, self.store.__iter__())
+    @cached_property
     def documentcount(self):
         if self.count:
             return self.count
         return self.store.count()
+class SimpleJsonDocument(NamedTuple):
+    id: str
+    text: str
+class LZ4JSONLDocumentStore(LZ4DocumentStore):
+    jsonl_path: Meta[Path]
+    """json-l based document store
+    Each line is of the form
+    ```json
+    { "id": "...", "text": "..." }
+    ```
+    """
+    def converter(self, data):
+        return DocumentRecord(IDItem(data["id"]), SimpleTextItem(data["text"]))
 class TopicsHandler(ABC):
     @abstractmethod
     def topic_int(self, internal_topic_id: int) -> TopicRecord:
@@ -352,6 +368,12 @@ class Topics(ir.TopicsStore, IRDSId):
         TrecQuery: tuple_constructor(
             formats.TrecTopic, "query_id", "title", "description", "narrative"
         ),
+        _irds.beir.BeirToucheQuery: tuple_constructor(
+            formats.TrecTopic, "query_id", "text", "description", "narrative"
+        ),
+        _irds.beir.BeirSciQuery: tuple_constructor(
+            formats.SciDocsTopic, "query_id", "text", "authors", "year", "cited_by", "references"
+        ),
         _irds.tweets2013_ia.TrecMb13Query: tuple_constructor(
             formats.TrecMb13Query, "query_id", "query", "time", "tweet_time"
         ),
@@ -395,99 +417,190 @@ class Topics(ir.TopicsStore, IRDSId):
         return self.handler.iter()
-if hasattr(_irds.trec_cast, "Cast2022Query"):
-    from datamaestro_text.data.conversation.base import (
-        ConversationTreeNode,
-        DecontextualizedDictItem,
-        RetrievedEntry,
-        ConversationHistoryItem,
-    )
+class CastTopicsHandler(TopicsHandler):
+    def __init__(self, dataset):
+        self.dataset = dataset
+    @cached_property
+    def ext2records(self):
+        return {record[IDItem].id: record for record in self.records}
+    def topic_int(self, internal_topic_id: int) -> TopicRecord:
+        """Returns a document given its internal ID"""
+        return self.records[internal_topic_id]
-    class CastTopicsHandler(TopicsHandler):
-        def __init__(self, dataset):
-            self.dataset = dataset
-        @property
-        @abstractmethod
-        def records(self):
-            ...
-        @cached_property
-        def ext2records(self):
-            return {record[IDItem].id: record for record in self.records}
-        def topic_int(self, internal_topic_id: int) -> TopicRecord:
-            """Returns a document given its internal ID"""
-            return self.records[internal_topic_id]
-        def topic_ext(self, external_topic_id: str) -> TopicRecord:
-            """Returns a document given its external ID"""
-            return self.ext2records[external_topic_id]
-        def iter(self) -> Iterator[ir.TopicRecord]:
-            """Returns an iterator over topics"""
-            return iter(self.records)
-    class Cast2020TopicsHandler(CastTopicsHandler):
-        @cached_property
-        def records(self):
-            try:
-                topic_number = None
-                node = None
-                conversation = []
-                records = []
-                for (
-                    query
-                ) in (
-                    self.dataset.dataset.queries_iter()
-                ):  # type: _irds.trec_cast.Cast2020Query
-                    decontextualized = DecontextualizedDictItem(
-                        "manual",
-                        {
-                            "manual": query.manual_rewritten_utterance,
-                            "auto": query.automatic_rewritten_utterance,
-                        },
+    def topic_ext(self, external_topic_id: str) -> TopicRecord:
+        """Returns a document given its external ID"""
+        return self.ext2records[external_topic_id]
+    def iter(self) -> Iterator[ir.TopicRecord]:
+        """Returns an iterator over topics"""
+        return iter(self.records)
+    @cached_property
+    def records(self):
+        try:
+            topic_number = None
+            node = None
+            conversation = []
+            records = []
+            for query in self.dataset.dataset.queries_iter():
+                decontextualized = DecontextualizedDictItem(
+                    "manual",
+                    {
+                        "manual": query.manual_rewritten_utterance,
+                        "auto": query.automatic_rewritten_utterance,
+                    },
+                )
+                is_new_conversation = topic_number != query.topic_number
+                topic = Record(
+                    IDItem(query.query_id),
+                    SimpleTextItem(query.raw_utterance),
+                    decontextualized,
+                    ConversationHistoryItem(
+                        [] if is_new_conversation else node.conversation(False)
+                    ),
+                    EntryType.USER_QUERY,
+                )
+                if is_new_conversation:
+                    conversation = []
+                    node = ConversationTreeNode(topic)
+                    topic_number = query.topic_number
+                else:
+                    node = node.add(ConversationTreeNode(topic))
+                records.append(topic)
+                conversation.append(node)
+                node = node.add(
+                    ConversationTreeNode(
+                        Record(
+                            AnswerDocumentID(self.get_canonical_result_id(query)),
+                            EntryType.SYSTEM_ANSWER,
+                        )
                     )
+                )
+                conversation.append(node)
+        except Exception:
+            logging.exception("Error while computing topic records")
+            raise
+        return records
+    @staticmethod
+    def get_canonical_result_id():
+        return None
+class Cast2020TopicsHandler(CastTopicsHandler):
+    @staticmethod
+    def get_canonical_result_id(query: _irds.trec_cast.Cast2020Query):
+        return query.manual_canonical_result_id
+class Cast2021TopicsHandler(CastTopicsHandler):
+    @staticmethod
+    def get_canonical_result_id(query: _irds.trec_cast.Cast2021Query):
+        return query.canonical_result_id
+class Cast2022TopicsHandler(CastTopicsHandler):
+    def __init__(self, dataset):
+        self.dataset = dataset
+    @cached_property
+    def records(self):
+        try:
+            records = []
+            nodes: Dict[str, ConversationTreeNode] = {}
+            for (
+                query
+            ) in (
+                self.dataset.dataset.queries_iter()
+            ):  # type: _irds.trec_cast.Cast2022Query
+                parent = nodes[query.parent_id] if query.parent_id else None
+                if query.participant == "User":
                     topic = Record(
                         IDItem(query.query_id),
                         SimpleTextItem(query.raw_utterance),
-                        decontextualized,
+                        DecontextualizedDictItem(
+                            "manual",
+                            {
+                                "manual": query.manual_rewritten_utterance,
+                            },
+                        ),
                         ConversationHistoryItem(
-                            node.conversation(False) if node else []
+                            parent.conversation(False) if parent else []
                         ),
+                        EntryType.USER_QUERY,
                     )
-                    if topic_number == query.topic_number:
-                        node = node.add(ConversationTreeNode(topic))
-                    else:
-                        conversation = []
-                        node = ConversationTreeNode(topic)
-                        topic_number = query.topic_number
+                    node = ConversationTreeNode(topic)
                     records.append(topic)
-                    conversation.append(node)
-                    node = node.add(
-                        ConversationTreeNode(
-                            Record(RetrievedEntry(query.manual_canonical_result_id))
+                else:
+                    node = ConversationTreeNode(
+                        Record(
+                            AnswerEntry(query.response),
+                            EntryType.SYSTEM_ANSWER,
                         )
                     )
-                    conversation.append(node)
-            except Exception:
-                logging.exception("Error while computing topic records")
-                raise
-            return records
-    Topics.HANDLERS.update(
-        {
-            # _irds.trec_cast.Cast2019Query: Cast2019TopicsHandler,
-            _irds.trec_cast.Cast2020Query: Cast2020TopicsHandler,
-            # _irds.trec_cast.Cast2021Query: Cast2021TopicsHandler,
-            # _irds.trec_cast.Cast2022Query: Cast2022TopicsHandler
-        }
-    )
+                nodes[query.query_id] = node
+                if parent:
+                    parent.add(node)
+        except Exception:
+            logging.exception("Error while computing topic records")
+            raise
+        return records
+Topics.HANDLERS.update(
+    {
+        # _irds.trec_cast.Cast2019Query: Cast2019TopicsHandler,
+        _irds.trec_cast.Cast2020Query: Cast2020TopicsHandler,
+        _irds.trec_cast.Cast2021Query: Cast2021TopicsHandler,
+        _irds.trec_cast.Cast2022Query: Cast2022TopicsHandler,
+    }
+)
+class CastDocHandler:
+    def check(self, cls):
+        assert issubclass(cls, _irds.trec_cast.CastDoc)
+    @cached_property
+    def target_cls(self):
+        return formats.TitleUrlDocument
+    def __call__(self, _, doc: _irds.trec_cast.CastDoc):
+        return Record(
+            IDItem(doc.doc_id), formats.SimpleTextItem(" ".join(doc.passages))
+        )
+class CastPassageDocHandler:
+    def check(self, cls):
+        assert issubclass(cls, _irds.trec_cast.CastPassageDoc)
+    @cached_property
+    def target_cls(self):
+        return formats.TitleUrlDocument
+    def __call__(self, _, doc: _irds.trec_cast.CastPassageDoc):
+        return Record(
+            IDItem(doc.doc_id),
+            formats.TitleUrlDocument(doc.text, doc.title, doc.url),
+        )
+Documents.CONVERTERS[_irds.trec_cast.CastDoc] = CastDocHandler()
+Documents.CONVERTERS[_irds.trec_cast.CastPassageDoc] = CastPassageDocHandler()
 class Adhoc(ir.Adhoc, IRDSId):

datamaestro_text/datasets/irds/helpers.py CHANGED Viewed

@@ -1,13 +1,13 @@
 import logging
 from typing import Optional, Type, Callable, Iterator
 from ir_datasets.indices import PickleLz4FullStore
-from datamaestro.download import Download
+from datamaestro.download import Resource
 from datamaestro.utils import FileChecker
 from pathlib import Path
 import urllib3
-class lz4docstore_downloader(Download):
+class lz4docstore_downloader(Resource):
     """Uses ir_datasets Lz4FullStore to build a document store for a stream of documents"""
     def __init__(
@@ -69,3 +69,59 @@ class lz4docstore_downloader(Download):
             # All good!
             (destination / "done").touch()
+class lz4docstore_builder(Resource):
+    def __init__(
+        self,
+        name: str,
+        iter_factory: Callable[[], Iterator],
+        doc_cls: Type,
+        lookup_field: str,
+        *,
+        count_hint: Optional[int] = None,
+    ):
+        """Uses ir_datasets Lz4FullStore to build a document store for a stream of documents
+        :param name: The name of the variable for path construction
+        :param iter_factory: Iterator over documents
+        :param doc_cls: The class of documents (must be a dataclass because of how ir-datasets works)
+        :param lookup_field: Which field to use for lookup
+        :param count_hint: Number of documents (hint), defaults to None
+        """
+        super().__init__(name)
+        self.iter_factory = iter_factory
+        self.doc_cls = doc_cls
+        self.lookup_field = lookup_field
+        self.count_hint = count_hint
+    def prepare(self):
+        return self.definition.datapath / self.varname
+    def download(self, force=False):
+        # Creates directory if needed
+        destination = self.definition.datapath / self.varname
+        destination.mkdir(exist_ok=True)
+        # Early exit
+        if (destination / "done").is_file() and not force:
+            return True
+        # Download (cache)
+        logging.info("Building the document index")
+        # Builds the LZ4 store
+        store = PickleLz4FullStore(
+            destination,
+            lambda: self.iter_factory(),
+            self.doc_cls,
+            lookup_field=self.lookup_field,
+            index_fields=[self.lookup_field],
+            key_field_prefix=None,
+            size_hint=None,
+            count_hint=self.count_hint,
+        )
+        store.build()
+        # All good!
+        (destination / "done").touch()

datamaestro_text/version.py CHANGED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '2024.3.10'
-__version_tuple__ = version_tuple = (2024, 3, 10)
+__version__ = version = '2025.1.7'
+__version_tuple__ = version_tuple = (2025, 1, 7)

{datamaestro_text-2024.3.10.dist-info → datamaestro_text-2025.1.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamaestro-text
-Version: 2024.3.10
+Version: 2025.1.7
 Summary: Datamaestro module for text-related datasets
 Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
 License: GPL-3
@@ -18,8 +18,8 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Requires-Python: >=3.8
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: datamaestro >=1.1.0
-Requires-Dist: ir-datasets
+Requires-Dist: datamaestro>=1.2.1
+Requires-Dist: ir_datasets>=0.5.8
 Requires-Dist: attrs
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) [![PyPI version](https://badge.fury.io/py/datamaestro-text.svg)](https://badge.fury.io/py/datamaestro-text)

{datamaestro_text-2024.3.10.dist-info → datamaestro_text-2025.1.7.dist-info}/RECORD RENAMED Viewed

@@ -1,12 +1,13 @@
 datamaestro_text/__init__.py,sha256=hU8jZpkXl3F74qIfqnJl7v4nJ9YxfoR7IpJpUREFNRI,248
-datamaestro_text/version.py,sha256=Bq97zWpOGRS-xWJRpvk6kRdLhLjS83bAhj3DIaONmi8,419
+datamaestro_text/version.py,sha256=k65KHkS4PD3NjDDCJgWcTwRBE4yVcKs32B8SUxv2DvE,417
 datamaestro_text/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamaestro_text/config/ai/quac.yaml,sha256=h1D7UJo2z1nZ_9MXpDHuQNJG9Ma2oayUdJV6lyumAIg,1103
 datamaestro_text/config/com/oscar-corpus.py,sha256=dDfdy_uhG6esRQQzUOOORwK8KvEq4c4hZbWt22pv-zY,733
 datamaestro_text/config/com/sentiment140.py,sha256=iRV_rSjQcr9WNjjQ4EdznaEMGLADV_rwpeRx7ycQi0k,1383
 datamaestro_text/config/com/fastml/goodbooks-10k.yaml,sha256=5ZABxUnBFs2ZnCXtBH8YoBiPb3SocRRdH1DLSfVWF-Y,1172
 datamaestro_text/config/com/github/aagohary/canard.py,sha256=tIwb_KxMUR3st7rzQUkt6rIjolTl8uKvDq6t795b1nY,1468
-datamaestro_text/config/com/github/prdwb/orconvqa.py,sha256=BytI8euqX04RlTCM8LvYKNKm9SVUTClSnszE3QUhGR8,3196
+datamaestro_text/config/com/github/apple/ml-qrecc.py,sha256=_lufJlg-4zaQyjr5Ae-X-9hXzpl2JFjfxh_RhnBbva4,3068
+datamaestro_text/config/com/github/prdwb/orconvqa.py,sha256=Tg3hbnvilxW_Lwt7fpKvGMtu-6mc9oNIHM-LX6JTR1U,3026
 datamaestro_text/config/com/github/soskek/bookcorpus.yaml,sha256=qJKs35yeEIilEMgNvU3OEqMp1TSn7mDM2T-uYyA7kTU,1607
 datamaestro_text/config/com/microsoft/wikiqa.yaml,sha256=U7rU-W2Xz1MYv2YXT4jCTj5DsHyM0VssbQPNa3EISaM,540
 datamaestro_text/config/com/microsoft/msmarco/passage.py,sha256=ouNn2nivS6mUMaCyMzqxNv1YMoPrSEX-UcSZpG1v_uw,11645
@@ -45,23 +46,24 @@ datamaestro_text/data/recommendation.py,sha256=MatelpJQiCMpNDuaQVBkRY809J1qiQo-8
 datamaestro_text/data/tagging.py,sha256=yWm7bNLks77cAySa1aZNCmLFxTTqhqXZ0PaoaEYU6hI,697
 datamaestro_text/data/text.py,sha256=pOI8nrEd6RoQA28DVH1JufHTunr9vG3FQzwElR8YirI,499
 datamaestro_text/data/conversation/__init__.py,sha256=esOWnSImMlQs5QtfxUt559ABLd6a5wwoNpj9XtIq71Y,159
-datamaestro_text/data/conversation/base.py,sha256=2invpOXa2DnrfWO0kdpohSw1Feb__obySSUtu7W4CYc,4883
+datamaestro_text/data/conversation/base.py,sha256=zXX5sPMoVtDf4WTkLx45IvZz6lgKPgcUZFu6N4lVBlc,6457
 datamaestro_text/data/conversation/canard.py,sha256=IMxu5NfytWZLZ_cyT8UFOICbDE82HLJoNBT_6j36Faw,1808
-datamaestro_text/data/conversation/orconvqa.py,sha256=TsaeJkxgNobyCNaRH8ZdAVNIAAfiMIxuRq_XDRzyC-I,3457
-datamaestro_text/data/ir/__init__.py,sha256=FwK6U6Yw3UjZjqZoaE1Dfe7UQktO5CFeyHCLfmxC3fE,8670
+datamaestro_text/data/conversation/orconvqa.py,sha256=VG3GV5_IPQcvla9rrQPypDNcZYmKNMgozmr2oudeLA4,3802
+datamaestro_text/data/conversation/qrecc.py,sha256=es4GmqPtE63A7O_GARe8Zy3rQvuLEhAvUA7CfN_nMeA,2562
+datamaestro_text/data/ir/__init__.py,sha256=EOVnRv9oKEEucMOgrFjhXxeUWEkaJkbXBndpkKhZaiY,8701
 datamaestro_text/data/ir/base.py,sha256=Cw8egjChpx4ksUwp-vTA70B2OWxROH7FIeJylsXggAk,1429
 datamaestro_text/data/ir/cord19.py,sha256=7xDIzsy63WrA9lxxyNOMu5ECRymu5x23EzYG977nS6Y,1440
 datamaestro_text/data/ir/csv.py,sha256=tnxuqR_MZ3GQhuoXEMYyWLQw8PyD0gRqsnzIP5Gsziw,1212
 datamaestro_text/data/ir/data.py,sha256=ERmmOxz_9mUkIuccNbzUa5Y6gVLLVDdyc4cCxbCCUbY,20
-datamaestro_text/data/ir/formats.py,sha256=wgjXIkNJjqRbHEMkkXyXRRMnxnho45jfUbPsJCazkZk,2866
+datamaestro_text/data/ir/formats.py,sha256=rU9uJhdFDdlnQ2qfFowK_--1WVOH1fVgfXWOPPDF_FY,3160
 datamaestro_text/data/ir/huggingface.py,sha256=G71VFDN-SllZy4LFxumEbCumEJvb5-orAbLemHjWhiA,956
-datamaestro_text/data/ir/stores.py,sha256=JdeDhPxAQOM5_1Pqi_HGoPNUbe63_zMaz-NRs24RS94,687
+datamaestro_text/data/ir/stores.py,sha256=rlOogoBAfsP7o01KqvHiTF1XqzK2Fp6QbRmuasabKAE,859
 datamaestro_text/data/ir/trec.py,sha256=qDOzmgUn0hMqeP9KdI190-9IKsLl6yNRqIoBz6s-cBs,1898
 datamaestro_text/data/ir/utils.py,sha256=6-GhXVtgkBZGhIs2-ODZua_3DmKjSSVydStpHDqbAwE,833
 datamaestro_text/datasets/irds/__init__.py,sha256=Tq0HN1qojnZYLBumM59BuTkz7r0gcu-5OXmDDLgPpAc,707
-datamaestro_text/datasets/irds/data.py,sha256=VjvqEvMY3VfuX4Kx7YdoVOoS_fIrMR_3RIIf_PdErsc,16785
+datamaestro_text/datasets/irds/data.py,sha256=BTsFZdjFJ0XdlYRh2rcshAoeknHrypf7be2BGUbWWFk,19931
 datamaestro_text/datasets/irds/datasets.py,sha256=qtN-nX2_OV9FD339aZjwVL1jFf6I4T6MbNCcuRAjmtU,5682
-datamaestro_text/datasets/irds/helpers.py,sha256=KC-2nQPCIl4VnbfDkAkr4iFlhkknn8zvbADlClWZvwU,2207
+datamaestro_text/datasets/irds/helpers.py,sha256=fGE-fbuJbXdTzl1qo55247jzn9cvApY-d82GJBgfY1E,3982
 datamaestro_text/datasets/irds/utils.py,sha256=m30JLIrV_HgilN11TvY9dGTyumES6LLzWZDUAMT915M,1425
 datamaestro_text/download/tmdb.py,sha256=kU_Vz9jhznlyeKMHziVu58IHoWv8zPu6CZTHVNQvmu4,4009
 datamaestro_text/interfaces/plaintext.py,sha256=cWfS_xjqZxQ0EV4Ax5BEarZ4lnhQ1I7mc_vgfBgE76w,885
@@ -76,9 +78,9 @@ datamaestro_text/utils/files.py,sha256=n6ZGl5LNrZbHLcV9RFwd7cFT0vPUezit-2dsBzs1v
 datamaestro_text/utils/iter.py,sha256=2_UZ8y9Ma4k5U9ZD4w55Zfb6NGrKM1L4G40OygRm1is,2459
 datamaestro_text/utils/randomstream.py,sha256=_-boH4IIqN8qcl3IktjpNp9vmF4TWRzHUSNVwg7WAr8,973
 datamaestro_text/utils/shuffle.py,sha256=o8JTz3mr0lYWyv0zEh91jEK12ci1etMiUnzh5GkOHCM,3490
-datamaestro_text-2024.3.10.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
-datamaestro_text-2024.3.10.dist-info/METADATA,sha256=MCmmREa3bheRgoqsHnhxZ3QvvuiGOWwqgrRGVQw67pw,1604
-datamaestro_text-2024.3.10.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
-datamaestro_text-2024.3.10.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
-datamaestro_text-2024.3.10.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
-datamaestro_text-2024.3.10.dist-info/RECORD,,
+datamaestro_text-2025.1.7.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
+datamaestro_text-2025.1.7.dist-info/METADATA,sha256=-wsBcUcnEnOMiHb1ROIf43r55ZNNNjpIemuFXm5hHUY,1609
+datamaestro_text-2025.1.7.dist-info/WHEEL,sha256=A3WOREP4zgxI0fKrHUG8DC8013e3dK3n7a6HDbcEIwE,91
+datamaestro_text-2025.1.7.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
+datamaestro_text-2025.1.7.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
+datamaestro_text-2025.1.7.dist-info/RECORD,,

{datamaestro_text-2024.3.10.dist-info → datamaestro_text-2025.1.7.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.42.0)
+Generator: setuptools (75.7.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{datamaestro_text-2024.3.10.dist-info → datamaestro_text-2025.1.7.dist-info}/LICENSE RENAMED Viewed

File without changes

{datamaestro_text-2024.3.10.dist-info → datamaestro_text-2025.1.7.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{datamaestro_text-2024.3.10.dist-info → datamaestro_text-2025.1.7.dist-info}/top_level.txt RENAMED Viewed

File without changes

datamaestro-text 2024.3.10__py3-none-any.whl → 2025.1.7__py3-none-any.whl

datamaestro-text 2024.3.10py3-none-any.whl → 2025.1.7py3-none-any.whl