PyPI - datamaestro-text - Versions diffs - 2025.6.30__py3-none-any.whl → 2025.7.28__py3-none-any.whl - Mend

datamaestro-text 2025.6.30py3-none-any.whl → 2025.7.28py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

datamaestro_text/config/com/github/ikat.py CHANGED Viewed

@@ -1,38 +1,121 @@
 # See documentation on https://datamaestro.readthedocs.io
+import bz2
+from datamaestro.download import reference
 from datamaestro.definitions import datatasks, datatags, dataset
-from datamaestro.data.ml import Supervised
-from datamaestro.data import Base
+from datamaestro_text.data.conversation.base import ConversationUserTopics
+from datamaestro_text.data.ir import Adhoc
 from datamaestro.utils import HashCheck
+from datamaestro.context import DatafolderPath
 from datamaestro.download.single import filedownloader
-from datamaestro_text.data.conversation.ikat import IkatDatasetEntry, IkatDataset
-from datamaestro_text.datasets.irds.data import (
-    SimpleJsonDocument,
-    LZ4JSONLDocumentStore,
-)
-import logging
+from datamaestro_text.data.conversation.ikat import IkatConversations
+from datamaestro.download.links import linkfolder
+from datamaestro_text.data.ir.stores import IKatClueWeb22DocumentStore
+from datamaestro_text.data.ir.trec import TrecAdhocAssessments
+from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
+@dataset(as_prepare=True)
+def clueweb22(dataset, options=None) -> IKatClueWeb22DocumentStore:
+    # Number of documents in the dataset
+    count = 116_838_987
+    jsonl_folder = linkfolder(
+        "documents", [DatafolderPath("gov.nist.trec.ikat.clueweb22", "jsonl")]
+    ).setup(dataset, options)
+    store_path = lz4docstore_builder(
+        "store",
+        IKatClueWeb22DocumentStore.generator(
+            jsonl_folder,
+            jsonl_folder / "ikat_2023_passages_jsonl.sha256sums",
+            jsonl_folder / "ikat_2023_passages_hashes.tsv.bz2",
+        ),
+        IKatClueWeb22DocumentStore.Document,
+        "id",
+        count_hint=count,
+    ).setup(dataset, options)
+    return IKatClueWeb22DocumentStore.C(path=store_path, count=count)
 @datatags("conversation", "context", "query")
-@datatasks("query rewriting")
+@datatasks("conversational search", "query rewriting")
+@reference("documents", clueweb22)
 @filedownloader(
-    "test.json",
+    "topics.json",
     "https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2025/data/2025_test_topics.json",
     checker=HashCheck("16f8444a8d0a8dfe0090f478f185a63c"),
 )
 @dataset(
-    Base,
+    id="2025",
     url="https://github.com/irlabamsterdam/iKAT/tree/main/2025",
 )
-def main(test) -> Supervised[IkatDataset, None, IkatDataset]:
+def test_2025(topics, documents) -> Adhoc.C:
     """Question-in-context rewriting
-    iKAT is a test dataset for question-in-context rewriting that consists of
+    iKAT is a test dataset for question-in-context rewriting that consists of
     questions each given in a dialog context together with a context-independent
-    rewriting of the question.
-    One of the special features of iKAT is that it includes a Personal PKTB',
+    rewriting of the question.
     """
-    logging.info("Creating iKAT dataset from %s", test)
-    return IkatDataset.C(path=test)
+    return Adhoc.C(
+        topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
+        # TODO: add when available
+        assessments=TrecAdhocAssessments.C(path="/to/do"),
+        documents=documents,
+    )
+@datatags("conversation", "context", "query")
+@datatasks("conversational search", "query rewriting")
+@reference("documents", clueweb22)
+@filedownloader(
+    "qrels",
+    "https://trec.nist.gov/data/ikat/2024-qrels.txt",
+    checker=HashCheck("57f958903ed1c12bbac207f62800814f"),
+)
+@filedownloader(
+    "topics.json",
+    "https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2024/data/2024_test_topics.json",
+    checker=HashCheck("ad45bc6e7add2081d69ea60a0a4d1203"),
+)
+@dataset(
+    Adhoc,
+    id="2024",
+    url="https://github.com/irlabamsterdam/iKAT/tree/main/2024",
+)
+def test_2024(topics, qrels, documents) -> Adhoc.C:
+    """iKAT 2024 dataset"""
+    return Adhoc.C(
+        topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
+        assessments=TrecAdhocAssessments.C(path=qrels),
+        documents=documents,
+    )
+@datatags("conversation", "context", "query")
+@datatasks("conversational search", "query rewriting")
+@reference("documents", clueweb22)
+@filedownloader(
+    "qrels",
+    "https://trec.nist.gov/data/ikat/2023-qrels.all-turns.txt",
+    checker=HashCheck("79dc121bab25b2245e52a53263e5ad1f"),
+)
+@filedownloader(
+    "topics.json",
+    "https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2023/data/2023_test_topics.json",
+    checker=HashCheck("684fa0197cdec8c3cfb6a2e586ab83f6"),
+)
+@dataset(
+    Adhoc,
+    id="2023",
+    url="https://github.com/irlabamsterdam/iKAT/tree/main/2023",
+)
+def test_2023(topics, qrels, documents) -> Adhoc.C:
+    """iKAT 2023 dataset"""
+    return Adhoc.C(
+        topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
+        assessments=TrecAdhocAssessments.C(path=qrels),
+        documents=documents,
+    )

datamaestro_text/config/com/sentiment140.py CHANGED Viewed

@@ -26,7 +26,7 @@ def english(dir):
     If you use this data, please cite Sentiment140 as your source.
     """
-    return {
-        "train": Generic(path=dir / "training.1600000.processed.noemoticon.csv"),
-        "test": Generic(path=dir / "testdata.manual.2009.06.14.csv"),
-    }
+    return Supervised.C(
+        train=Generic(path=dir / "training.1600000.processed.noemoticon.csv"),
+        test=Generic(path=dir / "testdata.manual.2009.06.14.csv"),
+    )

datamaestro_text/config/gov/nist/trec/tipster.py CHANGED Viewed

@@ -17,7 +17,6 @@ of three CD-ROMs of SGML encoded documents distributed by LDC plus queries and a
 See also https://trec.nist.gov/data/docs_eng.html and https://trec.nist.gov/data/intro_eng.html
 """
-from datamaestro.data import Base
 from datamaestro_text.data.ir.trec import TipsterCollection
 from datamaestro.download.links import linkfolder
 from datamaestro.definitions import (

datamaestro_text/data/conversation/base.py CHANGED Viewed

@@ -1,10 +1,13 @@
 from abc import ABC, abstractmethod
 from enum import Enum
+from datamaestro_text.data.ir.base import IDItem, SimpleTextItem
+from experimaestro import Param
 from typing import Dict, Generic, Iterator, List, Optional, Sequence, Tuple
 from attr import define
+from datamaestro.record import record_type
 from datamaestro.data import Base
 from datamaestro.record import Record, Item
-from datamaestro_text.data.ir import TopicRecord
+from datamaestro_text.data.ir import TopicRecord, Topics
 from datamaestro_text.utils.iter import FactoryIterable, LazyList, RangeView
 # ---- Basic types
@@ -120,20 +123,17 @@ class ConversationNode:
         ...
     @abstractmethod
-    def parent(self) -> Optional["ConversationNode"]:
-        ...
+    def parent(self) -> Optional["ConversationNode"]: ...
     @abstractmethod
-    def children(self) -> List["ConversationNode"]:
-        ...
+    def children(self) -> List["ConversationNode"]: ...
 class ConversationTree(ABC):
     """Represents a conversation tree"""
     @abstractmethod
-    def root(self) -> ConversationNode:
-        ...
+    def root(self) -> ConversationNode: ...
     @abstractmethod
     def __iter__(self) -> Iterator[ConversationNode]:
@@ -253,5 +253,30 @@ class ConversationDataset(Base, ABC):
     @abstractmethod
     def __iter__(self) -> Iterator[ConversationTree]:
         """Return an iterator over conversations"""
-        for i in range(len(self)):
-            yield self.get(i)
+        ...
+class ConversationUserTopics(Topics):
+    """Extract user topics from conversations"""
+    conversations: Param[ConversationDataset]
+    topic_recordtype = record_type(IDItem, SimpleTextItem)
+    def iter(self) -> Iterator[TopicRecord]:
+        """Returns an iterator over topics"""
+        # Extracts topics from conversations, Each user query is a topic (can perform retrieval on it)
+        # TODO: merge with xpmir.learning.DatasetConversationBase -> same logic
+        records: List[TopicRecord] = []
+        for conversation in self.conversations.__iter__():
+            nodes = [
+                node
+                for node in conversation
+                if node.entry[EntryType] == EntryType.USER_QUERY
+            ]
+            for node in nodes:
+                records.append(
+                    node.entry.update(ConversationHistoryItem(node.history()))
+                )
+        return iter(records)

datamaestro_text/data/conversation/ikat.py CHANGED Viewed

@@ -1,10 +1,11 @@
-from typing import Iterator, List, Optional
+from typing import Iterator, List
 from attr import define, field
 import json
 import logging
 from datamaestro.data import File
 from datamaestro.record import Record
+from datamaestro_text.data.ir import Topics
 from datamaestro_text.data.ir.base import (
     IDItem,
     SimpleTextItem,
@@ -12,7 +13,6 @@ from datamaestro_text.data.ir.base import (
 from .base import (
-    AnswerDocumentURL,
     AnswerEntry,
     ConversationTree,
     EntryType,
@@ -21,6 +21,25 @@ from .base import (
 )
 from . import ConversationDataset
+# Keys to change in the dataset entries for compatibility across different years
+KEY_MAPPINGS = {
+    # Keys to replace: Target Key
+    "turns": "responses",
+    "utterance": "user_utterance",
+    "ptkb_provenance": "relevant_ptkbs",
+    "response_provenance": "citations",
+}
+def norm_dict(entry: dict) -> dict:
+    """Convert keys in the entry to match the expected format."""
+    normalized = {}
+    for k, v in entry.items():
+        # Check for direct mapping, then try lowercase mapping
+        new_key = KEY_MAPPINGS.get(k) or KEY_MAPPINGS.get(k.lower()) or k.lower()
+        normalized[new_key] = v
+    return normalized
 @define(kw_only=True)
@@ -47,7 +66,7 @@ class IkatConversationEntry:
 @define(kw_only=True)
-class IkatDatasetEntry:
+class IkatConversationTopic:
     """A query with past history"""
     number: str
@@ -60,14 +79,20 @@ class IkatDatasetEntry:
     """The personal knowledge base associated with the user"""
     responses: List[IkatConversationEntry] = field(
-        converter=lambda items: [IkatConversationEntry(**item) if isinstance(item, dict) else item for item in items]
+        converter=lambda items: [
+            IkatConversationEntry(**item) if isinstance(item, dict) else item
+            for item in map(norm_dict, items)
+        ]
     )
     """The list of responses to the query"""
-class IkatDataset(ConversationDataset, File):
+class IkatConversations(ConversationDataset, File):
+    """A dataset containing conversations from the IKAT project"""
-    def entries(self) -> Iterator[IkatDatasetEntry]:
+    """Keys to change in the dataset entries for compatibility across different years"""
+    def entries(self) -> Iterator[IkatConversationTopic]:
         """Reads all conversation entries from the dataset file."""
         with self.path.open("rt") as fp:
             raw_data = json.load(fp)
@@ -75,12 +100,13 @@ class IkatDataset(ConversationDataset, File):
         logging.debug("Loaded %d entries from %s", len(raw_data), self.path)
         logging.debug(f"raw data has keys {raw_data[0].keys()}")
-        processed_data = []
         for entry in raw_data:
-            processed_data.append(IkatDatasetEntry(**{key.lower(): value for key, value in entry.items()}))
-        logging.debug(f"First parsed data sample: {processed_data[0]}")
-        return iter(processed_data)
+            try:
+                normalized_entry = norm_dict(entry)
+                yield IkatConversationTopic(**normalized_entry)
+            except Exception as e:
+                logging.warning(f"Failed to parse entry: {e}")
+                raise e
     def __iter__(self) -> Iterator[ConversationTree]:
         for entry in self.entries():
@@ -88,7 +114,7 @@ class IkatDataset(ConversationDataset, File):
             for turn in entry.responses:
                 turn: IkatConversationEntry = turn  # Ensure type is correct
-                query_id = f"{entry.number}#{turn.turn_id}"
+                query_id = f"{entry.number}_{turn.turn_id}"
                 # USER QUERY record
                 history.append(
@@ -117,4 +143,3 @@ class IkatDataset(ConversationDataset, File):
             # Ensure reverse if needed for compatibility (optional)
             history.reverse()
             yield SingleConversationTree(entry.number, history)

datamaestro_text/data/ir/__init__.py CHANGED Viewed

@@ -2,9 +2,10 @@
 from abc import ABC, abstractmethod
 from functools import cached_property
+import logging
 from pathlib import Path
 from attrs import define
-from typing import Callable, Dict, Iterator, List, Optional, Tuple, Type
+from typing import Callable, Dict, Iterator, List, Optional, Tuple, Type, TYPE_CHECKING
 import random
 from experimaestro import Config
 from datamaestro.definitions import datatasks, Param, Meta
@@ -28,6 +29,9 @@ from .base import (  # noqa: F401
     AdhocAssessedTopic,
 )
+#: A adhoc run dictionary (query id -> doc id -> score)
+AdhocRunDict = dict[str, dict[str, float]]
 class Documents(Base):
     """A set of documents with identifiers
@@ -45,6 +49,22 @@ class Documents(Base):
     def iter_documents(self) -> Iterator[DocumentRecord]:
         return self.iter()
+    def iter_documents_from(self, start=0) -> Iterator[DocumentRecord]:
+        """Iterate over a range of documents
+        Can be specialized in a subclass for faster access
+        :param start: The starting document, defaults to 0
+        :return: An iterator
+        """
+        iter = self.iter()
+        if start > 0:
+            logging.info("skipping %d documents", start + 1)
+            for _ in range(start + 1):
+                next(iter)
+        return iter
     def iter_ids(self) -> Iterator[str]:
         """Iterates over document ids
@@ -168,7 +188,10 @@ class AdhocAssessments(Base, ABC):
 class AdhocRun(Base):
     """IR adhoc run"""
-    pass
+    @abstractmethod
+    def get_dict(self) -> "AdhocRunDict":
+        """Get the run as a dictionary query ID -> doc ID -> score"""
+        ...
 class AdhocResults(Base):

datamaestro_text/data/ir/base.py CHANGED Viewed

@@ -43,6 +43,7 @@ class IDItem(Item, ABC):
     id: str
 @define
 class UrlItem(Item):
     """An url item"""
@@ -70,7 +71,7 @@ class AdhocAssessedTopic:
     """List of assessments for this topic"""
-def create_record(*items: Item, id: str = None, text: str = None):
+def create_record(*items: Item, id: str = None, text: str = None) -> Record:
     """Easy creation of a text/id item"""
     extra_items = []
     if id is not None:

datamaestro_text/data/ir/formats.py CHANGED Viewed

@@ -99,7 +99,7 @@ class WapoDocument(TextItem):
     body_media: Tuple[WapoDocMedia, ...]
     @cached_property
-    def text(self):
+    def text(self):
         return f"{self.title} {self.body_paras_html}"
@@ -132,11 +132,15 @@ class DprW100Doc(TextItem):
     text: str
     title: str
-@define
+@define
 class MsMarcoV2Passage(TextItem):
     text: str
     spans: Tuple[Tuple[int, int], ...]
     msmarco_document_id: str
+@define
 class Touche2020(TextItem):
     text: str
     title: str
@@ -204,11 +208,13 @@ class TrecTopic(SimpleTextItem):
 TrecTopicRecord = record_type(IDItem, TrecTopic)
 @define
 class DprW100Query(TextItem):
     text: str
     answers: Tuple[str]
 @define
 class TrecBackgroundLinkingQuery(IDItem):
     query_id: str

datamaestro_text/data/ir/stores.py CHANGED Viewed

@@ -1,12 +1,21 @@
-from collections import namedtuple
+import bz2
+from hashlib import md5, sha256
+import json
+import logging
+from pathlib import Path
 from typing import List, NamedTuple
+from datamaestro_text.utils.files import TQDMFileReader
 from experimaestro import Constant
-import attrs
 from datamaestro.record import Record
-from datamaestro_text.data.ir.base import IDItem
+from datamaestro_text.data.ir.base import (
+    DocumentRecord,
+    IDItem,
+    SimpleTextItem,
+    UrlItem,
+)
 from datamaestro_text.datasets.irds.data import LZ4DocumentStore
 from datamaestro_text.data.ir.formats import OrConvQADocument
+from tqdm import tqdm
 class OrConvQADocumentStore(LZ4DocumentStore):
@@ -27,3 +36,89 @@ class OrConvQADocumentStore(LZ4DocumentStore):
         fields = data._asdict()
         del fields["id"]
         return Record(OrConvQADocument(**fields), IDItem(data.id))
+class IKatClueWeb22DocumentStore(LZ4DocumentStore):
+    @staticmethod
+    def generator(path: Path, checksums_file: Path, passages_hashes: Path):
+        """Returns an iterator over iKAT 2022-25 documents
+        :param path: The folder containing the files
+        """
+        def __iter__():
+            errors = False
+            assert checksums_file.is_file(), f"{checksums_file} does not exist"
+            assert passages_hashes.is_file(), f"{passages_hashes} does not exist"
+            # Get the list of files
+            with checksums_file.open("rt") as fp:
+                files = []
+                for line in fp:
+                    checksum, filename = line.strip().split()
+                    files.append((checksum, filename))
+                    if not (path / filename).is_file():
+                        logging.error("File %s does not exist", path / filename)
+                        errors = True
+            assert not errors, "Errors detected, stopping"
+            # Check the SHA256 sums
+            match checksums_file.suffix:
+                case ".sha256sums":
+                    hasher_factory = sha256
+                case _:
+                    raise NotImplementedError(
+                        f"Cannot handle {checksums_file.suffix} checksum files"
+                    )
+            for checksum, filename in tqdm(files):
+                logging.info("Checking %s", filename)
+                hasher = hasher_factory()
+                with (path / filename).open("rb") as fp:
+                    while data := fp.read(2**20):
+                        hasher.update(data)
+                file_checksum = hasher.hexdigest()
+                assert file_checksum == checksum, (
+                    f"Expected {checksum}, " f"got {file_checksum} for {filename}"
+                )
+            # Get the MD5 hashes of all the passages
+            logging.info("Reading the hashes of all passages")
+            with TQDMFileReader(passages_hashes, "rt", bz2.open) as fp:
+                passage_checksums = {}
+                for line in tqdm(fp):
+                    doc_id, passage_no, checksum = line.strip().split()
+                    passage_checksums[f"{doc_id}:{passage_no}"] = checksum  # noqa: E231
+            # Read the files
+            logging.info("Starting to read the files")
+            for _, filename in tqdm(files):
+                with TQDMFileReader(path / filename, "rt", bz2.open) as jsonl_fp:
+                    for line in jsonl_fp:
+                        data = json.loads(line)
+                        expected = passage_checksums[data["id"]]
+                        computed = md5(data["contents"].encode("utf-8")).hexdigest()
+                        assert expected == computed, (
+                            f"Expected {expected}, "
+                            f"got {computed} for passage {data['id']} in {filename}"
+                        )
+                        yield IKatClueWeb22DocumentStore.Document(**data)
+        return __iter__
+    class Document(NamedTuple):
+        id: str
+        contents: str
+        url: str
+    data_cls = Document
+    lookup_field: Constant[str] = "id"
+    index_fields: Constant[List[str]] = ["id"]
+    def converter(self, data):
+        return DocumentRecord(
+            IDItem(data.id), SimpleTextItem(data.contents), UrlItem(data.url)
+        )

datamaestro_text/data/ir/trec.py CHANGED Viewed

@@ -1,9 +1,9 @@
+import re
 from typing import Dict, List, Optional
-from datamaestro.data import Base
 from experimaestro import documentation, Param, Meta
 from pathlib import Path
-from datamaestro.record import Record
 from datamaestro_text.data.ir import (
+    AdhocRunDict,
     Documents,
     Topics,
     AdhocAssessments,
@@ -47,6 +47,11 @@ class TrecAdhocAssessments(AdhocAssessments):
 class TrecAdhocRun(AdhocRun):
     path: Param[Path]
+    def get_dict(self) -> AdhocRunDict:
+        import datamaestro_text.interfaces.trec as trec
+        return trec.parse_run(self.path)
 class TrecAdhocResults(AdhocResults):
     """Adhoc results (TREC format)"""
@@ -62,8 +67,6 @@ class TrecAdhocResults(AdhocResults):
     def get_results(self) -> Dict[str, float]:
         """Returns the results as a dictionary {metric_name: value}"""
-        import re
         re_spaces = re.compile(r"\s+")
         results = {}

datamaestro_text/datasets/irds/data.py CHANGED Viewed

@@ -122,7 +122,14 @@ class Documents(ir.DocumentStore, IRDSId):
             formats.Touche2020, "doc_id", "text", "title", "stance", "url"
         ),
         _irds.beir.BeirSciDoc: tuple_constructor(
-            formats.SciDocs, "doc_id", "text", "title", "authors", "year", "cited_by", "references"
+            formats.SciDocs,
+            "doc_id",
+            "text",
+            "title",
+            "authors",
+            "year",
+            "cited_by",
+            "references",
         ),
         _irds.msmarco_document.MsMarcoDocument: tuple_constructor(
             formats.MsMarcoDocument, "doc_id", "url", "title", "body"
@@ -198,6 +205,10 @@ class Documents(ir.DocumentStore, IRDSId):
         for doc in self.dataset.docs_iter():
             yield self.converter(self.document_recordtype, doc)
+    def iter_documents_from(self, start=0):
+        for doc in self.dataset.docs_iter()[start:]:
+            yield self.converter(self.document_recordtype, doc)
     @property
     def documentcount(self):
         return self.dataset.docs_count()
@@ -244,7 +255,7 @@ if hasattr(_irds, "miracl"):
     )
-class LZ4DocumentStore(ir.DocumentStore):
+class LZ4DocumentStore(ir.DocumentStore, ABC):
     """A LZ4-based document store"""
     path: Param[Path]
@@ -253,7 +264,7 @@ class LZ4DocumentStore(ir.DocumentStore):
     lookup_field: Param[str]
     # Extra indexed fields (e.g. URLs)
-    index_fields: List[str]
+    index_fields: List[str] = []
     @cached_property
     def store(self):
@@ -285,6 +296,9 @@ class LZ4DocumentStore(ir.DocumentStore):
         """Returns an iterator over documents"""
         return map(self.converter, self.store.__iter__())
+    def iter_documents_from(self, start=0):
+        return map(self.converter, self.store.__iter__()[start:])
     @cached_property
     def documentcount(self):
         if self.count:
@@ -386,7 +400,13 @@ class Topics(ir.TopicsStore, IRDSId):
             formats.TrecTopic, "query_id", "text", "description", "narrative"
         ),
         _irds.beir.BeirSciQuery: tuple_constructor(
-            formats.SciDocsTopic, "query_id", "text", "authors", "year", "cited_by", "references"
+            formats.SciDocsTopic,
+            "query_id",
+            "text",
+            "authors",
+            "year",
+            "cited_by",
+            "references",
         ),
         _irds.tweets2013_ia.TrecMb13Query: tuple_constructor(
             formats.TrecMb13Query, "query_id", "query", "time", "tweet_time"
@@ -400,10 +420,7 @@ class Topics(ir.TopicsStore, IRDSId):
             "description",
         ),
         _irds.dpr_w100.DprW100Query: tuple_constructor(
-            formats.DprW100Query,
-            "query_id",
-            "text",
-            "answers"
+            formats.DprW100Query, "query_id", "text", "answers"
         ),
     }
@@ -435,11 +452,12 @@ class Topics(ir.TopicsStore, IRDSId):
     def iter(self) -> Iterator[TopicRecord]:
         """Returns an iterator over topics"""
         return self.handler.iter()
 class TrecBackgroundLinkingTopicsHandler(TopicsHandler):
     def __init__(self, dataset):
         self.dataset = dataset
     @cached_property
     def ext2records(self):
         return {record[IDItem].id: record for record in self.records}
@@ -462,10 +480,12 @@ class TrecBackgroundLinkingTopicsHandler(TopicsHandler):
             records = []
             for query in self.dataset.dataset.queries_iter():
-                topic =  Record(
+                topic = Record(
                     IDItem(query.query_id),
                     # Following BEIR documentation, we use title of documents as queries: https://github.com/beir-cellar/beir/blob/main/examples/dataset/README.md#queries-and-qrels
-                    SimpleTextItem(self.dataset.dataset.docs_store().get(query.doc_id).title),
+                    SimpleTextItem(
+                        self.dataset.dataset.docs_store().get(query.doc_id).title
+                    ),
                     UrlItem(query.url),
                 )
                 records.append(topic)
@@ -477,11 +497,10 @@ class TrecBackgroundLinkingTopicsHandler(TopicsHandler):
 Topics.HANDLERS.update(
-    {
-        _irds.wapo.TrecBackgroundLinkingQuery: TrecBackgroundLinkingTopicsHandler
-    }
+    {_irds.wapo.TrecBackgroundLinkingQuery: TrecBackgroundLinkingTopicsHandler}
 )
 class CastTopicsHandler(TopicsHandler):
     def __init__(self, dataset):
         self.dataset = dataset

datamaestro_text/interfaces/trec.py CHANGED Viewed

@@ -1,7 +1,7 @@
-from attrs import define
 from pathlib import Path
 from typing import Iterator, Optional
 import re
+from datamaestro_text.data.ir import AdhocRunDict
 from datamaestro_text.data.ir.base import (
     AdhocAssessedTopic,
     TopicRecord,
@@ -10,6 +10,33 @@ from datamaestro_text.data.ir.base import (
 )
 from datamaestro_text.data.ir.formats import TrecTopicRecord, TrecTopic
+# --- Runs
+def parse_run(path: Path) -> AdhocRunDict:
+    results = {}
+    with path.open("rt") as f:
+        for line in f:
+            query_id, _q0, doc_id, _rank, score, _model_id = re.split(
+                r"\s+", line.strip()
+            )
+            results.setdefault(query_id, {})[doc_id] = score
+    return results
+def write_run_dict(run: AdhocRunDict, run_path: Path):
+    """Write run dict"""
+    with run_path.open("wt") as f:
+        for query_id, scored_documents in run.items():
+            scored_documents = list(
+                [(doc_id, score) for doc_id, score in scored_documents.items()]
+            )
+            scored_documents.sort(key=lambda x: x[1], reverse=True)
+            for ix, (doc_id, score) in enumerate(scored_documents):
+                f.write(f"{query_id} Q0 {doc_id} {ix + 1} {score} run\n")
 # --- Assessments

datamaestro_text/utils/files.py CHANGED Viewed

@@ -1,3 +1,5 @@
+import os
+from tqdm import tqdm
 import gzip
 from pathlib import Path
@@ -6,3 +8,104 @@ def auto_open(path: Path, mode: str):
     if path.suffix == ".gz":
         return gzip.open(path, mode)
     return path.open(mode)
+class CountingWrapper:
+    """Wrap a file object to count the actual compressed bytes read."""
+    def __init__(self, file_obj):
+        self.file_obj = file_obj
+        self.bytes_read = 0
+    def read(self, size=-1):
+        data = self.file_obj.read(size)
+        self.bytes_read += len(data)
+        return data
+    def readline(self, size=-1):
+        data = self.file_obj.readline(size)
+        self.bytes_read += len(data)
+        return data
+    def __iter__(self):
+        return self
+    def __next__(self):
+        line = self.readline()
+        if not line:
+            raise StopIteration
+        return line
+    def close(self):
+        self.file_obj.close()
+    def __getattr__(self, attr):
+        return getattr(self.file_obj, attr)
+class TQDMBytesReader:
+    def __init__(self, file_obj, total_size, **tqdm_kwargs):
+        self.file_obj = CountingWrapper(file_obj)
+        self.tqdm = tqdm(
+            total=total_size,
+            unit="B",
+            unit_scale=True,
+            unit_divisor=1024,
+            **tqdm_kwargs,
+        )
+    def _update_progress(self):
+        delta = self.file_obj.bytes_read - self.tqdm.n
+        if delta > 0:
+            self.tqdm.update(delta)
+    def read(self, size=-1):
+        data = self.file_obj.read(size)
+        self._update_progress()
+        return data
+    def readline(self, size=-1):
+        line = self.file_obj.readline(size)
+        self._update_progress()
+        return line
+    def readlines(self, hint=-1):
+        lines = self.file_obj.readlines(hint)
+        self._update_progress()
+        return lines
+    def __iter__(self):
+        return self
+    def __next__(self):
+        line = self.readline()
+        if not line:
+            raise StopIteration
+        return line
+    def close(self):
+        self.tqdm.close()
+        self.file_obj.close()
+    def __getattr__(self, attr):
+        # Delegate any other attribute to the underlying file object
+        return getattr(self.file_obj, attr)
+class TQDMFileReader:
+    def __init__(self, filepath, mode="rt", file_opener=open, **tqdm_kwargs):
+        self.filepath = filepath
+        self.mode = mode
+        self.file_opener = file_opener
+        self.tqdm_kwargs = tqdm_kwargs
+    def __enter__(self):
+        self.file_obj = self.file_opener(self.filepath, self.mode)
+        total_size = os.path.getsize(self.filepath)  # this is compressed size
+        self.reader = TQDMBytesReader(
+            self.file_obj, total_size=total_size, **self.tqdm_kwargs
+        )
+        return self.reader
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.reader.close()

datamaestro_text/utils/iter.py CHANGED Viewed

@@ -82,3 +82,8 @@ class LazyList(Sequence):
         # Convert the iterable to a list if it hasn't been already
         if self.materialized_list is None:
             self.materialized_list = list(self.iterable)
+    def reverse(self):
+        """Reverse the list in place, materializing it if necessary"""
+        self._materialize()
+        self.materialized_list.reverse()

datamaestro_text/version.py CHANGED Viewed

@@ -17,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '2025.6.30'
-__version_tuple__ = version_tuple = (2025, 6, 30)
+__version__ = version = '2025.7.28'
+__version_tuple__ = version_tuple = (2025, 7, 28)

{datamaestro_text-2025.6.30.dist-info → datamaestro_text-2025.7.28.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: datamaestro-text
-Version: 2025.6.30
+Version: 2025.7.28
 Summary: Datamaestro module for text-related datasets
 Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
 License: GPL-3
@@ -15,10 +15,10 @@ Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python
 Classifier: Programming Language :: Python :: 3
 Classifier: Topic :: Software Development :: Libraries :: Python Modules
-Requires-Python: >=3.8
+Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: datamaestro>=1.4.2
+Requires-Dist: datamaestro>=1.5.0
 Requires-Dist: ir_datasets>=0.5.8
 Requires-Dist: attrs
 Provides-Extra: dev

{datamaestro_text-2025.6.30.dist-info → datamaestro_text-2025.7.28.dist-info}/RECORD RENAMED Viewed

@@ -1,11 +1,11 @@
 datamaestro_text/__init__.py,sha256=hU8jZpkXl3F74qIfqnJl7v4nJ9YxfoR7IpJpUREFNRI,248
-datamaestro_text/version.py,sha256=roAk0Y2ehJXQqbh_xSb4D-q2BlNYSoIoP3QPIZIy72s,519
+datamaestro_text/version.py,sha256=rJQHFC3G5XDG0rUPZ6r1msOA_XCbSY-qMukJgu2nA1M,519
 datamaestro_text/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamaestro_text/config/ai/quac.yaml,sha256=h1D7UJo2z1nZ_9MXpDHuQNJG9Ma2oayUdJV6lyumAIg,1103
 datamaestro_text/config/com/oscar-corpus.py,sha256=dDfdy_uhG6esRQQzUOOORwK8KvEq4c4hZbWt22pv-zY,733
-datamaestro_text/config/com/sentiment140.py,sha256=3cZfqs395gY14gGojRC_RTYEdG5l8RUTSdF9li8ynBc,1283
+datamaestro_text/config/com/sentiment140.py,sha256=WKKLaD7psbj9fIaTBHDTzOZanO2mukaB1g7aeTN1jdU,1289
 datamaestro_text/config/com/fastml/goodbooks-10k.yaml,sha256=5ZABxUnBFs2ZnCXtBH8YoBiPb3SocRRdH1DLSfVWF-Y,1172
-datamaestro_text/config/com/github/ikat.py,sha256=aozSgFcVK_vYZokD9YdF187aa3WwTkc6_Cx6NJ9I_74,1337
+datamaestro_text/config/com/github/ikat.py,sha256=DCayX-t2OBeW5bOJvRxoQgIH3vy-__mYzdmVcnayAkk,4230
 datamaestro_text/config/com/github/aagohary/canard.py,sha256=DREIKU2FFrfwibjU48Y1AsKTUzhwxmD4Kuykd4bDIus,1762
 datamaestro_text/config/com/github/apple/ml-qrecc.py,sha256=oYI0SUxEYzGoL2IbRrnze2cQuWwENwNk4ID9NQuI2Tw,3061
 datamaestro_text/config/com/github/prdwb/orconvqa.py,sha256=MFJYaxJoqJQ6kMDHa2PIE0zoxYTA8Kyl26-vzFoMML0,3032
@@ -29,7 +29,7 @@ datamaestro_text/config/gov/nist/trec/adhoc.py,sha256=vcFaLlZ-chwDt013MYI8bYZ2ug
 datamaestro_text/config/gov/nist/trec/clueweb.yaml,sha256=sm4UbdtMzWoDVPsewtVDS7Vj2jBOdgp18Xqo1X4ysQc,792
 datamaestro_text/config/gov/nist/trec/deeplearning.yaml,sha256=QGM7PtXLJRttNdOPE16o7-k3e5tA9HgcaM_-qFDV_5Q,2125
 datamaestro_text/config/gov/nist/trec/index.yaml,sha256=oSUhUmtukf5oXqUbJLhae8xZx7Uac5V6uZoUB-RJ7Sw,2711
-datamaestro_text/config/gov/nist/trec/tipster.py,sha256=t0w9lOBfvwt6YCYCyDj9fo1QiBXfs0q8qzKl74f40jU,5398
+datamaestro_text/config/gov/nist/trec/tipster.py,sha256=DirpnHpS10e27LcL7v9ksKreKVy7EgfVhyztV49VRds,5364
 datamaestro_text/config/gov/nist/trec/web.yaml,sha256=iNIJ-PCeLwpUaJByJesyCqfFJolWCts_VETACha3hfQ,563
 datamaestro_text/config/io/github/rajpurkar/squad.yaml,sha256=JchkTruEhVd0uOTGgek2tOgBL4UT3G3KVASvZcNigLg,1143
 datamaestro_text/config/io/github/thunlp/fewrel.py,sha256=LJYexk2ssZ5SqKIKLjPdybS9ju4g9fGIFy5eybtsvYQ,1224
@@ -47,42 +47,42 @@ datamaestro_text/data/recommendation.py,sha256=wHV_9SXSclouuXaBmvwg3ncZLOFfdKRZG
 datamaestro_text/data/tagging.py,sha256=yWm7bNLks77cAySa1aZNCmLFxTTqhqXZ0PaoaEYU6hI,697
 datamaestro_text/data/text.py,sha256=_9J7-j06BOIZ1HsOcBsWy_asanAKkDoZIsWWMMj4tm4,496
 datamaestro_text/data/conversation/__init__.py,sha256=esOWnSImMlQs5QtfxUt559ABLd6a5wwoNpj9XtIq71Y,159
-datamaestro_text/data/conversation/base.py,sha256=j4ftXRblBWwt3AqhIS4avalqY9o7VX2C9Wrw_ZMPqek,6514
+datamaestro_text/data/conversation/base.py,sha256=BAqu5YOjlRcBOSiyvrjiTIuuc1OalM0BDtcEvJYhO2Y,7515
 datamaestro_text/data/conversation/canard.py,sha256=FI02rPNuzSsA_uXOcgcuQn31d3QahmHndzYCV1Zm8rk,3305
-datamaestro_text/data/conversation/ikat.py,sha256=AOqJk_LQdhaNnemsNy6vkcEVN3ULMf2twXqmcQQ0t_g,3489
+datamaestro_text/data/conversation/ikat.py,sha256=djxTv0vHLXIUPzfmwpB25fyUWJNVPqthTtDbQ6CzmQo,4363
 datamaestro_text/data/conversation/orconvqa.py,sha256=RL9cpK7QK2dsk_j2e4Th8wzGFq-G3ww_EjdM1eKh-8o,3764
 datamaestro_text/data/conversation/qrecc.py,sha256=es4GmqPtE63A7O_GARe8Zy3rQvuLEhAvUA7CfN_nMeA,2562
-datamaestro_text/data/ir/__init__.py,sha256=EOVnRv9oKEEucMOgrFjhXxeUWEkaJkbXBndpkKhZaiY,8701
-datamaestro_text/data/ir/base.py,sha256=TSaY8UWoixqvKg1z5JfkXPnuZiVoChYEUvc7jjbpZqY,1495
+datamaestro_text/data/ir/__init__.py,sha256=jHoyIWyl0beUbg1gmkwNFf1cQRawB8p3SGfa17gniGM,9442
+datamaestro_text/data/ir/base.py,sha256=uwIiKn0ryK5DWUQsEegeTs6bye0uAOGl0XVm_cvV3ZU,1506
 datamaestro_text/data/ir/cord19.py,sha256=yu1Khgy6AZjH2BPQKdnxDid0vQgQ8zvb8-FQlHH-GOU,1465
 datamaestro_text/data/ir/csv.py,sha256=tnxuqR_MZ3GQhuoXEMYyWLQw8PyD0gRqsnzIP5Gsziw,1212
 datamaestro_text/data/ir/data.py,sha256=ERmmOxz_9mUkIuccNbzUa5Y6gVLLVDdyc4cCxbCCUbY,20
-datamaestro_text/data/ir/formats.py,sha256=K0mqmCGg0seneKo-Rt3cBfDVjEMS4_6t1MfNj4iW9Y4,3637
+datamaestro_text/data/ir/formats.py,sha256=Tbu5rrssq9m76a448-ixgs7a-KKvNCvXYkdv5FkFqp4,3641
 datamaestro_text/data/ir/huggingface.py,sha256=G71VFDN-SllZy4LFxumEbCumEJvb5-orAbLemHjWhiA,956
-datamaestro_text/data/ir/stores.py,sha256=rlOogoBAfsP7o01KqvHiTF1XqzK2Fp6QbRmuasabKAE,859
-datamaestro_text/data/ir/trec.py,sha256=qDOzmgUn0hMqeP9KdI190-9IKsLl6yNRqIoBz6s-cBs,1898
+datamaestro_text/data/ir/stores.py,sha256=A4Ew0L4P6iLLmDKhxqjjVkHcz797BHf4d76YguGkB1A,4412
+datamaestro_text/data/ir/trec.py,sha256=IOtQRMUz8zx-dYEMR2NIIM6qXEUjsV0eVOhGvKIRJK4,1974
 datamaestro_text/data/ir/utils.py,sha256=6-GhXVtgkBZGhIs2-ODZua_3DmKjSSVydStpHDqbAwE,833
 datamaestro_text/datasets/irds/__init__.py,sha256=Tq0HN1qojnZYLBumM59BuTkz7r0gcu-5OXmDDLgPpAc,707
-datamaestro_text/datasets/irds/data.py,sha256=0V5nPMQeh1I3Sp36x_NE7xrbC_vbZAKhU5NONj95aok,22058
+datamaestro_text/datasets/irds/data.py,sha256=eUehp_80H1yyh7CVkM7mOWJtB9XHlmI-A7fLewXuaDE,22487
 datamaestro_text/datasets/irds/datasets.py,sha256=n5mvD2omVI4PkeuJgX13Cio6gLrgYjxq8ZUKKyqocEs,5625
 datamaestro_text/datasets/irds/helpers.py,sha256=fGE-fbuJbXdTzl1qo55247jzn9cvApY-d82GJBgfY1E,3982
 datamaestro_text/datasets/irds/utils.py,sha256=m30JLIrV_HgilN11TvY9dGTyumES6LLzWZDUAMT915M,1425
 datamaestro_text/download/tmdb.py,sha256=kU_Vz9jhznlyeKMHziVu58IHoWv8zPu6CZTHVNQvmu4,4009
 datamaestro_text/interfaces/plaintext.py,sha256=cWfS_xjqZxQ0EV4Ax5BEarZ4lnhQ1I7mc_vgfBgE76w,885
-datamaestro_text/interfaces/trec.py,sha256=g5UIjOvhMBaib9mm280dkQLdtLtuId8bjfptaVi5Pew,2709
+datamaestro_text/interfaces/trec.py,sha256=GrP0N_Hcj5f73KS6CSzkyN4aaI-XoBJ19oVMpHVR3QM,3579
 datamaestro_text/test/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamaestro_text/test/test_datasets.py,sha256=hD1pe-CjEnOj0JNqDLasz_97ltFzMbYfyYBy1QyYdf8,202
 datamaestro_text/test/test_documented.py,sha256=Kl90fz_r_dsV0oXE1Mad34GiQyZ9dc6bfZKNtxP0O2s,453
 datamaestro_text/transforms/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamaestro_text/transforms/ir/__init__.py,sha256=eWxr0ZtfK9F9J4TachW5V8Ra5AhQJueDyOGR8vXi1uA,6553
 datamaestro_text/utils/__init__.py,sha256=2449YLTAtKJzkmt84Mu8sBRCCveNs5fiaqTCK_p5ha0,3340
-datamaestro_text/utils/files.py,sha256=n6ZGl5LNrZbHLcV9RFwd7cFT0vPUezit-2dsBzs1vRQ,170
-datamaestro_text/utils/iter.py,sha256=uzBmavBeqVyEvYmi9Ds2dqh0ywhJE-1SBO5v8AoIKcc,2537
+datamaestro_text/utils/files.py,sha256=jhcirufgTztEkx1hs2-qgScEcHnIcGF_BjCeuCSsNv0,2838
+datamaestro_text/utils/iter.py,sha256=QBajeSPLHvkeh6BCTZDSqWlOYNjwUDvgTTZ_YxJntXw,2701
 datamaestro_text/utils/randomstream.py,sha256=_-boH4IIqN8qcl3IktjpNp9vmF4TWRzHUSNVwg7WAr8,973
 datamaestro_text/utils/shuffle.py,sha256=o8JTz3mr0lYWyv0zEh91jEK12ci1etMiUnzh5GkOHCM,3490
-datamaestro_text-2025.6.30.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
-datamaestro_text-2025.6.30.dist-info/METADATA,sha256=SHQDBJyeUeNlRUYIPvwhTInQtTV02LIpwOg2v1YVL3s,1847
-datamaestro_text-2025.6.30.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-datamaestro_text-2025.6.30.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
-datamaestro_text-2025.6.30.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
-datamaestro_text-2025.6.30.dist-info/RECORD,,
+datamaestro_text-2025.7.28.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
+datamaestro_text-2025.7.28.dist-info/METADATA,sha256=M0V-4Q2_EBFIRnP0czVXvZC9t_qhhmVRbWSAry31SW4,1848
+datamaestro_text-2025.7.28.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+datamaestro_text-2025.7.28.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
+datamaestro_text-2025.7.28.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
+datamaestro_text-2025.7.28.dist-info/RECORD,,

{datamaestro_text-2025.6.30.dist-info → datamaestro_text-2025.7.28.dist-info}/WHEEL RENAMED Viewed

File without changes

{datamaestro_text-2025.6.30.dist-info → datamaestro_text-2025.7.28.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{datamaestro_text-2025.6.30.dist-info → datamaestro_text-2025.7.28.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{datamaestro_text-2025.6.30.dist-info → datamaestro_text-2025.7.28.dist-info}/top_level.txt RENAMED Viewed

File without changes

datamaestro-text 2025.6.30__py3-none-any.whl → 2025.7.28__py3-none-any.whl

datamaestro-text 2025.6.30py3-none-any.whl → 2025.7.28py3-none-any.whl