PyPI - datamaestro-text - Versions diffs - 2024.5.31__py3-none-any.whl → 2025.4.3__py3-none-any.whl - Mend

datamaestro-text 2024.5.31py3-none-any.whl → 2025.4.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

datamaestro_text/config/com/github/apple/ml-qrecc.py CHANGED Viewed

@@ -1,11 +1,20 @@
 # See documentation on https://datamaestro.readthedocs.io
+import re
+import json
 from pathlib import Path
 from datamaestro.definitions import datatasks, datatags, dataset
 from datamaestro.data.ml import Supervised
+from datamaestro.download import reference
 from datamaestro.download.archive import zipdownloader
+from datamaestro.download.wayback import wayback_documents
 from datamaestro.utils import HashCheck
 from datamaestro_text.data.conversation.qrecc import QReCCDataset
+from datamaestro_text.datasets.irds.data import (
+    LZ4JSONLDocumentStore,
+    SimpleJsonDocument,
+)
+from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
 @datatags("conversation", "context", "query")
@@ -35,3 +44,44 @@ def main(data: Path):
         "train": QReCCDataset(path=data / "qrecc_train.json"),
         "test": QReCCDataset(path=data / "qrecc_test.json"),
     }
+@dataset(
+    url="https://github.com/apple/ml-qrecc",
+    doi="https://doi.org/10.48550/arXiv.2010.04898",
+)
+class Content(LZ4JSONLDocumentStore):
+    """QReCC mentionned URLs content"""
+    @staticmethod
+    def __create_dataset__(dataset, options=None):
+        ds = reference(reference=main).setup(dataset, options)
+        documents_path = wayback_documents(
+            "20191127", lambda: Content._urls(ds), name="wayback.jsonl"
+        ).setup(dataset, options)
+        store_path = lz4docstore_builder(
+            "store",
+            lambda: Content._documents(documents_path),
+            SimpleJsonDocument,
+            "id",
+        ).setup(dataset, options)
+        return LZ4JSONLDocumentStore(jsonl_path=store_path)
+    @staticmethod
+    def _documents(path: Path):
+        """Iterates over documents from wayback"""
+        with path.open("rt") as fp:
+            for line in fp:
+                yield SimpleJsonDocument(**json.loads(line))
+    @staticmethod
+    def _urls(supervised: Supervised[QReCCDataset, None, QReCCDataset]):
+        urls = set()
+        for ds in [supervised.train, supervised.test]:
+            for entry in ds.entries():
+                if entry.answer_url:
+                    url = re.sub("#.*$", "", entry.answer_url)
+                    urls.add(url)
+        return urls

datamaestro_text/config/com/microsoft/msmarco/passage.py CHANGED Viewed

@@ -1,11 +1,11 @@
 """MS MARCO (Microsoft Machine Reading Comprehension) is a large scale dataset focused on machine reading comprehension, question answering, and passage ranking. A variant of this task will be the part of TREC and AFIRM 2019. For Updates about TREC 2019 please follow This Repository Passage Reranking task Task Given a query q and a the 1000 most relevant passages P = p1, p2, p3,... p1000, as retrieved by BM25 a succeful system is expected to rerank the most relevant passage as high as possible. For this task not all 1000 relevant items have a human labeled relevant passage. Evaluation will be done using MRR.
-  **Publication**:
-  Tri Nguyen, Mir Rosenberg, Xia Song, Jianfeng Gao, Saurabh Tiwary, RanganMajumder, and Li Deng. 2016.
-  MS MARCO: A Human Generated MAchineReading COmprehension Dataset. In CoCo@NIPS.
+**Publication**:
+Tri Nguyen, Mir Rosenberg, Xia Song, Jianfeng Gao, Saurabh Tiwary, RanganMajumder, and Li Deng. 2016.
+MS MARCO: A Human Generated MAchineReading COmprehension Dataset. In CoCo@NIPS.
-  See [https://github.com/microsoft/MSMARCO-Passage-Ranking](https://github.com/microsoft/MSMARCO-Passage-Ranking) for more details
+See [https://github.com/microsoft/MSMARCO-Passage-Ranking](https://github.com/microsoft/MSMARCO-Passage-Ranking) for more details
 """
 from datamaestro.annotations.agreement import useragreement
@@ -35,6 +35,7 @@ http://www.msmarco.org/dataset.aspx""",
 # --- Document collection
 # TODO: Not ideal since it would be better to have small versions right away
 # instead of downloading again the MS Marco Collection
 @lua
@@ -43,10 +44,10 @@ http://www.msmarco.org/dataset.aspx""",
     url="https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz",
     checker=HashCheck("31644046b18952c1386cd4564ba2ae69", md5),
 )
-@dataset(Folder, url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
-def collection_etc(data):
+@dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
+def collection_etc(data) -> Folder:
     """Documents and some more files"""
-    return {"path": data}
+    return Folder(path=data)
 @lua

datamaestro_text/config/com/sentiment140.py CHANGED Viewed

@@ -4,10 +4,6 @@ from datamaestro.download.archive import zipdownloader
 from datamaestro.data.ml import Supervised
 from datamaestro.utils import HashCheck
-# name: Sentiment140
-# web: http://help.sentiment140.com/for-students/
-# description: |
 @zipdownloader(

datamaestro_text/config/edu/upenn/ldc/aquaint.py CHANGED Viewed

@@ -1,10 +1,9 @@
 """The AQUAINT Corpus, Linguistic Data Consortium (LDC) catalog number LDC2002T31 and ISBN 1-58563-240-6 consists of newswire text data in English, drawn from three sources: the Xinhua News Service (People's Republic of China), the New York Times News Service, and the Associated Press Worldstream News Service. It was prepared by the LDC for the AQUAINT Project, and will be used in official benchmark evaluations conducted by National Institute of Standards and Technology (NIST)."""
-from datamaestro.definitions import DatafolderPath
-from datamaestro.data import Base
-from datamaestro_text.data.ir.trec import TipsterCollection
-from datamaestro.definitions import argument, datatasks, datatags, dataset
+from datamaestro.context import DatafolderPath
+from datamaestro.definitions import dataset
 from datamaestro.download.links import links, linkfolder
+from datamaestro_text.data.ir.trec import TipsterCollection
 URL = "https://catalog.ldc.upenn.edu/LDC2002T31"

datamaestro_text/config/gov/nist/trec/tipster.py CHANGED Viewed

@@ -22,8 +22,8 @@ from datamaestro_text.data.ir.trec import TipsterCollection
 from datamaestro.download.links import linkfolder
 from datamaestro.definitions import (
     dataset,
-    DatafolderPath,
 )
+from datamaestro.context import DatafolderPath
 # Store meta-information
 TIPSTER = dataset(TipsterCollection, url="https://catalog.ldc.upenn.edu/LDC93T3A")

datamaestro_text/data/conversation/base.py CHANGED Viewed

@@ -188,7 +188,7 @@ class SingleConversationTreeNode(ConversationNode):
     def history(self) -> Sequence[Record]:
         return self.tree.history[self.index + 1 :]
-    def parent(self) -> ConversationNode | None:
+    def parent(self) -> Optional[ConversationNode]:
         return (
             SingleConversationTreeNode(self.tree, self.index + 1)
             if self.index < len(self.tree.history) - 1
@@ -235,7 +235,7 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
         for child in self.children:
             yield from child
-    def parent(self) -> ConversationNode | None:
+    def parent(self) -> Optional[ConversationNode]:
         return self.parent
     def children(self) -> List[ConversationNode]:

datamaestro_text/data/embeddings.py CHANGED Viewed

@@ -1,5 +1,5 @@
-from pathlib import Path
-from datamaestro.data import Base, File, argument
+from experimaestro import Meta
+from datamaestro.data import Base, File
 from datamaestro.definitions import datatags
 import numpy as np
 from typing import Tuple, List
@@ -18,9 +18,9 @@ class WordEmbeddings(Base):
         raise NotImplementedError()
-@argument("encoding", str, ignored=True, default="utf-8")
 class WordEmbeddingsText(WordEmbeddings, File):
     """Word embeddings as a text word / values"""
+    encoding: Meta[str] = "utf-8"
     def load(self):
         words = []

datamaestro_text/data/ir/cord19.py CHANGED Viewed

@@ -1,7 +1,8 @@
 from csv import DictReader
 from typing import Iterator
-from datamaestro.data import File, documentation
+from experimaestro import documentation
+from datamaestro.data import File
 from datamaestro.record import Record
 from datamaestro_text.data.ir import Documents, TopicRecord, Topics, IDItem
 from datamaestro_text.data.ir.formats import (

datamaestro_text/data/ir/formats.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from functools import cached_property
-from typing import ClassVar, Tuple
+from typing import ClassVar, Tuple, List
 from attrs import define
 from datamaestro.record import record_type
 from ir_datasets.datasets.wapo import WapoDocMedia
@@ -117,7 +117,6 @@ class TweetDoc(TextItem):
 @define
 class OrConvQADocument(TextItem):
-    id: str
     title: str
     body: str
     aid: str
@@ -127,12 +126,21 @@ class OrConvQADocument(TextItem):
     def text(self):
         return f"{self.title} {self.body}"
+@define
+class Touche2020(TextItem):
+    text: str
+    title: str
+    stance: str
+    url: str
 @define
-class TrecTopic(TextItem):
+class SciDocs(TextItem):
     text: str
-    query: str
-    narrative: str
+    title: str
+    authors: List[str]
+    year: int
+    cited_by: List[str]
+    references: List[str]
 @define
@@ -167,6 +175,13 @@ class TrecMb14Query(TextItem):
     def get_text(self):
         return f"{self.query}"
+@define
+class SciDocsTopic(TextItem):
+    text: str
+    authors: List[str]
+    year: int
+    cited_by: List[str]
+    references: List[str]
 @define()
 class TrecTopic(SimpleTextItem):

datamaestro_text/data/ir/stores.py CHANGED Viewed

@@ -1,17 +1,21 @@
 from collections import namedtuple
-from typing import List
+from typing import List, NamedTuple
 from experimaestro import Constant
 import attrs
 from datamaestro.record import Record
+from datamaestro_text.data.ir.base import IDItem
 from datamaestro_text.datasets.irds.data import LZ4DocumentStore
 from datamaestro_text.data.ir.formats import OrConvQADocument
 class OrConvQADocumentStore(LZ4DocumentStore):
-    NAMED_TUPLE = namedtuple(
-        "OrConvQADocument", [a.name for a in attrs.fields(OrConvQADocument)]
-    )
+    class NAMED_TUPLE(NamedTuple):
+        id: str
+        title: str
+        body: str
+        aid: str
+        bid: int
     lookup_field: Constant[str] = "id"
     fields: Constant[List[str]] = list(NAMED_TUPLE._fields)
@@ -19,5 +23,7 @@ class OrConvQADocumentStore(LZ4DocumentStore):
     data_cls = NAMED_TUPLE
-    def converter(self, data: NAMED_TUPLE) -> OrConvQADocument:
-        return Record(OrConvQADocument(**data._asdict()))
+    def converter(self, data: NAMED_TUPLE) -> Record:
+        fields = data._asdict()
+        del fields["id"]
+        return Record(OrConvQADocument(**fields), IDItem(data.id))

datamaestro_text/data/recommendation.py CHANGED Viewed

@@ -1,14 +1,13 @@
-from datamaestro.data import Base, File, argument
+from experimaestro import Param
+from datamaestro.data import Base, File
 import datamaestro.data.csv as csv
-@argument("ratings", type=File)
 class RatedItems(Base):
-    pass
+    ratings: Param[File]
-@argument("links", type=csv.Generic)
-@argument("movies", type=csv.Generic)
-@argument("tags", type=csv.Generic)
 class Movielens(RatedItems):
-    pass
+    links: Param[csv.Generic]
+    movies: Param[csv.Generic]
+    tags: Param[csv.Generic]

datamaestro_text/data/text.py CHANGED Viewed

@@ -1,15 +1,15 @@
-from pathlib import Path
-from datamaestro.data import Base, Folder, File, argument
+from typing import Optional
+from experimaestro import Param
+from datamaestro.data import Base, Folder, File
 from datamaestro.data.ml import Supervised
-@argument("train", type=Base)
-@argument("test", type=Base, required=False)
-@argument("validation", type=Base, required=False)
 class TrainingText(Supervised):
     """ "A dataset used for training with a train and a test"""
-    pass
+    train: Param[Base]
+    test: Param[Optional[Base]] = None
+    validation: Param[Optional[Base]] = None
 class TextFolder(Folder):

datamaestro_text/datasets/irds/data.py CHANGED Viewed

@@ -1,36 +1,44 @@
+import logging
 from abc import ABC, abstractmethod
+from dataclasses import dataclass
 from functools import partial
-import logging
 from pathlib import Path
-from typing import Dict, Iterator, Tuple, Type, List
+from typing import Dict, Iterator, List, NamedTuple, Tuple, Type
 import ir_datasets
-from ir_datasets.indices import PickleLz4FullStore
+import ir_datasets.datasets as _irds
+from datamaestro.record import RecordType, record_type
+from experimaestro import Config, Meta, Option, Param
+from experimaestro.compat import cached_property
 from ir_datasets.formats import (
     GenericDoc,
-    GenericQuery,
     GenericDocPair,
+    GenericQuery,
     TrecParsedDoc,
     TrecQuery,
 )
-import ir_datasets.datasets as _irds
-from experimaestro import Config, Param
-from experimaestro.compat import cached_property
-from experimaestro import Option
-from datamaestro.record import RecordType, record_type
-from datamaestro_text.data.conversation.base import AnswerEntry
+from ir_datasets.indices import PickleLz4FullStore
 import datamaestro_text.data.ir as ir
+import datamaestro_text.data.ir.formats as formats
+from datamaestro_text.data.conversation.base import (
+    AnswerDocumentID,
+    AnswerEntry,
+    ConversationHistoryItem,
+    ConversationTreeNode,
+    DecontextualizedDictItem,
+    EntryType,
+)
 from datamaestro_text.data.ir.base import (
-    Record,
-    TopicRecord,
-    DocumentRecord,
-    SimpleTextItem,
     AdhocAssessedTopic,
-    SimpleAdhocAssessment,
+    DocumentRecord,
     IDItem,
+    Record,
+    SimpleAdhocAssessment,
+    SimpleTextItem,
+    TopicRecord,
     create_record,
 )
-import datamaestro_text.data.ir.formats as formats
 # Interface between ir_datasets and datamaestro:
 # provides adapted data types
@@ -109,6 +117,12 @@ class Documents(ir.DocumentStore, IRDSId):
         _irds.beir.BeirTitleUrlDoc: tuple_constructor(
             formats.TitleUrlDocument, "doc_id", "text", "title", "url"
         ),
+        _irds.beir.BeirToucheDoc: tuple_constructor(
+            formats.Touche2020, "doc_id", "text", "title", "stance", "url"
+        ),
+        _irds.beir.BeirSciDoc: tuple_constructor(
+            formats.SciDocs, "doc_id", "text", "title", "authors", "year", "cited_by", "references"
+        ),
         _irds.msmarco_document.MsMarcoDocument: tuple_constructor(
             formats.MsMarcoDocument, "doc_id", "url", "title", "body"
         ),
@@ -216,20 +230,6 @@ if hasattr(_irds, "miracl"):
     )
-# Fix while PR https://github.com/allenai/ir_datasets/pull/252
-# is not in.
-class DMPickleLz4FullStore(PickleLz4FullStore):
-    def get_many(self, doc_ids, field=None):
-        result = {}
-        field_idx = self._doc_cls._fields.index(field) if field is not None else None
-        for doc in self.get_many_iter(doc_ids):
-            if field is not None:
-                result[getattr(doc, self._id_field)] = doc[field_idx]
-            else:
-                result[getattr(doc, self._id_field)] = doc
-        return result
 class LZ4DocumentStore(ir.DocumentStore):
     """A LZ4-based document store"""
@@ -243,7 +243,7 @@ class LZ4DocumentStore(ir.DocumentStore):
     @cached_property
     def store(self):
-        return DMPickleLz4FullStore(
+        return PickleLz4FullStore(
             self.path, None, self.data_cls, self.lookup_field, self.index_fields
         )
@@ -262,10 +262,10 @@ class LZ4DocumentStore(ir.DocumentStore):
         retrieved = self.store.get_many(docids)
         return [self.converter(retrieved[docid]) for docid in docids]
+    @abstractmethod
     def converter(self, data):
-        """Converts a document from LZ4 tuples to any other format"""
-        # By default, use identity
-        return data
+        """Converts a document from LZ4 tuples to a document record"""
+        ...
     def iter(self) -> Iterator[DocumentRecord]:
         """Returns an iterator over documents"""
@@ -278,6 +278,25 @@ class LZ4DocumentStore(ir.DocumentStore):
         return self.store.count()
+class SimpleJsonDocument(NamedTuple):
+    id: str
+    text: str
+class LZ4JSONLDocumentStore(LZ4DocumentStore):
+    jsonl_path: Meta[Path]
+    """json-l based document store
+    Each line is of the form
+    ```json
+    { "id": "...", "text": "..." }
+    ```
+    """
+    def converter(self, data):
+        return DocumentRecord(IDItem(data["id"]), SimpleTextItem(data["text"]))
 class TopicsHandler(ABC):
     @abstractmethod
     def topic_int(self, internal_topic_id: int) -> TopicRecord:
@@ -349,6 +368,12 @@ class Topics(ir.TopicsStore, IRDSId):
         TrecQuery: tuple_constructor(
             formats.TrecTopic, "query_id", "title", "description", "narrative"
         ),
+        _irds.beir.BeirToucheQuery: tuple_constructor(
+            formats.TrecTopic, "query_id", "text", "description", "narrative"
+        ),
+        _irds.beir.BeirSciQuery: tuple_constructor(
+            formats.SciDocsTopic, "query_id", "text", "authors", "year", "cited_by", "references"
+        ),
         _irds.tweets2013_ia.TrecMb13Query: tuple_constructor(
             formats.TrecMb13Query, "query_id", "query", "time", "tweet_time"
         ),
@@ -392,197 +417,190 @@ class Topics(ir.TopicsStore, IRDSId):
         return self.handler.iter()
-if hasattr(_irds.trec_cast, "Cast2022Query"):
-    from datamaestro_text.data.conversation.base import (
-        ConversationTreeNode,
-        DecontextualizedDictItem,
-        AnswerDocumentID,
-        ConversationHistoryItem,
-        EntryType,
-    )
+class CastTopicsHandler(TopicsHandler):
+    def __init__(self, dataset):
+        self.dataset = dataset
+    @cached_property
+    def ext2records(self):
+        return {record[IDItem].id: record for record in self.records}
+    def topic_int(self, internal_topic_id: int) -> TopicRecord:
+        """Returns a document given its internal ID"""
+        return self.records[internal_topic_id]
+    def topic_ext(self, external_topic_id: str) -> TopicRecord:
+        """Returns a document given its external ID"""
+        return self.ext2records[external_topic_id]
+    def iter(self) -> Iterator[ir.TopicRecord]:
+        """Returns an iterator over topics"""
+        return iter(self.records)
-    class CastTopicsHandler(TopicsHandler):
-        def __init__(self, dataset):
-            self.dataset = dataset
-        @property
-        @abstractmethod
-        def records(self):
-            ...
-        @cached_property
-        def ext2records(self):
-            return {record[IDItem].id: record for record in self.records}
-        def topic_int(self, internal_topic_id: int) -> TopicRecord:
-            """Returns a document given its internal ID"""
-            return self.records[internal_topic_id]
-        def topic_ext(self, external_topic_id: str) -> TopicRecord:
-            """Returns a document given its external ID"""
-            return self.ext2records[external_topic_id]
-        def iter(self) -> Iterator[ir.TopicRecord]:
-            """Returns an iterator over topics"""
-            return iter(self.records)
-        @cached_property
-        def records(self):
-            try:
-                topic_number = None
-                node = None
-                conversation = []
-                records = []
-                for query in self.dataset.dataset.queries_iter():
-                    decontextualized = DecontextualizedDictItem(
-                        "manual",
-                        {
-                            "manual": query.manual_rewritten_utterance,
-                            "auto": query.automatic_rewritten_utterance,
-                        },
+    @cached_property
+    def records(self):
+        try:
+            topic_number = None
+            node = None
+            conversation = []
+            records = []
+            for query in self.dataset.dataset.queries_iter():
+                decontextualized = DecontextualizedDictItem(
+                    "manual",
+                    {
+                        "manual": query.manual_rewritten_utterance,
+                        "auto": query.automatic_rewritten_utterance,
+                    },
+                )
+                is_new_conversation = topic_number != query.topic_number
+                topic = Record(
+                    IDItem(query.query_id),
+                    SimpleTextItem(query.raw_utterance),
+                    decontextualized,
+                    ConversationHistoryItem(
+                        [] if is_new_conversation else node.conversation(False)
+                    ),
+                    EntryType.USER_QUERY,
+                )
+                if is_new_conversation:
+                    conversation = []
+                    node = ConversationTreeNode(topic)
+                    topic_number = query.topic_number
+                else:
+                    node = node.add(ConversationTreeNode(topic))
+                records.append(topic)
+                conversation.append(node)
+                node = node.add(
+                    ConversationTreeNode(
+                        Record(
+                            AnswerDocumentID(self.get_canonical_result_id(query)),
+                            EntryType.SYSTEM_ANSWER,
+                        )
                     )
+                )
+                conversation.append(node)
+        except Exception:
+            logging.exception("Error while computing topic records")
+            raise
+        return records
+    @staticmethod
+    def get_canonical_result_id():
+        return None
-                    is_new_conversation = topic_number != query.topic_number
+class Cast2020TopicsHandler(CastTopicsHandler):
+    @staticmethod
+    def get_canonical_result_id(query: _irds.trec_cast.Cast2020Query):
+        return query.manual_canonical_result_id
+class Cast2021TopicsHandler(CastTopicsHandler):
+    @staticmethod
+    def get_canonical_result_id(query: _irds.trec_cast.Cast2021Query):
+        return query.canonical_result_id
+class Cast2022TopicsHandler(CastTopicsHandler):
+    def __init__(self, dataset):
+        self.dataset = dataset
+    @cached_property
+    def records(self):
+        try:
+            records = []
+            nodes: Dict[str, ConversationTreeNode] = {}
+            for (
+                query
+            ) in (
+                self.dataset.dataset.queries_iter()
+            ):  # type: _irds.trec_cast.Cast2022Query
+                parent = nodes[query.parent_id] if query.parent_id else None
+                if query.participant == "User":
                     topic = Record(
                         IDItem(query.query_id),
                         SimpleTextItem(query.raw_utterance),
-                        decontextualized,
+                        DecontextualizedDictItem(
+                            "manual",
+                            {
+                                "manual": query.manual_rewritten_utterance,
+                            },
+                        ),
                         ConversationHistoryItem(
-                            [] if is_new_conversation else node.conversation(False)
+                            parent.conversation(False) if parent else []
                         ),
                         EntryType.USER_QUERY,
                     )
-                    if is_new_conversation:
-                        conversation = []
-                        node = ConversationTreeNode(topic)
-                        topic_number = query.topic_number
-                    else:
-                        node = node.add(ConversationTreeNode(topic))
+                    node = ConversationTreeNode(topic)
                     records.append(topic)
-                    conversation.append(node)
-                    node = node.add(
-                        ConversationTreeNode(
-                            Record(
-                                AnswerDocumentID(self.get_canonical_result_id(query)),
-                                EntryType.SYSTEM_ANSWER,
-                            )
+                else:
+                    node = ConversationTreeNode(
+                        Record(
+                            AnswerEntry(query.response),
+                            EntryType.SYSTEM_ANSWER,
                         )
                     )
-                    conversation.append(node)
-            except Exception:
-                logging.exception("Error while computing topic records")
-                raise
-            return records
-        @staticmethod
-        def get_canonical_result_id():
-            return None
-    class Cast2020TopicsHandler(CastTopicsHandler):
-        @staticmethod
-        def get_canonical_result_id(query: _irds.trec_cast.Cast2020Query):
-            return query.manual_canonical_result_id
-    class Cast2021TopicsHandler(CastTopicsHandler):
-        @staticmethod
-        def get_canonical_result_id(query: _irds.trec_cast.Cast2021Query):
-            return query.canonical_result_id
-    class Cast2022TopicsHandler(CastTopicsHandler):
-        def __init__(self, dataset):
-            self.dataset = dataset
-        @cached_property
-        def records(self):
-            try:
-                records = []
-                nodes: Dict[str, ConversationTreeNode] = {}
-                for (
-                    query
-                ) in (
-                    self.dataset.dataset.queries_iter()
-                ):  # type: _irds.trec_cast.Cast2022Query
-                    parent = nodes[query.parent_id] if query.parent_id else None
-                    if query.participant == "User":
-                        topic = Record(
-                            IDItem(query.query_id),
-                            SimpleTextItem(query.raw_utterance),
-                            DecontextualizedDictItem(
-                                "manual",
-                                {
-                                    "manual": query.manual_rewritten_utterance,
-                                },
-                            ),
-                            ConversationHistoryItem(
-                                parent.conversation(False) if parent else []
-                            ),
-                            EntryType.USER_QUERY,
-                        )
-                        node = ConversationTreeNode(topic)
-                        records.append(topic)
-                    else:
-                        node = ConversationTreeNode(
-                            Record(
-                                AnswerEntry(query.response),
-                                EntryType.SYSTEM_ANSWER,
-                            )
-                        )
-                    nodes[query.query_id] = node
-                    if parent:
-                        parent.add(node)
-            except Exception:
-                logging.exception("Error while computing topic records")
-                raise
-            return records
-    Topics.HANDLERS.update(
-        {
-            # _irds.trec_cast.Cast2019Query: Cast2019TopicsHandler,
-            _irds.trec_cast.Cast2020Query: Cast2020TopicsHandler,
-            _irds.trec_cast.Cast2021Query: Cast2021TopicsHandler,
-            _irds.trec_cast.Cast2022Query: Cast2022TopicsHandler,
-        }
-    )
+                nodes[query.query_id] = node
+                if parent:
+                    parent.add(node)
+        except Exception:
+            logging.exception("Error while computing topic records")
+            raise
-    class CastDocHandler:
-        def check(self, cls):
-            assert issubclass(cls, _irds.trec_cast.CastDoc)
+        return records
-        @cached_property
-        def target_cls(self):
-            return formats.TitleUrlDocument
-        def __call__(self, _, doc: _irds.trec_cast.CastDoc):
-            return Record(
-                IDItem(doc.doc_id), formats.SimpleTextItem(" ".join(doc.passages))
-            )
+Topics.HANDLERS.update(
+    {
+        # _irds.trec_cast.Cast2019Query: Cast2019TopicsHandler,
+        _irds.trec_cast.Cast2020Query: Cast2020TopicsHandler,
+        _irds.trec_cast.Cast2021Query: Cast2021TopicsHandler,
+        _irds.trec_cast.Cast2022Query: Cast2022TopicsHandler,
+    }
+)
-    class CastPassageDocHandler:
-        def check(self, cls):
-            assert issubclass(cls, _irds.trec_cast.CastPassageDoc)
-        @cached_property
-        def target_cls(self):
-            return formats.TitleUrlDocument
+class CastDocHandler:
+    def check(self, cls):
+        assert issubclass(cls, _irds.trec_cast.CastDoc)
+    @cached_property
+    def target_cls(self):
+        return formats.TitleUrlDocument
+    def __call__(self, _, doc: _irds.trec_cast.CastDoc):
+        return Record(
+            IDItem(doc.doc_id), formats.SimpleTextItem(" ".join(doc.passages))
+        )
+class CastPassageDocHandler:
+    def check(self, cls):
+        assert issubclass(cls, _irds.trec_cast.CastPassageDoc)
+    @cached_property
+    def target_cls(self):
+        return formats.TitleUrlDocument
+    def __call__(self, _, doc: _irds.trec_cast.CastPassageDoc):
+        return Record(
+            IDItem(doc.doc_id),
+            formats.TitleUrlDocument(doc.text, doc.title, doc.url),
+        )
-        def __call__(self, _, doc: _irds.trec_cast.CastPassageDoc):
-            return Record(
-                IDItem(doc.doc_id),
-                formats.TitleUrlDocument(doc.text, doc.title, doc.url),
-            )
-    Documents.CONVERTERS[_irds.trec_cast.CastDoc] = CastDocHandler()
-    Documents.CONVERTERS[_irds.trec_cast.CastPassageDoc] = CastPassageDocHandler()
+Documents.CONVERTERS[_irds.trec_cast.CastDoc] = CastDocHandler()
+Documents.CONVERTERS[_irds.trec_cast.CastPassageDoc] = CastPassageDocHandler()
 class Adhoc(ir.Adhoc, IRDSId):

datamaestro_text/datasets/irds/helpers.py CHANGED Viewed

@@ -1,13 +1,13 @@
 import logging
 from typing import Optional, Type, Callable, Iterator
 from ir_datasets.indices import PickleLz4FullStore
-from datamaestro.download import Download
+from datamaestro.download import Resource
 from datamaestro.utils import FileChecker
 from pathlib import Path
 import urllib3
-class lz4docstore_downloader(Download):
+class lz4docstore_downloader(Resource):
     """Uses ir_datasets Lz4FullStore to build a document store for a stream of documents"""
     def __init__(
@@ -69,3 +69,59 @@ class lz4docstore_downloader(Download):
             # All good!
             (destination / "done").touch()
+class lz4docstore_builder(Resource):
+    def __init__(
+        self,
+        name: str,
+        iter_factory: Callable[[], Iterator],
+        doc_cls: Type,
+        lookup_field: str,
+        *,
+        count_hint: Optional[int] = None,
+    ):
+        """Uses ir_datasets Lz4FullStore to build a document store for a stream of documents
+        :param name: The name of the variable for path construction
+        :param iter_factory: Iterator over documents
+        :param doc_cls: The class of documents (must be a dataclass because of how ir-datasets works)
+        :param lookup_field: Which field to use for lookup
+        :param count_hint: Number of documents (hint), defaults to None
+        """
+        super().__init__(name)
+        self.iter_factory = iter_factory
+        self.doc_cls = doc_cls
+        self.lookup_field = lookup_field
+        self.count_hint = count_hint
+    def prepare(self):
+        return self.definition.datapath / self.varname
+    def download(self, force=False):
+        # Creates directory if needed
+        destination = self.definition.datapath / self.varname
+        destination.mkdir(exist_ok=True)
+        # Early exit
+        if (destination / "done").is_file() and not force:
+            return True
+        # Download (cache)
+        logging.info("Building the document index")
+        # Builds the LZ4 store
+        store = PickleLz4FullStore(
+            destination,
+            lambda: self.iter_factory(),
+            self.doc_cls,
+            lookup_field=self.lookup_field,
+            index_fields=[self.lookup_field],
+            key_field_prefix=None,
+            size_hint=None,
+            count_hint=self.count_hint,
+        )
+        store.build()
+        # All good!
+        (destination / "done").touch()

datamaestro_text/version.py CHANGED Viewed

@@ -1,8 +1,13 @@
-# file generated by setuptools_scm
+# file generated by setuptools-scm
 # don't change, don't track in version control
+__all__ = ["__version__", "__version_tuple__", "version", "version_tuple"]
 TYPE_CHECKING = False
 if TYPE_CHECKING:
-    from typing import Tuple, Union
+    from typing import Tuple
+    from typing import Union
     VERSION_TUPLE = Tuple[Union[int, str], ...]
 else:
     VERSION_TUPLE = object
@@ -12,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '2024.5.31'
-__version_tuple__ = version_tuple = (2024, 5, 31)
+__version__ = version = '2025.4.3'
+__version_tuple__ = version_tuple = (2025, 4, 3)

{datamaestro_text-2024.5.31.dist-info → datamaestro_text-2025.4.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.4
 Name: datamaestro-text
-Version: 2024.5.31
+Version: 2025.4.3
 Summary: Datamaestro module for text-related datasets
 Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
 License: GPL-3
@@ -18,9 +18,10 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Requires-Python: >=3.8
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: datamaestro >=1.1.1
-Requires-Dist: ir-datasets
+Requires-Dist: datamaestro>=1.2.1
+Requires-Dist: ir_datasets>=0.5.8
 Requires-Dist: attrs
+Dynamic: license-file
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) [![PyPI version](https://badge.fury.io/py/datamaestro-text.svg)](https://badge.fury.io/py/datamaestro-text)

{datamaestro_text-2024.5.31.dist-info → datamaestro_text-2025.4.3.dist-info}/RECORD RENAMED Viewed

@@ -1,16 +1,16 @@
 datamaestro_text/__init__.py,sha256=hU8jZpkXl3F74qIfqnJl7v4nJ9YxfoR7IpJpUREFNRI,248
-datamaestro_text/version.py,sha256=fwtF2zbaLZ1a4gnsJGlnkD1w9QKZyClNJUGeq39EhTE,419
+datamaestro_text/version.py,sha256=Ldmqy5wkUM54W7PO84xMGLTubji7Xl68QRaeaSSILS0,517
 datamaestro_text/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamaestro_text/config/ai/quac.yaml,sha256=h1D7UJo2z1nZ_9MXpDHuQNJG9Ma2oayUdJV6lyumAIg,1103
 datamaestro_text/config/com/oscar-corpus.py,sha256=dDfdy_uhG6esRQQzUOOORwK8KvEq4c4hZbWt22pv-zY,733
-datamaestro_text/config/com/sentiment140.py,sha256=iRV_rSjQcr9WNjjQ4EdznaEMGLADV_rwpeRx7ycQi0k,1383
+datamaestro_text/config/com/sentiment140.py,sha256=bLxFY6xIOp3_9mn5H36V-jfa_vXdetRxi6sK4cghl9w,1294
 datamaestro_text/config/com/fastml/goodbooks-10k.yaml,sha256=5ZABxUnBFs2ZnCXtBH8YoBiPb3SocRRdH1DLSfVWF-Y,1172
 datamaestro_text/config/com/github/aagohary/canard.py,sha256=tIwb_KxMUR3st7rzQUkt6rIjolTl8uKvDq6t795b1nY,1468
-datamaestro_text/config/com/github/apple/ml-qrecc.py,sha256=prljhI25mZn4NqUwu5sfntvvzLI1-Twpe_tJYjUoWDo,1444
+datamaestro_text/config/com/github/apple/ml-qrecc.py,sha256=_lufJlg-4zaQyjr5Ae-X-9hXzpl2JFjfxh_RhnBbva4,3068
 datamaestro_text/config/com/github/prdwb/orconvqa.py,sha256=Tg3hbnvilxW_Lwt7fpKvGMtu-6mc9oNIHM-LX6JTR1U,3026
 datamaestro_text/config/com/github/soskek/bookcorpus.yaml,sha256=qJKs35yeEIilEMgNvU3OEqMp1TSn7mDM2T-uYyA7kTU,1607
 datamaestro_text/config/com/microsoft/wikiqa.yaml,sha256=U7rU-W2Xz1MYv2YXT4jCTj5DsHyM0VssbQPNa3EISaM,540
-datamaestro_text/config/com/microsoft/msmarco/passage.py,sha256=ouNn2nivS6mUMaCyMzqxNv1YMoPrSEX-UcSZpG1v_uw,11645
+datamaestro_text/config/com/microsoft/msmarco/passage.py,sha256=tVldwVh0pQpXXifqcMNEN9cLO1HXkXoEhToSazSx5RE,11643
 datamaestro_text/config/com/smashwords/bookcorpus.py,sha256=hCFjZg9t1AHET05qU31uatiAOD5EEzzWG9y_W90jcOE,861
 datamaestro_text/config/edu/cornell/nlvr.yaml,sha256=9Yk5VZMncSmrP7JNuGXqExksgX5nQ_Zfnlps8hWze3Q,921
 datamaestro_text/config/edu/stanford/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -19,7 +19,7 @@ datamaestro_text/config/edu/stanford/glove.py,sha256=ykkQ7nYWqhmgc2TeohNMliYSiX8
 datamaestro_text/config/edu/stanford/im2p.yaml,sha256=JoToNyEPpmwdyLFedCBot5ypyw7p9rzi12mGXJuZin0,2909
 datamaestro_text/config/edu/upenn/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamaestro_text/config/edu/upenn/ldc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datamaestro_text/config/edu/upenn/ldc/aquaint.py,sha256=Y_biKee8LA8arsHiKOUlPBWfylDDM9k-x5UgN-uJdLE,1658
+datamaestro_text/config/edu/upenn/ldc/aquaint.py,sha256=YEU3kIXzv71Vjg9bUoXyQ-vSLvC-y4LlE3mJQf38XuY,1589
 datamaestro_text/config/gov/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamaestro_text/config/gov/nist/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamaestro_text/config/gov/nist/ir/covid.py,sha256=wn2E7sQ8M6pAucVD4sKJYImyzKUKphyiDFJD0oYRCbg,4004
@@ -28,7 +28,7 @@ datamaestro_text/config/gov/nist/trec/adhoc.py,sha256=vcFaLlZ-chwDt013MYI8bYZ2ug
 datamaestro_text/config/gov/nist/trec/clueweb.yaml,sha256=sm4UbdtMzWoDVPsewtVDS7Vj2jBOdgp18Xqo1X4ysQc,792
 datamaestro_text/config/gov/nist/trec/deeplearning.yaml,sha256=QGM7PtXLJRttNdOPE16o7-k3e5tA9HgcaM_-qFDV_5Q,2125
 datamaestro_text/config/gov/nist/trec/index.yaml,sha256=oSUhUmtukf5oXqUbJLhae8xZx7Uac5V6uZoUB-RJ7Sw,2711
-datamaestro_text/config/gov/nist/trec/tipster.py,sha256=rmVFcwUPAfD529rneZUlCLBke-edYjrBIH3n02-qfvc,5371
+datamaestro_text/config/gov/nist/trec/tipster.py,sha256=t0w9lOBfvwt6YCYCyDj9fo1QiBXfs0q8qzKl74f40jU,5398
 datamaestro_text/config/gov/nist/trec/web.yaml,sha256=iNIJ-PCeLwpUaJByJesyCqfFJolWCts_VETACha3hfQ,563
 datamaestro_text/config/io/github/rajpurkar/squad.yaml,sha256=JchkTruEhVd0uOTGgek2tOgBL4UT3G3KVASvZcNigLg,1143
 datamaestro_text/config/io/github/thunlp/fewrel.py,sha256=LJYexk2ssZ5SqKIKLjPdybS9ju4g9fGIFy5eybtsvYQ,1224
@@ -41,29 +41,29 @@ datamaestro_text/config/org/grouplens/movielens.py,sha256=e78JoiHuwP6gbj7Q84UwPT
 datamaestro_text/config/org/universaldependencies/french.py,sha256=W_gDtfinjlw08qohX_PWvzQlacDwRFB7PeOzO33mRVU,2208
 datamaestro_text/config/uk/ac/ucl/cs/qangaroo.yaml,sha256=IBy82CDNNLjJPNPzues1EgDXu0A5WDvUFeVNSOyrIpI,1137
 datamaestro_text/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datamaestro_text/data/embeddings.py,sha256=AskX7Ggvkpqhb-Je_hBTFp_vfkiWzWtJH1gFQxuUTwM,1155
-datamaestro_text/data/recommendation.py,sha256=MatelpJQiCMpNDuaQVBkRY809J1qiQo-8QO9XD2J_dU,303
+datamaestro_text/data/embeddings.py,sha256=qdeZ4m6drFNkqW_4pKxJKdJHtOnDRs0P7QVJ7AO6xFE,1127
+datamaestro_text/data/recommendation.py,sha256=wHV_9SXSclouuXaBmvwg3ncZLOFfdKRZG3IHkkPJX9Y,279
 datamaestro_text/data/tagging.py,sha256=yWm7bNLks77cAySa1aZNCmLFxTTqhqXZ0PaoaEYU6hI,697
-datamaestro_text/data/text.py,sha256=pOI8nrEd6RoQA28DVH1JufHTunr9vG3FQzwElR8YirI,499
+datamaestro_text/data/text.py,sha256=_9J7-j06BOIZ1HsOcBsWy_asanAKkDoZIsWWMMj4tm4,496
 datamaestro_text/data/conversation/__init__.py,sha256=esOWnSImMlQs5QtfxUt559ABLd6a5wwoNpj9XtIq71Y,159
-datamaestro_text/data/conversation/base.py,sha256=FWA4-5corSZUuRMzpewOBXPDG2YR60j5geZmN-SaXrg,6451
+datamaestro_text/data/conversation/base.py,sha256=zXX5sPMoVtDf4WTkLx45IvZz6lgKPgcUZFu6N4lVBlc,6457
 datamaestro_text/data/conversation/canard.py,sha256=IMxu5NfytWZLZ_cyT8UFOICbDE82HLJoNBT_6j36Faw,1808
 datamaestro_text/data/conversation/orconvqa.py,sha256=VG3GV5_IPQcvla9rrQPypDNcZYmKNMgozmr2oudeLA4,3802
 datamaestro_text/data/conversation/qrecc.py,sha256=es4GmqPtE63A7O_GARe8Zy3rQvuLEhAvUA7CfN_nMeA,2562
 datamaestro_text/data/ir/__init__.py,sha256=EOVnRv9oKEEucMOgrFjhXxeUWEkaJkbXBndpkKhZaiY,8701
 datamaestro_text/data/ir/base.py,sha256=Cw8egjChpx4ksUwp-vTA70B2OWxROH7FIeJylsXggAk,1429
-datamaestro_text/data/ir/cord19.py,sha256=7xDIzsy63WrA9lxxyNOMu5ECRymu5x23EzYG977nS6Y,1440
+datamaestro_text/data/ir/cord19.py,sha256=yu1Khgy6AZjH2BPQKdnxDid0vQgQ8zvb8-FQlHH-GOU,1465
 datamaestro_text/data/ir/csv.py,sha256=tnxuqR_MZ3GQhuoXEMYyWLQw8PyD0gRqsnzIP5Gsziw,1212
 datamaestro_text/data/ir/data.py,sha256=ERmmOxz_9mUkIuccNbzUa5Y6gVLLVDdyc4cCxbCCUbY,20
-datamaestro_text/data/ir/formats.py,sha256=wgjXIkNJjqRbHEMkkXyXRRMnxnho45jfUbPsJCazkZk,2866
+datamaestro_text/data/ir/formats.py,sha256=rU9uJhdFDdlnQ2qfFowK_--1WVOH1fVgfXWOPPDF_FY,3160
 datamaestro_text/data/ir/huggingface.py,sha256=G71VFDN-SllZy4LFxumEbCumEJvb5-orAbLemHjWhiA,956
-datamaestro_text/data/ir/stores.py,sha256=odp1XoCq-FakKICXsMBCxzJlx77j71QPKzyLnMg0xGA,733
+datamaestro_text/data/ir/stores.py,sha256=rlOogoBAfsP7o01KqvHiTF1XqzK2Fp6QbRmuasabKAE,859
 datamaestro_text/data/ir/trec.py,sha256=qDOzmgUn0hMqeP9KdI190-9IKsLl6yNRqIoBz6s-cBs,1898
 datamaestro_text/data/ir/utils.py,sha256=6-GhXVtgkBZGhIs2-ODZua_3DmKjSSVydStpHDqbAwE,833
 datamaestro_text/datasets/irds/__init__.py,sha256=Tq0HN1qojnZYLBumM59BuTkz7r0gcu-5OXmDDLgPpAc,707
-datamaestro_text/datasets/irds/data.py,sha256=EsqaY5UNbtQGMdEqUn5tlxW-k2LiaJ0jiD_6vVtZuU8,20261
+datamaestro_text/datasets/irds/data.py,sha256=BTsFZdjFJ0XdlYRh2rcshAoeknHrypf7be2BGUbWWFk,19931
 datamaestro_text/datasets/irds/datasets.py,sha256=qtN-nX2_OV9FD339aZjwVL1jFf6I4T6MbNCcuRAjmtU,5682
-datamaestro_text/datasets/irds/helpers.py,sha256=KC-2nQPCIl4VnbfDkAkr4iFlhkknn8zvbADlClWZvwU,2207
+datamaestro_text/datasets/irds/helpers.py,sha256=fGE-fbuJbXdTzl1qo55247jzn9cvApY-d82GJBgfY1E,3982
 datamaestro_text/datasets/irds/utils.py,sha256=m30JLIrV_HgilN11TvY9dGTyumES6LLzWZDUAMT915M,1425
 datamaestro_text/download/tmdb.py,sha256=kU_Vz9jhznlyeKMHziVu58IHoWv8zPu6CZTHVNQvmu4,4009
 datamaestro_text/interfaces/plaintext.py,sha256=cWfS_xjqZxQ0EV4Ax5BEarZ4lnhQ1I7mc_vgfBgE76w,885
@@ -78,9 +78,9 @@ datamaestro_text/utils/files.py,sha256=n6ZGl5LNrZbHLcV9RFwd7cFT0vPUezit-2dsBzs1v
 datamaestro_text/utils/iter.py,sha256=2_UZ8y9Ma4k5U9ZD4w55Zfb6NGrKM1L4G40OygRm1is,2459
 datamaestro_text/utils/randomstream.py,sha256=_-boH4IIqN8qcl3IktjpNp9vmF4TWRzHUSNVwg7WAr8,973
 datamaestro_text/utils/shuffle.py,sha256=o8JTz3mr0lYWyv0zEh91jEK12ci1etMiUnzh5GkOHCM,3490
-datamaestro_text-2024.5.31.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
-datamaestro_text-2024.5.31.dist-info/METADATA,sha256=MGRuQbJdMtcfGAGdF0MqDiPcR7NABD7PhGIMVnf71aY,1604
-datamaestro_text-2024.5.31.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-datamaestro_text-2024.5.31.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
-datamaestro_text-2024.5.31.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
-datamaestro_text-2024.5.31.dist-info/RECORD,,
+datamaestro_text-2025.4.3.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
+datamaestro_text-2025.4.3.dist-info/METADATA,sha256=M1XG19GB1RLCTJ0xICe47LYDjHzLGFPUvHXg9-bmZZM,1631
+datamaestro_text-2025.4.3.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
+datamaestro_text-2025.4.3.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
+datamaestro_text-2025.4.3.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
+datamaestro_text-2025.4.3.dist-info/RECORD,,

{datamaestro_text-2024.5.31.dist-info → datamaestro_text-2025.4.3.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.43.0)
+Generator: setuptools (78.1.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{datamaestro_text-2024.5.31.dist-info → datamaestro_text-2025.4.3.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{datamaestro_text-2024.5.31.dist-info → datamaestro_text-2025.4.3.dist-info/licenses}/LICENSE RENAMED Viewed

File without changes

{datamaestro_text-2024.5.31.dist-info → datamaestro_text-2025.4.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

datamaestro-text 2024.5.31__py3-none-any.whl → 2025.4.3__py3-none-any.whl

datamaestro-text 2024.5.31py3-none-any.whl → 2025.4.3py3-none-any.whl