PyPI - datamaestro-text - Versions diffs - 2026.2.2__py3-none-any.whl → 2026.2.3__py3-none-any.whl - Mend

datamaestro-text 2026.2.2py3-none-any.whl → 2026.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

datamaestro_text/config/com/sentiment140.py CHANGED Viewed

@@ -1,19 +1,14 @@
 from datamaestro.data.csv import Generic
-from datamaestro.definitions import datatasks, datatags, dataset
-from datamaestro.download.archive import zipdownloader
+from datamaestro.definitions import Dataset, datatasks, datatags, dataset
+from datamaestro.download.archive import ZipDownloader
 from datamaestro.data.ml import Supervised
 from datamaestro.utils import HashCheck
-@zipdownloader(
-    "dir",
-    "http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip",
-    checker=HashCheck("1647eb110dd2492512e27b9a70d5d1bc"),
-)
 @datatasks("sentiment analysis")
 @datatags("english", "sentiment", "text")
-@dataset(Supervised, url="http://help.sentiment140.com/for-students/", size="228M")
-def english(dir):
+@dataset(url="http://help.sentiment140.com/for-students/", size="228M")
+class English(Dataset):
     """Sentiment analysis dataset 140
     The data is a CSV with emoticons removed. Data file format has 6 fields:
@@ -26,7 +21,17 @@ def english(dir):
     If you use this data, please cite Sentiment140 as your source.
     """
-    return Supervised.C(
-        train=Generic.C(path=dir / "training.1600000.processed.noemoticon.csv"),
-        test=Generic.C(path=dir / "testdata.manual.2009.06.14.csv"),
+    DIR = ZipDownloader(
+        "dir",
+        "http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip",
+        checker=HashCheck("1647eb110dd2492512e27b9a70d5d1bc"),
     )
+    def config(self) -> Supervised:
+        return Supervised.C(
+            train=Generic.C(
+                path=self.DIR.path / "training.1600000.processed.noemoticon.csv"
+            ),
+            test=Generic.C(path=self.DIR.path / "testdata.manual.2009.06.14.csv"),
+        )

datamaestro_text/config/com/smashwords/bookcorpus.py CHANGED Viewed

@@ -1,23 +1,26 @@
 # See documentation on https://datamaestro.readthedocs.io
-from datamaestro.definitions import datatasks, datatags, dataset
+from datamaestro.definitions import Dataset, datatasks, datatags, dataset
 from datamaestro_text.data.text import TextFolder
-from datamaestro.download.archive import tardownloader
+from datamaestro.download.archive import TarDownloader
 from datamaestro.utils import HashCheck
 @datatags("text", "books", "English")
 @datatasks("language modeling")
-@tardownloader(
-    "folder",
-    "https://storage.googleapis.com/huggingface-nlp/datasets/bookcorpus/bookcorpus.tar.bz2",
-    checker=HashCheck("5c906ede3c5265f8934b62c275a754bc"),
-)
-@dataset(TextFolder, id="", url="https://yknzhu.wixsite.com/mbweb", size="4.3G")
-def main(folder):
+@dataset(id="", url="https://yknzhu.wixsite.com/mbweb", size="4.3G")
+class Main(Dataset):
     """Unpublished books from Smashwords
     The books are concatened in two files hosted on huggingface NLP storage.
     Each sentence is on a separate line and tokens are space separated.
     """
-    return {"path": folder}
+    FOLDER = TarDownloader(
+        "folder",
+        "https://storage.googleapis.com/huggingface-nlp/datasets/bookcorpus/bookcorpus.tar.bz2",
+        checker=HashCheck("5c906ede3c5265f8934b62c275a754bc"),
+    )
+    def config(self) -> TextFolder:
+        return TextFolder.C(path=self.FOLDER.path)

datamaestro_text/config/edu/stanford/aclimdb.py CHANGED Viewed

@@ -1,16 +1,21 @@
 from datamaestro.data.ml import FolderBased, Supervised
-from datamaestro.definitions import dataset
-from datamaestro.download.archive import tardownloader
+from datamaestro.definitions import Dataset, dataset
+from datamaestro.download.archive import TarDownloader
-@tardownloader("data", "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz")
-@dataset(Supervised, url="http://ai.stanford.edu/~amaas/data/sentiment/", id="")
-def aclimdb(data):
+@dataset(url="http://ai.stanford.edu/~amaas/data/sentiment/", id="")
+class Aclimdb(Dataset):
     """Large Movie Review Dataset
     Paper http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf
     """
-    return {
-        "train": FolderBased.C(path=data / "train", classes=["neg", "pos"]),
-        "test": FolderBased.C(path=data / "test", classes=["neg", "pos"]),
-    }
+    DATA = TarDownloader(
+        "data", "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
+    )
+    def config(self) -> Supervised:
+        return Supervised.C(
+            train=FolderBased.C(path=self.DATA.path / "train", classes=["neg", "pos"]),
+            test=FolderBased.C(path=self.DATA.path / "test", classes=["neg", "pos"]),
+        )

datamaestro_text/config/edu/stanford/glove.py CHANGED Viewed

@@ -4,10 +4,10 @@ GloVe is an unsupervised learning algorithm for obtaining vector representations
   and the resulting representations showcase interesting linear substructures of the word vector space.
 """
-from datamaestro.definitions import dataset
+from datamaestro.definitions import Dataset, dataset
 from datamaestro.download import reference
-from datamaestro.download.archive import zipdownloader
-from datamaestro.download.single import filedownloader
+from datamaestro.download.archive import ZipDownloader
+from datamaestro.download.single import FileDownloader
 from datamaestro_text.data.embeddings import WordEmbeddingsText
@@ -16,65 +16,100 @@ from datamaestro_text.data.embeddings import WordEmbeddingsText
 #   tokens: 6G
 #   vocabulary: 400K
 #   cased: false
-@zipdownloader("embeddings", "http://nlp.stanford.edu/data/glove.6B.zip")
-@dataset(WordEmbeddingsText, id="6b")
-def glove_6b(embeddings):
+@dataset(id=".6b")
+class Glove6B(Dataset):
     """Embeddings for 6B words in various dimensions"""
-    return {"path": embeddings}
+    EMBEDDINGS = ZipDownloader(
+        "embeddings", "http://nlp.stanford.edu/data/glove.6B.zip"
+    )
-@reference("data_6b", glove_6b)
-@dataset(WordEmbeddingsText, id="6b.50")
-def glove_6b_50(data_6b):
+    def config(self) -> WordEmbeddingsText:
+        return WordEmbeddingsText.C(path=self.EMBEDDINGS.path)
+@dataset(id=".6b.50")
+class Glove6B50(Dataset):
     """Glove 6B - dimension 50"""
-    return {"path": data_6b.path / "glove.6B.50d.txt"}
+    DATA_6B = reference(varname="data_6b", reference=Glove6B)
+    def config(self) -> WordEmbeddingsText:
+        return WordEmbeddingsText.C(
+            path=self.DATA_6B.prepare().path / "glove.6B.50d.txt"
+        )
-@reference("data_6b", glove_6b)
-@dataset(WordEmbeddingsText, id="6b.100")
-def glove_6b_100(data_6b):
+@dataset(id=".6b.100")
+class Glove6B100(Dataset):
     """Glove 6B - dimension 100"""
-    return {"path": data_6b.path / "glove.6B.100d.txt"}
+    DATA_6B = reference(varname="data_6b", reference=Glove6B)
-@reference("data_6b", glove_6b)
-@dataset(WordEmbeddingsText, id="6b.200")
-def glove_6b_200(data_6b):
+    def config(self) -> WordEmbeddingsText:
+        return WordEmbeddingsText.C(
+            path=self.DATA_6B.prepare().path / "glove.6B.100d.txt"
+        )
+@dataset(id=".6b.200")
+class Glove6B200(Dataset):
     """Glove 6B - dimension 200"""
-    return {"path": data_6b.path / "glove.6B.200d.txt"}
+    DATA_6B = reference(varname="data_6b", reference=Glove6B)
+    def config(self) -> WordEmbeddingsText:
+        return WordEmbeddingsText.C(
+            path=self.DATA_6B.prepare().path / "glove.6B.200d.txt"
+        )
 ...
-@reference("data_6b", glove_6b)
-@dataset(WordEmbeddingsText, id="6b.300")
-def glove_6b_300(data_6b):
+@dataset(id=".6b.300")
+class Glove6B300(Dataset):
     """Glove 6B - dimension 200"""
-    return {"path": data_6b.path / "glove.6B.200d.txt"}
+    DATA_6B = reference(varname="data_6b", reference=Glove6B)
+    def config(self) -> WordEmbeddingsText:
+        return WordEmbeddingsText.C(
+            path=self.DATA_6B.prepare().path / "glove.6B.200d.txt"
+        )
-@filedownloader("embeddings", "http://nlp.stanford.edu/data/glove.42B.300d.zip")
-@dataset(WordEmbeddingsText, id="42b")
+@dataset(id=".42b")
 # size: 2.03G
 # statistics:
 #   cased: true
 #   tokens: 42B
 #   vocabulary: 2.2M
 #   dimension: 300
-def glove_42b(embeddings):
+class Glove42B(Dataset):
     """Glove embeddings trained on Common Crawl with 42B tokens"""
-    return {"path": embeddings}
+    EMBEDDINGS = FileDownloader(
+        "embeddings", "http://nlp.stanford.edu/data/glove.42B.300d.zip"
+    )
+    def config(self) -> WordEmbeddingsText:
+        return WordEmbeddingsText.C(path=self.EMBEDDINGS.path)
-@filedownloader("embeddings", "http://nlp.stanford.edu/data/glove.840B.300d.zip")
-@dataset(WordEmbeddingsText, id="840b")
+@dataset(id=".840b")
 # size: 2.03G
 # statistics:
 #   cased: true
 #   tokens: 840G
 #   vocabulary: 2.2M
 #   dimension: 300
-def glove_840b(embeddings):
+class Glove840B(Dataset):
     """Glove embeddings trained on Common Crawl with 840B tokens"""
-    return {"path": embeddings}
+    EMBEDDINGS = FileDownloader(
+        "embeddings", "http://nlp.stanford.edu/data/glove.840B.300d.zip"
+    )
+    def config(self) -> WordEmbeddingsText:
+        return WordEmbeddingsText.C(path=self.EMBEDDINGS.path)

datamaestro_text/config/edu/upenn/ldc/aquaint.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """The AQUAINT Corpus, Linguistic Data Consortium (LDC) catalog number LDC2002T31 and ISBN 1-58563-240-6 consists of newswire text data in English, drawn from three sources: the Xinhua News Service (People's Republic of China), the New York Times News Service, and the Associated Press Worldstream News Service. It was prepared by the LDC for the AQUAINT Project, and will be used in official benchmark evaluations conducted by National Institute of Standards and Technology (NIST)."""
 from datamaestro.context import DatafolderPath
-from datamaestro.definitions import dataset
+from datamaestro.definitions import Dataset, dataset
 from datamaestro.download.links import links, linkfolder
 from datamaestro_text.data.ir.trec import TipsterCollection
@@ -9,29 +9,47 @@ from datamaestro_text.data.ir.trec import TipsterCollection
 URL = "https://catalog.ldc.upenn.edu/LDC2002T31"
-@linkfolder("documents", [DatafolderPath("edu.upenn.ldc.aquaint", "APW")])
-@dataset(TipsterCollection, url=URL, id="apw")
-def apw(documents):
+@dataset(url=URL, id=".apw")
+class Apw(Dataset):
     """Associated Press (1998-2000)"""
-    return {"path": documents}
+    DOCUMENTS = linkfolder(
+        "documents", [DatafolderPath("edu.upenn.ldc.aquaint", "APW")]
+    )
-@linkfolder("documents", [DatafolderPath("edu.upenn.ldc.aquaint", "NYT")])
-@dataset(TipsterCollection, url=URL, id="nyt")
-def nyt(documents):
+    def config(self) -> TipsterCollection:
+        return TipsterCollection.C(path=self.DOCUMENTS.path)
+@dataset(url=URL, id=".nyt")
+class Nyt(Dataset):
     """New York Times (1998-2000)"""
-    return {"path": documents}
+    DOCUMENTS = linkfolder(
+        "documents", [DatafolderPath("edu.upenn.ldc.aquaint", "NYT")]
+    )
+    def config(self) -> TipsterCollection:
+        return TipsterCollection.C(path=self.DOCUMENTS.path)
-@linkfolder("documents", [DatafolderPath("edu.upenn.ldc.aquaint", "XIE")])
-@dataset(TipsterCollection, url=URL, id="xie")
-def xie(documents):
+@dataset(url=URL, id=".xie")
+class Xie(Dataset):
     """Xinhua News Agency newswires (1996-2000)"""
-    return {"path": documents}
+    DOCUMENTS = linkfolder(
+        "documents", [DatafolderPath("edu.upenn.ldc.aquaint", "XIE")]
+    )
-@links("documents", apw=apw.path, nyt=nyt.path, xie=xie.path)
-@dataset(TipsterCollection, url=URL, id="")
-def aquaint(documents):
+    def config(self) -> TipsterCollection:
+        return TipsterCollection.C(path=self.DOCUMENTS.path)
+@dataset(url=URL, id="")
+class Aquaint(Dataset):
     """Aquaint documents"""
-    return {"path": documents}
+    DOCUMENTS = links("documents", apw=Apw, nyt=Nyt, xie=Xie)
+    def config(self) -> TipsterCollection:
+        return TipsterCollection.C(path=self.DOCUMENTS.path)

datamaestro_text/config/fr/granddebat.py CHANGED Viewed

@@ -1,9 +1,8 @@
 # See documentation on https://datamaestro.readthedocs.io
-from pathlib import Path
-from datamaestro.definitions import datatags, dataset
+from datamaestro.definitions import Dataset, datatags, dataset
 from datamaestro_text.data.debate import GrandDebatFile
-from datamaestro.download.single import filedownloader
+from datamaestro.download.single import FileDownloader
 from datamaestro.utils import HashCheck
 from datamaestro.stream import Transform
 import io
@@ -46,18 +45,11 @@ class JsonToJsonl(Transform):
         return r_file
-@filedownloader(
-    "la_transition_ecologique_2019_03_21.jsonl",
-    "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LA_TRANSITION_ECOLOGIQUE.json",
-    checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
-    transforms=JsonToJsonl(),
-)
 @datatags("politics", "debate", "french")
 @dataset(
-    GrandDebatFile,
     url="https://granddebat.fr",
 )
-def transition(la_transition_ecologique_2019_03_21: Path):
+class Transition(Dataset):
     """Grand Débat National (transition écologique)
     The *Grand Débat National* (GDN) is a country-wide citizen consultation held
@@ -71,21 +63,23 @@ def transition(la_transition_ecologique_2019_03_21: Path):
     each concluding with a critical open-ended prompt: "Do you have anything to
     add about [theme]?".
     """
-    return GrandDebatFile.C(path=la_transition_ecologique_2019_03_21)
+    FILE = FileDownloader(
+        "la_transition_ecologique_2019_03_21.jsonl",
+        "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LA_TRANSITION_ECOLOGIQUE.json",
+        checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
+        transforms=JsonToJsonl(),
+    )
+    def config(self) -> GrandDebatFile:
+        return GrandDebatFile.C(path=self.FILE.path)
-@filedownloader(
-    "fiscalité_et_dépenses_publiques_2019_03_21.jsonl",
-    "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json",
-    checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
-    transforms=JsonToJsonl(),
-)
 @datatags("politics", "debate", "french")
 @dataset(
-    GrandDebatFile,
     url="https://granddebat.fr",
 )
-def fiscalité(fiscalité_et_dépenses_publiques_2019_03_21: Path):
+class Fiscalité(Dataset):
     """Grand Débat National (fiscalité et dépenses publiques)
     The *Grand Débat National* (GDN) is a country-wide citizen consultation held
@@ -99,21 +93,23 @@ def fiscalité(fiscalité_et_dépenses_publiques_2019_03_21: Path):
     each concluding with a critical open-ended prompt: "Do you have anything to
     add about [theme]?".
     """
-    return GrandDebatFile.C(path=fiscalité_et_dépenses_publiques_2019_03_21)
+    FILE = FileDownloader(
+        "fiscalité_et_dépenses_publiques_2019_03_21.jsonl",
+        "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json",
+        checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
+        transforms=JsonToJsonl(),
+    )
+    def config(self) -> GrandDebatFile:
+        return GrandDebatFile.C(path=self.FILE.path)
-@filedownloader(
-    "democratie_et_citoyennete_2019_03_21.jsonl",
-    "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/DEMOCRATIE_ET_CITOYENNETE.json",
-    checker=HashCheck("049aaeca7e51747f2da5d68143c760fa"),
-    transforms=JsonToJsonl(),
-)
 @datatags("politics", "debate", "french")
 @dataset(
-    GrandDebatFile,
     url="https://granddebat.fr",
 )
-def démocratie(democratie_et_citoyennete_2019_03_21: Path):
+class Démocratie(Dataset):
     """Grand Débat National (démocratie et citoyenneté)
     The *Grand Débat National* (GDN) is a country-wide citizen consultation held
@@ -127,21 +123,23 @@ def démocratie(democratie_et_citoyennete_2019_03_21: Path):
     each concluding with a critical open-ended prompt: "Do you have anything to
     add about [theme]?".
     """
-    return GrandDebatFile.C(path=democratie_et_citoyennete_2019_03_21)
+    FILE = FileDownloader(
+        "democratie_et_citoyennete_2019_03_21.jsonl",
+        "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/DEMOCRATIE_ET_CITOYENNETE.json",
+        checker=HashCheck("049aaeca7e51747f2da5d68143c760fa"),
+        transforms=JsonToJsonl(),
+    )
+    def config(self) -> GrandDebatFile:
+        return GrandDebatFile.C(path=self.FILE.path)
-@filedownloader(
-    "organisation_etat_services_publics_2019_03_21.jsonl",
-    "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json",
-    checker=HashCheck("0ccb7c401889f738b73b0caab897a68b"),
-    transforms=JsonToJsonl(),
-)
 @datatags("politics", "debate", "french")
 @dataset(
-    GrandDebatFile,
     url="https://granddebat.fr",
 )
-def organisation(organisation_etat_services_publics_2019_03_21: Path):
+class Organisation(Dataset):
     """Grand Débat National (organisation de l'État et des services publics)
     The *Grand Débat National* (GDN) is a country-wide citizen consultation held
@@ -155,21 +153,23 @@ def organisation(organisation_etat_services_publics_2019_03_21: Path):
     each concluding with a critical open-ended prompt: "Do you have anything to
     add about [theme]?".
     """
-    return GrandDebatFile.C(path=organisation_etat_services_publics_2019_03_21)
+    FILE = FileDownloader(
+        "organisation_etat_services_publics_2019_03_21.jsonl",
+        "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json",
+        checker=HashCheck("0ccb7c401889f738b73b0caab897a68b"),
+        transforms=JsonToJsonl(),
+    )
+    def config(self) -> GrandDebatFile:
+        return GrandDebatFile.C(path=self.FILE.path)
-@filedownloader(
-    "les_evenements_2019_03_21.jsonl",
-    "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LES_EVENEMENTS.json",
-    checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
-    transforms=JsonToJsonl(),
-)
 @datatags("politics", "debate", "french")
 @dataset(
-    GrandDebatFile,
     url="https://granddebat.fr",
 )
-def evenements(les_evenements_2019_03_21: Path):
+class Evenements(Dataset):
     """Grand Débat National (événements)
     The *Grand Débat National* (GDN) is a country-wide citizen consultation held
@@ -183,4 +183,13 @@ def evenements(les_evenements_2019_03_21: Path):
     each concluding with a critical open-ended prompt: "Do you have anything to
     add about [theme]?".
     """
-    return GrandDebatFile.C(path=les_evenements_2019_03_21)
+    FILE = FileDownloader(
+        "les_evenements_2019_03_21.jsonl",
+        "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LES_EVENEMENTS.json",
+        checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
+        transforms=JsonToJsonl(),
+    )
+    def config(self) -> GrandDebatFile:
+        return GrandDebatFile.C(path=self.FILE.path)

datamaestro_text/config/gov/nist/ir/covid.py CHANGED Viewed

@@ -1,9 +1,9 @@
 """CORD-19 dataset"""
 from datamaestro.annotations.agreement import useragreement
-from datamaestro.definitions import datatasks, dataset
+from datamaestro.definitions import Dataset, datatasks, dataset
 from datamaestro.download import reference
-from datamaestro.download.single import filedownloader
+from datamaestro.download.single import FileDownloader
 from datamaestro.utils import HashCheck
 from datamaestro_text.data.ir import Adhoc
 import datamaestro_text.data.ir.cord19 as d_cord19
@@ -14,19 +14,19 @@ cord19_lua = useragreement(
 By accessing, downloading or otherwise using any Journals, Articles, Metadata, Abstracts,
 Full-Texts or any other content types provided in the COVID-19 Open Research Dataset (CORD-19)
-Database (the “Data”), You expressly acknowledge and agree to the following:
+Database (the "Data"), You expressly acknowledge and agree to the following:
-• AI2 grants to You a worldwide, perpetual, non-exclusive, non-transferablelicenseto use and
+\u2022 AI2 grants to You a worldwide, perpetual, non-exclusive, non-transferablelicenseto use and
 make derivatives of the Datafor text and data mining only.
-• AI2 warrants that it has the right to make the Data available to Youas provided for in and
+\u2022 AI2 warrants that it has the right to make the Data available to Youas provided for in and
 subject to this Agreement and in accordance with applicable law.  EXCEPT FOR THE LIMITED WARRANTY
-IN THIS SECTION, THE DATA IS PROVIDED “AS IS”, WITHOUT ANY WARRANTIES OF ANY KIND.
+IN THIS SECTION, THE DATA IS PROVIDED "AS IS", WITHOUT ANY WARRANTIES OF ANY KIND.
-• You agree to comply with all applicable local, state, national, and international laws and
-regulations with respect to AI2’s license and Youruse of the Data.• Data provided by AI2 is
+\u2022 You agree to comply with all applicable local, state, national, and international laws and
+regulations with respect to AI2's license and Youruse of the Data.\u2022 Data provided by AI2 is
 from copyrighted sources of the respective copyright holders. You are solely responsible
-for Your and Your users’ compliance with any copyright, patent or trademark restrictions
+for Your and Your users' compliance with any copyright, patent or trademark restrictions
 and are referred to the copyright, patent or trademark notices appearing in the original
 sources, all of which are hereby incorporated by reference""",
     id="ai2.cord19",
@@ -34,62 +34,73 @@ sources, all of which are hereby incorporated by reference""",
 @cord19_lua
-@filedownloader(
-    "data.csv",
-    url="https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv",
-    checker=HashCheck("80d664e496b8b7e50a39c6f6bb92e0ef"),
-)
 @dataset(
-    d_cord19.Documents,
     url="https://ir.nist.gov/covidSubmit/index.html",
 )
-def cord19_round5_metadata(data):
+class Cord19Round5Metadata(Dataset):
     """Cord 19 metadata (round 5)
     Released on 2020-07-16
     """
-    return {
-        "path": data,
-        "names_row": 0,
-        # Number of documents
-        "count": 192509,
-    }
-@filedownloader(
-    "data.xml",
-    url="https://ir.nist.gov/covidSubmit/data/topics-rnd5.xml",
-    checker=HashCheck("0307a37b6b9f1a5f233340a769d538ea"),
-)
-@dataset(d_cord19.Topics)
-def cord19_round5_topics(data):
+    DATA = FileDownloader(
+        "data.csv",
+        url="https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-07-16/metadata.csv",
+        checker=HashCheck("80d664e496b8b7e50a39c6f6bb92e0ef"),
+    )
+    def config(self) -> d_cord19.Documents:
+        return d_cord19.Documents.C(
+            path=self.DATA.path,
+            names_row=0,
+            # Number of documents
+            count=192509,
+        )
+@dataset()
+class Cord19Round5Topics(Dataset):
     """CORD-19 topics (round 5)"""
-    return {"path": data}
+    DATA = FileDownloader(
+        "data.xml",
+        url="https://ir.nist.gov/covidSubmit/data/topics-rnd5.xml",
+        checker=HashCheck("0307a37b6b9f1a5f233340a769d538ea"),
+    )
-@filedownloader(
-    "data.ssv",
-    url="https://ir.nist.gov/covidSubmit/data/qrels-covid_d5_j0.5-5.txt",
-    checker=HashCheck("8138424a59daea0aba751c8a891e5f54"),
-)
-@dataset(TrecAdhocAssessments)
-def cord19_round5_assessments(data):
+    def config(self) -> d_cord19.Topics:
+        return d_cord19.Topics.C(path=self.DATA.path)
+@dataset()
+class Cord19Round5Assessments(Dataset):
     """CORD19 assessments (round 5)"""
-    return {"path": data}
+    DATA = FileDownloader(
+        "data.ssv",
+        url="https://ir.nist.gov/covidSubmit/data/qrels-covid_d5_j0.5-5.txt",
+        checker=HashCheck("8138424a59daea0aba751c8a891e5f54"),
+    )
+    def config(self) -> TrecAdhocAssessments:
+        return TrecAdhocAssessments.C(path=self.DATA.path)
-@reference("collection", cord19_round5_metadata)
-@reference("topics", cord19_round5_topics)
-@reference("qrels", cord19_round5_assessments)
 @datatasks("information retrieval", "passage retrieval")
-@dataset(Adhoc, url="https://ir.nist.gov/covidSubmit/data.html")
-def cord19_round5(topics, qrels, collection):
+@dataset(url="https://ir.nist.gov/covidSubmit/data.html")
+class Cord19Round5(Dataset):
     """CORD-19 IR collection (round 5)
     This is the primary test collection for ad hoc retrieval that is the outcome of all five rounds of TREC-COVID. The test set, called TREC-COVID Complete, consists of the Round 5 document set (July 16 release of CORD-19); the final set of 50 topics; and the cumulative judgments from all assessing rounds with CORD-UIDs mapped to July 16 ids if necessary, previously judged documents no longer in the July 16 release removed, and the last judgments for documents judged multiple times due to significant content changes between rounds. Note that no TREC-COVID submissions correspond to this collection since all TREC-COVID submissions were subject to residual collection evaluation.
     """
-    return {
-        "documents": collection,
-        "topics": topics,
-        "assessments": qrels,
-    }
+    COLLECTION = reference(varname="collection", reference=Cord19Round5Metadata)
+    TOPICS = reference(varname="topics", reference=Cord19Round5Topics)
+    QRELS = reference(varname="qrels", reference=Cord19Round5Assessments)
+    def config(self) -> Adhoc:
+        return Adhoc.C(
+            documents=self.COLLECTION.prepare(),
+            topics=self.TOPICS.prepare(),
+            assessments=self.QRELS.prepare(),
+        )

datamaestro-text 2026.2.2__py3-none-any.whl → 2026.2.3__py3-none-any.whl

datamaestro-text 2026.2.2py3-none-any.whl → 2026.2.3py3-none-any.whl