PyPI - datamaestro-text - Versions diffs - 2025.7.28__tar.gz → 2026.1.1__tar.gz - Mend

datamaestro-text 2025.7.28tar.gz → 2026.1.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (129) hide show

{datamaestro_text-2025.7.28/src/datamaestro_text.egg-info → datamaestro_text-2026.1.1}/PKG-INFO RENAMED Viewed

@@ -1,33 +1,37 @@
 Metadata-Version: 2.4
 Name: datamaestro-text
-Version: 2025.7.28
+Version: 2026.1.1
 Summary: Datamaestro module for text-related datasets
+Project-URL: Homepage, https://github.com/experimaestro/datamaestro_text
+Project-URL: Documentation, https://datamaestro-text.readthedocs.io/en/latest/
+Project-URL: Repository, https://github.com/experimaestro/datamaestro_text
+Project-URL: Bug Tracker, https://github.com/experimaestro/datamaestro_text/issues
 Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
-License: GPL-3
-Project-URL: homepage, https://github.com/experimaestro/datamaestro_text
-Project-URL: documentation, https://datamaestro-text.readthedocs.io/en/latest/
-Project-URL: repository, https://github.com/experimaestro/datamaestro_text
-Keywords: dataset manager,information retrieval,experiments
+License: GPL-3.0-or-later
+License-File: LICENSE
+Keywords: dataset manager,experiments,information retrieval
 Classifier: Development Status :: 4 - Beta
 Classifier: Intended Audience :: Science/Research
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python
 Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
 Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Requires-Python: >=3.10
-Description-Content-Type: text/markdown
-License-File: LICENSE
-Requires-Dist: datamaestro>=1.5.0
-Requires-Dist: ir_datasets>=0.5.8
 Requires-Dist: attrs
+Requires-Dist: datamaestro>=1.6.2
+Requires-Dist: experimaestro
+Requires-Dist: ir-datasets>=0.5.8
 Provides-Extra: dev
-Requires-Dist: pytest; extra == "dev"
-Requires-Dist: docutils; extra == "dev"
-Requires-Dist: sphobjinv; extra == "dev"
-Requires-Dist: flake8; extra == "dev"
-Requires-Dist: sphinx; extra == "dev"
-Dynamic: license-file
+Requires-Dist: docutils; extra == 'dev'
+Requires-Dist: flake8; extra == 'dev'
+Requires-Dist: pytest; extra == 'dev'
+Requires-Dist: sphinx<8; extra == 'dev'
+Requires-Dist: sphobjinv; extra == 'dev'
+Description-Content-Type: text/markdown
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) [![PyPI version](https://badge.fury.io/py/datamaestro-text.svg)](https://badge.fury.io/py/datamaestro-text)

datamaestro_text-2026.1.1/pyproject.toml ADDED Viewed

@@ -0,0 +1,87 @@
+[build-system]
+requires = ["hatchling", "hatch-vcs"]
+build-backend = "hatchling.build"
+[project]
+name = "datamaestro-text"
+authors = [
+    {name = "Benjamin Piwowarski", email = "benjamin@piwowarski.fr"}
+]
+description = "Datamaestro module for text-related datasets"
+readme = "README.md"
+license = {text = "GPL-3.0-or-later"}
+keywords = ["dataset manager", "information retrieval", "experiments"]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+]
+requires-python = ">=3.10"
+dynamic = ["version"]
+dependencies = [
+    "datamaestro>=1.6.2",
+    "ir_datasets>=0.5.8",
+    "attrs",
+    "experimaestro",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest",
+    "docutils",
+    "sphobjinv",
+    "flake8",
+    "sphinx<8",
+]
+[project.urls]
+Homepage = "https://github.com/experimaestro/datamaestro_text"
+Documentation = "https://datamaestro-text.readthedocs.io/en/latest/"
+Repository = "https://github.com/experimaestro/datamaestro_text"
+"Bug Tracker" = "https://github.com/experimaestro/datamaestro_text/issues"
+[project.entry-points."datamaestro.repositories"]
+text = "datamaestro_text:Repository"
+irds = "datamaestro_text.datasets.irds:Repository"
+[tool.hatch.version]
+source = "vcs"
+[tool.hatch.version.raw-options]
+local_scheme = "no-local-version"
+[tool.hatch.build.hooks.vcs]
+version-file = "src/datamaestro_text/version.py"
+[tool.hatch.build.targets.sdist]
+include = [
+    "/src",
+    "/README.md",
+    "/LICENSE",
+    "/pyproject.toml",
+]
+[tool.hatch.build.targets.wheel]
+packages = ["src/datamaestro_text"]
+[tool.pytest.ini_options]
+junit_family = "xunit2"
+testpaths = ["src/datamaestro_text"]
+norecursedirs = ["node_modules"]
+[dependency-groups]
+dev = [
+    "docutils>=0.21.2",
+    "flake8>=7.3.0",
+    "git-cliff>=2.11.0",
+    "pytest>=8.4.1",
+    "sphinx>=7,<8",
+    "sphobjinv>=2.3.1.3",
+]

{datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/com/github/aagohary/canard.py RENAMED Viewed

@@ -37,7 +37,7 @@ def main(train, dev, test):
     Each dataset is an instance of :class:`datamaestro_text.data.conversation.CanardDataset`
     """
     return {
-        "train": CanardDataset(path=train),
-        "validation": CanardDataset(path=dev),
-        "test": CanardDataset(path=test),
+        "train": CanardDataset.C(path=train),
+        "validation": CanardDataset.C(path=dev),
+        "test": CanardDataset.C(path=test),
     }

{datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/com/github/apple/ml-qrecc.py RENAMED Viewed

@@ -51,6 +51,7 @@ def main(data: Path) -> Supervised[QReCCDataset, None, QReCCDataset]:
 )
 class Content(LZ4JSONLDocumentStore):
     """QReCC mentionned URLs content"""
     @staticmethod
     def __create_dataset__(dataset, options=None):
         ds = reference(reference=main).setup(dataset, options)
@@ -65,7 +66,7 @@ class Content(LZ4JSONLDocumentStore):
             "id",
         ).setup(dataset, options)
-        return Content(jsonl_path=store_path)
+        return Content.C(jsonl_path=store_path)
     @staticmethod
     def _documents(path: Path):

{datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/com/microsoft/msmarco/passage.py RENAMED Viewed

@@ -47,7 +47,7 @@ http://www.msmarco.org/dataset.aspx""",
 @dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
 def collection_etc(data) -> Folder:
     """Documents and some more files"""
-    return Folder(path=data)
+    return Folder.C(path=data)
 @lua

{datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/com/oscar-corpus.py RENAMED Viewed

@@ -1,4 +1,4 @@
-from datamaestro.definitions import argument, datatasks, datatags, dataset
+from datamaestro.definitions import datatasks, datatags, dataset
 from datamaestro.download.single import filedownloader
 from datamaestro_text.data.text import TextFile
 from datamaestro.utils import HashCheck

{datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/com/sentiment140.py RENAMED Viewed

@@ -27,6 +27,6 @@ def english(dir):
     If you use this data, please cite Sentiment140 as your source.
     """
     return Supervised.C(
-        train=Generic(path=dir / "training.1600000.processed.noemoticon.csv"),
-        test=Generic(path=dir / "testdata.manual.2009.06.14.csv"),
+        train=Generic.C(path=dir / "training.1600000.processed.noemoticon.csv"),
+        test=Generic.C(path=dir / "testdata.manual.2009.06.14.csv"),
     )

{datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/com/smashwords/bookcorpus.py RENAMED Viewed

@@ -1,6 +1,6 @@
 # See documentation on https://datamaestro.readthedocs.io
-from datamaestro.definitions import argument, datatasks, datatags, dataset
+from datamaestro.definitions import datatasks, datatags, dataset
 from datamaestro_text.data.text import TextFolder
 from datamaestro.download.archive import tardownloader
 from datamaestro.utils import HashCheck

{datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/edu/stanford/aclimdb.py RENAMED Viewed

@@ -11,6 +11,6 @@ def aclimdb(data):
     Paper http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf
     """
     return {
-        "train": FolderBased(path=data / "train", classes=["neg", "pos"]),
-        "test": FolderBased(path=data / "test", classes=["neg", "pos"]),
+        "train": FolderBased.C(path=data / "train", classes=["neg", "pos"]),
+        "test": FolderBased.C(path=data / "test", classes=["neg", "pos"]),
     }

datamaestro_text-2026.1.1/src/datamaestro_text/config/fr/granddebat.py ADDED Viewed

@@ -0,0 +1,186 @@
+# See documentation on https://datamaestro.readthedocs.io
+from pathlib import Path
+from datamaestro.definitions import datatags, dataset
+from datamaestro_text.data.debate import GrandDebatFile
+from datamaestro.download.single import filedownloader
+from datamaestro.utils import HashCheck
+from datamaestro.stream import Transform
+import io
+import json
+import ijson
+import os
+import threading
+class JsonToJsonl(Transform):
+    """Transforms a JSON file with an array into a JSONL file with one line per
+    array element"""
+    def __call__(self, fileobj: io.IOBase) -> io.IOBase:
+        # Stream items from the top-level array into a read-end pipe.
+        try:
+            fileobj.seek(0)
+        except Exception:
+            pass
+        r_fd, w_fd = os.pipe()
+        r_file = os.fdopen(r_fd, "rb")
+        w_file = os.fdopen(w_fd, "wb")
+        def _writer(fin, fout):
+            try:
+                for item in ijson.items(fin, "item"):
+                    line = json.dumps(item, ensure_ascii=False) + "\n"
+                    fout.write(line.encode("utf-8"))
+                fout.close()
+            except Exception:
+                try:
+                    fout.close()
+                except Exception:
+                    pass
+        t = threading.Thread(target=_writer, args=(fileobj, w_file), daemon=True)
+        t.start()
+        return r_file
+@filedownloader(
+    "la_transition_ecologique_2019_03_21.jsonl",
+    "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LA_TRANSITION_ECOLOGIQUE.json",
+    checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
+    transforms=JsonToJsonl(),
+)
+@datatags("politics", "debate", "french")
+@dataset(
+    GrandDebatFile,
+    url="https://granddebat.fr",
+)
+def transition(la_transition_ecologique_2019_03_21: Path):
+    """Grand Débat National (transition écologique)
+    The *Grand Débat National* (GDN) is a country-wide citizen consultation held
+    in France in 2019.
+    The consultation prompted citizens to express their views across four main
+    themes: *Taxation and public spending*, *Organization of the state and
+    public services*, *Democracy and citizenship*, and *Ecological transition*.
+    A significant portion of this consultation involved online questionnaires,
+    each concluding with a critical open-ended prompt: "Do you have anything to
+    add about [theme]?".
+    """
+    return GrandDebatFile.C(path=la_transition_ecologique_2019_03_21)
+@filedownloader(
+    "fiscalité_et_dépenses_publiques_2019_03_21.jsonl",
+    "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json",
+    checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
+    transforms=JsonToJsonl(),
+)
+@datatags("politics", "debate", "french")
+@dataset(
+    GrandDebatFile,
+    url="https://granddebat.fr",
+)
+def fiscalité(fiscalité_et_dépenses_publiques_2019_03_21: Path):
+    """Grand Débat National (fiscalité et dépenses publiques)
+    The *Grand Débat National* (GDN) is a country-wide citizen consultation held
+    in France in 2019.
+    The consultation prompted citizens to express their views across four main
+    themes: *Taxation and public spending*, *Organization of the state and
+    public services*, *Democracy and citizenship*, and *Ecological transition*.
+    A significant portion of this consultation involved online questionnaires,
+    each concluding with a critical open-ended prompt: "Do you have anything to
+    add about [theme]?".
+    """
+    return GrandDebatFile.C(path=fiscalité_et_dépenses_publiques_2019_03_21)
+@filedownloader(
+    "democratie_et_citoyennete_2019_03_21.jsonl",
+    "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/DEMOCRATIE_ET_CITOYENNETE.json",
+    checker=HashCheck("049aaeca7e51747f2da5d68143c760fa"),
+    transforms=JsonToJsonl(),
+)
+@datatags("politics", "debate", "french")
+@dataset(
+    GrandDebatFile,
+    url="https://granddebat.fr",
+)
+def démocratie(democratie_et_citoyennete_2019_03_21: Path):
+    """Grand Débat National (démocratie et citoyenneté)
+    The *Grand Débat National* (GDN) is a country-wide citizen consultation held
+    in France in 2019.
+    The consultation prompted citizens to express their views across four main
+    themes: *Taxation and public spending*, *Organization of the state and
+    public services*, *Democracy and citizenship*, and *Ecological transition*.
+    A significant portion of this consultation involved online questionnaires,
+    each concluding with a critical open-ended prompt: "Do you have anything to
+    add about [theme]?".
+    """
+    return GrandDebatFile.C(path=democratie_et_citoyennete_2019_03_21)
+@filedownloader(
+    "organisation_etat_services_publics_2019_03_21.jsonl",
+    "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json",
+    checker=HashCheck("0ccb7c401889f738b73b0caab897a68b"),
+    transforms=JsonToJsonl(),
+)
+@datatags("politics", "debate", "french")
+@dataset(
+    GrandDebatFile,
+    url="https://granddebat.fr",
+)
+def organisation(organisation_etat_services_publics_2019_03_21: Path):
+    """Grand Débat National (organisation de l'État et des services publics)
+    The *Grand Débat National* (GDN) is a country-wide citizen consultation held
+    in France in 2019.
+    The consultation prompted citizens to express their views across four main
+    themes: *Taxation and public spending*, *Organization of the state and
+    public services*, *Democracy and citizenship*, and *Ecological transition*.
+    A significant portion of this consultation involved online questionnaires,
+    each concluding with a critical open-ended prompt: "Do you have anything to
+    add about [theme]?".
+    """
+    return GrandDebatFile.C(path=organisation_etat_services_publics_2019_03_21)
+@filedownloader(
+    "les_evenements_2019_03_21.jsonl",
+    "http://opendata.auth-6f31f706db6f4a24b55f42a6a79c5086.storage.sbg.cloud.ovh.net/2019-03-21/LES_EVENEMENTS.json",
+    checker=HashCheck("c4ed3a8b8c43d5806d1f090e03f7aa91"),
+    transforms=JsonToJsonl(),
+)
+@datatags("politics", "debate", "french")
+@dataset(
+    GrandDebatFile,
+    url="https://granddebat.fr",
+)
+def evenements(les_evenements_2019_03_21: Path):
+    """Grand Débat National (événements)
+    The *Grand Débat National* (GDN) is a country-wide citizen consultation held
+    in France in 2019.
+    The consultation prompted citizens to express their views across four main
+    themes: *Taxation and public spending*, *Organization of the state and
+    public services*, *Democracy and citizenship*, and *Ecological transition*.
+    A significant portion of this consultation involved online questionnaires,
+    each concluding with a critical open-ended prompt: "Do you have anything to
+    add about [theme]?".
+    """
+    return GrandDebatFile.C(path=les_evenements_2019_03_21)

{datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/io/github/thunlp/fewrel.py RENAMED Viewed

@@ -32,4 +32,4 @@ def v1(train, validation):
     Only the train and validation dataset are available. The test set is hidden
     for the leaderboard.
     """
-    return {"train": File(path=train), "validation": File(path=validation)}
+    return {"train": File.C(path=train), "validation": File.C(path=validation)}

{datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/io/metamind/research/wikitext.py RENAMED Viewed

@@ -30,9 +30,9 @@ def WikiText(data, type):
     https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/
     """
     return {
-        "train": File(path=data / ("wiki.train.%s" % type)),
-        "validation": File(path=data / ("wiki.valid.%s" % type)),
-        "test": File(path=data / ("wiki.test.%s" % type)),
+        "train": File.C(path=data / ("wiki.train.%s" % type)),
+        "validation": File.C(path=data / ("wiki.valid.%s" % type)),
+        "test": File.C(path=data / ("wiki.test.%s" % type)),
     }

{datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/org/grouplens/movielens.py RENAMED Viewed

@@ -31,10 +31,10 @@ def small(ds) -> Movielens:
     100,000 ratings and 3,600 tag applications applied to 9,000 movies by 600 users (as of 9/2018)
     """
     return {
-        "ratings": csv.Generic(path=ds / "ratings.csv", names_row=0),
-        "links": csv.Generic(path=ds / "links.csv", names_row=0),
-        "movies": csv.Generic(path=ds / "movies.csv", names_row=0),
-        "tags": csv.Generic(path=ds / "tags.csv", names_row=0),
+        "ratings": csv.Generic.C(path=ds / "ratings.csv", names_row=0),
+        "links": csv.Generic.C(path=ds / "links.csv", names_row=0),
+        "movies": csv.Generic.C(path=ds / "movies.csv", names_row=0),
+        "tags": csv.Generic.C(path=ds / "tags.csv", names_row=0),
     }
@@ -46,8 +46,8 @@ def full(ds) -> Movielens:
     27,000,000 ratings and 1,100,000 tag applications applied to 58,000 movies by 280,000 users (as of 9/2018)
     """
     return {
-        "ratings": csv.Generic(path=ds / "ratings.csv", names_row=0),
-        "links": csv.Generic(path=ds / "links.csv", names_row=0),
-        "movies": csv.Generic(path=ds / "movies.csv", names_row=0),
-        "tags": csv.Generic(path=ds / "tags.csv", names_row=0),
+        "ratings": csv.Generic.C(path=ds / "ratings.csv", names_row=0),
+        "links": csv.Generic.C(path=ds / "links.csv", names_row=0),
+        "movies": csv.Generic.C(path=ds / "movies.csv", names_row=0),
+        "tags": csv.Generic.C(path=ds / "tags.csv", names_row=0),
     }

{datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/config/org/universaldependencies/french.py RENAMED Viewed

@@ -34,9 +34,9 @@ def gsd(ds) -> Supervised:
     is updated since 2015 independently from the previous source.
     """
     return {
-        "train": CoNLL_U(path=ds / "fr_gsd-ud-train.conllu"),
-        "test": CoNLL_U(path=ds / "fr_gsd-ud-dev.conllu"),
-        "validation": CoNLL_U(path=ds / "fr_gsd-ud-test.conllu"),
+        "train": CoNLL_U.C(path=ds / "fr_gsd-ud-train.conllu"),
+        "test": CoNLL_U.C(path=ds / "fr_gsd-ud-dev.conllu"),
+        "validation": CoNLL_U.C(path=ds / "fr_gsd-ud-test.conllu"),
     }

{datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/data/conversation/base.py RENAMED Viewed

@@ -267,7 +267,7 @@ class ConversationUserTopics(Topics):
         """Returns an iterator over topics"""
         # Extracts topics from conversations, Each user query is a topic (can perform retrieval on it)
         # TODO: merge with xpmir.learning.DatasetConversationBase -> same logic
         records: List[TopicRecord] = []
         for conversation in self.conversations.__iter__():
             nodes = [
@@ -279,4 +279,4 @@ class ConversationUserTopics(Topics):
                 records.append(
                     node.entry.update(ConversationHistoryItem(node.history()))
                 )
-        return iter(records)
+        return iter(records)

datamaestro_text-2026.1.1/src/datamaestro_text/data/debate/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Data classes for debate datasets"""
+from .granddebat import GrandDebatEntry, GrandDebatFile, GrandDebatResponse
+__all__ = ["GrandDebatEntry", "GrandDebatFile", "GrandDebatResponse"]

datamaestro_text-2026.1.1/src/datamaestro_text/data/debate/granddebat.py ADDED Viewed

@@ -0,0 +1,68 @@
+"""Data classes for the Grand Débat National dataset"""
+import json
+from dataclasses import dataclass, field
+from typing import Iterator, List, Optional
+from datamaestro.data import File
+@dataclass
+class GrandDebatResponse:
+    """A response to a question in the Grand Débat National"""
+    question_id: str
+    question_title: str
+    value: Optional[str]
+    formatted_value: Optional[str]
+@dataclass
+class GrandDebatEntry:
+    """An entry (contribution) in the Grand Débat National dataset"""
+    id: str
+    reference: str
+    title: str
+    created_at: str
+    published_at: str
+    updated_at: Optional[str]
+    trashed: bool
+    trashed_status: Optional[str]
+    author_id: str
+    author_type: str
+    author_zip_code: str
+    responses: List[GrandDebatResponse] = field(default_factory=list)
+class GrandDebatFile(File):
+    """A Grand Débat National JSONL file with iteration support"""
+    def __iter__(self) -> Iterator[GrandDebatEntry]:
+        """Iterate over entries in the JSONL file"""
+        with self.path.open("r", encoding="utf-8") as f:
+            for line in f:
+                data = json.loads(line)
+                responses = [
+                    GrandDebatResponse(
+                        question_id=r["questionId"],
+                        question_title=r["questionTitle"],
+                        value=r.get("value"),
+                        formatted_value=r.get("formattedValue"),
+                    )
+                    for r in data.get("responses", [])
+                ]
+                yield GrandDebatEntry(
+                    id=data["id"],
+                    reference=data["reference"],
+                    title=data["title"],
+                    created_at=data["createdAt"],
+                    published_at=data["publishedAt"],
+                    updated_at=data.get("updatedAt"),
+                    trashed=data["trashed"],
+                    trashed_status=data.get("trashedStatus"),
+                    author_id=data["authorId"],
+                    author_type=data["authorType"],
+                    author_zip_code=data["authorZipCode"],
+                    responses=responses,
+                )

{datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/data/ir/__init__.py RENAMED Viewed

@@ -1,6 +1,7 @@
 """Generic data types for information retrieval"""
 from abc import ABC, abstractmethod
+from enum import Enum
 from functools import cached_property
 import logging
 from pathlib import Path
@@ -88,6 +89,19 @@ class Documents(Base):
         ...
+class FileAccess(Enum):
+    """Defines how to access files (e.g. for document stores)"""
+    FILE = 0
+    """Direct file access"""
+    MMAP = 1
+    """Use mmap"""
+    MEMORY = 2
+    """Use memory"""
 class DocumentStore(Documents):
     """A document store
@@ -97,6 +111,10 @@ class DocumentStore(Documents):
     - return the number of documents
     """
+    file_access: Meta[FileAccess] = FileAccess.MMAP
+    """How to access the file collection (might not have any impact, depends on
+    the docstore)"""
     def docid_internal2external(self, docid: int):
         """Converts an internal collection ID (integer) to an external ID"""
         raise NotImplementedError(f"For class {self.__class__}")
@@ -327,5 +345,4 @@ class PairwiseSampleDataset(Base, ABC):
     """Datasets where each record is a query with positive and negative samples"""
     @abstractmethod
-    def iter(self) -> Iterator[PairwiseSample]:
-        ...
+    def iter(self) -> Iterator[PairwiseSample]: ...

{datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/data/ir/csv.py RENAMED Viewed

@@ -1,27 +1,26 @@
 from functools import cached_property
 from pathlib import Path
-from typing import Iterator, Tuple, Type
-from experimaestro import Param, Option, Constant, Meta
-from datamaestro.definitions import argument
+from experimaestro import Param, Meta
 from datamaestro.record import Record, RecordType
 import datamaestro_text.data.ir as ir
 from datamaestro_text.data.ir.base import IDItem, SimpleTextItem
 from datamaestro_text.interfaces.plaintext import read_tsv
-@argument("path", type=Path)
-@argument("separator", type=str, default="\t", ignored=True)
 class AdhocRunWithText(ir.AdhocRun):
     "(qid, doc.id, query, passage)"
-    pass
+    path: Meta[Path]
+    separator: Meta[str] = "\t"
-@argument("path", type=Path)
-@argument("separator", type=str, default="\t", ignored=True)
 class Topics(ir.Topics):
     "Pairs of query id - query using a separator"
+    path: Meta[Path]
+    separator: Meta[str] = "\t"
     def iter(self):
         return (
             Record(IDItem(qid), SimpleTextItem(title))

{datamaestro_text-2025.7.28 → datamaestro_text-2026.1.1}/src/datamaestro_text/data/ir/formats.py RENAMED Viewed

@@ -10,10 +10,8 @@ from ir_datasets.datasets.cord19 import Cord19FullTextSection
 @define
 class DocumentWithTitle(TextItem):
     """Web document with title and body"""
-    body: str
     title: str
+    body: str
     @cached_property
     def text(self):

datamaestro-text 2025.7.28__tar.gz → 2026.1.1__tar.gz

datamaestro-text 2025.7.28tar.gz → 2026.1.1tar.gz