PyPI - datamaestro-text - Versions diffs - 2025.6.30__tar.gz → 2025.9.11__tar.gz - Mend

datamaestro-text 2025.6.30tar.gz → 2025.9.11tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (129) hide show

{datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/.github/workflows/pytest.yml RENAMED Viewed

@@ -25,7 +25,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-        python -m pip install --upgrade pip
+        python -m pip install --upgrade pip setuptools
         SETUPTOOLS_SCM_PRETEND_VERSION="0.1-dev" pip install -e '.[dev]'
     - name: Lint with flake8
       run: |

{datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/.readthedocs.yml RENAMED Viewed

@@ -11,7 +11,7 @@ sphinx:
 build:
   os: ubuntu-20.04
   tools:
-    python: "3.9"
+    python: "3.10"
 # Install the package

{datamaestro_text-2025.6.30/src/datamaestro_text.egg-info → datamaestro_text-2025.9.11}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: datamaestro-text
-Version: 2025.6.30
+Version: 2025.9.11
 Summary: Datamaestro module for text-related datasets
 Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
 License: GPL-3
@@ -15,10 +15,10 @@ Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python
 Classifier: Programming Language :: Python :: 3
 Classifier: Topic :: Software Development :: Libraries :: Python Modules
-Requires-Python: >=3.8
+Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: datamaestro>=1.4.2
+Requires-Dist: datamaestro>=1.5.0
 Requires-Dist: ir_datasets>=0.5.8
 Requires-Dist: attrs
 Provides-Extra: dev

{datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/source/api/conversation.rst RENAMED Viewed

@@ -26,6 +26,12 @@ Data classes
 .. autoclass:: ConversationTopic
+Conversational IR
+-----------------
+.. autoxpmconfig:: datamaestro_text.data.conversation.base.ConversationUserTopics
 Contextual query reformulation
 ------------------------------
@@ -34,7 +40,7 @@ Contextual query reformulation
 .. autoclass:: ContextualizedRewrittenQuery
     :members:
-CANARD Dataset
+CANARD Dataset
 .. autoxpmconfig:: datamaestro_text.data.conversation.canard.CanardDataset
     :members: iter
@@ -50,7 +56,7 @@ OrConvQA Dataset
 .. autoclass:: datamaestro_text.data.conversation.orconvqa.OrConvQADatasetHistoryEntry
     :members:
-QReCC Dataset
+QReCC Dataset
 .. autoclass:: datamaestro_text.data.conversation.qrecc.QReCCDatasetEntry
     :members:
@@ -61,11 +67,11 @@ QReCC Dataset
 iKAT Dataset
-.. autoclass:: datamaestro_text.data.conversation.ikat.IkatDatasetEntry
+.. autoclass:: datamaestro_text.data.conversation.ikat.IkatConversationTopic
     :members:
 .. autoclass:: datamaestro_text.data.conversation.ikat.IkatConversationEntry
     :members:
-.. autoxpmconfig:: datamaestro_text.data.conversation.ikat.IkatDataset
+.. autoxpmconfig:: datamaestro_text.data.conversation.ikat.IkatConversations
     :members: iter

{datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/source/api/ir.rst RENAMED Viewed

@@ -36,11 +36,17 @@ Documents
 .. autoxpmconfig:: datamaestro_text.data.ir.Documents
     :members: iter_documents, iter_ids, documentcount
+.. autoxpmconfig:: datamaestro_text.data.ir.csv.Documents
+.. autoxpmconfig:: datamaestro_text.datasets.irds.data.LZ4DocumentStore
+Dataset-specific documents
+**************************
 .. autoxpmconfig:: datamaestro_text.data.ir.cord19.Documents
 .. autoxpmconfig:: datamaestro_text.data.ir.trec.TipsterCollection
-.. autoxpmconfig:: datamaestro_text.data.ir.csv.Documents
 .. autoxpmconfig:: datamaestro_text.data.ir.stores.OrConvQADocumentStore
-.. autoxpmconfig:: datamaestro_text.datasets.irds.data.LZ4DocumentStore
+.. autoxpmconfig:: datamaestro_text.data.ir.stores.IKatClueWeb22DocumentStore
 Assessments
 -----------

{datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "datamaestro-text"
-requires-python = ">=3.8"
+requires-python = ">=3.10"
 keywords = ["dataset manager", "information retrieval", "experiments"]
 description = "Datamaestro module for text-related datasets"
 dynamic = ["version", "readme", "dependencies"]
@@ -44,6 +44,14 @@ fallback_version = "0.0.0-dev"
 [build-system]
 requires = ["setuptools", "setuptools-scm", "wheel"]  # PEP 508 specifications.
+[dependency-groups]
+dev = [
+    "docutils>=0.21.2",
+    "pytest>=8.4.1",
+    "sphinx>=8.1.3",
+    "sphobjinv>=2.3.1.3",
+]
 [project.entry-points."datamaestro.repositories"]
 text = "datamaestro_text:Repository"
 irds = "datamaestro_text.datasets.irds:Repository"

{datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/requirements.txt RENAMED Viewed

@@ -1,3 +1,3 @@
-datamaestro>=1.4.2
+datamaestro>=1.5.0
 ir_datasets>=0.5.8
 attrs

{datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/github/aagohary/canard.py RENAMED Viewed

@@ -37,7 +37,7 @@ def main(train, dev, test):
     Each dataset is an instance of :class:`datamaestro_text.data.conversation.CanardDataset`
     """
     return {
-        "train": CanardDataset(path=train),
-        "validation": CanardDataset(path=dev),
-        "test": CanardDataset(path=test),
+        "train": CanardDataset.C(path=train),
+        "validation": CanardDataset.C(path=dev),
+        "test": CanardDataset.C(path=test),
     }

{datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/github/apple/ml-qrecc.py RENAMED Viewed

@@ -51,6 +51,7 @@ def main(data: Path) -> Supervised[QReCCDataset, None, QReCCDataset]:
 )
 class Content(LZ4JSONLDocumentStore):
     """QReCC mentionned URLs content"""
     @staticmethod
     def __create_dataset__(dataset, options=None):
         ds = reference(reference=main).setup(dataset, options)
@@ -65,7 +66,7 @@ class Content(LZ4JSONLDocumentStore):
             "id",
         ).setup(dataset, options)
-        return Content(jsonl_path=store_path)
+        return Content.C(jsonl_path=store_path)
     @staticmethod
     def _documents(path: Path):

datamaestro_text-2025.9.11/src/datamaestro_text/config/com/github/ikat.py ADDED Viewed

@@ -0,0 +1,121 @@
+# See documentation on https://datamaestro.readthedocs.io
+import bz2
+from datamaestro.download import reference
+from datamaestro.definitions import datatasks, datatags, dataset
+from datamaestro_text.data.conversation.base import ConversationUserTopics
+from datamaestro_text.data.ir import Adhoc
+from datamaestro.utils import HashCheck
+from datamaestro.context import DatafolderPath
+from datamaestro.download.single import filedownloader
+from datamaestro_text.data.conversation.ikat import IkatConversations
+from datamaestro.download.links import linkfolder
+from datamaestro_text.data.ir.stores import IKatClueWeb22DocumentStore
+from datamaestro_text.data.ir.trec import TrecAdhocAssessments
+from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
+@dataset(as_prepare=True)
+def clueweb22(dataset, options=None) -> IKatClueWeb22DocumentStore:
+    # Number of documents in the dataset
+    count = 116_838_987
+    jsonl_folder = linkfolder(
+        "documents", [DatafolderPath("gov.nist.trec.ikat.clueweb22", "jsonl")]
+    ).setup(dataset, options)
+    store_path = lz4docstore_builder(
+        "store",
+        IKatClueWeb22DocumentStore.generator(
+            jsonl_folder,
+            jsonl_folder / "ikat_2023_passages_jsonl.sha256sums",
+            jsonl_folder / "ikat_2023_passages_hashes.tsv.bz2",
+        ),
+        IKatClueWeb22DocumentStore.Document,
+        "id",
+        count_hint=count,
+    ).setup(dataset, options)
+    return IKatClueWeb22DocumentStore.C(path=store_path, count=count)
+@datatags("conversation", "context", "query")
+@datatasks("conversational search", "query rewriting")
+@reference("documents", clueweb22)
+@filedownloader(
+    "topics.json",
+    "https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2025/data/2025_test_topics.json",
+    checker=HashCheck("16f8444a8d0a8dfe0090f478f185a63c"),
+)
+@dataset(
+    id="2025",
+    url="https://github.com/irlabamsterdam/iKAT/tree/main/2025",
+)
+def test_2025(topics, documents) -> Adhoc.C:
+    """Question-in-context rewriting
+    iKAT is a test dataset for question-in-context rewriting that consists of
+    questions each given in a dialog context together with a context-independent
+    rewriting of the question.
+    """
+    return Adhoc.C(
+        topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
+        # TODO: add when available
+        assessments=TrecAdhocAssessments.C(path="/to/do"),
+        documents=documents,
+    )
+@datatags("conversation", "context", "query")
+@datatasks("conversational search", "query rewriting")
+@reference("documents", clueweb22)
+@filedownloader(
+    "qrels",
+    "https://trec.nist.gov/data/ikat/2024-qrels.txt",
+    checker=HashCheck("57f958903ed1c12bbac207f62800814f"),
+)
+@filedownloader(
+    "topics.json",
+    "https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2024/data/2024_test_topics.json",
+    checker=HashCheck("ad45bc6e7add2081d69ea60a0a4d1203"),
+)
+@dataset(
+    Adhoc,
+    id="2024",
+    url="https://github.com/irlabamsterdam/iKAT/tree/main/2024",
+)
+def test_2024(topics, qrels, documents) -> Adhoc.C:
+    """iKAT 2024 dataset"""
+    return Adhoc.C(
+        topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
+        assessments=TrecAdhocAssessments.C(path=qrels),
+        documents=documents,
+    )
+@datatags("conversation", "context", "query")
+@datatasks("conversational search", "query rewriting")
+@reference("documents", clueweb22)
+@filedownloader(
+    "qrels",
+    "https://trec.nist.gov/data/ikat/2023-qrels.all-turns.txt",
+    checker=HashCheck("79dc121bab25b2245e52a53263e5ad1f"),
+)
+@filedownloader(
+    "topics.json",
+    "https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2023/data/2023_test_topics.json",
+    checker=HashCheck("684fa0197cdec8c3cfb6a2e586ab83f6"),
+)
+@dataset(
+    Adhoc,
+    id="2023",
+    url="https://github.com/irlabamsterdam/iKAT/tree/main/2023",
+)
+def test_2023(topics, qrels, documents) -> Adhoc.C:
+    """iKAT 2023 dataset"""
+    return Adhoc.C(
+        topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
+        assessments=TrecAdhocAssessments.C(path=qrels),
+        documents=documents,
+    )

{datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/microsoft/msmarco/passage.py RENAMED Viewed

@@ -47,7 +47,7 @@ http://www.msmarco.org/dataset.aspx""",
 @dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
 def collection_etc(data) -> Folder:
     """Documents and some more files"""
-    return Folder(path=data)
+    return Folder.C(path=data)
 @lua

{datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/sentiment140.py RENAMED Viewed

@@ -26,7 +26,7 @@ def english(dir):
     If you use this data, please cite Sentiment140 as your source.
     """
-    return {
-        "train": Generic(path=dir / "training.1600000.processed.noemoticon.csv"),
-        "test": Generic(path=dir / "testdata.manual.2009.06.14.csv"),
-    }
+    return Supervised.C(
+        train=Generic.C(path=dir / "training.1600000.processed.noemoticon.csv"),
+        test=Generic.C(path=dir / "testdata.manual.2009.06.14.csv"),
+    )

{datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/edu/stanford/aclimdb.py RENAMED Viewed

@@ -11,6 +11,6 @@ def aclimdb(data):
     Paper http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf
     """
     return {
-        "train": FolderBased(path=data / "train", classes=["neg", "pos"]),
-        "test": FolderBased(path=data / "test", classes=["neg", "pos"]),
+        "train": FolderBased.C(path=data / "train", classes=["neg", "pos"]),
+        "test": FolderBased.C(path=data / "test", classes=["neg", "pos"]),
     }

{datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/gov/nist/trec/tipster.py RENAMED Viewed

@@ -17,7 +17,6 @@ of three CD-ROMs of SGML encoded documents distributed by LDC plus queries and a
 See also https://trec.nist.gov/data/docs_eng.html and https://trec.nist.gov/data/intro_eng.html
 """
-from datamaestro.data import Base
 from datamaestro_text.data.ir.trec import TipsterCollection
 from datamaestro.download.links import linkfolder
 from datamaestro.definitions import (

{datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/io/github/thunlp/fewrel.py RENAMED Viewed

@@ -32,4 +32,4 @@ def v1(train, validation):
     Only the train and validation dataset are available. The test set is hidden
     for the leaderboard.
     """
-    return {"train": File(path=train), "validation": File(path=validation)}
+    return {"train": File.C(path=train), "validation": File.C(path=validation)}

{datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/io/metamind/research/wikitext.py RENAMED Viewed

@@ -30,9 +30,9 @@ def WikiText(data, type):
     https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/
     """
     return {
-        "train": File(path=data / ("wiki.train.%s" % type)),
-        "validation": File(path=data / ("wiki.valid.%s" % type)),
-        "test": File(path=data / ("wiki.test.%s" % type)),
+        "train": File.C(path=data / ("wiki.train.%s" % type)),
+        "validation": File.C(path=data / ("wiki.valid.%s" % type)),
+        "test": File.C(path=data / ("wiki.test.%s" % type)),
     }

{datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/org/grouplens/movielens.py RENAMED Viewed

@@ -31,10 +31,10 @@ def small(ds) -> Movielens:
     100,000 ratings and 3,600 tag applications applied to 9,000 movies by 600 users (as of 9/2018)
     """
     return {
-        "ratings": csv.Generic(path=ds / "ratings.csv", names_row=0),
-        "links": csv.Generic(path=ds / "links.csv", names_row=0),
-        "movies": csv.Generic(path=ds / "movies.csv", names_row=0),
-        "tags": csv.Generic(path=ds / "tags.csv", names_row=0),
+        "ratings": csv.Generic.C(path=ds / "ratings.csv", names_row=0),
+        "links": csv.Generic.C(path=ds / "links.csv", names_row=0),
+        "movies": csv.Generic.C(path=ds / "movies.csv", names_row=0),
+        "tags": csv.Generic.C(path=ds / "tags.csv", names_row=0),
     }
@@ -46,8 +46,8 @@ def full(ds) -> Movielens:
     27,000,000 ratings and 1,100,000 tag applications applied to 58,000 movies by 280,000 users (as of 9/2018)
     """
     return {
-        "ratings": csv.Generic(path=ds / "ratings.csv", names_row=0),
-        "links": csv.Generic(path=ds / "links.csv", names_row=0),
-        "movies": csv.Generic(path=ds / "movies.csv", names_row=0),
-        "tags": csv.Generic(path=ds / "tags.csv", names_row=0),
+        "ratings": csv.Generic.C(path=ds / "ratings.csv", names_row=0),
+        "links": csv.Generic.C(path=ds / "links.csv", names_row=0),
+        "movies": csv.Generic.C(path=ds / "movies.csv", names_row=0),
+        "tags": csv.Generic.C(path=ds / "tags.csv", names_row=0),
     }

{datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/org/universaldependencies/french.py RENAMED Viewed

@@ -34,9 +34,9 @@ def gsd(ds) -> Supervised:
     is updated since 2015 independently from the previous source.
     """
     return {
-        "train": CoNLL_U(path=ds / "fr_gsd-ud-train.conllu"),
-        "test": CoNLL_U(path=ds / "fr_gsd-ud-dev.conllu"),
-        "validation": CoNLL_U(path=ds / "fr_gsd-ud-test.conllu"),
+        "train": CoNLL_U.C(path=ds / "fr_gsd-ud-train.conllu"),
+        "test": CoNLL_U.C(path=ds / "fr_gsd-ud-dev.conllu"),
+        "validation": CoNLL_U.C(path=ds / "fr_gsd-ud-test.conllu"),
     }

{datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/conversation/base.py RENAMED Viewed

@@ -1,10 +1,13 @@
 from abc import ABC, abstractmethod
 from enum import Enum
+from datamaestro_text.data.ir.base import IDItem, SimpleTextItem
+from experimaestro import Param
 from typing import Dict, Generic, Iterator, List, Optional, Sequence, Tuple
 from attr import define
+from datamaestro.record import record_type
 from datamaestro.data import Base
 from datamaestro.record import Record, Item
-from datamaestro_text.data.ir import TopicRecord
+from datamaestro_text.data.ir import TopicRecord, Topics
 from datamaestro_text.utils.iter import FactoryIterable, LazyList, RangeView
 # ---- Basic types
@@ -120,20 +123,17 @@ class ConversationNode:
         ...
     @abstractmethod
-    def parent(self) -> Optional["ConversationNode"]:
-        ...
+    def parent(self) -> Optional["ConversationNode"]: ...
     @abstractmethod
-    def children(self) -> List["ConversationNode"]:
-        ...
+    def children(self) -> List["ConversationNode"]: ...
 class ConversationTree(ABC):
     """Represents a conversation tree"""
     @abstractmethod
-    def root(self) -> ConversationNode:
-        ...
+    def root(self) -> ConversationNode: ...
     @abstractmethod
     def __iter__(self) -> Iterator[ConversationNode]:
@@ -253,5 +253,30 @@ class ConversationDataset(Base, ABC):
     @abstractmethod
     def __iter__(self) -> Iterator[ConversationTree]:
         """Return an iterator over conversations"""
-        for i in range(len(self)):
-            yield self.get(i)
+        ...
+class ConversationUserTopics(Topics):
+    """Extract user topics from conversations"""
+    conversations: Param[ConversationDataset]
+    topic_recordtype = record_type(IDItem, SimpleTextItem)
+    def iter(self) -> Iterator[TopicRecord]:
+        """Returns an iterator over topics"""
+        # Extracts topics from conversations, Each user query is a topic (can perform retrieval on it)
+        # TODO: merge with xpmir.learning.DatasetConversationBase -> same logic
+        records: List[TopicRecord] = []
+        for conversation in self.conversations.__iter__():
+            nodes = [
+                node
+                for node in conversation
+                if node.entry[EntryType] == EntryType.USER_QUERY
+            ]
+            for node in nodes:
+                records.append(
+                    node.entry.update(ConversationHistoryItem(node.history()))
+                )
+        return iter(records)

{datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/conversation/ikat.py RENAMED Viewed

@@ -1,10 +1,11 @@
-from typing import Iterator, List, Optional
+from typing import Iterator, List
 from attr import define, field
 import json
 import logging
 from datamaestro.data import File
 from datamaestro.record import Record
+from datamaestro_text.data.ir import Topics
 from datamaestro_text.data.ir.base import (
     IDItem,
     SimpleTextItem,
@@ -12,7 +13,6 @@ from datamaestro_text.data.ir.base import (
 from .base import (
-    AnswerDocumentURL,
     AnswerEntry,
     ConversationTree,
     EntryType,
@@ -21,6 +21,25 @@ from .base import (
 )
 from . import ConversationDataset
+# Keys to change in the dataset entries for compatibility across different years
+KEY_MAPPINGS = {
+    # Keys to replace: Target Key
+    "turns": "responses",
+    "utterance": "user_utterance",
+    "ptkb_provenance": "relevant_ptkbs",
+    "response_provenance": "citations",
+}
+def norm_dict(entry: dict) -> dict:
+    """Convert keys in the entry to match the expected format."""
+    normalized = {}
+    for k, v in entry.items():
+        # Check for direct mapping, then try lowercase mapping
+        new_key = KEY_MAPPINGS.get(k) or KEY_MAPPINGS.get(k.lower()) or k.lower()
+        normalized[new_key] = v
+    return normalized
 @define(kw_only=True)
@@ -47,7 +66,7 @@ class IkatConversationEntry:
 @define(kw_only=True)
-class IkatDatasetEntry:
+class IkatConversationTopic:
     """A query with past history"""
     number: str
@@ -60,14 +79,20 @@ class IkatDatasetEntry:
     """The personal knowledge base associated with the user"""
     responses: List[IkatConversationEntry] = field(
-        converter=lambda items: [IkatConversationEntry(**item) if isinstance(item, dict) else item for item in items]
+        converter=lambda items: [
+            IkatConversationEntry(**item) if isinstance(item, dict) else item
+            for item in map(norm_dict, items)
+        ]
     )
     """The list of responses to the query"""
-class IkatDataset(ConversationDataset, File):
+class IkatConversations(ConversationDataset, File):
+    """A dataset containing conversations from the IKAT project"""
-    def entries(self) -> Iterator[IkatDatasetEntry]:
+    """Keys to change in the dataset entries for compatibility across different years"""
+    def entries(self) -> Iterator[IkatConversationTopic]:
         """Reads all conversation entries from the dataset file."""
         with self.path.open("rt") as fp:
             raw_data = json.load(fp)
@@ -75,12 +100,13 @@ class IkatDataset(ConversationDataset, File):
         logging.debug("Loaded %d entries from %s", len(raw_data), self.path)
         logging.debug(f"raw data has keys {raw_data[0].keys()}")
-        processed_data = []
         for entry in raw_data:
-            processed_data.append(IkatDatasetEntry(**{key.lower(): value for key, value in entry.items()}))
-        logging.debug(f"First parsed data sample: {processed_data[0]}")
-        return iter(processed_data)
+            try:
+                normalized_entry = norm_dict(entry)
+                yield IkatConversationTopic(**normalized_entry)
+            except Exception as e:
+                logging.warning(f"Failed to parse entry: {e}")
+                raise e
     def __iter__(self) -> Iterator[ConversationTree]:
         for entry in self.entries():
@@ -88,7 +114,7 @@ class IkatDataset(ConversationDataset, File):
             for turn in entry.responses:
                 turn: IkatConversationEntry = turn  # Ensure type is correct
-                query_id = f"{entry.number}#{turn.turn_id}"
+                query_id = f"{entry.number}_{turn.turn_id}"
                 # USER QUERY record
                 history.append(
@@ -117,4 +143,3 @@ class IkatDataset(ConversationDataset, File):
             # Ensure reverse if needed for compatibility (optional)
             history.reverse()
             yield SingleConversationTree(entry.number, history)

datamaestro-text 2025.6.30__tar.gz → 2025.9.11__tar.gz

datamaestro-text 2025.6.30tar.gz → 2025.9.11tar.gz