datamaestro-text 2025.6.11__tar.gz → 2025.6.30__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datamaestro_text-2025.6.11/src/datamaestro_text.egg-info → datamaestro_text-2025.6.30}/PKG-INFO +1 -1
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/api/conversation.rst +18 -2
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/com/github/apple/ml-qrecc.py +3 -3
- datamaestro_text-2025.6.30/src/datamaestro_text/config/com/github/ikat.py +38 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/conversation/base.py +4 -4
- datamaestro_text-2025.6.30/src/datamaestro_text/data/conversation/ikat.py +120 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/version.py +2 -2
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30/src/datamaestro_text.egg-info}/PKG-INFO +1 -1
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text.egg-info/SOURCES.txt +2 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/.circleci/config.yml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/.flake8 +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/.github/workflows/pytest.yml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/.github/workflows/python-publish.yml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/.gitignore +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/.pre-commit-config.yaml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/.readthedocs.yml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/LICENSE +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/MANIFEST.in +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/Makefile +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/README.md +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/Makefile +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/make.bat +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/requirements.txt +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/api/embeddings.rst +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/api/index.rst +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/api/ir.rst +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/api/nlp.rst +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/api/recommendation.rst +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/api/text.rst +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/conf.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/datasets/conversation.rst +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/datasets/embeddings.rst +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/datasets/index.rst +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/datasets/ir.rst +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/datasets/irds.rst +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/datasets/recommendation.rst +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/datasets/text.rst +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/index.rst +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/mkdocs.yml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/pyproject.toml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/requirements-dev.txt +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/requirements.txt +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/setup.cfg +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/__init__.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/__init__.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/ai/quac.yaml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/com/fastml/goodbooks-10k.yaml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/com/github/aagohary/canard.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/com/github/prdwb/orconvqa.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/com/github/soskek/bookcorpus.yaml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/com/microsoft/msmarco/passage.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/com/microsoft/wikiqa.yaml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/com/oscar-corpus.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/com/sentiment140.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/com/smashwords/bookcorpus.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/edu/cornell/nlvr.yaml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/edu/stanford/__init__.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/edu/stanford/aclimdb.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/edu/stanford/glove.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/edu/stanford/im2p.yaml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/edu/upenn/__init__.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/edu/upenn/ldc/__init__.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/edu/upenn/ldc/aquaint.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/gov/__init__.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/gov/nist/__init__.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/gov/nist/ir/covid.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/gov/nist/trec/__init__.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/gov/nist/trec/adhoc.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/gov/nist/trec/clueweb.yaml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/gov/nist/trec/deeplearning.yaml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/gov/nist/trec/index.yaml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/gov/nist/trec/tipster.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/gov/nist/trec/web.yaml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/io/github/rajpurkar/squad.yaml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/io/github/thunlp/fewrel.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/io/metamind/research/__init__.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/io/metamind/research/wikitext.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/net/mattmahoney/enwiki.yaml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/org/acm/recsys/cb2014.yaml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/org/cocodataset/index.yaml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/org/grouplens/movielens.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/org/universaldependencies/french.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/uk/ac/ucl/cs/qangaroo.yaml +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/__init__.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/conversation/__init__.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/conversation/canard.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/conversation/orconvqa.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/conversation/qrecc.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/embeddings.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/__init__.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/base.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/cord19.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/csv.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/data.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/formats.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/huggingface.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/stores.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/trec.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/utils.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/recommendation.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/tagging.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/text.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/datasets/irds/__init__.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/datasets/irds/data.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/datasets/irds/datasets.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/datasets/irds/helpers.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/datasets/irds/utils.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/download/tmdb.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/interfaces/plaintext.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/interfaces/trec.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/test/__init__.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/test/test_datasets.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/test/test_documented.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/transforms/__init__.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/transforms/ir/__init__.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/utils/__init__.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/utils/files.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/utils/iter.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/utils/randomstream.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/utils/shuffle.py +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text.egg-info/dependency_links.txt +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text.egg-info/entry_points.txt +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text.egg-info/requires.txt +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text.egg-info/top_level.txt +0 -0
- {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/tox.ini +0 -0
|
@@ -34,9 +34,13 @@ Contextual query reformulation
|
|
|
34
34
|
.. autoclass:: ContextualizedRewrittenQuery
|
|
35
35
|
:members:
|
|
36
36
|
|
|
37
|
+
CANARD Dataset
|
|
38
|
+
|
|
37
39
|
.. autoxpmconfig:: datamaestro_text.data.conversation.canard.CanardDataset
|
|
38
40
|
:members: iter
|
|
39
41
|
|
|
42
|
+
OrConvQA Dataset
|
|
43
|
+
|
|
40
44
|
.. autoxpmconfig:: datamaestro_text.data.conversation.orconvqa.OrConvQADataset
|
|
41
45
|
:members: iter
|
|
42
46
|
|
|
@@ -46,10 +50,22 @@ Contextual query reformulation
|
|
|
46
50
|
.. autoclass:: datamaestro_text.data.conversation.orconvqa.OrConvQADatasetHistoryEntry
|
|
47
51
|
:members:
|
|
48
52
|
|
|
53
|
+
QReCC Dataset
|
|
49
54
|
|
|
50
|
-
|
|
51
|
-
.. autoclass:: datamaestro_text.data.conversation.orconvqa.QReCCDatasetEntry
|
|
55
|
+
.. autoclass:: datamaestro_text.data.conversation.qrecc.QReCCDatasetEntry
|
|
52
56
|
:members:
|
|
53
57
|
|
|
54
58
|
.. autoxpmconfig:: datamaestro_text.data.conversation.qrecc.QReCCDataset
|
|
55
59
|
:members: iter
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
iKAT Dataset
|
|
63
|
+
|
|
64
|
+
.. autoclass:: datamaestro_text.data.conversation.ikat.IkatDatasetEntry
|
|
65
|
+
:members:
|
|
66
|
+
|
|
67
|
+
.. autoclass:: datamaestro_text.data.conversation.ikat.IkatConversationEntry
|
|
68
|
+
:members:
|
|
69
|
+
|
|
70
|
+
.. autoxpmconfig:: datamaestro_text.data.conversation.ikat.IkatDataset
|
|
71
|
+
:members: iter
|
|
@@ -39,9 +39,9 @@ def main(data: Path) -> Supervised[QReCCDataset, None, QReCCDataset]:
|
|
|
39
39
|
answering that includes the individual subtasks of question rewriting,
|
|
40
40
|
passage retrieval and reading comprehension
|
|
41
41
|
"""
|
|
42
|
-
return Supervised(
|
|
43
|
-
train=QReCCDataset(path=data / "qrecc_train.json"),
|
|
44
|
-
test=QReCCDataset(path=data / "qrecc_test.json"),
|
|
42
|
+
return Supervised.C(
|
|
43
|
+
train=QReCCDataset.C(path=data / "qrecc_train.json"),
|
|
44
|
+
test=QReCCDataset.C(path=data / "qrecc_test.json"),
|
|
45
45
|
)
|
|
46
46
|
|
|
47
47
|
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# See documentation on https://datamaestro.readthedocs.io
|
|
2
|
+
|
|
3
|
+
from datamaestro.definitions import datatasks, datatags, dataset
|
|
4
|
+
from datamaestro.data.ml import Supervised
|
|
5
|
+
from datamaestro.data import Base
|
|
6
|
+
|
|
7
|
+
from datamaestro.utils import HashCheck
|
|
8
|
+
from datamaestro.download.single import filedownloader
|
|
9
|
+
from datamaestro_text.data.conversation.ikat import IkatDatasetEntry, IkatDataset
|
|
10
|
+
from datamaestro_text.datasets.irds.data import (
|
|
11
|
+
SimpleJsonDocument,
|
|
12
|
+
LZ4JSONLDocumentStore,
|
|
13
|
+
)
|
|
14
|
+
import logging
|
|
15
|
+
|
|
16
|
+
@datatags("conversation", "context", "query")
|
|
17
|
+
@datatasks("query rewriting")
|
|
18
|
+
@filedownloader(
|
|
19
|
+
"test.json",
|
|
20
|
+
"https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2025/data/2025_test_topics.json",
|
|
21
|
+
checker=HashCheck("16f8444a8d0a8dfe0090f478f185a63c"),
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
@dataset(
|
|
25
|
+
Base,
|
|
26
|
+
url="https://github.com/irlabamsterdam/iKAT/tree/main/2025",
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
def main(test) -> Supervised[IkatDataset, None, IkatDataset]:
|
|
30
|
+
"""Question-in-context rewriting
|
|
31
|
+
|
|
32
|
+
iKAT is a test dataset for question-in-context rewriting that consists of
|
|
33
|
+
questions each given in a dialog context together with a context-independent
|
|
34
|
+
rewriting of the question.
|
|
35
|
+
One of the special features of iKAT is that it includes a Personal PKTB',
|
|
36
|
+
"""
|
|
37
|
+
logging.info("Creating iKAT dataset from %s", test)
|
|
38
|
+
return IkatDataset.C(path=test)
|
|
@@ -214,8 +214,8 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
|
|
|
214
214
|
|
|
215
215
|
def __init__(self, entry):
|
|
216
216
|
self.entry = entry
|
|
217
|
-
self.
|
|
218
|
-
self.
|
|
217
|
+
self._parent = None
|
|
218
|
+
self._children = []
|
|
219
219
|
|
|
220
220
|
def add(self, node: "ConversationTreeNode") -> "ConversationTreeNode":
|
|
221
221
|
self._children.append(node)
|
|
@@ -224,10 +224,10 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
|
|
|
224
224
|
|
|
225
225
|
def conversation(self, skip_self: bool) -> ConversationHistory:
|
|
226
226
|
def iterator():
|
|
227
|
-
current = self.parent if skip_self else self
|
|
227
|
+
current = self.parent() if skip_self else self
|
|
228
228
|
while current is not None:
|
|
229
229
|
yield current.entry
|
|
230
|
-
current = current.parent
|
|
230
|
+
current = current.parent()
|
|
231
231
|
|
|
232
232
|
return LazyList(FactoryIterable(iterator))
|
|
233
233
|
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
from typing import Iterator, List, Optional
|
|
2
|
+
from attr import define, field
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from datamaestro.data import File
|
|
6
|
+
from datamaestro.record import Record
|
|
7
|
+
|
|
8
|
+
from datamaestro_text.data.ir.base import (
|
|
9
|
+
IDItem,
|
|
10
|
+
SimpleTextItem,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
from .base import (
|
|
15
|
+
AnswerDocumentURL,
|
|
16
|
+
AnswerEntry,
|
|
17
|
+
ConversationTree,
|
|
18
|
+
EntryType,
|
|
19
|
+
SimpleDecontextualizedItem,
|
|
20
|
+
SingleConversationTree,
|
|
21
|
+
)
|
|
22
|
+
from . import ConversationDataset
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@define(kw_only=True)
|
|
27
|
+
class IkatConversationEntry:
|
|
28
|
+
"""A query with past history"""
|
|
29
|
+
|
|
30
|
+
turn_id: int
|
|
31
|
+
"""Turn number in the conversation"""
|
|
32
|
+
|
|
33
|
+
user_utterance: str
|
|
34
|
+
"""The last issued query"""
|
|
35
|
+
|
|
36
|
+
resolved_utterance: str
|
|
37
|
+
"""Manually rewritten query"""
|
|
38
|
+
|
|
39
|
+
response: str
|
|
40
|
+
"""The system response to the query"""
|
|
41
|
+
|
|
42
|
+
relevant_ptkbs: List[str]
|
|
43
|
+
"""The list of relevant personal knowledge bases for the query"""
|
|
44
|
+
|
|
45
|
+
citations: List[str]
|
|
46
|
+
"""The list of citations for the response"""
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@define(kw_only=True)
|
|
50
|
+
class IkatDatasetEntry:
|
|
51
|
+
"""A query with past history"""
|
|
52
|
+
|
|
53
|
+
number: str
|
|
54
|
+
"""Conversation ID"""
|
|
55
|
+
|
|
56
|
+
title: str
|
|
57
|
+
"""Title of the conversation"""
|
|
58
|
+
|
|
59
|
+
ptkb: str
|
|
60
|
+
"""The personal knowledge base associated with the user"""
|
|
61
|
+
|
|
62
|
+
responses: List[IkatConversationEntry] = field(
|
|
63
|
+
converter=lambda items: [IkatConversationEntry(**item) if isinstance(item, dict) else item for item in items]
|
|
64
|
+
)
|
|
65
|
+
"""The list of responses to the query"""
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class IkatDataset(ConversationDataset, File):
|
|
69
|
+
|
|
70
|
+
def entries(self) -> Iterator[IkatDatasetEntry]:
|
|
71
|
+
"""Reads all conversation entries from the dataset file."""
|
|
72
|
+
with self.path.open("rt") as fp:
|
|
73
|
+
raw_data = json.load(fp)
|
|
74
|
+
|
|
75
|
+
logging.debug("Loaded %d entries from %s", len(raw_data), self.path)
|
|
76
|
+
logging.debug(f"raw data has keys {raw_data[0].keys()}")
|
|
77
|
+
|
|
78
|
+
processed_data = []
|
|
79
|
+
for entry in raw_data:
|
|
80
|
+
processed_data.append(IkatDatasetEntry(**{key.lower(): value for key, value in entry.items()}))
|
|
81
|
+
|
|
82
|
+
logging.debug(f"First parsed data sample: {processed_data[0]}")
|
|
83
|
+
return iter(processed_data)
|
|
84
|
+
|
|
85
|
+
def __iter__(self) -> Iterator[ConversationTree]:
|
|
86
|
+
for entry in self.entries():
|
|
87
|
+
history: List[Record] = []
|
|
88
|
+
|
|
89
|
+
for turn in entry.responses:
|
|
90
|
+
turn: IkatConversationEntry = turn # Ensure type is correct
|
|
91
|
+
query_id = f"{entry.number}#{turn.turn_id}"
|
|
92
|
+
|
|
93
|
+
# USER QUERY record
|
|
94
|
+
history.append(
|
|
95
|
+
Record(
|
|
96
|
+
IDItem(query_id),
|
|
97
|
+
SimpleTextItem(turn.user_utterance),
|
|
98
|
+
SimpleDecontextualizedItem(turn.resolved_utterance),
|
|
99
|
+
EntryType.USER_QUERY,
|
|
100
|
+
)
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
# Build citation info (stubbed relevance to match format)
|
|
104
|
+
relevances = {}
|
|
105
|
+
if turn.relevant_ptkbs:
|
|
106
|
+
# Example: just use first as relevant (can be improved)
|
|
107
|
+
relevances[0] = (0, None) # No position info in this structure
|
|
108
|
+
|
|
109
|
+
# SYSTEM ANSWER record
|
|
110
|
+
history.append(
|
|
111
|
+
Record(
|
|
112
|
+
AnswerEntry(turn.response),
|
|
113
|
+
EntryType.SYSTEM_ANSWER,
|
|
114
|
+
)
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Ensure reverse if needed for compatibility (optional)
|
|
118
|
+
history.reverse()
|
|
119
|
+
yield SingleConversationTree(entry.number, history)
|
|
120
|
+
|
|
@@ -17,5 +17,5 @@ __version__: str
|
|
|
17
17
|
__version_tuple__: VERSION_TUPLE
|
|
18
18
|
version_tuple: VERSION_TUPLE
|
|
19
19
|
|
|
20
|
-
__version__ = version = '2025.6.
|
|
21
|
-
__version_tuple__ = version_tuple = (2025, 6,
|
|
20
|
+
__version__ = version = '2025.6.30'
|
|
21
|
+
__version_tuple__ = version_tuple = (2025, 6, 30)
|
{datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text.egg-info/SOURCES.txt
RENAMED
|
@@ -46,6 +46,7 @@ src/datamaestro_text/config/ai/quac.yaml
|
|
|
46
46
|
src/datamaestro_text/config/com/oscar-corpus.py
|
|
47
47
|
src/datamaestro_text/config/com/sentiment140.py
|
|
48
48
|
src/datamaestro_text/config/com/fastml/goodbooks-10k.yaml
|
|
49
|
+
src/datamaestro_text/config/com/github/ikat.py
|
|
49
50
|
src/datamaestro_text/config/com/github/aagohary/canard.py
|
|
50
51
|
src/datamaestro_text/config/com/github/apple/ml-qrecc.py
|
|
51
52
|
src/datamaestro_text/config/com/github/prdwb/orconvqa.py
|
|
@@ -89,6 +90,7 @@ src/datamaestro_text/data/text.py
|
|
|
89
90
|
src/datamaestro_text/data/conversation/__init__.py
|
|
90
91
|
src/datamaestro_text/data/conversation/base.py
|
|
91
92
|
src/datamaestro_text/data/conversation/canard.py
|
|
93
|
+
src/datamaestro_text/data/conversation/ikat.py
|
|
92
94
|
src/datamaestro_text/data/conversation/orconvqa.py
|
|
93
95
|
src/datamaestro_text/data/conversation/qrecc.py
|
|
94
96
|
src/datamaestro_text/data/ir/__init__.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/.github/workflows/python-publish.yml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/api/recommendation.rst
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/datasets/conversation.rst
RENAMED
|
File without changes
|
{datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/datasets/embeddings.rst
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/datasets/recommendation.rst
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/__init__.py
RENAMED
|
File without changes
|
{datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/ai/quac.yaml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/embeddings.py
RENAMED
|
File without changes
|
{datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/__init__.py
RENAMED
|
File without changes
|
{datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/base.py
RENAMED
|
File without changes
|
{datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/cord19.py
RENAMED
|
File without changes
|
{datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/csv.py
RENAMED
|
File without changes
|
{datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/data.py
RENAMED
|
File without changes
|
{datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/formats.py
RENAMED
|
File without changes
|
|
File without changes
|
{datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/stores.py
RENAMED
|
File without changes
|
{datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/trec.py
RENAMED
|
File without changes
|
{datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
{datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/tagging.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/datasets/irds/data.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/download/tmdb.py
RENAMED
|
File without changes
|
|
File without changes
|
{datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/interfaces/trec.py
RENAMED
|
File without changes
|
{datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/test/__init__.py
RENAMED
|
File without changes
|
{datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/test/test_datasets.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/utils/__init__.py
RENAMED
|
File without changes
|
{datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/utils/files.py
RENAMED
|
File without changes
|
{datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/utils/iter.py
RENAMED
|
File without changes
|
{datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/utils/randomstream.py
RENAMED
|
File without changes
|
{datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/utils/shuffle.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text.egg-info/requires.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|