datamaestro-text 2025.6.30__tar.gz → 2025.9.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/.github/workflows/pytest.yml +1 -1
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/.readthedocs.yml +1 -1
- {datamaestro_text-2025.6.30/src/datamaestro_text.egg-info → datamaestro_text-2025.9.11}/PKG-INFO +3 -3
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/source/api/conversation.rst +10 -4
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/source/api/ir.rst +8 -2
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/pyproject.toml +9 -1
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/requirements.txt +1 -1
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/github/aagohary/canard.py +3 -3
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/github/apple/ml-qrecc.py +2 -1
- datamaestro_text-2025.9.11/src/datamaestro_text/config/com/github/ikat.py +121 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/microsoft/msmarco/passage.py +1 -1
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/sentiment140.py +4 -4
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/edu/stanford/aclimdb.py +2 -2
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/gov/nist/trec/tipster.py +0 -1
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/io/github/thunlp/fewrel.py +1 -1
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/io/metamind/research/wikitext.py +3 -3
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/org/grouplens/movielens.py +8 -8
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/org/universaldependencies/french.py +3 -3
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/conversation/base.py +34 -9
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/conversation/ikat.py +38 -13
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/__init__.py +44 -4
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/base.py +2 -1
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/formats.py +8 -2
- datamaestro_text-2025.9.11/src/datamaestro_text/data/ir/stores.py +124 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/trec.py +7 -4
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/datasets/irds/data.py +47 -16
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/interfaces/trec.py +28 -1
- datamaestro_text-2025.9.11/src/datamaestro_text/utils/files.py +111 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/utils/iter.py +5 -0
- datamaestro_text-2025.9.11/src/datamaestro_text/version.py +34 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11/src/datamaestro_text.egg-info}/PKG-INFO +3 -3
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text.egg-info/requires.txt +1 -1
- datamaestro_text-2025.6.30/src/datamaestro_text/config/com/github/ikat.py +0 -38
- datamaestro_text-2025.6.30/src/datamaestro_text/data/ir/stores.py +0 -29
- datamaestro_text-2025.6.30/src/datamaestro_text/utils/files.py +0 -8
- datamaestro_text-2025.6.30/src/datamaestro_text/version.py +0 -21
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/.circleci/config.yml +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/.flake8 +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/.github/workflows/python-publish.yml +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/.gitignore +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/.pre-commit-config.yaml +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/LICENSE +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/MANIFEST.in +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/Makefile +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/README.md +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/Makefile +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/make.bat +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/requirements.txt +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/source/api/embeddings.rst +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/source/api/index.rst +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/source/api/nlp.rst +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/source/api/recommendation.rst +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/source/api/text.rst +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/source/conf.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/source/datasets/conversation.rst +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/source/datasets/embeddings.rst +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/source/datasets/index.rst +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/source/datasets/ir.rst +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/source/datasets/irds.rst +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/source/datasets/recommendation.rst +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/source/datasets/text.rst +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/source/index.rst +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/mkdocs.yml +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/requirements-dev.txt +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/setup.cfg +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/__init__.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/__init__.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/ai/quac.yaml +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/fastml/goodbooks-10k.yaml +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/github/prdwb/orconvqa.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/github/soskek/bookcorpus.yaml +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/microsoft/wikiqa.yaml +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/oscar-corpus.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/smashwords/bookcorpus.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/edu/cornell/nlvr.yaml +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/edu/stanford/__init__.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/edu/stanford/glove.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/edu/stanford/im2p.yaml +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/edu/upenn/__init__.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/edu/upenn/ldc/__init__.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/edu/upenn/ldc/aquaint.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/gov/__init__.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/gov/nist/__init__.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/gov/nist/ir/covid.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/gov/nist/trec/__init__.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/gov/nist/trec/adhoc.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/gov/nist/trec/clueweb.yaml +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/gov/nist/trec/deeplearning.yaml +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/gov/nist/trec/index.yaml +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/gov/nist/trec/web.yaml +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/io/github/rajpurkar/squad.yaml +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/io/metamind/research/__init__.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/net/mattmahoney/enwiki.yaml +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/org/acm/recsys/cb2014.yaml +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/org/cocodataset/index.yaml +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/uk/ac/ucl/cs/qangaroo.yaml +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/__init__.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/conversation/__init__.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/conversation/canard.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/conversation/orconvqa.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/conversation/qrecc.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/embeddings.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/cord19.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/csv.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/data.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/huggingface.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/utils.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/recommendation.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/tagging.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/text.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/datasets/irds/__init__.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/datasets/irds/datasets.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/datasets/irds/helpers.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/datasets/irds/utils.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/download/tmdb.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/interfaces/plaintext.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/test/__init__.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/test/test_datasets.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/test/test_documented.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/transforms/__init__.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/transforms/ir/__init__.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/utils/__init__.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/utils/randomstream.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/utils/shuffle.py +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text.egg-info/SOURCES.txt +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text.egg-info/dependency_links.txt +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text.egg-info/entry_points.txt +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text.egg-info/top_level.txt +0 -0
- {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/tox.ini +0 -0
|
@@ -25,7 +25,7 @@ jobs:
|
|
|
25
25
|
python-version: ${{ matrix.python-version }}
|
|
26
26
|
- name: Install dependencies
|
|
27
27
|
run: |
|
|
28
|
-
python -m pip install --upgrade pip
|
|
28
|
+
python -m pip install --upgrade pip setuptools
|
|
29
29
|
SETUPTOOLS_SCM_PRETEND_VERSION="0.1-dev" pip install -e '.[dev]'
|
|
30
30
|
- name: Lint with flake8
|
|
31
31
|
run: |
|
{datamaestro_text-2025.6.30/src/datamaestro_text.egg-info → datamaestro_text-2025.9.11}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datamaestro-text
|
|
3
|
-
Version: 2025.
|
|
3
|
+
Version: 2025.9.11
|
|
4
4
|
Summary: Datamaestro module for text-related datasets
|
|
5
5
|
Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
|
|
6
6
|
License: GPL-3
|
|
@@ -15,10 +15,10 @@ Classifier: Operating System :: OS Independent
|
|
|
15
15
|
Classifier: Programming Language :: Python
|
|
16
16
|
Classifier: Programming Language :: Python :: 3
|
|
17
17
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
18
|
-
Requires-Python: >=3.
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
19
|
Description-Content-Type: text/markdown
|
|
20
20
|
License-File: LICENSE
|
|
21
|
-
Requires-Dist: datamaestro>=1.
|
|
21
|
+
Requires-Dist: datamaestro>=1.5.0
|
|
22
22
|
Requires-Dist: ir_datasets>=0.5.8
|
|
23
23
|
Requires-Dist: attrs
|
|
24
24
|
Provides-Extra: dev
|
|
@@ -26,6 +26,12 @@ Data classes
|
|
|
26
26
|
|
|
27
27
|
.. autoclass:: ConversationTopic
|
|
28
28
|
|
|
29
|
+
Conversational IR
|
|
30
|
+
-----------------
|
|
31
|
+
|
|
32
|
+
.. autoxpmconfig:: datamaestro_text.data.conversation.base.ConversationUserTopics
|
|
33
|
+
|
|
34
|
+
|
|
29
35
|
Contextual query reformulation
|
|
30
36
|
------------------------------
|
|
31
37
|
|
|
@@ -34,7 +40,7 @@ Contextual query reformulation
|
|
|
34
40
|
.. autoclass:: ContextualizedRewrittenQuery
|
|
35
41
|
:members:
|
|
36
42
|
|
|
37
|
-
CANARD Dataset
|
|
43
|
+
CANARD Dataset
|
|
38
44
|
|
|
39
45
|
.. autoxpmconfig:: datamaestro_text.data.conversation.canard.CanardDataset
|
|
40
46
|
:members: iter
|
|
@@ -50,7 +56,7 @@ OrConvQA Dataset
|
|
|
50
56
|
.. autoclass:: datamaestro_text.data.conversation.orconvqa.OrConvQADatasetHistoryEntry
|
|
51
57
|
:members:
|
|
52
58
|
|
|
53
|
-
QReCC Dataset
|
|
59
|
+
QReCC Dataset
|
|
54
60
|
|
|
55
61
|
.. autoclass:: datamaestro_text.data.conversation.qrecc.QReCCDatasetEntry
|
|
56
62
|
:members:
|
|
@@ -61,11 +67,11 @@ QReCC Dataset
|
|
|
61
67
|
|
|
62
68
|
iKAT Dataset
|
|
63
69
|
|
|
64
|
-
.. autoclass:: datamaestro_text.data.conversation.ikat.
|
|
70
|
+
.. autoclass:: datamaestro_text.data.conversation.ikat.IkatConversationTopic
|
|
65
71
|
:members:
|
|
66
72
|
|
|
67
73
|
.. autoclass:: datamaestro_text.data.conversation.ikat.IkatConversationEntry
|
|
68
74
|
:members:
|
|
69
75
|
|
|
70
|
-
.. autoxpmconfig:: datamaestro_text.data.conversation.ikat.
|
|
76
|
+
.. autoxpmconfig:: datamaestro_text.data.conversation.ikat.IkatConversations
|
|
71
77
|
:members: iter
|
|
@@ -36,11 +36,17 @@ Documents
|
|
|
36
36
|
|
|
37
37
|
.. autoxpmconfig:: datamaestro_text.data.ir.Documents
|
|
38
38
|
:members: iter_documents, iter_ids, documentcount
|
|
39
|
+
.. autoxpmconfig:: datamaestro_text.data.ir.csv.Documents
|
|
40
|
+
.. autoxpmconfig:: datamaestro_text.datasets.irds.data.LZ4DocumentStore
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
Dataset-specific documents
|
|
44
|
+
**************************
|
|
45
|
+
|
|
39
46
|
.. autoxpmconfig:: datamaestro_text.data.ir.cord19.Documents
|
|
40
47
|
.. autoxpmconfig:: datamaestro_text.data.ir.trec.TipsterCollection
|
|
41
|
-
.. autoxpmconfig:: datamaestro_text.data.ir.csv.Documents
|
|
42
48
|
.. autoxpmconfig:: datamaestro_text.data.ir.stores.OrConvQADocumentStore
|
|
43
|
-
.. autoxpmconfig:: datamaestro_text.
|
|
49
|
+
.. autoxpmconfig:: datamaestro_text.data.ir.stores.IKatClueWeb22DocumentStore
|
|
44
50
|
|
|
45
51
|
Assessments
|
|
46
52
|
-----------
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "datamaestro-text"
|
|
3
|
-
requires-python = ">=3.
|
|
3
|
+
requires-python = ">=3.10"
|
|
4
4
|
keywords = ["dataset manager", "information retrieval", "experiments"]
|
|
5
5
|
description = "Datamaestro module for text-related datasets"
|
|
6
6
|
dynamic = ["version", "readme", "dependencies"]
|
|
@@ -44,6 +44,14 @@ fallback_version = "0.0.0-dev"
|
|
|
44
44
|
[build-system]
|
|
45
45
|
requires = ["setuptools", "setuptools-scm", "wheel"] # PEP 508 specifications.
|
|
46
46
|
|
|
47
|
+
[dependency-groups]
|
|
48
|
+
dev = [
|
|
49
|
+
"docutils>=0.21.2",
|
|
50
|
+
"pytest>=8.4.1",
|
|
51
|
+
"sphinx>=8.1.3",
|
|
52
|
+
"sphobjinv>=2.3.1.3",
|
|
53
|
+
]
|
|
54
|
+
|
|
47
55
|
[project.entry-points."datamaestro.repositories"]
|
|
48
56
|
text = "datamaestro_text:Repository"
|
|
49
57
|
irds = "datamaestro_text.datasets.irds:Repository"
|
|
@@ -37,7 +37,7 @@ def main(train, dev, test):
|
|
|
37
37
|
Each dataset is an instance of :class:`datamaestro_text.data.conversation.CanardDataset`
|
|
38
38
|
"""
|
|
39
39
|
return {
|
|
40
|
-
"train": CanardDataset(path=train),
|
|
41
|
-
"validation": CanardDataset(path=dev),
|
|
42
|
-
"test": CanardDataset(path=test),
|
|
40
|
+
"train": CanardDataset.C(path=train),
|
|
41
|
+
"validation": CanardDataset.C(path=dev),
|
|
42
|
+
"test": CanardDataset.C(path=test),
|
|
43
43
|
}
|
|
@@ -51,6 +51,7 @@ def main(data: Path) -> Supervised[QReCCDataset, None, QReCCDataset]:
|
|
|
51
51
|
)
|
|
52
52
|
class Content(LZ4JSONLDocumentStore):
|
|
53
53
|
"""QReCC mentionned URLs content"""
|
|
54
|
+
|
|
54
55
|
@staticmethod
|
|
55
56
|
def __create_dataset__(dataset, options=None):
|
|
56
57
|
ds = reference(reference=main).setup(dataset, options)
|
|
@@ -65,7 +66,7 @@ class Content(LZ4JSONLDocumentStore):
|
|
|
65
66
|
"id",
|
|
66
67
|
).setup(dataset, options)
|
|
67
68
|
|
|
68
|
-
return Content(jsonl_path=store_path)
|
|
69
|
+
return Content.C(jsonl_path=store_path)
|
|
69
70
|
|
|
70
71
|
@staticmethod
|
|
71
72
|
def _documents(path: Path):
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# See documentation on https://datamaestro.readthedocs.io
|
|
2
|
+
|
|
3
|
+
import bz2
|
|
4
|
+
from datamaestro.download import reference
|
|
5
|
+
from datamaestro.definitions import datatasks, datatags, dataset
|
|
6
|
+
from datamaestro_text.data.conversation.base import ConversationUserTopics
|
|
7
|
+
from datamaestro_text.data.ir import Adhoc
|
|
8
|
+
|
|
9
|
+
from datamaestro.utils import HashCheck
|
|
10
|
+
from datamaestro.context import DatafolderPath
|
|
11
|
+
from datamaestro.download.single import filedownloader
|
|
12
|
+
from datamaestro_text.data.conversation.ikat import IkatConversations
|
|
13
|
+
from datamaestro.download.links import linkfolder
|
|
14
|
+
|
|
15
|
+
from datamaestro_text.data.ir.stores import IKatClueWeb22DocumentStore
|
|
16
|
+
from datamaestro_text.data.ir.trec import TrecAdhocAssessments
|
|
17
|
+
from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataset(as_prepare=True)
|
|
21
|
+
def clueweb22(dataset, options=None) -> IKatClueWeb22DocumentStore:
|
|
22
|
+
# Number of documents in the dataset
|
|
23
|
+
count = 116_838_987
|
|
24
|
+
|
|
25
|
+
jsonl_folder = linkfolder(
|
|
26
|
+
"documents", [DatafolderPath("gov.nist.trec.ikat.clueweb22", "jsonl")]
|
|
27
|
+
).setup(dataset, options)
|
|
28
|
+
store_path = lz4docstore_builder(
|
|
29
|
+
"store",
|
|
30
|
+
IKatClueWeb22DocumentStore.generator(
|
|
31
|
+
jsonl_folder,
|
|
32
|
+
jsonl_folder / "ikat_2023_passages_jsonl.sha256sums",
|
|
33
|
+
jsonl_folder / "ikat_2023_passages_hashes.tsv.bz2",
|
|
34
|
+
),
|
|
35
|
+
IKatClueWeb22DocumentStore.Document,
|
|
36
|
+
"id",
|
|
37
|
+
count_hint=count,
|
|
38
|
+
).setup(dataset, options)
|
|
39
|
+
|
|
40
|
+
return IKatClueWeb22DocumentStore.C(path=store_path, count=count)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@datatags("conversation", "context", "query")
|
|
44
|
+
@datatasks("conversational search", "query rewriting")
|
|
45
|
+
@reference("documents", clueweb22)
|
|
46
|
+
@filedownloader(
|
|
47
|
+
"topics.json",
|
|
48
|
+
"https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2025/data/2025_test_topics.json",
|
|
49
|
+
checker=HashCheck("16f8444a8d0a8dfe0090f478f185a63c"),
|
|
50
|
+
)
|
|
51
|
+
@dataset(
|
|
52
|
+
id="2025",
|
|
53
|
+
url="https://github.com/irlabamsterdam/iKAT/tree/main/2025",
|
|
54
|
+
)
|
|
55
|
+
def test_2025(topics, documents) -> Adhoc.C:
|
|
56
|
+
"""Question-in-context rewriting
|
|
57
|
+
|
|
58
|
+
iKAT is a test dataset for question-in-context rewriting that consists of
|
|
59
|
+
questions each given in a dialog context together with a context-independent
|
|
60
|
+
rewriting of the question.
|
|
61
|
+
"""
|
|
62
|
+
return Adhoc.C(
|
|
63
|
+
topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
|
|
64
|
+
# TODO: add when available
|
|
65
|
+
assessments=TrecAdhocAssessments.C(path="/to/do"),
|
|
66
|
+
documents=documents,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@datatags("conversation", "context", "query")
|
|
71
|
+
@datatasks("conversational search", "query rewriting")
|
|
72
|
+
@reference("documents", clueweb22)
|
|
73
|
+
@filedownloader(
|
|
74
|
+
"qrels",
|
|
75
|
+
"https://trec.nist.gov/data/ikat/2024-qrels.txt",
|
|
76
|
+
checker=HashCheck("57f958903ed1c12bbac207f62800814f"),
|
|
77
|
+
)
|
|
78
|
+
@filedownloader(
|
|
79
|
+
"topics.json",
|
|
80
|
+
"https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2024/data/2024_test_topics.json",
|
|
81
|
+
checker=HashCheck("ad45bc6e7add2081d69ea60a0a4d1203"),
|
|
82
|
+
)
|
|
83
|
+
@dataset(
|
|
84
|
+
Adhoc,
|
|
85
|
+
id="2024",
|
|
86
|
+
url="https://github.com/irlabamsterdam/iKAT/tree/main/2024",
|
|
87
|
+
)
|
|
88
|
+
def test_2024(topics, qrels, documents) -> Adhoc.C:
|
|
89
|
+
"""iKAT 2024 dataset"""
|
|
90
|
+
return Adhoc.C(
|
|
91
|
+
topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
|
|
92
|
+
assessments=TrecAdhocAssessments.C(path=qrels),
|
|
93
|
+
documents=documents,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@datatags("conversation", "context", "query")
|
|
98
|
+
@datatasks("conversational search", "query rewriting")
|
|
99
|
+
@reference("documents", clueweb22)
|
|
100
|
+
@filedownloader(
|
|
101
|
+
"qrels",
|
|
102
|
+
"https://trec.nist.gov/data/ikat/2023-qrels.all-turns.txt",
|
|
103
|
+
checker=HashCheck("79dc121bab25b2245e52a53263e5ad1f"),
|
|
104
|
+
)
|
|
105
|
+
@filedownloader(
|
|
106
|
+
"topics.json",
|
|
107
|
+
"https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2023/data/2023_test_topics.json",
|
|
108
|
+
checker=HashCheck("684fa0197cdec8c3cfb6a2e586ab83f6"),
|
|
109
|
+
)
|
|
110
|
+
@dataset(
|
|
111
|
+
Adhoc,
|
|
112
|
+
id="2023",
|
|
113
|
+
url="https://github.com/irlabamsterdam/iKAT/tree/main/2023",
|
|
114
|
+
)
|
|
115
|
+
def test_2023(topics, qrels, documents) -> Adhoc.C:
|
|
116
|
+
"""iKAT 2023 dataset"""
|
|
117
|
+
return Adhoc.C(
|
|
118
|
+
topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
|
|
119
|
+
assessments=TrecAdhocAssessments.C(path=qrels),
|
|
120
|
+
documents=documents,
|
|
121
|
+
)
|
|
@@ -47,7 +47,7 @@ http://www.msmarco.org/dataset.aspx""",
|
|
|
47
47
|
@dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
|
|
48
48
|
def collection_etc(data) -> Folder:
|
|
49
49
|
"""Documents and some more files"""
|
|
50
|
-
return Folder(path=data)
|
|
50
|
+
return Folder.C(path=data)
|
|
51
51
|
|
|
52
52
|
|
|
53
53
|
@lua
|
|
@@ -26,7 +26,7 @@ def english(dir):
|
|
|
26
26
|
|
|
27
27
|
If you use this data, please cite Sentiment140 as your source.
|
|
28
28
|
"""
|
|
29
|
-
return
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
29
|
+
return Supervised.C(
|
|
30
|
+
train=Generic.C(path=dir / "training.1600000.processed.noemoticon.csv"),
|
|
31
|
+
test=Generic.C(path=dir / "testdata.manual.2009.06.14.csv"),
|
|
32
|
+
)
|
|
@@ -11,6 +11,6 @@ def aclimdb(data):
|
|
|
11
11
|
Paper http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf
|
|
12
12
|
"""
|
|
13
13
|
return {
|
|
14
|
-
"train": FolderBased(path=data / "train", classes=["neg", "pos"]),
|
|
15
|
-
"test": FolderBased(path=data / "test", classes=["neg", "pos"]),
|
|
14
|
+
"train": FolderBased.C(path=data / "train", classes=["neg", "pos"]),
|
|
15
|
+
"test": FolderBased.C(path=data / "test", classes=["neg", "pos"]),
|
|
16
16
|
}
|
|
@@ -17,7 +17,6 @@ of three CD-ROMs of SGML encoded documents distributed by LDC plus queries and a
|
|
|
17
17
|
See also https://trec.nist.gov/data/docs_eng.html and https://trec.nist.gov/data/intro_eng.html
|
|
18
18
|
"""
|
|
19
19
|
|
|
20
|
-
from datamaestro.data import Base
|
|
21
20
|
from datamaestro_text.data.ir.trec import TipsterCollection
|
|
22
21
|
from datamaestro.download.links import linkfolder
|
|
23
22
|
from datamaestro.definitions import (
|
|
@@ -32,4 +32,4 @@ def v1(train, validation):
|
|
|
32
32
|
Only the train and validation dataset are available. The test set is hidden
|
|
33
33
|
for the leaderboard.
|
|
34
34
|
"""
|
|
35
|
-
return {"train": File(path=train), "validation": File(path=validation)}
|
|
35
|
+
return {"train": File.C(path=train), "validation": File.C(path=validation)}
|
|
@@ -30,9 +30,9 @@ def WikiText(data, type):
|
|
|
30
30
|
https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/
|
|
31
31
|
"""
|
|
32
32
|
return {
|
|
33
|
-
"train": File(path=data / ("wiki.train.%s" % type)),
|
|
34
|
-
"validation": File(path=data / ("wiki.valid.%s" % type)),
|
|
35
|
-
"test": File(path=data / ("wiki.test.%s" % type)),
|
|
33
|
+
"train": File.C(path=data / ("wiki.train.%s" % type)),
|
|
34
|
+
"validation": File.C(path=data / ("wiki.valid.%s" % type)),
|
|
35
|
+
"test": File.C(path=data / ("wiki.test.%s" % type)),
|
|
36
36
|
}
|
|
37
37
|
|
|
38
38
|
|
|
@@ -31,10 +31,10 @@ def small(ds) -> Movielens:
|
|
|
31
31
|
100,000 ratings and 3,600 tag applications applied to 9,000 movies by 600 users (as of 9/2018)
|
|
32
32
|
"""
|
|
33
33
|
return {
|
|
34
|
-
"ratings": csv.Generic(path=ds / "ratings.csv", names_row=0),
|
|
35
|
-
"links": csv.Generic(path=ds / "links.csv", names_row=0),
|
|
36
|
-
"movies": csv.Generic(path=ds / "movies.csv", names_row=0),
|
|
37
|
-
"tags": csv.Generic(path=ds / "tags.csv", names_row=0),
|
|
34
|
+
"ratings": csv.Generic.C(path=ds / "ratings.csv", names_row=0),
|
|
35
|
+
"links": csv.Generic.C(path=ds / "links.csv", names_row=0),
|
|
36
|
+
"movies": csv.Generic.C(path=ds / "movies.csv", names_row=0),
|
|
37
|
+
"tags": csv.Generic.C(path=ds / "tags.csv", names_row=0),
|
|
38
38
|
}
|
|
39
39
|
|
|
40
40
|
|
|
@@ -46,8 +46,8 @@ def full(ds) -> Movielens:
|
|
|
46
46
|
27,000,000 ratings and 1,100,000 tag applications applied to 58,000 movies by 280,000 users (as of 9/2018)
|
|
47
47
|
"""
|
|
48
48
|
return {
|
|
49
|
-
"ratings": csv.Generic(path=ds / "ratings.csv", names_row=0),
|
|
50
|
-
"links": csv.Generic(path=ds / "links.csv", names_row=0),
|
|
51
|
-
"movies": csv.Generic(path=ds / "movies.csv", names_row=0),
|
|
52
|
-
"tags": csv.Generic(path=ds / "tags.csv", names_row=0),
|
|
49
|
+
"ratings": csv.Generic.C(path=ds / "ratings.csv", names_row=0),
|
|
50
|
+
"links": csv.Generic.C(path=ds / "links.csv", names_row=0),
|
|
51
|
+
"movies": csv.Generic.C(path=ds / "movies.csv", names_row=0),
|
|
52
|
+
"tags": csv.Generic.C(path=ds / "tags.csv", names_row=0),
|
|
53
53
|
}
|
|
@@ -34,9 +34,9 @@ def gsd(ds) -> Supervised:
|
|
|
34
34
|
is updated since 2015 independently from the previous source.
|
|
35
35
|
"""
|
|
36
36
|
return {
|
|
37
|
-
"train": CoNLL_U(path=ds / "fr_gsd-ud-train.conllu"),
|
|
38
|
-
"test": CoNLL_U(path=ds / "fr_gsd-ud-dev.conllu"),
|
|
39
|
-
"validation": CoNLL_U(path=ds / "fr_gsd-ud-test.conllu"),
|
|
37
|
+
"train": CoNLL_U.C(path=ds / "fr_gsd-ud-train.conllu"),
|
|
38
|
+
"test": CoNLL_U.C(path=ds / "fr_gsd-ud-dev.conllu"),
|
|
39
|
+
"validation": CoNLL_U.C(path=ds / "fr_gsd-ud-test.conllu"),
|
|
40
40
|
}
|
|
41
41
|
|
|
42
42
|
|
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
from enum import Enum
|
|
3
|
+
from datamaestro_text.data.ir.base import IDItem, SimpleTextItem
|
|
4
|
+
from experimaestro import Param
|
|
3
5
|
from typing import Dict, Generic, Iterator, List, Optional, Sequence, Tuple
|
|
4
6
|
from attr import define
|
|
7
|
+
from datamaestro.record import record_type
|
|
5
8
|
from datamaestro.data import Base
|
|
6
9
|
from datamaestro.record import Record, Item
|
|
7
|
-
from datamaestro_text.data.ir import TopicRecord
|
|
10
|
+
from datamaestro_text.data.ir import TopicRecord, Topics
|
|
8
11
|
from datamaestro_text.utils.iter import FactoryIterable, LazyList, RangeView
|
|
9
12
|
|
|
10
13
|
# ---- Basic types
|
|
@@ -120,20 +123,17 @@ class ConversationNode:
|
|
|
120
123
|
...
|
|
121
124
|
|
|
122
125
|
@abstractmethod
|
|
123
|
-
def parent(self) -> Optional["ConversationNode"]:
|
|
124
|
-
...
|
|
126
|
+
def parent(self) -> Optional["ConversationNode"]: ...
|
|
125
127
|
|
|
126
128
|
@abstractmethod
|
|
127
|
-
def children(self) -> List["ConversationNode"]:
|
|
128
|
-
...
|
|
129
|
+
def children(self) -> List["ConversationNode"]: ...
|
|
129
130
|
|
|
130
131
|
|
|
131
132
|
class ConversationTree(ABC):
|
|
132
133
|
"""Represents a conversation tree"""
|
|
133
134
|
|
|
134
135
|
@abstractmethod
|
|
135
|
-
def root(self) -> ConversationNode:
|
|
136
|
-
...
|
|
136
|
+
def root(self) -> ConversationNode: ...
|
|
137
137
|
|
|
138
138
|
@abstractmethod
|
|
139
139
|
def __iter__(self) -> Iterator[ConversationNode]:
|
|
@@ -253,5 +253,30 @@ class ConversationDataset(Base, ABC):
|
|
|
253
253
|
@abstractmethod
|
|
254
254
|
def __iter__(self) -> Iterator[ConversationTree]:
|
|
255
255
|
"""Return an iterator over conversations"""
|
|
256
|
-
|
|
257
|
-
|
|
256
|
+
...
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
class ConversationUserTopics(Topics):
|
|
260
|
+
"""Extract user topics from conversations"""
|
|
261
|
+
|
|
262
|
+
conversations: Param[ConversationDataset]
|
|
263
|
+
|
|
264
|
+
topic_recordtype = record_type(IDItem, SimpleTextItem)
|
|
265
|
+
|
|
266
|
+
def iter(self) -> Iterator[TopicRecord]:
|
|
267
|
+
"""Returns an iterator over topics"""
|
|
268
|
+
# Extracts topics from conversations, Each user query is a topic (can perform retrieval on it)
|
|
269
|
+
# TODO: merge with xpmir.learning.DatasetConversationBase -> same logic
|
|
270
|
+
|
|
271
|
+
records: List[TopicRecord] = []
|
|
272
|
+
for conversation in self.conversations.__iter__():
|
|
273
|
+
nodes = [
|
|
274
|
+
node
|
|
275
|
+
for node in conversation
|
|
276
|
+
if node.entry[EntryType] == EntryType.USER_QUERY
|
|
277
|
+
]
|
|
278
|
+
for node in nodes:
|
|
279
|
+
records.append(
|
|
280
|
+
node.entry.update(ConversationHistoryItem(node.history()))
|
|
281
|
+
)
|
|
282
|
+
return iter(records)
|
|
@@ -1,10 +1,11 @@
|
|
|
1
|
-
from typing import Iterator, List
|
|
1
|
+
from typing import Iterator, List
|
|
2
2
|
from attr import define, field
|
|
3
3
|
import json
|
|
4
4
|
import logging
|
|
5
5
|
from datamaestro.data import File
|
|
6
6
|
from datamaestro.record import Record
|
|
7
7
|
|
|
8
|
+
from datamaestro_text.data.ir import Topics
|
|
8
9
|
from datamaestro_text.data.ir.base import (
|
|
9
10
|
IDItem,
|
|
10
11
|
SimpleTextItem,
|
|
@@ -12,7 +13,6 @@ from datamaestro_text.data.ir.base import (
|
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
from .base import (
|
|
15
|
-
AnswerDocumentURL,
|
|
16
16
|
AnswerEntry,
|
|
17
17
|
ConversationTree,
|
|
18
18
|
EntryType,
|
|
@@ -21,6 +21,25 @@ from .base import (
|
|
|
21
21
|
)
|
|
22
22
|
from . import ConversationDataset
|
|
23
23
|
|
|
24
|
+
# Keys to change in the dataset entries for compatibility across different years
|
|
25
|
+
|
|
26
|
+
KEY_MAPPINGS = {
|
|
27
|
+
# Keys to replace: Target Key
|
|
28
|
+
"turns": "responses",
|
|
29
|
+
"utterance": "user_utterance",
|
|
30
|
+
"ptkb_provenance": "relevant_ptkbs",
|
|
31
|
+
"response_provenance": "citations",
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def norm_dict(entry: dict) -> dict:
|
|
36
|
+
"""Convert keys in the entry to match the expected format."""
|
|
37
|
+
normalized = {}
|
|
38
|
+
for k, v in entry.items():
|
|
39
|
+
# Check for direct mapping, then try lowercase mapping
|
|
40
|
+
new_key = KEY_MAPPINGS.get(k) or KEY_MAPPINGS.get(k.lower()) or k.lower()
|
|
41
|
+
normalized[new_key] = v
|
|
42
|
+
return normalized
|
|
24
43
|
|
|
25
44
|
|
|
26
45
|
@define(kw_only=True)
|
|
@@ -47,7 +66,7 @@ class IkatConversationEntry:
|
|
|
47
66
|
|
|
48
67
|
|
|
49
68
|
@define(kw_only=True)
|
|
50
|
-
class
|
|
69
|
+
class IkatConversationTopic:
|
|
51
70
|
"""A query with past history"""
|
|
52
71
|
|
|
53
72
|
number: str
|
|
@@ -60,14 +79,20 @@ class IkatDatasetEntry:
|
|
|
60
79
|
"""The personal knowledge base associated with the user"""
|
|
61
80
|
|
|
62
81
|
responses: List[IkatConversationEntry] = field(
|
|
63
|
-
converter=lambda items: [
|
|
82
|
+
converter=lambda items: [
|
|
83
|
+
IkatConversationEntry(**item) if isinstance(item, dict) else item
|
|
84
|
+
for item in map(norm_dict, items)
|
|
85
|
+
]
|
|
64
86
|
)
|
|
65
87
|
"""The list of responses to the query"""
|
|
66
88
|
|
|
67
89
|
|
|
68
|
-
class
|
|
90
|
+
class IkatConversations(ConversationDataset, File):
|
|
91
|
+
"""A dataset containing conversations from the IKAT project"""
|
|
69
92
|
|
|
70
|
-
|
|
93
|
+
"""Keys to change in the dataset entries for compatibility across different years"""
|
|
94
|
+
|
|
95
|
+
def entries(self) -> Iterator[IkatConversationTopic]:
|
|
71
96
|
"""Reads all conversation entries from the dataset file."""
|
|
72
97
|
with self.path.open("rt") as fp:
|
|
73
98
|
raw_data = json.load(fp)
|
|
@@ -75,12 +100,13 @@ class IkatDataset(ConversationDataset, File):
|
|
|
75
100
|
logging.debug("Loaded %d entries from %s", len(raw_data), self.path)
|
|
76
101
|
logging.debug(f"raw data has keys {raw_data[0].keys()}")
|
|
77
102
|
|
|
78
|
-
processed_data = []
|
|
79
103
|
for entry in raw_data:
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
104
|
+
try:
|
|
105
|
+
normalized_entry = norm_dict(entry)
|
|
106
|
+
yield IkatConversationTopic(**normalized_entry)
|
|
107
|
+
except Exception as e:
|
|
108
|
+
logging.warning(f"Failed to parse entry: {e}")
|
|
109
|
+
raise e
|
|
84
110
|
|
|
85
111
|
def __iter__(self) -> Iterator[ConversationTree]:
|
|
86
112
|
for entry in self.entries():
|
|
@@ -88,7 +114,7 @@ class IkatDataset(ConversationDataset, File):
|
|
|
88
114
|
|
|
89
115
|
for turn in entry.responses:
|
|
90
116
|
turn: IkatConversationEntry = turn # Ensure type is correct
|
|
91
|
-
query_id = f"{entry.number}
|
|
117
|
+
query_id = f"{entry.number}_{turn.turn_id}"
|
|
92
118
|
|
|
93
119
|
# USER QUERY record
|
|
94
120
|
history.append(
|
|
@@ -117,4 +143,3 @@ class IkatDataset(ConversationDataset, File):
|
|
|
117
143
|
# Ensure reverse if needed for compatibility (optional)
|
|
118
144
|
history.reverse()
|
|
119
145
|
yield SingleConversationTree(entry.number, history)
|
|
120
|
-
|