datamaestro-text 2025.6.11__tar.gz → 2025.6.30__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. {datamaestro_text-2025.6.11/src/datamaestro_text.egg-info → datamaestro_text-2025.6.30}/PKG-INFO +1 -1
  2. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/api/conversation.rst +18 -2
  3. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/com/github/apple/ml-qrecc.py +3 -3
  4. datamaestro_text-2025.6.30/src/datamaestro_text/config/com/github/ikat.py +38 -0
  5. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/conversation/base.py +4 -4
  6. datamaestro_text-2025.6.30/src/datamaestro_text/data/conversation/ikat.py +120 -0
  7. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/version.py +2 -2
  8. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30/src/datamaestro_text.egg-info}/PKG-INFO +1 -1
  9. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text.egg-info/SOURCES.txt +2 -0
  10. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/.circleci/config.yml +0 -0
  11. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/.flake8 +0 -0
  12. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/.github/workflows/pytest.yml +0 -0
  13. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/.github/workflows/python-publish.yml +0 -0
  14. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/.gitignore +0 -0
  15. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/.pre-commit-config.yaml +0 -0
  16. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/.readthedocs.yml +0 -0
  17. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/LICENSE +0 -0
  18. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/MANIFEST.in +0 -0
  19. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/Makefile +0 -0
  20. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/README.md +0 -0
  21. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/Makefile +0 -0
  22. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/make.bat +0 -0
  23. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/requirements.txt +0 -0
  24. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/api/embeddings.rst +0 -0
  25. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/api/index.rst +0 -0
  26. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/api/ir.rst +0 -0
  27. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/api/nlp.rst +0 -0
  28. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/api/recommendation.rst +0 -0
  29. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/api/text.rst +0 -0
  30. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/conf.py +0 -0
  31. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/datasets/conversation.rst +0 -0
  32. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/datasets/embeddings.rst +0 -0
  33. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/datasets/index.rst +0 -0
  34. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/datasets/ir.rst +0 -0
  35. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/datasets/irds.rst +0 -0
  36. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/datasets/recommendation.rst +0 -0
  37. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/datasets/text.rst +0 -0
  38. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/docs/source/index.rst +0 -0
  39. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/mkdocs.yml +0 -0
  40. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/pyproject.toml +0 -0
  41. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/requirements-dev.txt +0 -0
  42. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/requirements.txt +0 -0
  43. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/setup.cfg +0 -0
  44. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/__init__.py +0 -0
  45. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/__init__.py +0 -0
  46. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/ai/quac.yaml +0 -0
  47. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/com/fastml/goodbooks-10k.yaml +0 -0
  48. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/com/github/aagohary/canard.py +0 -0
  49. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/com/github/prdwb/orconvqa.py +0 -0
  50. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/com/github/soskek/bookcorpus.yaml +0 -0
  51. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/com/microsoft/msmarco/passage.py +0 -0
  52. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/com/microsoft/wikiqa.yaml +0 -0
  53. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/com/oscar-corpus.py +0 -0
  54. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/com/sentiment140.py +0 -0
  55. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/com/smashwords/bookcorpus.py +0 -0
  56. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/edu/cornell/nlvr.yaml +0 -0
  57. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/edu/stanford/__init__.py +0 -0
  58. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/edu/stanford/aclimdb.py +0 -0
  59. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/edu/stanford/glove.py +0 -0
  60. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/edu/stanford/im2p.yaml +0 -0
  61. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/edu/upenn/__init__.py +0 -0
  62. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/edu/upenn/ldc/__init__.py +0 -0
  63. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/edu/upenn/ldc/aquaint.py +0 -0
  64. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/gov/__init__.py +0 -0
  65. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/gov/nist/__init__.py +0 -0
  66. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/gov/nist/ir/covid.py +0 -0
  67. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/gov/nist/trec/__init__.py +0 -0
  68. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/gov/nist/trec/adhoc.py +0 -0
  69. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/gov/nist/trec/clueweb.yaml +0 -0
  70. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/gov/nist/trec/deeplearning.yaml +0 -0
  71. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/gov/nist/trec/index.yaml +0 -0
  72. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/gov/nist/trec/tipster.py +0 -0
  73. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/gov/nist/trec/web.yaml +0 -0
  74. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/io/github/rajpurkar/squad.yaml +0 -0
  75. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/io/github/thunlp/fewrel.py +0 -0
  76. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/io/metamind/research/__init__.py +0 -0
  77. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/io/metamind/research/wikitext.py +0 -0
  78. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/net/mattmahoney/enwiki.yaml +0 -0
  79. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/org/acm/recsys/cb2014.yaml +0 -0
  80. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/org/cocodataset/index.yaml +0 -0
  81. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/org/grouplens/movielens.py +0 -0
  82. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/org/universaldependencies/french.py +0 -0
  83. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/uk/ac/ucl/cs/qangaroo.yaml +0 -0
  84. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/__init__.py +0 -0
  85. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/conversation/__init__.py +0 -0
  86. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/conversation/canard.py +0 -0
  87. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/conversation/orconvqa.py +0 -0
  88. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/conversation/qrecc.py +0 -0
  89. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/embeddings.py +0 -0
  90. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/__init__.py +0 -0
  91. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/base.py +0 -0
  92. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/cord19.py +0 -0
  93. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/csv.py +0 -0
  94. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/data.py +0 -0
  95. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/formats.py +0 -0
  96. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/huggingface.py +0 -0
  97. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/stores.py +0 -0
  98. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/trec.py +0 -0
  99. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/utils.py +0 -0
  100. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/recommendation.py +0 -0
  101. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/tagging.py +0 -0
  102. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/text.py +0 -0
  103. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/datasets/irds/__init__.py +0 -0
  104. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/datasets/irds/data.py +0 -0
  105. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/datasets/irds/datasets.py +0 -0
  106. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/datasets/irds/helpers.py +0 -0
  107. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/datasets/irds/utils.py +0 -0
  108. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/download/tmdb.py +0 -0
  109. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/interfaces/plaintext.py +0 -0
  110. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/interfaces/trec.py +0 -0
  111. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/test/__init__.py +0 -0
  112. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/test/test_datasets.py +0 -0
  113. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/test/test_documented.py +0 -0
  114. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/transforms/__init__.py +0 -0
  115. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/transforms/ir/__init__.py +0 -0
  116. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/utils/__init__.py +0 -0
  117. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/utils/files.py +0 -0
  118. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/utils/iter.py +0 -0
  119. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/utils/randomstream.py +0 -0
  120. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text/utils/shuffle.py +0 -0
  121. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text.egg-info/dependency_links.txt +0 -0
  122. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text.egg-info/entry_points.txt +0 -0
  123. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text.egg-info/requires.txt +0 -0
  124. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/src/datamaestro_text.egg-info/top_level.txt +0 -0
  125. {datamaestro_text-2025.6.11 → datamaestro_text-2025.6.30}/tox.ini +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datamaestro-text
3
- Version: 2025.6.11
3
+ Version: 2025.6.30
4
4
  Summary: Datamaestro module for text-related datasets
5
5
  Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
6
6
  License: GPL-3
@@ -34,9 +34,13 @@ Contextual query reformulation
34
34
  .. autoclass:: ContextualizedRewrittenQuery
35
35
  :members:
36
36
 
37
+ CANARD Dataset
38
+
37
39
  .. autoxpmconfig:: datamaestro_text.data.conversation.canard.CanardDataset
38
40
  :members: iter
39
41
 
42
+ OrConvQA Dataset
43
+
40
44
  .. autoxpmconfig:: datamaestro_text.data.conversation.orconvqa.OrConvQADataset
41
45
  :members: iter
42
46
 
@@ -46,10 +50,22 @@ Contextual query reformulation
46
50
  .. autoclass:: datamaestro_text.data.conversation.orconvqa.OrConvQADatasetHistoryEntry
47
51
  :members:
48
52
 
53
+ QReCC Dataset
49
54
 
50
-
51
- .. autoclass:: datamaestro_text.data.conversation.orconvqa.QReCCDatasetEntry
55
+ .. autoclass:: datamaestro_text.data.conversation.qrecc.QReCCDatasetEntry
52
56
  :members:
53
57
 
54
58
  .. autoxpmconfig:: datamaestro_text.data.conversation.qrecc.QReCCDataset
55
59
  :members: iter
60
+
61
+
62
+ iKAT Dataset
63
+
64
+ .. autoclass:: datamaestro_text.data.conversation.ikat.IkatDatasetEntry
65
+ :members:
66
+
67
+ .. autoclass:: datamaestro_text.data.conversation.ikat.IkatConversationEntry
68
+ :members:
69
+
70
+ .. autoxpmconfig:: datamaestro_text.data.conversation.ikat.IkatDataset
71
+ :members: iter
@@ -39,9 +39,9 @@ def main(data: Path) -> Supervised[QReCCDataset, None, QReCCDataset]:
39
39
  answering that includes the individual subtasks of question rewriting,
40
40
  passage retrieval and reading comprehension
41
41
  """
42
- return Supervised(
43
- train=QReCCDataset(path=data / "qrecc_train.json"),
44
- test=QReCCDataset(path=data / "qrecc_test.json"),
42
+ return Supervised.C(
43
+ train=QReCCDataset.C(path=data / "qrecc_train.json"),
44
+ test=QReCCDataset.C(path=data / "qrecc_test.json"),
45
45
  )
46
46
 
47
47
 
@@ -0,0 +1,38 @@
1
+ # See documentation on https://datamaestro.readthedocs.io
2
+
3
+ from datamaestro.definitions import datatasks, datatags, dataset
4
+ from datamaestro.data.ml import Supervised
5
+ from datamaestro.data import Base
6
+
7
+ from datamaestro.utils import HashCheck
8
+ from datamaestro.download.single import filedownloader
9
+ from datamaestro_text.data.conversation.ikat import IkatDatasetEntry, IkatDataset
10
+ from datamaestro_text.datasets.irds.data import (
11
+ SimpleJsonDocument,
12
+ LZ4JSONLDocumentStore,
13
+ )
14
+ import logging
15
+
16
+ @datatags("conversation", "context", "query")
17
+ @datatasks("query rewriting")
18
+ @filedownloader(
19
+ "test.json",
20
+ "https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2025/data/2025_test_topics.json",
21
+ checker=HashCheck("16f8444a8d0a8dfe0090f478f185a63c"),
22
+ )
23
+
24
+ @dataset(
25
+ Base,
26
+ url="https://github.com/irlabamsterdam/iKAT/tree/main/2025",
27
+ )
28
+
29
+ def main(test) -> Supervised[IkatDataset, None, IkatDataset]:
30
+ """Question-in-context rewriting
31
+
32
+ iKAT is a test dataset for question-in-context rewriting that consists of
33
+ questions each given in a dialog context together with a context-independent
34
+ rewriting of the question.
35
+ One of the special features of iKAT is that it includes a Personal PKTB',
36
+ """
37
+ logging.info("Creating iKAT dataset from %s", test)
38
+ return IkatDataset.C(path=test)
@@ -214,8 +214,8 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
214
214
 
215
215
  def __init__(self, entry):
216
216
  self.entry = entry
217
- self.parent = None
218
- self.children = []
217
+ self._parent = None
218
+ self._children = []
219
219
 
220
220
  def add(self, node: "ConversationTreeNode") -> "ConversationTreeNode":
221
221
  self._children.append(node)
@@ -224,10 +224,10 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
224
224
 
225
225
  def conversation(self, skip_self: bool) -> ConversationHistory:
226
226
  def iterator():
227
- current = self.parent if skip_self else self
227
+ current = self.parent() if skip_self else self
228
228
  while current is not None:
229
229
  yield current.entry
230
- current = current.parent
230
+ current = current.parent()
231
231
 
232
232
  return LazyList(FactoryIterable(iterator))
233
233
 
@@ -0,0 +1,120 @@
1
+ from typing import Iterator, List, Optional
2
+ from attr import define, field
3
+ import json
4
+ import logging
5
+ from datamaestro.data import File
6
+ from datamaestro.record import Record
7
+
8
+ from datamaestro_text.data.ir.base import (
9
+ IDItem,
10
+ SimpleTextItem,
11
+ )
12
+
13
+
14
+ from .base import (
15
+ AnswerDocumentURL,
16
+ AnswerEntry,
17
+ ConversationTree,
18
+ EntryType,
19
+ SimpleDecontextualizedItem,
20
+ SingleConversationTree,
21
+ )
22
+ from . import ConversationDataset
23
+
24
+
25
+
26
+ @define(kw_only=True)
27
+ class IkatConversationEntry:
28
+ """A query with past history"""
29
+
30
+ turn_id: int
31
+ """Turn number in the conversation"""
32
+
33
+ user_utterance: str
34
+ """The last issued query"""
35
+
36
+ resolved_utterance: str
37
+ """Manually rewritten query"""
38
+
39
+ response: str
40
+ """The system response to the query"""
41
+
42
+ relevant_ptkbs: List[str]
43
+ """The list of relevant personal knowledge bases for the query"""
44
+
45
+ citations: List[str]
46
+ """The list of citations for the response"""
47
+
48
+
49
+ @define(kw_only=True)
50
+ class IkatDatasetEntry:
51
+ """A query with past history"""
52
+
53
+ number: str
54
+ """Conversation ID"""
55
+
56
+ title: str
57
+ """Title of the conversation"""
58
+
59
+ ptkb: str
60
+ """The personal knowledge base associated with the user"""
61
+
62
+ responses: List[IkatConversationEntry] = field(
63
+ converter=lambda items: [IkatConversationEntry(**item) if isinstance(item, dict) else item for item in items]
64
+ )
65
+ """The list of responses to the query"""
66
+
67
+
68
+ class IkatDataset(ConversationDataset, File):
69
+
70
+ def entries(self) -> Iterator[IkatDatasetEntry]:
71
+ """Reads all conversation entries from the dataset file."""
72
+ with self.path.open("rt") as fp:
73
+ raw_data = json.load(fp)
74
+
75
+ logging.debug("Loaded %d entries from %s", len(raw_data), self.path)
76
+ logging.debug(f"raw data has keys {raw_data[0].keys()}")
77
+
78
+ processed_data = []
79
+ for entry in raw_data:
80
+ processed_data.append(IkatDatasetEntry(**{key.lower(): value for key, value in entry.items()}))
81
+
82
+ logging.debug(f"First parsed data sample: {processed_data[0]}")
83
+ return iter(processed_data)
84
+
85
+ def __iter__(self) -> Iterator[ConversationTree]:
86
+ for entry in self.entries():
87
+ history: List[Record] = []
88
+
89
+ for turn in entry.responses:
90
+ turn: IkatConversationEntry = turn # Ensure type is correct
91
+ query_id = f"{entry.number}#{turn.turn_id}"
92
+
93
+ # USER QUERY record
94
+ history.append(
95
+ Record(
96
+ IDItem(query_id),
97
+ SimpleTextItem(turn.user_utterance),
98
+ SimpleDecontextualizedItem(turn.resolved_utterance),
99
+ EntryType.USER_QUERY,
100
+ )
101
+ )
102
+
103
+ # Build citation info (stubbed relevance to match format)
104
+ relevances = {}
105
+ if turn.relevant_ptkbs:
106
+ # Example: just use first as relevant (can be improved)
107
+ relevances[0] = (0, None) # No position info in this structure
108
+
109
+ # SYSTEM ANSWER record
110
+ history.append(
111
+ Record(
112
+ AnswerEntry(turn.response),
113
+ EntryType.SYSTEM_ANSWER,
114
+ )
115
+ )
116
+
117
+ # Ensure reverse if needed for compatibility (optional)
118
+ history.reverse()
119
+ yield SingleConversationTree(entry.number, history)
120
+
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '2025.6.11'
21
- __version_tuple__ = version_tuple = (2025, 6, 11)
20
+ __version__ = version = '2025.6.30'
21
+ __version_tuple__ = version_tuple = (2025, 6, 30)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datamaestro-text
3
- Version: 2025.6.11
3
+ Version: 2025.6.30
4
4
  Summary: Datamaestro module for text-related datasets
5
5
  Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
6
6
  License: GPL-3
@@ -46,6 +46,7 @@ src/datamaestro_text/config/ai/quac.yaml
46
46
  src/datamaestro_text/config/com/oscar-corpus.py
47
47
  src/datamaestro_text/config/com/sentiment140.py
48
48
  src/datamaestro_text/config/com/fastml/goodbooks-10k.yaml
49
+ src/datamaestro_text/config/com/github/ikat.py
49
50
  src/datamaestro_text/config/com/github/aagohary/canard.py
50
51
  src/datamaestro_text/config/com/github/apple/ml-qrecc.py
51
52
  src/datamaestro_text/config/com/github/prdwb/orconvqa.py
@@ -89,6 +90,7 @@ src/datamaestro_text/data/text.py
89
90
  src/datamaestro_text/data/conversation/__init__.py
90
91
  src/datamaestro_text/data/conversation/base.py
91
92
  src/datamaestro_text/data/conversation/canard.py
93
+ src/datamaestro_text/data/conversation/ikat.py
92
94
  src/datamaestro_text/data/conversation/orconvqa.py
93
95
  src/datamaestro_text/data/conversation/qrecc.py
94
96
  src/datamaestro_text/data/ir/__init__.py