datamaestro-text 2025.5.13__tar.gz → 2025.6.30__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/.gitignore +1 -0
  2. {datamaestro_text-2025.5.13/src/datamaestro_text.egg-info → datamaestro_text-2025.6.30}/PKG-INFO +1 -1
  3. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/docs/source/api/conversation.rst +18 -2
  4. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/com/github/apple/ml-qrecc.py +3 -3
  5. datamaestro_text-2025.6.30/src/datamaestro_text/config/com/github/ikat.py +38 -0
  6. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/com/github/prdwb/orconvqa.py +3 -3
  7. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/conversation/base.py +4 -4
  8. datamaestro_text-2025.6.30/src/datamaestro_text/data/conversation/ikat.py +120 -0
  9. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/datasets/irds/datasets.py +4 -4
  10. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/version.py +2 -2
  11. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30/src/datamaestro_text.egg-info}/PKG-INFO +1 -1
  12. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text.egg-info/SOURCES.txt +2 -0
  13. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/.circleci/config.yml +0 -0
  14. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/.flake8 +0 -0
  15. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/.github/workflows/pytest.yml +0 -0
  16. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/.github/workflows/python-publish.yml +0 -0
  17. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/.pre-commit-config.yaml +0 -0
  18. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/.readthedocs.yml +0 -0
  19. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/LICENSE +0 -0
  20. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/MANIFEST.in +0 -0
  21. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/Makefile +0 -0
  22. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/README.md +0 -0
  23. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/docs/Makefile +0 -0
  24. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/docs/make.bat +0 -0
  25. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/docs/requirements.txt +0 -0
  26. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/docs/source/api/embeddings.rst +0 -0
  27. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/docs/source/api/index.rst +0 -0
  28. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/docs/source/api/ir.rst +0 -0
  29. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/docs/source/api/nlp.rst +0 -0
  30. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/docs/source/api/recommendation.rst +0 -0
  31. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/docs/source/api/text.rst +0 -0
  32. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/docs/source/conf.py +0 -0
  33. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/docs/source/datasets/conversation.rst +0 -0
  34. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/docs/source/datasets/embeddings.rst +0 -0
  35. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/docs/source/datasets/index.rst +0 -0
  36. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/docs/source/datasets/ir.rst +0 -0
  37. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/docs/source/datasets/irds.rst +0 -0
  38. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/docs/source/datasets/recommendation.rst +0 -0
  39. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/docs/source/datasets/text.rst +0 -0
  40. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/docs/source/index.rst +0 -0
  41. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/mkdocs.yml +0 -0
  42. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/pyproject.toml +0 -0
  43. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/requirements-dev.txt +0 -0
  44. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/requirements.txt +0 -0
  45. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/setup.cfg +0 -0
  46. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/__init__.py +0 -0
  47. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/__init__.py +0 -0
  48. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/ai/quac.yaml +0 -0
  49. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/com/fastml/goodbooks-10k.yaml +0 -0
  50. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/com/github/aagohary/canard.py +0 -0
  51. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/com/github/soskek/bookcorpus.yaml +0 -0
  52. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/com/microsoft/msmarco/passage.py +0 -0
  53. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/com/microsoft/wikiqa.yaml +0 -0
  54. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/com/oscar-corpus.py +0 -0
  55. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/com/sentiment140.py +0 -0
  56. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/com/smashwords/bookcorpus.py +0 -0
  57. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/edu/cornell/nlvr.yaml +0 -0
  58. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/edu/stanford/__init__.py +0 -0
  59. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/edu/stanford/aclimdb.py +0 -0
  60. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/edu/stanford/glove.py +0 -0
  61. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/edu/stanford/im2p.yaml +0 -0
  62. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/edu/upenn/__init__.py +0 -0
  63. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/edu/upenn/ldc/__init__.py +0 -0
  64. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/edu/upenn/ldc/aquaint.py +0 -0
  65. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/gov/__init__.py +0 -0
  66. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/gov/nist/__init__.py +0 -0
  67. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/gov/nist/ir/covid.py +0 -0
  68. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/gov/nist/trec/__init__.py +0 -0
  69. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/gov/nist/trec/adhoc.py +0 -0
  70. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/gov/nist/trec/clueweb.yaml +0 -0
  71. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/gov/nist/trec/deeplearning.yaml +0 -0
  72. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/gov/nist/trec/index.yaml +0 -0
  73. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/gov/nist/trec/tipster.py +0 -0
  74. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/gov/nist/trec/web.yaml +0 -0
  75. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/io/github/rajpurkar/squad.yaml +0 -0
  76. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/io/github/thunlp/fewrel.py +0 -0
  77. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/io/metamind/research/__init__.py +0 -0
  78. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/io/metamind/research/wikitext.py +0 -0
  79. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/net/mattmahoney/enwiki.yaml +0 -0
  80. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/org/acm/recsys/cb2014.yaml +0 -0
  81. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/org/cocodataset/index.yaml +0 -0
  82. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/org/grouplens/movielens.py +0 -0
  83. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/org/universaldependencies/french.py +0 -0
  84. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/config/uk/ac/ucl/cs/qangaroo.yaml +0 -0
  85. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/__init__.py +0 -0
  86. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/conversation/__init__.py +0 -0
  87. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/conversation/canard.py +0 -0
  88. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/conversation/orconvqa.py +0 -0
  89. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/conversation/qrecc.py +0 -0
  90. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/embeddings.py +0 -0
  91. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/__init__.py +0 -0
  92. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/base.py +0 -0
  93. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/cord19.py +0 -0
  94. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/csv.py +0 -0
  95. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/data.py +0 -0
  96. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/formats.py +0 -0
  97. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/huggingface.py +0 -0
  98. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/stores.py +0 -0
  99. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/trec.py +0 -0
  100. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/ir/utils.py +0 -0
  101. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/recommendation.py +0 -0
  102. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/tagging.py +0 -0
  103. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/data/text.py +0 -0
  104. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/datasets/irds/__init__.py +0 -0
  105. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/datasets/irds/data.py +0 -0
  106. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/datasets/irds/helpers.py +0 -0
  107. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/datasets/irds/utils.py +0 -0
  108. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/download/tmdb.py +0 -0
  109. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/interfaces/plaintext.py +0 -0
  110. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/interfaces/trec.py +0 -0
  111. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/test/__init__.py +0 -0
  112. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/test/test_datasets.py +0 -0
  113. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/test/test_documented.py +0 -0
  114. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/transforms/__init__.py +0 -0
  115. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/transforms/ir/__init__.py +0 -0
  116. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/utils/__init__.py +0 -0
  117. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/utils/files.py +0 -0
  118. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/utils/iter.py +0 -0
  119. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/utils/randomstream.py +0 -0
  120. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text/utils/shuffle.py +0 -0
  121. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text.egg-info/dependency_links.txt +0 -0
  122. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text.egg-info/entry_points.txt +0 -0
  123. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text.egg-info/requires.txt +0 -0
  124. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/src/datamaestro_text.egg-info/top_level.txt +0 -0
  125. {datamaestro_text-2025.5.13 → datamaestro_text-2025.6.30}/tox.ini +0 -0
@@ -1,4 +1,5 @@
1
1
  *.egg-info
2
+ .DS_Store
2
3
  .vscode
3
4
  __pycache__
4
5
  site
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datamaestro-text
3
- Version: 2025.5.13
3
+ Version: 2025.6.30
4
4
  Summary: Datamaestro module for text-related datasets
5
5
  Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
6
6
  License: GPL-3
@@ -34,9 +34,13 @@ Contextual query reformulation
34
34
  .. autoclass:: ContextualizedRewrittenQuery
35
35
  :members:
36
36
 
37
+ CANARD Dataset
38
+
37
39
  .. autoxpmconfig:: datamaestro_text.data.conversation.canard.CanardDataset
38
40
  :members: iter
39
41
 
42
+ OrConvQA Dataset
43
+
40
44
  .. autoxpmconfig:: datamaestro_text.data.conversation.orconvqa.OrConvQADataset
41
45
  :members: iter
42
46
 
@@ -46,10 +50,22 @@ Contextual query reformulation
46
50
  .. autoclass:: datamaestro_text.data.conversation.orconvqa.OrConvQADatasetHistoryEntry
47
51
  :members:
48
52
 
53
+ QReCC Dataset
49
54
 
50
-
51
- .. autoclass:: datamaestro_text.data.conversation.orconvqa.QReCCDatasetEntry
55
+ .. autoclass:: datamaestro_text.data.conversation.qrecc.QReCCDatasetEntry
52
56
  :members:
53
57
 
54
58
  .. autoxpmconfig:: datamaestro_text.data.conversation.qrecc.QReCCDataset
55
59
  :members: iter
60
+
61
+
62
+ iKAT Dataset
63
+
64
+ .. autoclass:: datamaestro_text.data.conversation.ikat.IkatDatasetEntry
65
+ :members:
66
+
67
+ .. autoclass:: datamaestro_text.data.conversation.ikat.IkatConversationEntry
68
+ :members:
69
+
70
+ .. autoxpmconfig:: datamaestro_text.data.conversation.ikat.IkatDataset
71
+ :members: iter
@@ -39,9 +39,9 @@ def main(data: Path) -> Supervised[QReCCDataset, None, QReCCDataset]:
39
39
  answering that includes the individual subtasks of question rewriting,
40
40
  passage retrieval and reading comprehension
41
41
  """
42
- return Supervised(
43
- train=QReCCDataset(path=data / "qrecc_train.json"),
44
- test=QReCCDataset(path=data / "qrecc_test.json"),
42
+ return Supervised.C(
43
+ train=QReCCDataset.C(path=data / "qrecc_train.json"),
44
+ test=QReCCDataset.C(path=data / "qrecc_test.json"),
45
45
  )
46
46
 
47
47
 
@@ -0,0 +1,38 @@
1
+ # See documentation on https://datamaestro.readthedocs.io
2
+
3
+ from datamaestro.definitions import datatasks, datatags, dataset
4
+ from datamaestro.data.ml import Supervised
5
+ from datamaestro.data import Base
6
+
7
+ from datamaestro.utils import HashCheck
8
+ from datamaestro.download.single import filedownloader
9
+ from datamaestro_text.data.conversation.ikat import IkatDatasetEntry, IkatDataset
10
+ from datamaestro_text.datasets.irds.data import (
11
+ SimpleJsonDocument,
12
+ LZ4JSONLDocumentStore,
13
+ )
14
+ import logging
15
+
16
+ @datatags("conversation", "context", "query")
17
+ @datatasks("query rewriting")
18
+ @filedownloader(
19
+ "test.json",
20
+ "https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2025/data/2025_test_topics.json",
21
+ checker=HashCheck("16f8444a8d0a8dfe0090f478f185a63c"),
22
+ )
23
+
24
+ @dataset(
25
+ Base,
26
+ url="https://github.com/irlabamsterdam/iKAT/tree/main/2025",
27
+ )
28
+
29
+ def main(test) -> Supervised[IkatDataset, None, IkatDataset]:
30
+ """Question-in-context rewriting
31
+
32
+ iKAT is a test dataset for question-in-context rewriting that consists of
33
+ questions each given in a dialog context together with a context-independent
34
+ rewriting of the question.
35
+ One of the special features of iKAT is that it includes a Personal PKTB',
36
+ """
37
+ logging.info("Creating iKAT dataset from %s", test)
38
+ return IkatDataset.C(path=test)
@@ -49,9 +49,9 @@ def preprocessed(train, dev, test):
49
49
  Each dataset is an instance of :class:`datamaestro_text.data.conversation.OrConvQADataset`
50
50
  """
51
51
  return {
52
- "train": OrConvQADataset(path=train),
53
- "validation": OrConvQADataset(path=dev),
54
- "test": OrConvQADataset(path=test),
52
+ "train": OrConvQADataset.C(path=train),
53
+ "validation": OrConvQADataset.C(path=dev),
54
+ "test": OrConvQADataset.C(path=test),
55
55
  }
56
56
 
57
57
 
@@ -214,8 +214,8 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
214
214
 
215
215
  def __init__(self, entry):
216
216
  self.entry = entry
217
- self.parent = None
218
- self.children = []
217
+ self._parent = None
218
+ self._children = []
219
219
 
220
220
  def add(self, node: "ConversationTreeNode") -> "ConversationTreeNode":
221
221
  self._children.append(node)
@@ -224,10 +224,10 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
224
224
 
225
225
  def conversation(self, skip_self: bool) -> ConversationHistory:
226
226
  def iterator():
227
- current = self.parent if skip_self else self
227
+ current = self.parent() if skip_self else self
228
228
  while current is not None:
229
229
  yield current.entry
230
- current = current.parent
230
+ current = current.parent()
231
231
 
232
232
  return LazyList(FactoryIterable(iterator))
233
233
 
@@ -0,0 +1,120 @@
1
+ from typing import Iterator, List, Optional
2
+ from attr import define, field
3
+ import json
4
+ import logging
5
+ from datamaestro.data import File
6
+ from datamaestro.record import Record
7
+
8
+ from datamaestro_text.data.ir.base import (
9
+ IDItem,
10
+ SimpleTextItem,
11
+ )
12
+
13
+
14
+ from .base import (
15
+ AnswerDocumentURL,
16
+ AnswerEntry,
17
+ ConversationTree,
18
+ EntryType,
19
+ SimpleDecontextualizedItem,
20
+ SingleConversationTree,
21
+ )
22
+ from . import ConversationDataset
23
+
24
+
25
+
26
+ @define(kw_only=True)
27
+ class IkatConversationEntry:
28
+ """A query with past history"""
29
+
30
+ turn_id: int
31
+ """Turn number in the conversation"""
32
+
33
+ user_utterance: str
34
+ """The last issued query"""
35
+
36
+ resolved_utterance: str
37
+ """Manually rewritten query"""
38
+
39
+ response: str
40
+ """The system response to the query"""
41
+
42
+ relevant_ptkbs: List[str]
43
+ """The list of relevant personal knowledge bases for the query"""
44
+
45
+ citations: List[str]
46
+ """The list of citations for the response"""
47
+
48
+
49
+ @define(kw_only=True)
50
+ class IkatDatasetEntry:
51
+ """A query with past history"""
52
+
53
+ number: str
54
+ """Conversation ID"""
55
+
56
+ title: str
57
+ """Title of the conversation"""
58
+
59
+ ptkb: str
60
+ """The personal knowledge base associated with the user"""
61
+
62
+ responses: List[IkatConversationEntry] = field(
63
+ converter=lambda items: [IkatConversationEntry(**item) if isinstance(item, dict) else item for item in items]
64
+ )
65
+ """The list of responses to the query"""
66
+
67
+
68
+ class IkatDataset(ConversationDataset, File):
69
+
70
+ def entries(self) -> Iterator[IkatDatasetEntry]:
71
+ """Reads all conversation entries from the dataset file."""
72
+ with self.path.open("rt") as fp:
73
+ raw_data = json.load(fp)
74
+
75
+ logging.debug("Loaded %d entries from %s", len(raw_data), self.path)
76
+ logging.debug(f"raw data has keys {raw_data[0].keys()}")
77
+
78
+ processed_data = []
79
+ for entry in raw_data:
80
+ processed_data.append(IkatDatasetEntry(**{key.lower(): value for key, value in entry.items()}))
81
+
82
+ logging.debug(f"First parsed data sample: {processed_data[0]}")
83
+ return iter(processed_data)
84
+
85
+ def __iter__(self) -> Iterator[ConversationTree]:
86
+ for entry in self.entries():
87
+ history: List[Record] = []
88
+
89
+ for turn in entry.responses:
90
+ turn: IkatConversationEntry = turn # Ensure type is correct
91
+ query_id = f"{entry.number}#{turn.turn_id}"
92
+
93
+ # USER QUERY record
94
+ history.append(
95
+ Record(
96
+ IDItem(query_id),
97
+ SimpleTextItem(turn.user_utterance),
98
+ SimpleDecontextualizedItem(turn.resolved_utterance),
99
+ EntryType.USER_QUERY,
100
+ )
101
+ )
102
+
103
+ # Build citation info (stubbed relevance to match format)
104
+ relevances = {}
105
+ if turn.relevant_ptkbs:
106
+ # Example: just use first as relevant (can be improved)
107
+ relevances[0] = (0, None) # No position info in this structure
108
+
109
+ # SYSTEM ANSWER record
110
+ history.append(
111
+ Record(
112
+ AnswerEntry(turn.response),
113
+ EntryType.SYSTEM_ANSWER,
114
+ )
115
+ )
116
+
117
+ # Ensure reverse if needed for compatibility (optional)
118
+ history.reverse()
119
+ yield SingleConversationTree(entry.number, history)
120
+
@@ -65,7 +65,7 @@ class QrelsDataset(Dataset):
65
65
  return True
66
66
 
67
67
  def _prepare(self, download=False) -> Documents:
68
- return AdhocAssessments(id=self.fullid)
68
+ return AdhocAssessments.C(id=self.fullid)
69
69
 
70
70
 
71
71
  class QueriesDataset(Dataset):
@@ -78,7 +78,7 @@ class QueriesDataset(Dataset):
78
78
  return True
79
79
 
80
80
  def _prepare(self, download=False) -> Documents:
81
- return Topics(id=self.fullid)
81
+ return Topics.C(id=self.fullid)
82
82
 
83
83
 
84
84
  # class ScoredDocuments(Dataset):
@@ -96,7 +96,7 @@ class DocumentsDataset(Dataset):
96
96
  return True
97
97
 
98
98
  def _prepare(self, download=False) -> Documents:
99
- return Documents(id=self.fullid)
99
+ return Documents.C(id=self.fullid)
100
100
 
101
101
 
102
102
  class TrainingTripletsDataset(Dataset):
@@ -123,7 +123,7 @@ class Collection(Dataset):
123
123
  topics: QueriesDataset
124
124
 
125
125
  def _prepare(self, download=False) -> Documents:
126
- return Adhoc(
126
+ return Adhoc.C(
127
127
  id=self.fullid,
128
128
  topics=self.topics.prepare(download),
129
129
  assessments=self.assessments.prepare(download),
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '2025.5.13'
21
- __version_tuple__ = version_tuple = (2025, 5, 13)
20
+ __version__ = version = '2025.6.30'
21
+ __version_tuple__ = version_tuple = (2025, 6, 30)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datamaestro-text
3
- Version: 2025.5.13
3
+ Version: 2025.6.30
4
4
  Summary: Datamaestro module for text-related datasets
5
5
  Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
6
6
  License: GPL-3
@@ -46,6 +46,7 @@ src/datamaestro_text/config/ai/quac.yaml
46
46
  src/datamaestro_text/config/com/oscar-corpus.py
47
47
  src/datamaestro_text/config/com/sentiment140.py
48
48
  src/datamaestro_text/config/com/fastml/goodbooks-10k.yaml
49
+ src/datamaestro_text/config/com/github/ikat.py
49
50
  src/datamaestro_text/config/com/github/aagohary/canard.py
50
51
  src/datamaestro_text/config/com/github/apple/ml-qrecc.py
51
52
  src/datamaestro_text/config/com/github/prdwb/orconvqa.py
@@ -89,6 +90,7 @@ src/datamaestro_text/data/text.py
89
90
  src/datamaestro_text/data/conversation/__init__.py
90
91
  src/datamaestro_text/data/conversation/base.py
91
92
  src/datamaestro_text/data/conversation/canard.py
93
+ src/datamaestro_text/data/conversation/ikat.py
92
94
  src/datamaestro_text/data/conversation/orconvqa.py
93
95
  src/datamaestro_text/data/conversation/qrecc.py
94
96
  src/datamaestro_text/data/ir/__init__.py