datamaestro-text 2025.6.11__tar.gz → 2025.7.28__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/.github/workflows/pytest.yml +1 -1
  2. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/.readthedocs.yml +1 -1
  3. {datamaestro_text-2025.6.11/src/datamaestro_text.egg-info → datamaestro_text-2025.7.28}/PKG-INFO +3 -3
  4. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/source/api/conversation.rst +24 -2
  5. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/source/api/ir.rst +8 -2
  6. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/pyproject.toml +9 -1
  7. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/requirements.txt +1 -1
  8. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/com/github/apple/ml-qrecc.py +3 -3
  9. datamaestro_text-2025.7.28/src/datamaestro_text/config/com/github/ikat.py +121 -0
  10. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/com/sentiment140.py +4 -4
  11. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/gov/nist/trec/tipster.py +0 -1
  12. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/conversation/base.py +38 -13
  13. datamaestro_text-2025.7.28/src/datamaestro_text/data/conversation/ikat.py +145 -0
  14. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/ir/__init__.py +25 -2
  15. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/ir/base.py +2 -1
  16. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/ir/formats.py +8 -2
  17. datamaestro_text-2025.7.28/src/datamaestro_text/data/ir/stores.py +124 -0
  18. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/ir/trec.py +7 -4
  19. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/datasets/irds/data.py +34 -15
  20. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/interfaces/trec.py +28 -1
  21. datamaestro_text-2025.7.28/src/datamaestro_text/utils/files.py +111 -0
  22. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/utils/iter.py +5 -0
  23. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/version.py +2 -2
  24. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28/src/datamaestro_text.egg-info}/PKG-INFO +3 -3
  25. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text.egg-info/SOURCES.txt +2 -0
  26. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text.egg-info/requires.txt +1 -1
  27. datamaestro_text-2025.6.11/src/datamaestro_text/data/ir/stores.py +0 -29
  28. datamaestro_text-2025.6.11/src/datamaestro_text/utils/files.py +0 -8
  29. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/.circleci/config.yml +0 -0
  30. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/.flake8 +0 -0
  31. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/.github/workflows/python-publish.yml +0 -0
  32. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/.gitignore +0 -0
  33. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/.pre-commit-config.yaml +0 -0
  34. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/LICENSE +0 -0
  35. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/MANIFEST.in +0 -0
  36. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/Makefile +0 -0
  37. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/README.md +0 -0
  38. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/Makefile +0 -0
  39. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/make.bat +0 -0
  40. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/requirements.txt +0 -0
  41. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/source/api/embeddings.rst +0 -0
  42. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/source/api/index.rst +0 -0
  43. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/source/api/nlp.rst +0 -0
  44. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/source/api/recommendation.rst +0 -0
  45. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/source/api/text.rst +0 -0
  46. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/source/conf.py +0 -0
  47. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/source/datasets/conversation.rst +0 -0
  48. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/source/datasets/embeddings.rst +0 -0
  49. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/source/datasets/index.rst +0 -0
  50. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/source/datasets/ir.rst +0 -0
  51. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/source/datasets/irds.rst +0 -0
  52. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/source/datasets/recommendation.rst +0 -0
  53. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/source/datasets/text.rst +0 -0
  54. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/docs/source/index.rst +0 -0
  55. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/mkdocs.yml +0 -0
  56. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/requirements-dev.txt +0 -0
  57. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/setup.cfg +0 -0
  58. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/__init__.py +0 -0
  59. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/__init__.py +0 -0
  60. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/ai/quac.yaml +0 -0
  61. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/com/fastml/goodbooks-10k.yaml +0 -0
  62. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/com/github/aagohary/canard.py +0 -0
  63. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/com/github/prdwb/orconvqa.py +0 -0
  64. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/com/github/soskek/bookcorpus.yaml +0 -0
  65. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/com/microsoft/msmarco/passage.py +0 -0
  66. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/com/microsoft/wikiqa.yaml +0 -0
  67. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/com/oscar-corpus.py +0 -0
  68. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/com/smashwords/bookcorpus.py +0 -0
  69. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/edu/cornell/nlvr.yaml +0 -0
  70. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/edu/stanford/__init__.py +0 -0
  71. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/edu/stanford/aclimdb.py +0 -0
  72. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/edu/stanford/glove.py +0 -0
  73. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/edu/stanford/im2p.yaml +0 -0
  74. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/edu/upenn/__init__.py +0 -0
  75. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/edu/upenn/ldc/__init__.py +0 -0
  76. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/edu/upenn/ldc/aquaint.py +0 -0
  77. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/gov/__init__.py +0 -0
  78. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/gov/nist/__init__.py +0 -0
  79. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/gov/nist/ir/covid.py +0 -0
  80. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/gov/nist/trec/__init__.py +0 -0
  81. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/gov/nist/trec/adhoc.py +0 -0
  82. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/gov/nist/trec/clueweb.yaml +0 -0
  83. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/gov/nist/trec/deeplearning.yaml +0 -0
  84. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/gov/nist/trec/index.yaml +0 -0
  85. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/gov/nist/trec/web.yaml +0 -0
  86. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/io/github/rajpurkar/squad.yaml +0 -0
  87. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/io/github/thunlp/fewrel.py +0 -0
  88. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/io/metamind/research/__init__.py +0 -0
  89. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/io/metamind/research/wikitext.py +0 -0
  90. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/net/mattmahoney/enwiki.yaml +0 -0
  91. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/org/acm/recsys/cb2014.yaml +0 -0
  92. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/org/cocodataset/index.yaml +0 -0
  93. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/org/grouplens/movielens.py +0 -0
  94. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/org/universaldependencies/french.py +0 -0
  95. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/uk/ac/ucl/cs/qangaroo.yaml +0 -0
  96. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/__init__.py +0 -0
  97. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/conversation/__init__.py +0 -0
  98. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/conversation/canard.py +0 -0
  99. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/conversation/orconvqa.py +0 -0
  100. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/conversation/qrecc.py +0 -0
  101. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/embeddings.py +0 -0
  102. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/ir/cord19.py +0 -0
  103. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/ir/csv.py +0 -0
  104. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/ir/data.py +0 -0
  105. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/ir/huggingface.py +0 -0
  106. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/ir/utils.py +0 -0
  107. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/recommendation.py +0 -0
  108. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/tagging.py +0 -0
  109. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/text.py +0 -0
  110. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/datasets/irds/__init__.py +0 -0
  111. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/datasets/irds/datasets.py +0 -0
  112. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/datasets/irds/helpers.py +0 -0
  113. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/datasets/irds/utils.py +0 -0
  114. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/download/tmdb.py +0 -0
  115. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/interfaces/plaintext.py +0 -0
  116. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/test/__init__.py +0 -0
  117. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/test/test_datasets.py +0 -0
  118. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/test/test_documented.py +0 -0
  119. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/transforms/__init__.py +0 -0
  120. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/transforms/ir/__init__.py +0 -0
  121. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/utils/__init__.py +0 -0
  122. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/utils/randomstream.py +0 -0
  123. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text/utils/shuffle.py +0 -0
  124. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text.egg-info/dependency_links.txt +0 -0
  125. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text.egg-info/entry_points.txt +0 -0
  126. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/src/datamaestro_text.egg-info/top_level.txt +0 -0
  127. {datamaestro_text-2025.6.11 → datamaestro_text-2025.7.28}/tox.ini +0 -0
@@ -25,7 +25,7 @@ jobs:
25
25
  python-version: ${{ matrix.python-version }}
26
26
  - name: Install dependencies
27
27
  run: |
28
- python -m pip install --upgrade pip
28
+ python -m pip install --upgrade pip setuptools
29
29
  SETUPTOOLS_SCM_PRETEND_VERSION="0.1-dev" pip install -e '.[dev]'
30
30
  - name: Lint with flake8
31
31
  run: |
@@ -11,7 +11,7 @@ sphinx:
11
11
  build:
12
12
  os: ubuntu-20.04
13
13
  tools:
14
- python: "3.9"
14
+ python: "3.10"
15
15
 
16
16
 
17
17
  # Install the package
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datamaestro-text
3
- Version: 2025.6.11
3
+ Version: 2025.7.28
4
4
  Summary: Datamaestro module for text-related datasets
5
5
  Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
6
6
  License: GPL-3
@@ -15,10 +15,10 @@ Classifier: Operating System :: OS Independent
15
15
  Classifier: Programming Language :: Python
16
16
  Classifier: Programming Language :: Python :: 3
17
17
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
- Requires-Python: >=3.8
18
+ Requires-Python: >=3.10
19
19
  Description-Content-Type: text/markdown
20
20
  License-File: LICENSE
21
- Requires-Dist: datamaestro>=1.4.2
21
+ Requires-Dist: datamaestro>=1.5.0
22
22
  Requires-Dist: ir_datasets>=0.5.8
23
23
  Requires-Dist: attrs
24
24
  Provides-Extra: dev
@@ -26,6 +26,12 @@ Data classes
26
26
 
27
27
  .. autoclass:: ConversationTopic
28
28
 
29
+ Conversational IR
30
+ -----------------
31
+
32
+ .. autoxpmconfig:: datamaestro_text.data.conversation.base.ConversationUserTopics
33
+
34
+
29
35
  Contextual query reformulation
30
36
  ------------------------------
31
37
 
@@ -34,9 +40,13 @@ Contextual query reformulation
34
40
  .. autoclass:: ContextualizedRewrittenQuery
35
41
  :members:
36
42
 
43
+ CANARD Dataset
44
+
37
45
  .. autoxpmconfig:: datamaestro_text.data.conversation.canard.CanardDataset
38
46
  :members: iter
39
47
 
48
+ OrConvQA Dataset
49
+
40
50
  .. autoxpmconfig:: datamaestro_text.data.conversation.orconvqa.OrConvQADataset
41
51
  :members: iter
42
52
 
@@ -46,10 +56,22 @@ Contextual query reformulation
46
56
  .. autoclass:: datamaestro_text.data.conversation.orconvqa.OrConvQADatasetHistoryEntry
47
57
  :members:
48
58
 
59
+ QReCC Dataset
49
60
 
50
-
51
- .. autoclass:: datamaestro_text.data.conversation.orconvqa.QReCCDatasetEntry
61
+ .. autoclass:: datamaestro_text.data.conversation.qrecc.QReCCDatasetEntry
52
62
  :members:
53
63
 
54
64
  .. autoxpmconfig:: datamaestro_text.data.conversation.qrecc.QReCCDataset
55
65
  :members: iter
66
+
67
+
68
+ iKAT Dataset
69
+
70
+ .. autoclass:: datamaestro_text.data.conversation.ikat.IkatConversationTopic
71
+ :members:
72
+
73
+ .. autoclass:: datamaestro_text.data.conversation.ikat.IkatConversationEntry
74
+ :members:
75
+
76
+ .. autoxpmconfig:: datamaestro_text.data.conversation.ikat.IkatConversations
77
+ :members: iter
@@ -36,11 +36,17 @@ Documents
36
36
 
37
37
  .. autoxpmconfig:: datamaestro_text.data.ir.Documents
38
38
  :members: iter_documents, iter_ids, documentcount
39
+ .. autoxpmconfig:: datamaestro_text.data.ir.csv.Documents
40
+ .. autoxpmconfig:: datamaestro_text.datasets.irds.data.LZ4DocumentStore
41
+
42
+
43
+ Dataset-specific documents
44
+ **************************
45
+
39
46
  .. autoxpmconfig:: datamaestro_text.data.ir.cord19.Documents
40
47
  .. autoxpmconfig:: datamaestro_text.data.ir.trec.TipsterCollection
41
- .. autoxpmconfig:: datamaestro_text.data.ir.csv.Documents
42
48
  .. autoxpmconfig:: datamaestro_text.data.ir.stores.OrConvQADocumentStore
43
- .. autoxpmconfig:: datamaestro_text.datasets.irds.data.LZ4DocumentStore
49
+ .. autoxpmconfig:: datamaestro_text.data.ir.stores.IKatClueWeb22DocumentStore
44
50
 
45
51
  Assessments
46
52
  -----------
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "datamaestro-text"
3
- requires-python = ">=3.8"
3
+ requires-python = ">=3.10"
4
4
  keywords = ["dataset manager", "information retrieval", "experiments"]
5
5
  description = "Datamaestro module for text-related datasets"
6
6
  dynamic = ["version", "readme", "dependencies"]
@@ -44,6 +44,14 @@ fallback_version = "0.0.0-dev"
44
44
  [build-system]
45
45
  requires = ["setuptools", "setuptools-scm", "wheel"] # PEP 508 specifications.
46
46
 
47
+ [dependency-groups]
48
+ dev = [
49
+ "docutils>=0.21.2",
50
+ "pytest>=8.4.1",
51
+ "sphinx>=8.1.3",
52
+ "sphobjinv>=2.3.1.3",
53
+ ]
54
+
47
55
  [project.entry-points."datamaestro.repositories"]
48
56
  text = "datamaestro_text:Repository"
49
57
  irds = "datamaestro_text.datasets.irds:Repository"
@@ -1,3 +1,3 @@
1
- datamaestro>=1.4.2
1
+ datamaestro>=1.5.0
2
2
  ir_datasets>=0.5.8
3
3
  attrs
@@ -39,9 +39,9 @@ def main(data: Path) -> Supervised[QReCCDataset, None, QReCCDataset]:
39
39
  answering that includes the individual subtasks of question rewriting,
40
40
  passage retrieval and reading comprehension
41
41
  """
42
- return Supervised(
43
- train=QReCCDataset(path=data / "qrecc_train.json"),
44
- test=QReCCDataset(path=data / "qrecc_test.json"),
42
+ return Supervised.C(
43
+ train=QReCCDataset.C(path=data / "qrecc_train.json"),
44
+ test=QReCCDataset.C(path=data / "qrecc_test.json"),
45
45
  )
46
46
 
47
47
 
@@ -0,0 +1,121 @@
1
+ # See documentation on https://datamaestro.readthedocs.io
2
+
3
+ import bz2
4
+ from datamaestro.download import reference
5
+ from datamaestro.definitions import datatasks, datatags, dataset
6
+ from datamaestro_text.data.conversation.base import ConversationUserTopics
7
+ from datamaestro_text.data.ir import Adhoc
8
+
9
+ from datamaestro.utils import HashCheck
10
+ from datamaestro.context import DatafolderPath
11
+ from datamaestro.download.single import filedownloader
12
+ from datamaestro_text.data.conversation.ikat import IkatConversations
13
+ from datamaestro.download.links import linkfolder
14
+
15
+ from datamaestro_text.data.ir.stores import IKatClueWeb22DocumentStore
16
+ from datamaestro_text.data.ir.trec import TrecAdhocAssessments
17
+ from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
18
+
19
+
20
+ @dataset(as_prepare=True)
21
+ def clueweb22(dataset, options=None) -> IKatClueWeb22DocumentStore:
22
+ # Number of documents in the dataset
23
+ count = 116_838_987
24
+
25
+ jsonl_folder = linkfolder(
26
+ "documents", [DatafolderPath("gov.nist.trec.ikat.clueweb22", "jsonl")]
27
+ ).setup(dataset, options)
28
+ store_path = lz4docstore_builder(
29
+ "store",
30
+ IKatClueWeb22DocumentStore.generator(
31
+ jsonl_folder,
32
+ jsonl_folder / "ikat_2023_passages_jsonl.sha256sums",
33
+ jsonl_folder / "ikat_2023_passages_hashes.tsv.bz2",
34
+ ),
35
+ IKatClueWeb22DocumentStore.Document,
36
+ "id",
37
+ count_hint=count,
38
+ ).setup(dataset, options)
39
+
40
+ return IKatClueWeb22DocumentStore.C(path=store_path, count=count)
41
+
42
+
43
+ @datatags("conversation", "context", "query")
44
+ @datatasks("conversational search", "query rewriting")
45
+ @reference("documents", clueweb22)
46
+ @filedownloader(
47
+ "topics.json",
48
+ "https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2025/data/2025_test_topics.json",
49
+ checker=HashCheck("16f8444a8d0a8dfe0090f478f185a63c"),
50
+ )
51
+ @dataset(
52
+ id="2025",
53
+ url="https://github.com/irlabamsterdam/iKAT/tree/main/2025",
54
+ )
55
+ def test_2025(topics, documents) -> Adhoc.C:
56
+ """Question-in-context rewriting
57
+
58
+ iKAT is a test dataset for question-in-context rewriting that consists of
59
+ questions each given in a dialog context together with a context-independent
60
+ rewriting of the question.
61
+ """
62
+ return Adhoc.C(
63
+ topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
64
+ # TODO: add when available
65
+ assessments=TrecAdhocAssessments.C(path="/to/do"),
66
+ documents=documents,
67
+ )
68
+
69
+
70
+ @datatags("conversation", "context", "query")
71
+ @datatasks("conversational search", "query rewriting")
72
+ @reference("documents", clueweb22)
73
+ @filedownloader(
74
+ "qrels",
75
+ "https://trec.nist.gov/data/ikat/2024-qrels.txt",
76
+ checker=HashCheck("57f958903ed1c12bbac207f62800814f"),
77
+ )
78
+ @filedownloader(
79
+ "topics.json",
80
+ "https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2024/data/2024_test_topics.json",
81
+ checker=HashCheck("ad45bc6e7add2081d69ea60a0a4d1203"),
82
+ )
83
+ @dataset(
84
+ Adhoc,
85
+ id="2024",
86
+ url="https://github.com/irlabamsterdam/iKAT/tree/main/2024",
87
+ )
88
+ def test_2024(topics, qrels, documents) -> Adhoc.C:
89
+ """iKAT 2024 dataset"""
90
+ return Adhoc.C(
91
+ topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
92
+ assessments=TrecAdhocAssessments.C(path=qrels),
93
+ documents=documents,
94
+ )
95
+
96
+
97
+ @datatags("conversation", "context", "query")
98
+ @datatasks("conversational search", "query rewriting")
99
+ @reference("documents", clueweb22)
100
+ @filedownloader(
101
+ "qrels",
102
+ "https://trec.nist.gov/data/ikat/2023-qrels.all-turns.txt",
103
+ checker=HashCheck("79dc121bab25b2245e52a53263e5ad1f"),
104
+ )
105
+ @filedownloader(
106
+ "topics.json",
107
+ "https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2023/data/2023_test_topics.json",
108
+ checker=HashCheck("684fa0197cdec8c3cfb6a2e586ab83f6"),
109
+ )
110
+ @dataset(
111
+ Adhoc,
112
+ id="2023",
113
+ url="https://github.com/irlabamsterdam/iKAT/tree/main/2023",
114
+ )
115
+ def test_2023(topics, qrels, documents) -> Adhoc.C:
116
+ """iKAT 2023 dataset"""
117
+ return Adhoc.C(
118
+ topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
119
+ assessments=TrecAdhocAssessments.C(path=qrels),
120
+ documents=documents,
121
+ )
@@ -26,7 +26,7 @@ def english(dir):
26
26
 
27
27
  If you use this data, please cite Sentiment140 as your source.
28
28
  """
29
- return {
30
- "train": Generic(path=dir / "training.1600000.processed.noemoticon.csv"),
31
- "test": Generic(path=dir / "testdata.manual.2009.06.14.csv"),
32
- }
29
+ return Supervised.C(
30
+ train=Generic(path=dir / "training.1600000.processed.noemoticon.csv"),
31
+ test=Generic(path=dir / "testdata.manual.2009.06.14.csv"),
32
+ )
@@ -17,7 +17,6 @@ of three CD-ROMs of SGML encoded documents distributed by LDC plus queries and a
17
17
  See also https://trec.nist.gov/data/docs_eng.html and https://trec.nist.gov/data/intro_eng.html
18
18
  """
19
19
 
20
- from datamaestro.data import Base
21
20
  from datamaestro_text.data.ir.trec import TipsterCollection
22
21
  from datamaestro.download.links import linkfolder
23
22
  from datamaestro.definitions import (
@@ -1,10 +1,13 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from enum import Enum
3
+ from datamaestro_text.data.ir.base import IDItem, SimpleTextItem
4
+ from experimaestro import Param
3
5
  from typing import Dict, Generic, Iterator, List, Optional, Sequence, Tuple
4
6
  from attr import define
7
+ from datamaestro.record import record_type
5
8
  from datamaestro.data import Base
6
9
  from datamaestro.record import Record, Item
7
- from datamaestro_text.data.ir import TopicRecord
10
+ from datamaestro_text.data.ir import TopicRecord, Topics
8
11
  from datamaestro_text.utils.iter import FactoryIterable, LazyList, RangeView
9
12
 
10
13
  # ---- Basic types
@@ -120,20 +123,17 @@ class ConversationNode:
120
123
  ...
121
124
 
122
125
  @abstractmethod
123
- def parent(self) -> Optional["ConversationNode"]:
124
- ...
126
+ def parent(self) -> Optional["ConversationNode"]: ...
125
127
 
126
128
  @abstractmethod
127
- def children(self) -> List["ConversationNode"]:
128
- ...
129
+ def children(self) -> List["ConversationNode"]: ...
129
130
 
130
131
 
131
132
  class ConversationTree(ABC):
132
133
  """Represents a conversation tree"""
133
134
 
134
135
  @abstractmethod
135
- def root(self) -> ConversationNode:
136
- ...
136
+ def root(self) -> ConversationNode: ...
137
137
 
138
138
  @abstractmethod
139
139
  def __iter__(self) -> Iterator[ConversationNode]:
@@ -214,8 +214,8 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
214
214
 
215
215
  def __init__(self, entry):
216
216
  self.entry = entry
217
- self.parent = None
218
- self.children = []
217
+ self._parent = None
218
+ self._children = []
219
219
 
220
220
  def add(self, node: "ConversationTreeNode") -> "ConversationTreeNode":
221
221
  self._children.append(node)
@@ -224,10 +224,10 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
224
224
 
225
225
  def conversation(self, skip_self: bool) -> ConversationHistory:
226
226
  def iterator():
227
- current = self.parent if skip_self else self
227
+ current = self.parent() if skip_self else self
228
228
  while current is not None:
229
229
  yield current.entry
230
- current = current.parent
230
+ current = current.parent()
231
231
 
232
232
  return LazyList(FactoryIterable(iterator))
233
233
 
@@ -253,5 +253,30 @@ class ConversationDataset(Base, ABC):
253
253
  @abstractmethod
254
254
  def __iter__(self) -> Iterator[ConversationTree]:
255
255
  """Return an iterator over conversations"""
256
- for i in range(len(self)):
257
- yield self.get(i)
256
+ ...
257
+
258
+
259
+ class ConversationUserTopics(Topics):
260
+ """Extract user topics from conversations"""
261
+
262
+ conversations: Param[ConversationDataset]
263
+
264
+ topic_recordtype = record_type(IDItem, SimpleTextItem)
265
+
266
+ def iter(self) -> Iterator[TopicRecord]:
267
+ """Returns an iterator over topics"""
268
+ # Extracts topics from conversations, Each user query is a topic (can perform retrieval on it)
269
+ # TODO: merge with xpmir.learning.DatasetConversationBase -> same logic
270
+
271
+ records: List[TopicRecord] = []
272
+ for conversation in self.conversations.__iter__():
273
+ nodes = [
274
+ node
275
+ for node in conversation
276
+ if node.entry[EntryType] == EntryType.USER_QUERY
277
+ ]
278
+ for node in nodes:
279
+ records.append(
280
+ node.entry.update(ConversationHistoryItem(node.history()))
281
+ )
282
+ return iter(records)
@@ -0,0 +1,145 @@
1
+ from typing import Iterator, List
2
+ from attr import define, field
3
+ import json
4
+ import logging
5
+ from datamaestro.data import File
6
+ from datamaestro.record import Record
7
+
8
+ from datamaestro_text.data.ir import Topics
9
+ from datamaestro_text.data.ir.base import (
10
+ IDItem,
11
+ SimpleTextItem,
12
+ )
13
+
14
+
15
+ from .base import (
16
+ AnswerEntry,
17
+ ConversationTree,
18
+ EntryType,
19
+ SimpleDecontextualizedItem,
20
+ SingleConversationTree,
21
+ )
22
+ from . import ConversationDataset
23
+
24
+ # Keys to change in the dataset entries for compatibility across different years
25
+
26
+ KEY_MAPPINGS = {
27
+ # Keys to replace: Target Key
28
+ "turns": "responses",
29
+ "utterance": "user_utterance",
30
+ "ptkb_provenance": "relevant_ptkbs",
31
+ "response_provenance": "citations",
32
+ }
33
+
34
+
35
+ def norm_dict(entry: dict) -> dict:
36
+ """Convert keys in the entry to match the expected format."""
37
+ normalized = {}
38
+ for k, v in entry.items():
39
+ # Check for direct mapping, then try lowercase mapping
40
+ new_key = KEY_MAPPINGS.get(k) or KEY_MAPPINGS.get(k.lower()) or k.lower()
41
+ normalized[new_key] = v
42
+ return normalized
43
+
44
+
45
+ @define(kw_only=True)
46
+ class IkatConversationEntry:
47
+ """A query with past history"""
48
+
49
+ turn_id: int
50
+ """Turn number in the conversation"""
51
+
52
+ user_utterance: str
53
+ """The last issued query"""
54
+
55
+ resolved_utterance: str
56
+ """Manually rewritten query"""
57
+
58
+ response: str
59
+ """The system response to the query"""
60
+
61
+ relevant_ptkbs: List[str]
62
+ """The list of relevant personal knowledge bases for the query"""
63
+
64
+ citations: List[str]
65
+ """The list of citations for the response"""
66
+
67
+
68
+ @define(kw_only=True)
69
+ class IkatConversationTopic:
70
+ """A query with past history"""
71
+
72
+ number: str
73
+ """Conversation ID"""
74
+
75
+ title: str
76
+ """Title of the conversation"""
77
+
78
+ ptkb: str
79
+ """The personal knowledge base associated with the user"""
80
+
81
+ responses: List[IkatConversationEntry] = field(
82
+ converter=lambda items: [
83
+ IkatConversationEntry(**item) if isinstance(item, dict) else item
84
+ for item in map(norm_dict, items)
85
+ ]
86
+ )
87
+ """The list of responses to the query"""
88
+
89
+
90
+ class IkatConversations(ConversationDataset, File):
91
+ """A dataset containing conversations from the IKAT project"""
92
+
93
+ """Keys to change in the dataset entries for compatibility across different years"""
94
+
95
+ def entries(self) -> Iterator[IkatConversationTopic]:
96
+ """Reads all conversation entries from the dataset file."""
97
+ with self.path.open("rt") as fp:
98
+ raw_data = json.load(fp)
99
+
100
+ logging.debug("Loaded %d entries from %s", len(raw_data), self.path)
101
+ logging.debug(f"raw data has keys {raw_data[0].keys()}")
102
+
103
+ for entry in raw_data:
104
+ try:
105
+ normalized_entry = norm_dict(entry)
106
+ yield IkatConversationTopic(**normalized_entry)
107
+ except Exception as e:
108
+ logging.warning(f"Failed to parse entry: {e}")
109
+ raise e
110
+
111
+ def __iter__(self) -> Iterator[ConversationTree]:
112
+ for entry in self.entries():
113
+ history: List[Record] = []
114
+
115
+ for turn in entry.responses:
116
+ turn: IkatConversationEntry = turn # Ensure type is correct
117
+ query_id = f"{entry.number}_{turn.turn_id}"
118
+
119
+ # USER QUERY record
120
+ history.append(
121
+ Record(
122
+ IDItem(query_id),
123
+ SimpleTextItem(turn.user_utterance),
124
+ SimpleDecontextualizedItem(turn.resolved_utterance),
125
+ EntryType.USER_QUERY,
126
+ )
127
+ )
128
+
129
+ # Build citation info (stubbed relevance to match format)
130
+ relevances = {}
131
+ if turn.relevant_ptkbs:
132
+ # Example: just use first as relevant (can be improved)
133
+ relevances[0] = (0, None) # No position info in this structure
134
+
135
+ # SYSTEM ANSWER record
136
+ history.append(
137
+ Record(
138
+ AnswerEntry(turn.response),
139
+ EntryType.SYSTEM_ANSWER,
140
+ )
141
+ )
142
+
143
+ # Ensure reverse if needed for compatibility (optional)
144
+ history.reverse()
145
+ yield SingleConversationTree(entry.number, history)
@@ -2,9 +2,10 @@
2
2
 
3
3
  from abc import ABC, abstractmethod
4
4
  from functools import cached_property
5
+ import logging
5
6
  from pathlib import Path
6
7
  from attrs import define
7
- from typing import Callable, Dict, Iterator, List, Optional, Tuple, Type
8
+ from typing import Callable, Dict, Iterator, List, Optional, Tuple, Type, TYPE_CHECKING
8
9
  import random
9
10
  from experimaestro import Config
10
11
  from datamaestro.definitions import datatasks, Param, Meta
@@ -28,6 +29,9 @@ from .base import ( # noqa: F401
28
29
  AdhocAssessedTopic,
29
30
  )
30
31
 
32
+ #: A adhoc run dictionary (query id -> doc id -> score)
33
+ AdhocRunDict = dict[str, dict[str, float]]
34
+
31
35
 
32
36
  class Documents(Base):
33
37
  """A set of documents with identifiers
@@ -45,6 +49,22 @@ class Documents(Base):
45
49
  def iter_documents(self) -> Iterator[DocumentRecord]:
46
50
  return self.iter()
47
51
 
52
+ def iter_documents_from(self, start=0) -> Iterator[DocumentRecord]:
53
+ """Iterate over a range of documents
54
+
55
+ Can be specialized in a subclass for faster access
56
+
57
+ :param start: The starting document, defaults to 0
58
+ :return: An iterator
59
+ """
60
+ iter = self.iter()
61
+ if start > 0:
62
+ logging.info("skipping %d documents", start + 1)
63
+ for _ in range(start + 1):
64
+ next(iter)
65
+
66
+ return iter
67
+
48
68
  def iter_ids(self) -> Iterator[str]:
49
69
  """Iterates over document ids
50
70
 
@@ -168,7 +188,10 @@ class AdhocAssessments(Base, ABC):
168
188
  class AdhocRun(Base):
169
189
  """IR adhoc run"""
170
190
 
171
- pass
191
+ @abstractmethod
192
+ def get_dict(self) -> "AdhocRunDict":
193
+ """Get the run as a dictionary query ID -> doc ID -> score"""
194
+ ...
172
195
 
173
196
 
174
197
  class AdhocResults(Base):
@@ -43,6 +43,7 @@ class IDItem(Item, ABC):
43
43
 
44
44
  id: str
45
45
 
46
+
46
47
  @define
47
48
  class UrlItem(Item):
48
49
  """An url item"""
@@ -70,7 +71,7 @@ class AdhocAssessedTopic:
70
71
  """List of assessments for this topic"""
71
72
 
72
73
 
73
- def create_record(*items: Item, id: str = None, text: str = None):
74
+ def create_record(*items: Item, id: str = None, text: str = None) -> Record:
74
75
  """Easy creation of a text/id item"""
75
76
  extra_items = []
76
77
  if id is not None:
@@ -99,7 +99,7 @@ class WapoDocument(TextItem):
99
99
  body_media: Tuple[WapoDocMedia, ...]
100
100
 
101
101
  @cached_property
102
- def text(self):
102
+ def text(self):
103
103
  return f"{self.title} {self.body_paras_html}"
104
104
 
105
105
 
@@ -132,11 +132,15 @@ class DprW100Doc(TextItem):
132
132
  text: str
133
133
  title: str
134
134
 
135
- @define
135
+
136
+ @define
136
137
  class MsMarcoV2Passage(TextItem):
137
138
  text: str
138
139
  spans: Tuple[Tuple[int, int], ...]
139
140
  msmarco_document_id: str
141
+
142
+
143
+ @define
140
144
  class Touche2020(TextItem):
141
145
  text: str
142
146
  title: str
@@ -204,11 +208,13 @@ class TrecTopic(SimpleTextItem):
204
208
 
205
209
  TrecTopicRecord = record_type(IDItem, TrecTopic)
206
210
 
211
+
207
212
  @define
208
213
  class DprW100Query(TextItem):
209
214
  text: str
210
215
  answers: Tuple[str]
211
216
 
217
+
212
218
  @define
213
219
  class TrecBackgroundLinkingQuery(IDItem):
214
220
  query_id: str