datamaestro-text 2025.6.30__tar.gz → 2025.9.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/.github/workflows/pytest.yml +1 -1
  2. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/.readthedocs.yml +1 -1
  3. {datamaestro_text-2025.6.30/src/datamaestro_text.egg-info → datamaestro_text-2025.9.11}/PKG-INFO +3 -3
  4. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/source/api/conversation.rst +10 -4
  5. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/source/api/ir.rst +8 -2
  6. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/pyproject.toml +9 -1
  7. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/requirements.txt +1 -1
  8. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/github/aagohary/canard.py +3 -3
  9. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/github/apple/ml-qrecc.py +2 -1
  10. datamaestro_text-2025.9.11/src/datamaestro_text/config/com/github/ikat.py +121 -0
  11. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/microsoft/msmarco/passage.py +1 -1
  12. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/sentiment140.py +4 -4
  13. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/edu/stanford/aclimdb.py +2 -2
  14. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/gov/nist/trec/tipster.py +0 -1
  15. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/io/github/thunlp/fewrel.py +1 -1
  16. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/io/metamind/research/wikitext.py +3 -3
  17. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/org/grouplens/movielens.py +8 -8
  18. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/org/universaldependencies/french.py +3 -3
  19. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/conversation/base.py +34 -9
  20. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/conversation/ikat.py +38 -13
  21. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/__init__.py +44 -4
  22. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/base.py +2 -1
  23. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/formats.py +8 -2
  24. datamaestro_text-2025.9.11/src/datamaestro_text/data/ir/stores.py +124 -0
  25. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/trec.py +7 -4
  26. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/datasets/irds/data.py +47 -16
  27. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/interfaces/trec.py +28 -1
  28. datamaestro_text-2025.9.11/src/datamaestro_text/utils/files.py +111 -0
  29. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/utils/iter.py +5 -0
  30. datamaestro_text-2025.9.11/src/datamaestro_text/version.py +34 -0
  31. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11/src/datamaestro_text.egg-info}/PKG-INFO +3 -3
  32. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text.egg-info/requires.txt +1 -1
  33. datamaestro_text-2025.6.30/src/datamaestro_text/config/com/github/ikat.py +0 -38
  34. datamaestro_text-2025.6.30/src/datamaestro_text/data/ir/stores.py +0 -29
  35. datamaestro_text-2025.6.30/src/datamaestro_text/utils/files.py +0 -8
  36. datamaestro_text-2025.6.30/src/datamaestro_text/version.py +0 -21
  37. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/.circleci/config.yml +0 -0
  38. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/.flake8 +0 -0
  39. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/.github/workflows/python-publish.yml +0 -0
  40. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/.gitignore +0 -0
  41. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/.pre-commit-config.yaml +0 -0
  42. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/LICENSE +0 -0
  43. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/MANIFEST.in +0 -0
  44. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/Makefile +0 -0
  45. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/README.md +0 -0
  46. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/Makefile +0 -0
  47. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/make.bat +0 -0
  48. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/requirements.txt +0 -0
  49. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/source/api/embeddings.rst +0 -0
  50. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/source/api/index.rst +0 -0
  51. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/source/api/nlp.rst +0 -0
  52. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/source/api/recommendation.rst +0 -0
  53. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/source/api/text.rst +0 -0
  54. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/source/conf.py +0 -0
  55. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/source/datasets/conversation.rst +0 -0
  56. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/source/datasets/embeddings.rst +0 -0
  57. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/source/datasets/index.rst +0 -0
  58. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/source/datasets/ir.rst +0 -0
  59. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/source/datasets/irds.rst +0 -0
  60. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/source/datasets/recommendation.rst +0 -0
  61. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/source/datasets/text.rst +0 -0
  62. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/docs/source/index.rst +0 -0
  63. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/mkdocs.yml +0 -0
  64. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/requirements-dev.txt +0 -0
  65. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/setup.cfg +0 -0
  66. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/__init__.py +0 -0
  67. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/__init__.py +0 -0
  68. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/ai/quac.yaml +0 -0
  69. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/fastml/goodbooks-10k.yaml +0 -0
  70. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/github/prdwb/orconvqa.py +0 -0
  71. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/github/soskek/bookcorpus.yaml +0 -0
  72. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/microsoft/wikiqa.yaml +0 -0
  73. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/oscar-corpus.py +0 -0
  74. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/com/smashwords/bookcorpus.py +0 -0
  75. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/edu/cornell/nlvr.yaml +0 -0
  76. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/edu/stanford/__init__.py +0 -0
  77. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/edu/stanford/glove.py +0 -0
  78. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/edu/stanford/im2p.yaml +0 -0
  79. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/edu/upenn/__init__.py +0 -0
  80. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/edu/upenn/ldc/__init__.py +0 -0
  81. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/edu/upenn/ldc/aquaint.py +0 -0
  82. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/gov/__init__.py +0 -0
  83. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/gov/nist/__init__.py +0 -0
  84. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/gov/nist/ir/covid.py +0 -0
  85. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/gov/nist/trec/__init__.py +0 -0
  86. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/gov/nist/trec/adhoc.py +0 -0
  87. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/gov/nist/trec/clueweb.yaml +0 -0
  88. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/gov/nist/trec/deeplearning.yaml +0 -0
  89. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/gov/nist/trec/index.yaml +0 -0
  90. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/gov/nist/trec/web.yaml +0 -0
  91. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/io/github/rajpurkar/squad.yaml +0 -0
  92. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/io/metamind/research/__init__.py +0 -0
  93. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/net/mattmahoney/enwiki.yaml +0 -0
  94. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/org/acm/recsys/cb2014.yaml +0 -0
  95. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/org/cocodataset/index.yaml +0 -0
  96. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/config/uk/ac/ucl/cs/qangaroo.yaml +0 -0
  97. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/__init__.py +0 -0
  98. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/conversation/__init__.py +0 -0
  99. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/conversation/canard.py +0 -0
  100. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/conversation/orconvqa.py +0 -0
  101. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/conversation/qrecc.py +0 -0
  102. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/embeddings.py +0 -0
  103. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/cord19.py +0 -0
  104. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/csv.py +0 -0
  105. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/data.py +0 -0
  106. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/huggingface.py +0 -0
  107. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/ir/utils.py +0 -0
  108. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/recommendation.py +0 -0
  109. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/tagging.py +0 -0
  110. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/data/text.py +0 -0
  111. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/datasets/irds/__init__.py +0 -0
  112. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/datasets/irds/datasets.py +0 -0
  113. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/datasets/irds/helpers.py +0 -0
  114. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/datasets/irds/utils.py +0 -0
  115. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/download/tmdb.py +0 -0
  116. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/interfaces/plaintext.py +0 -0
  117. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/test/__init__.py +0 -0
  118. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/test/test_datasets.py +0 -0
  119. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/test/test_documented.py +0 -0
  120. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/transforms/__init__.py +0 -0
  121. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/transforms/ir/__init__.py +0 -0
  122. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/utils/__init__.py +0 -0
  123. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/utils/randomstream.py +0 -0
  124. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text/utils/shuffle.py +0 -0
  125. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text.egg-info/SOURCES.txt +0 -0
  126. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text.egg-info/dependency_links.txt +0 -0
  127. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text.egg-info/entry_points.txt +0 -0
  128. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/src/datamaestro_text.egg-info/top_level.txt +0 -0
  129. {datamaestro_text-2025.6.30 → datamaestro_text-2025.9.11}/tox.ini +0 -0
@@ -25,7 +25,7 @@ jobs:
25
25
  python-version: ${{ matrix.python-version }}
26
26
  - name: Install dependencies
27
27
  run: |
28
- python -m pip install --upgrade pip
28
+ python -m pip install --upgrade pip setuptools
29
29
  SETUPTOOLS_SCM_PRETEND_VERSION="0.1-dev" pip install -e '.[dev]'
30
30
  - name: Lint with flake8
31
31
  run: |
@@ -11,7 +11,7 @@ sphinx:
11
11
  build:
12
12
  os: ubuntu-20.04
13
13
  tools:
14
- python: "3.9"
14
+ python: "3.10"
15
15
 
16
16
 
17
17
  # Install the package
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datamaestro-text
3
- Version: 2025.6.30
3
+ Version: 2025.9.11
4
4
  Summary: Datamaestro module for text-related datasets
5
5
  Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
6
6
  License: GPL-3
@@ -15,10 +15,10 @@ Classifier: Operating System :: OS Independent
15
15
  Classifier: Programming Language :: Python
16
16
  Classifier: Programming Language :: Python :: 3
17
17
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
- Requires-Python: >=3.8
18
+ Requires-Python: >=3.10
19
19
  Description-Content-Type: text/markdown
20
20
  License-File: LICENSE
21
- Requires-Dist: datamaestro>=1.4.2
21
+ Requires-Dist: datamaestro>=1.5.0
22
22
  Requires-Dist: ir_datasets>=0.5.8
23
23
  Requires-Dist: attrs
24
24
  Provides-Extra: dev
@@ -26,6 +26,12 @@ Data classes
26
26
 
27
27
  .. autoclass:: ConversationTopic
28
28
 
29
+ Conversational IR
30
+ -----------------
31
+
32
+ .. autoxpmconfig:: datamaestro_text.data.conversation.base.ConversationUserTopics
33
+
34
+
29
35
  Contextual query reformulation
30
36
  ------------------------------
31
37
 
@@ -34,7 +40,7 @@ Contextual query reformulation
34
40
  .. autoclass:: ContextualizedRewrittenQuery
35
41
  :members:
36
42
 
37
- CANARD Dataset
43
+ CANARD Dataset
38
44
 
39
45
  .. autoxpmconfig:: datamaestro_text.data.conversation.canard.CanardDataset
40
46
  :members: iter
@@ -50,7 +56,7 @@ OrConvQA Dataset
50
56
  .. autoclass:: datamaestro_text.data.conversation.orconvqa.OrConvQADatasetHistoryEntry
51
57
  :members:
52
58
 
53
- QReCC Dataset
59
+ QReCC Dataset
54
60
 
55
61
  .. autoclass:: datamaestro_text.data.conversation.qrecc.QReCCDatasetEntry
56
62
  :members:
@@ -61,11 +67,11 @@ QReCC Dataset
61
67
 
62
68
  iKAT Dataset
63
69
 
64
- .. autoclass:: datamaestro_text.data.conversation.ikat.IkatDatasetEntry
70
+ .. autoclass:: datamaestro_text.data.conversation.ikat.IkatConversationTopic
65
71
  :members:
66
72
 
67
73
  .. autoclass:: datamaestro_text.data.conversation.ikat.IkatConversationEntry
68
74
  :members:
69
75
 
70
- .. autoxpmconfig:: datamaestro_text.data.conversation.ikat.IkatDataset
76
+ .. autoxpmconfig:: datamaestro_text.data.conversation.ikat.IkatConversations
71
77
  :members: iter
@@ -36,11 +36,17 @@ Documents
36
36
 
37
37
  .. autoxpmconfig:: datamaestro_text.data.ir.Documents
38
38
  :members: iter_documents, iter_ids, documentcount
39
+ .. autoxpmconfig:: datamaestro_text.data.ir.csv.Documents
40
+ .. autoxpmconfig:: datamaestro_text.datasets.irds.data.LZ4DocumentStore
41
+
42
+
43
+ Dataset-specific documents
44
+ **************************
45
+
39
46
  .. autoxpmconfig:: datamaestro_text.data.ir.cord19.Documents
40
47
  .. autoxpmconfig:: datamaestro_text.data.ir.trec.TipsterCollection
41
- .. autoxpmconfig:: datamaestro_text.data.ir.csv.Documents
42
48
  .. autoxpmconfig:: datamaestro_text.data.ir.stores.OrConvQADocumentStore
43
- .. autoxpmconfig:: datamaestro_text.datasets.irds.data.LZ4DocumentStore
49
+ .. autoxpmconfig:: datamaestro_text.data.ir.stores.IKatClueWeb22DocumentStore
44
50
 
45
51
  Assessments
46
52
  -----------
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "datamaestro-text"
3
- requires-python = ">=3.8"
3
+ requires-python = ">=3.10"
4
4
  keywords = ["dataset manager", "information retrieval", "experiments"]
5
5
  description = "Datamaestro module for text-related datasets"
6
6
  dynamic = ["version", "readme", "dependencies"]
@@ -44,6 +44,14 @@ fallback_version = "0.0.0-dev"
44
44
  [build-system]
45
45
  requires = ["setuptools", "setuptools-scm", "wheel"] # PEP 508 specifications.
46
46
 
47
+ [dependency-groups]
48
+ dev = [
49
+ "docutils>=0.21.2",
50
+ "pytest>=8.4.1",
51
+ "sphinx>=8.1.3",
52
+ "sphobjinv>=2.3.1.3",
53
+ ]
54
+
47
55
  [project.entry-points."datamaestro.repositories"]
48
56
  text = "datamaestro_text:Repository"
49
57
  irds = "datamaestro_text.datasets.irds:Repository"
@@ -1,3 +1,3 @@
1
- datamaestro>=1.4.2
1
+ datamaestro>=1.5.0
2
2
  ir_datasets>=0.5.8
3
3
  attrs
@@ -37,7 +37,7 @@ def main(train, dev, test):
37
37
  Each dataset is an instance of :class:`datamaestro_text.data.conversation.CanardDataset`
38
38
  """
39
39
  return {
40
- "train": CanardDataset(path=train),
41
- "validation": CanardDataset(path=dev),
42
- "test": CanardDataset(path=test),
40
+ "train": CanardDataset.C(path=train),
41
+ "validation": CanardDataset.C(path=dev),
42
+ "test": CanardDataset.C(path=test),
43
43
  }
@@ -51,6 +51,7 @@ def main(data: Path) -> Supervised[QReCCDataset, None, QReCCDataset]:
51
51
  )
52
52
  class Content(LZ4JSONLDocumentStore):
53
53
  """QReCC mentionned URLs content"""
54
+
54
55
  @staticmethod
55
56
  def __create_dataset__(dataset, options=None):
56
57
  ds = reference(reference=main).setup(dataset, options)
@@ -65,7 +66,7 @@ class Content(LZ4JSONLDocumentStore):
65
66
  "id",
66
67
  ).setup(dataset, options)
67
68
 
68
- return Content(jsonl_path=store_path)
69
+ return Content.C(jsonl_path=store_path)
69
70
 
70
71
  @staticmethod
71
72
  def _documents(path: Path):
@@ -0,0 +1,121 @@
1
+ # See documentation on https://datamaestro.readthedocs.io
2
+
3
+ import bz2
4
+ from datamaestro.download import reference
5
+ from datamaestro.definitions import datatasks, datatags, dataset
6
+ from datamaestro_text.data.conversation.base import ConversationUserTopics
7
+ from datamaestro_text.data.ir import Adhoc
8
+
9
+ from datamaestro.utils import HashCheck
10
+ from datamaestro.context import DatafolderPath
11
+ from datamaestro.download.single import filedownloader
12
+ from datamaestro_text.data.conversation.ikat import IkatConversations
13
+ from datamaestro.download.links import linkfolder
14
+
15
+ from datamaestro_text.data.ir.stores import IKatClueWeb22DocumentStore
16
+ from datamaestro_text.data.ir.trec import TrecAdhocAssessments
17
+ from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
18
+
19
+
20
+ @dataset(as_prepare=True)
21
+ def clueweb22(dataset, options=None) -> IKatClueWeb22DocumentStore:
22
+ # Number of documents in the dataset
23
+ count = 116_838_987
24
+
25
+ jsonl_folder = linkfolder(
26
+ "documents", [DatafolderPath("gov.nist.trec.ikat.clueweb22", "jsonl")]
27
+ ).setup(dataset, options)
28
+ store_path = lz4docstore_builder(
29
+ "store",
30
+ IKatClueWeb22DocumentStore.generator(
31
+ jsonl_folder,
32
+ jsonl_folder / "ikat_2023_passages_jsonl.sha256sums",
33
+ jsonl_folder / "ikat_2023_passages_hashes.tsv.bz2",
34
+ ),
35
+ IKatClueWeb22DocumentStore.Document,
36
+ "id",
37
+ count_hint=count,
38
+ ).setup(dataset, options)
39
+
40
+ return IKatClueWeb22DocumentStore.C(path=store_path, count=count)
41
+
42
+
43
+ @datatags("conversation", "context", "query")
44
+ @datatasks("conversational search", "query rewriting")
45
+ @reference("documents", clueweb22)
46
+ @filedownloader(
47
+ "topics.json",
48
+ "https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2025/data/2025_test_topics.json",
49
+ checker=HashCheck("16f8444a8d0a8dfe0090f478f185a63c"),
50
+ )
51
+ @dataset(
52
+ id="2025",
53
+ url="https://github.com/irlabamsterdam/iKAT/tree/main/2025",
54
+ )
55
+ def test_2025(topics, documents) -> Adhoc.C:
56
+ """Question-in-context rewriting
57
+
58
+ iKAT is a test dataset for question-in-context rewriting that consists of
59
+ questions each given in a dialog context together with a context-independent
60
+ rewriting of the question.
61
+ """
62
+ return Adhoc.C(
63
+ topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
64
+ # TODO: add when available
65
+ assessments=TrecAdhocAssessments.C(path="/to/do"),
66
+ documents=documents,
67
+ )
68
+
69
+
70
+ @datatags("conversation", "context", "query")
71
+ @datatasks("conversational search", "query rewriting")
72
+ @reference("documents", clueweb22)
73
+ @filedownloader(
74
+ "qrels",
75
+ "https://trec.nist.gov/data/ikat/2024-qrels.txt",
76
+ checker=HashCheck("57f958903ed1c12bbac207f62800814f"),
77
+ )
78
+ @filedownloader(
79
+ "topics.json",
80
+ "https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2024/data/2024_test_topics.json",
81
+ checker=HashCheck("ad45bc6e7add2081d69ea60a0a4d1203"),
82
+ )
83
+ @dataset(
84
+ Adhoc,
85
+ id="2024",
86
+ url="https://github.com/irlabamsterdam/iKAT/tree/main/2024",
87
+ )
88
+ def test_2024(topics, qrels, documents) -> Adhoc.C:
89
+ """iKAT 2024 dataset"""
90
+ return Adhoc.C(
91
+ topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
92
+ assessments=TrecAdhocAssessments.C(path=qrels),
93
+ documents=documents,
94
+ )
95
+
96
+
97
+ @datatags("conversation", "context", "query")
98
+ @datatasks("conversational search", "query rewriting")
99
+ @reference("documents", clueweb22)
100
+ @filedownloader(
101
+ "qrels",
102
+ "https://trec.nist.gov/data/ikat/2023-qrels.all-turns.txt",
103
+ checker=HashCheck("79dc121bab25b2245e52a53263e5ad1f"),
104
+ )
105
+ @filedownloader(
106
+ "topics.json",
107
+ "https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2023/data/2023_test_topics.json",
108
+ checker=HashCheck("684fa0197cdec8c3cfb6a2e586ab83f6"),
109
+ )
110
+ @dataset(
111
+ Adhoc,
112
+ id="2023",
113
+ url="https://github.com/irlabamsterdam/iKAT/tree/main/2023",
114
+ )
115
+ def test_2023(topics, qrels, documents) -> Adhoc.C:
116
+ """iKAT 2023 dataset"""
117
+ return Adhoc.C(
118
+ topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
119
+ assessments=TrecAdhocAssessments.C(path=qrels),
120
+ documents=documents,
121
+ )
@@ -47,7 +47,7 @@ http://www.msmarco.org/dataset.aspx""",
47
47
  @dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
48
48
  def collection_etc(data) -> Folder:
49
49
  """Documents and some more files"""
50
- return Folder(path=data)
50
+ return Folder.C(path=data)
51
51
 
52
52
 
53
53
  @lua
@@ -26,7 +26,7 @@ def english(dir):
26
26
 
27
27
  If you use this data, please cite Sentiment140 as your source.
28
28
  """
29
- return {
30
- "train": Generic(path=dir / "training.1600000.processed.noemoticon.csv"),
31
- "test": Generic(path=dir / "testdata.manual.2009.06.14.csv"),
32
- }
29
+ return Supervised.C(
30
+ train=Generic.C(path=dir / "training.1600000.processed.noemoticon.csv"),
31
+ test=Generic.C(path=dir / "testdata.manual.2009.06.14.csv"),
32
+ )
@@ -11,6 +11,6 @@ def aclimdb(data):
11
11
  Paper http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf
12
12
  """
13
13
  return {
14
- "train": FolderBased(path=data / "train", classes=["neg", "pos"]),
15
- "test": FolderBased(path=data / "test", classes=["neg", "pos"]),
14
+ "train": FolderBased.C(path=data / "train", classes=["neg", "pos"]),
15
+ "test": FolderBased.C(path=data / "test", classes=["neg", "pos"]),
16
16
  }
@@ -17,7 +17,6 @@ of three CD-ROMs of SGML encoded documents distributed by LDC plus queries and a
17
17
  See also https://trec.nist.gov/data/docs_eng.html and https://trec.nist.gov/data/intro_eng.html
18
18
  """
19
19
 
20
- from datamaestro.data import Base
21
20
  from datamaestro_text.data.ir.trec import TipsterCollection
22
21
  from datamaestro.download.links import linkfolder
23
22
  from datamaestro.definitions import (
@@ -32,4 +32,4 @@ def v1(train, validation):
32
32
  Only the train and validation dataset are available. The test set is hidden
33
33
  for the leaderboard.
34
34
  """
35
- return {"train": File(path=train), "validation": File(path=validation)}
35
+ return {"train": File.C(path=train), "validation": File.C(path=validation)}
@@ -30,9 +30,9 @@ def WikiText(data, type):
30
30
  https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/
31
31
  """
32
32
  return {
33
- "train": File(path=data / ("wiki.train.%s" % type)),
34
- "validation": File(path=data / ("wiki.valid.%s" % type)),
35
- "test": File(path=data / ("wiki.test.%s" % type)),
33
+ "train": File.C(path=data / ("wiki.train.%s" % type)),
34
+ "validation": File.C(path=data / ("wiki.valid.%s" % type)),
35
+ "test": File.C(path=data / ("wiki.test.%s" % type)),
36
36
  }
37
37
 
38
38
 
@@ -31,10 +31,10 @@ def small(ds) -> Movielens:
31
31
  100,000 ratings and 3,600 tag applications applied to 9,000 movies by 600 users (as of 9/2018)
32
32
  """
33
33
  return {
34
- "ratings": csv.Generic(path=ds / "ratings.csv", names_row=0),
35
- "links": csv.Generic(path=ds / "links.csv", names_row=0),
36
- "movies": csv.Generic(path=ds / "movies.csv", names_row=0),
37
- "tags": csv.Generic(path=ds / "tags.csv", names_row=0),
34
+ "ratings": csv.Generic.C(path=ds / "ratings.csv", names_row=0),
35
+ "links": csv.Generic.C(path=ds / "links.csv", names_row=0),
36
+ "movies": csv.Generic.C(path=ds / "movies.csv", names_row=0),
37
+ "tags": csv.Generic.C(path=ds / "tags.csv", names_row=0),
38
38
  }
39
39
 
40
40
 
@@ -46,8 +46,8 @@ def full(ds) -> Movielens:
46
46
  27,000,000 ratings and 1,100,000 tag applications applied to 58,000 movies by 280,000 users (as of 9/2018)
47
47
  """
48
48
  return {
49
- "ratings": csv.Generic(path=ds / "ratings.csv", names_row=0),
50
- "links": csv.Generic(path=ds / "links.csv", names_row=0),
51
- "movies": csv.Generic(path=ds / "movies.csv", names_row=0),
52
- "tags": csv.Generic(path=ds / "tags.csv", names_row=0),
49
+ "ratings": csv.Generic.C(path=ds / "ratings.csv", names_row=0),
50
+ "links": csv.Generic.C(path=ds / "links.csv", names_row=0),
51
+ "movies": csv.Generic.C(path=ds / "movies.csv", names_row=0),
52
+ "tags": csv.Generic.C(path=ds / "tags.csv", names_row=0),
53
53
  }
@@ -34,9 +34,9 @@ def gsd(ds) -> Supervised:
34
34
  is updated since 2015 independently from the previous source.
35
35
  """
36
36
  return {
37
- "train": CoNLL_U(path=ds / "fr_gsd-ud-train.conllu"),
38
- "test": CoNLL_U(path=ds / "fr_gsd-ud-dev.conllu"),
39
- "validation": CoNLL_U(path=ds / "fr_gsd-ud-test.conllu"),
37
+ "train": CoNLL_U.C(path=ds / "fr_gsd-ud-train.conllu"),
38
+ "test": CoNLL_U.C(path=ds / "fr_gsd-ud-dev.conllu"),
39
+ "validation": CoNLL_U.C(path=ds / "fr_gsd-ud-test.conllu"),
40
40
  }
41
41
 
42
42
 
@@ -1,10 +1,13 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from enum import Enum
3
+ from datamaestro_text.data.ir.base import IDItem, SimpleTextItem
4
+ from experimaestro import Param
3
5
  from typing import Dict, Generic, Iterator, List, Optional, Sequence, Tuple
4
6
  from attr import define
7
+ from datamaestro.record import record_type
5
8
  from datamaestro.data import Base
6
9
  from datamaestro.record import Record, Item
7
- from datamaestro_text.data.ir import TopicRecord
10
+ from datamaestro_text.data.ir import TopicRecord, Topics
8
11
  from datamaestro_text.utils.iter import FactoryIterable, LazyList, RangeView
9
12
 
10
13
  # ---- Basic types
@@ -120,20 +123,17 @@ class ConversationNode:
120
123
  ...
121
124
 
122
125
  @abstractmethod
123
- def parent(self) -> Optional["ConversationNode"]:
124
- ...
126
+ def parent(self) -> Optional["ConversationNode"]: ...
125
127
 
126
128
  @abstractmethod
127
- def children(self) -> List["ConversationNode"]:
128
- ...
129
+ def children(self) -> List["ConversationNode"]: ...
129
130
 
130
131
 
131
132
  class ConversationTree(ABC):
132
133
  """Represents a conversation tree"""
133
134
 
134
135
  @abstractmethod
135
- def root(self) -> ConversationNode:
136
- ...
136
+ def root(self) -> ConversationNode: ...
137
137
 
138
138
  @abstractmethod
139
139
  def __iter__(self) -> Iterator[ConversationNode]:
@@ -253,5 +253,30 @@ class ConversationDataset(Base, ABC):
253
253
  @abstractmethod
254
254
  def __iter__(self) -> Iterator[ConversationTree]:
255
255
  """Return an iterator over conversations"""
256
- for i in range(len(self)):
257
- yield self.get(i)
256
+ ...
257
+
258
+
259
+ class ConversationUserTopics(Topics):
260
+ """Extract user topics from conversations"""
261
+
262
+ conversations: Param[ConversationDataset]
263
+
264
+ topic_recordtype = record_type(IDItem, SimpleTextItem)
265
+
266
+ def iter(self) -> Iterator[TopicRecord]:
267
+ """Returns an iterator over topics"""
268
+ # Extracts topics from conversations, Each user query is a topic (can perform retrieval on it)
269
+ # TODO: merge with xpmir.learning.DatasetConversationBase -> same logic
270
+
271
+ records: List[TopicRecord] = []
272
+ for conversation in self.conversations.__iter__():
273
+ nodes = [
274
+ node
275
+ for node in conversation
276
+ if node.entry[EntryType] == EntryType.USER_QUERY
277
+ ]
278
+ for node in nodes:
279
+ records.append(
280
+ node.entry.update(ConversationHistoryItem(node.history()))
281
+ )
282
+ return iter(records)
@@ -1,10 +1,11 @@
1
- from typing import Iterator, List, Optional
1
+ from typing import Iterator, List
2
2
  from attr import define, field
3
3
  import json
4
4
  import logging
5
5
  from datamaestro.data import File
6
6
  from datamaestro.record import Record
7
7
 
8
+ from datamaestro_text.data.ir import Topics
8
9
  from datamaestro_text.data.ir.base import (
9
10
  IDItem,
10
11
  SimpleTextItem,
@@ -12,7 +13,6 @@ from datamaestro_text.data.ir.base import (
12
13
 
13
14
 
14
15
  from .base import (
15
- AnswerDocumentURL,
16
16
  AnswerEntry,
17
17
  ConversationTree,
18
18
  EntryType,
@@ -21,6 +21,25 @@ from .base import (
21
21
  )
22
22
  from . import ConversationDataset
23
23
 
24
+ # Keys to change in the dataset entries for compatibility across different years
25
+
26
+ KEY_MAPPINGS = {
27
+ # Keys to replace: Target Key
28
+ "turns": "responses",
29
+ "utterance": "user_utterance",
30
+ "ptkb_provenance": "relevant_ptkbs",
31
+ "response_provenance": "citations",
32
+ }
33
+
34
+
35
+ def norm_dict(entry: dict) -> dict:
36
+ """Convert keys in the entry to match the expected format."""
37
+ normalized = {}
38
+ for k, v in entry.items():
39
+ # Check for direct mapping, then try lowercase mapping
40
+ new_key = KEY_MAPPINGS.get(k) or KEY_MAPPINGS.get(k.lower()) or k.lower()
41
+ normalized[new_key] = v
42
+ return normalized
24
43
 
25
44
 
26
45
  @define(kw_only=True)
@@ -47,7 +66,7 @@ class IkatConversationEntry:
47
66
 
48
67
 
49
68
  @define(kw_only=True)
50
- class IkatDatasetEntry:
69
+ class IkatConversationTopic:
51
70
  """A query with past history"""
52
71
 
53
72
  number: str
@@ -60,14 +79,20 @@ class IkatDatasetEntry:
60
79
  """The personal knowledge base associated with the user"""
61
80
 
62
81
  responses: List[IkatConversationEntry] = field(
63
- converter=lambda items: [IkatConversationEntry(**item) if isinstance(item, dict) else item for item in items]
82
+ converter=lambda items: [
83
+ IkatConversationEntry(**item) if isinstance(item, dict) else item
84
+ for item in map(norm_dict, items)
85
+ ]
64
86
  )
65
87
  """The list of responses to the query"""
66
88
 
67
89
 
68
- class IkatDataset(ConversationDataset, File):
90
+ class IkatConversations(ConversationDataset, File):
91
+ """A dataset containing conversations from the IKAT project"""
69
92
 
70
- def entries(self) -> Iterator[IkatDatasetEntry]:
93
+ """Keys to change in the dataset entries for compatibility across different years"""
94
+
95
+ def entries(self) -> Iterator[IkatConversationTopic]:
71
96
  """Reads all conversation entries from the dataset file."""
72
97
  with self.path.open("rt") as fp:
73
98
  raw_data = json.load(fp)
@@ -75,12 +100,13 @@ class IkatDataset(ConversationDataset, File):
75
100
  logging.debug("Loaded %d entries from %s", len(raw_data), self.path)
76
101
  logging.debug(f"raw data has keys {raw_data[0].keys()}")
77
102
 
78
- processed_data = []
79
103
  for entry in raw_data:
80
- processed_data.append(IkatDatasetEntry(**{key.lower(): value for key, value in entry.items()}))
81
-
82
- logging.debug(f"First parsed data sample: {processed_data[0]}")
83
- return iter(processed_data)
104
+ try:
105
+ normalized_entry = norm_dict(entry)
106
+ yield IkatConversationTopic(**normalized_entry)
107
+ except Exception as e:
108
+ logging.warning(f"Failed to parse entry: {e}")
109
+ raise e
84
110
 
85
111
  def __iter__(self) -> Iterator[ConversationTree]:
86
112
  for entry in self.entries():
@@ -88,7 +114,7 @@ class IkatDataset(ConversationDataset, File):
88
114
 
89
115
  for turn in entry.responses:
90
116
  turn: IkatConversationEntry = turn # Ensure type is correct
91
- query_id = f"{entry.number}#{turn.turn_id}"
117
+ query_id = f"{entry.number}_{turn.turn_id}"
92
118
 
93
119
  # USER QUERY record
94
120
  history.append(
@@ -117,4 +143,3 @@ class IkatDataset(ConversationDataset, File):
117
143
  # Ensure reverse if needed for compatibility (optional)
118
144
  history.reverse()
119
145
  yield SingleConversationTree(entry.number, history)
120
-