datamaestro-text 2025.6.30__tar.gz → 2025.7.28__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/.github/workflows/pytest.yml +1 -1
  2. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/.readthedocs.yml +1 -1
  3. {datamaestro_text-2025.6.30/src/datamaestro_text.egg-info → datamaestro_text-2025.7.28}/PKG-INFO +3 -3
  4. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/docs/source/api/conversation.rst +10 -4
  5. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/docs/source/api/ir.rst +8 -2
  6. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/pyproject.toml +9 -1
  7. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/requirements.txt +1 -1
  8. datamaestro_text-2025.7.28/src/datamaestro_text/config/com/github/ikat.py +121 -0
  9. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/com/sentiment140.py +4 -4
  10. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/gov/nist/trec/tipster.py +0 -1
  11. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/conversation/base.py +34 -9
  12. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/conversation/ikat.py +38 -13
  13. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/ir/__init__.py +25 -2
  14. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/ir/base.py +2 -1
  15. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/ir/formats.py +8 -2
  16. datamaestro_text-2025.7.28/src/datamaestro_text/data/ir/stores.py +124 -0
  17. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/ir/trec.py +7 -4
  18. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/datasets/irds/data.py +34 -15
  19. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/interfaces/trec.py +28 -1
  20. datamaestro_text-2025.7.28/src/datamaestro_text/utils/files.py +111 -0
  21. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/utils/iter.py +5 -0
  22. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/version.py +2 -2
  23. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28/src/datamaestro_text.egg-info}/PKG-INFO +3 -3
  24. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text.egg-info/requires.txt +1 -1
  25. datamaestro_text-2025.6.30/src/datamaestro_text/config/com/github/ikat.py +0 -38
  26. datamaestro_text-2025.6.30/src/datamaestro_text/data/ir/stores.py +0 -29
  27. datamaestro_text-2025.6.30/src/datamaestro_text/utils/files.py +0 -8
  28. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/.circleci/config.yml +0 -0
  29. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/.flake8 +0 -0
  30. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/.github/workflows/python-publish.yml +0 -0
  31. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/.gitignore +0 -0
  32. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/.pre-commit-config.yaml +0 -0
  33. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/LICENSE +0 -0
  34. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/MANIFEST.in +0 -0
  35. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/Makefile +0 -0
  36. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/README.md +0 -0
  37. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/docs/Makefile +0 -0
  38. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/docs/make.bat +0 -0
  39. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/docs/requirements.txt +0 -0
  40. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/docs/source/api/embeddings.rst +0 -0
  41. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/docs/source/api/index.rst +0 -0
  42. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/docs/source/api/nlp.rst +0 -0
  43. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/docs/source/api/recommendation.rst +0 -0
  44. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/docs/source/api/text.rst +0 -0
  45. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/docs/source/conf.py +0 -0
  46. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/docs/source/datasets/conversation.rst +0 -0
  47. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/docs/source/datasets/embeddings.rst +0 -0
  48. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/docs/source/datasets/index.rst +0 -0
  49. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/docs/source/datasets/ir.rst +0 -0
  50. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/docs/source/datasets/irds.rst +0 -0
  51. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/docs/source/datasets/recommendation.rst +0 -0
  52. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/docs/source/datasets/text.rst +0 -0
  53. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/docs/source/index.rst +0 -0
  54. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/mkdocs.yml +0 -0
  55. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/requirements-dev.txt +0 -0
  56. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/setup.cfg +0 -0
  57. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/__init__.py +0 -0
  58. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/__init__.py +0 -0
  59. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/ai/quac.yaml +0 -0
  60. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/com/fastml/goodbooks-10k.yaml +0 -0
  61. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/com/github/aagohary/canard.py +0 -0
  62. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/com/github/apple/ml-qrecc.py +0 -0
  63. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/com/github/prdwb/orconvqa.py +0 -0
  64. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/com/github/soskek/bookcorpus.yaml +0 -0
  65. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/com/microsoft/msmarco/passage.py +0 -0
  66. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/com/microsoft/wikiqa.yaml +0 -0
  67. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/com/oscar-corpus.py +0 -0
  68. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/com/smashwords/bookcorpus.py +0 -0
  69. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/edu/cornell/nlvr.yaml +0 -0
  70. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/edu/stanford/__init__.py +0 -0
  71. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/edu/stanford/aclimdb.py +0 -0
  72. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/edu/stanford/glove.py +0 -0
  73. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/edu/stanford/im2p.yaml +0 -0
  74. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/edu/upenn/__init__.py +0 -0
  75. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/edu/upenn/ldc/__init__.py +0 -0
  76. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/edu/upenn/ldc/aquaint.py +0 -0
  77. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/gov/__init__.py +0 -0
  78. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/gov/nist/__init__.py +0 -0
  79. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/gov/nist/ir/covid.py +0 -0
  80. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/gov/nist/trec/__init__.py +0 -0
  81. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/gov/nist/trec/adhoc.py +0 -0
  82. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/gov/nist/trec/clueweb.yaml +0 -0
  83. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/gov/nist/trec/deeplearning.yaml +0 -0
  84. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/gov/nist/trec/index.yaml +0 -0
  85. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/gov/nist/trec/web.yaml +0 -0
  86. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/io/github/rajpurkar/squad.yaml +0 -0
  87. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/io/github/thunlp/fewrel.py +0 -0
  88. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/io/metamind/research/__init__.py +0 -0
  89. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/io/metamind/research/wikitext.py +0 -0
  90. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/net/mattmahoney/enwiki.yaml +0 -0
  91. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/org/acm/recsys/cb2014.yaml +0 -0
  92. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/org/cocodataset/index.yaml +0 -0
  93. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/org/grouplens/movielens.py +0 -0
  94. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/org/universaldependencies/french.py +0 -0
  95. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/config/uk/ac/ucl/cs/qangaroo.yaml +0 -0
  96. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/__init__.py +0 -0
  97. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/conversation/__init__.py +0 -0
  98. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/conversation/canard.py +0 -0
  99. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/conversation/orconvqa.py +0 -0
  100. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/conversation/qrecc.py +0 -0
  101. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/embeddings.py +0 -0
  102. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/ir/cord19.py +0 -0
  103. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/ir/csv.py +0 -0
  104. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/ir/data.py +0 -0
  105. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/ir/huggingface.py +0 -0
  106. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/ir/utils.py +0 -0
  107. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/recommendation.py +0 -0
  108. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/tagging.py +0 -0
  109. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/data/text.py +0 -0
  110. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/datasets/irds/__init__.py +0 -0
  111. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/datasets/irds/datasets.py +0 -0
  112. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/datasets/irds/helpers.py +0 -0
  113. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/datasets/irds/utils.py +0 -0
  114. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/download/tmdb.py +0 -0
  115. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/interfaces/plaintext.py +0 -0
  116. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/test/__init__.py +0 -0
  117. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/test/test_datasets.py +0 -0
  118. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/test/test_documented.py +0 -0
  119. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/transforms/__init__.py +0 -0
  120. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/transforms/ir/__init__.py +0 -0
  121. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/utils/__init__.py +0 -0
  122. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/utils/randomstream.py +0 -0
  123. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text/utils/shuffle.py +0 -0
  124. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text.egg-info/SOURCES.txt +0 -0
  125. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text.egg-info/dependency_links.txt +0 -0
  126. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text.egg-info/entry_points.txt +0 -0
  127. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/src/datamaestro_text.egg-info/top_level.txt +0 -0
  128. {datamaestro_text-2025.6.30 → datamaestro_text-2025.7.28}/tox.ini +0 -0
@@ -25,7 +25,7 @@ jobs:
25
25
  python-version: ${{ matrix.python-version }}
26
26
  - name: Install dependencies
27
27
  run: |
28
- python -m pip install --upgrade pip
28
+ python -m pip install --upgrade pip setuptools
29
29
  SETUPTOOLS_SCM_PRETEND_VERSION="0.1-dev" pip install -e '.[dev]'
30
30
  - name: Lint with flake8
31
31
  run: |
@@ -11,7 +11,7 @@ sphinx:
11
11
  build:
12
12
  os: ubuntu-20.04
13
13
  tools:
14
- python: "3.9"
14
+ python: "3.10"
15
15
 
16
16
 
17
17
  # Install the package
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datamaestro-text
3
- Version: 2025.6.30
3
+ Version: 2025.7.28
4
4
  Summary: Datamaestro module for text-related datasets
5
5
  Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
6
6
  License: GPL-3
@@ -15,10 +15,10 @@ Classifier: Operating System :: OS Independent
15
15
  Classifier: Programming Language :: Python
16
16
  Classifier: Programming Language :: Python :: 3
17
17
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
- Requires-Python: >=3.8
18
+ Requires-Python: >=3.10
19
19
  Description-Content-Type: text/markdown
20
20
  License-File: LICENSE
21
- Requires-Dist: datamaestro>=1.4.2
21
+ Requires-Dist: datamaestro>=1.5.0
22
22
  Requires-Dist: ir_datasets>=0.5.8
23
23
  Requires-Dist: attrs
24
24
  Provides-Extra: dev
@@ -26,6 +26,12 @@ Data classes
26
26
 
27
27
  .. autoclass:: ConversationTopic
28
28
 
29
+ Conversational IR
30
+ -----------------
31
+
32
+ .. autoxpmconfig:: datamaestro_text.data.conversation.base.ConversationUserTopics
33
+
34
+
29
35
  Contextual query reformulation
30
36
  ------------------------------
31
37
 
@@ -34,7 +40,7 @@ Contextual query reformulation
34
40
  .. autoclass:: ContextualizedRewrittenQuery
35
41
  :members:
36
42
 
37
- CANARD Dataset
43
+ CANARD Dataset
38
44
 
39
45
  .. autoxpmconfig:: datamaestro_text.data.conversation.canard.CanardDataset
40
46
  :members: iter
@@ -50,7 +56,7 @@ OrConvQA Dataset
50
56
  .. autoclass:: datamaestro_text.data.conversation.orconvqa.OrConvQADatasetHistoryEntry
51
57
  :members:
52
58
 
53
- QReCC Dataset
59
+ QReCC Dataset
54
60
 
55
61
  .. autoclass:: datamaestro_text.data.conversation.qrecc.QReCCDatasetEntry
56
62
  :members:
@@ -61,11 +67,11 @@ QReCC Dataset
61
67
 
62
68
  iKAT Dataset
63
69
 
64
- .. autoclass:: datamaestro_text.data.conversation.ikat.IkatDatasetEntry
70
+ .. autoclass:: datamaestro_text.data.conversation.ikat.IkatConversationTopic
65
71
  :members:
66
72
 
67
73
  .. autoclass:: datamaestro_text.data.conversation.ikat.IkatConversationEntry
68
74
  :members:
69
75
 
70
- .. autoxpmconfig:: datamaestro_text.data.conversation.ikat.IkatDataset
76
+ .. autoxpmconfig:: datamaestro_text.data.conversation.ikat.IkatConversations
71
77
  :members: iter
@@ -36,11 +36,17 @@ Documents
36
36
 
37
37
  .. autoxpmconfig:: datamaestro_text.data.ir.Documents
38
38
  :members: iter_documents, iter_ids, documentcount
39
+ .. autoxpmconfig:: datamaestro_text.data.ir.csv.Documents
40
+ .. autoxpmconfig:: datamaestro_text.datasets.irds.data.LZ4DocumentStore
41
+
42
+
43
+ Dataset-specific documents
44
+ **************************
45
+
39
46
  .. autoxpmconfig:: datamaestro_text.data.ir.cord19.Documents
40
47
  .. autoxpmconfig:: datamaestro_text.data.ir.trec.TipsterCollection
41
- .. autoxpmconfig:: datamaestro_text.data.ir.csv.Documents
42
48
  .. autoxpmconfig:: datamaestro_text.data.ir.stores.OrConvQADocumentStore
43
- .. autoxpmconfig:: datamaestro_text.datasets.irds.data.LZ4DocumentStore
49
+ .. autoxpmconfig:: datamaestro_text.data.ir.stores.IKatClueWeb22DocumentStore
44
50
 
45
51
  Assessments
46
52
  -----------
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "datamaestro-text"
3
- requires-python = ">=3.8"
3
+ requires-python = ">=3.10"
4
4
  keywords = ["dataset manager", "information retrieval", "experiments"]
5
5
  description = "Datamaestro module for text-related datasets"
6
6
  dynamic = ["version", "readme", "dependencies"]
@@ -44,6 +44,14 @@ fallback_version = "0.0.0-dev"
44
44
  [build-system]
45
45
  requires = ["setuptools", "setuptools-scm", "wheel"] # PEP 508 specifications.
46
46
 
47
+ [dependency-groups]
48
+ dev = [
49
+ "docutils>=0.21.2",
50
+ "pytest>=8.4.1",
51
+ "sphinx>=8.1.3",
52
+ "sphobjinv>=2.3.1.3",
53
+ ]
54
+
47
55
  [project.entry-points."datamaestro.repositories"]
48
56
  text = "datamaestro_text:Repository"
49
57
  irds = "datamaestro_text.datasets.irds:Repository"
@@ -1,3 +1,3 @@
1
- datamaestro>=1.4.2
1
+ datamaestro>=1.5.0
2
2
  ir_datasets>=0.5.8
3
3
  attrs
@@ -0,0 +1,121 @@
1
+ # See documentation on https://datamaestro.readthedocs.io
2
+
3
+ import bz2
4
+ from datamaestro.download import reference
5
+ from datamaestro.definitions import datatasks, datatags, dataset
6
+ from datamaestro_text.data.conversation.base import ConversationUserTopics
7
+ from datamaestro_text.data.ir import Adhoc
8
+
9
+ from datamaestro.utils import HashCheck
10
+ from datamaestro.context import DatafolderPath
11
+ from datamaestro.download.single import filedownloader
12
+ from datamaestro_text.data.conversation.ikat import IkatConversations
13
+ from datamaestro.download.links import linkfolder
14
+
15
+ from datamaestro_text.data.ir.stores import IKatClueWeb22DocumentStore
16
+ from datamaestro_text.data.ir.trec import TrecAdhocAssessments
17
+ from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
18
+
19
+
20
+ @dataset(as_prepare=True)
21
+ def clueweb22(dataset, options=None) -> IKatClueWeb22DocumentStore:
22
+ # Number of documents in the dataset
23
+ count = 116_838_987
24
+
25
+ jsonl_folder = linkfolder(
26
+ "documents", [DatafolderPath("gov.nist.trec.ikat.clueweb22", "jsonl")]
27
+ ).setup(dataset, options)
28
+ store_path = lz4docstore_builder(
29
+ "store",
30
+ IKatClueWeb22DocumentStore.generator(
31
+ jsonl_folder,
32
+ jsonl_folder / "ikat_2023_passages_jsonl.sha256sums",
33
+ jsonl_folder / "ikat_2023_passages_hashes.tsv.bz2",
34
+ ),
35
+ IKatClueWeb22DocumentStore.Document,
36
+ "id",
37
+ count_hint=count,
38
+ ).setup(dataset, options)
39
+
40
+ return IKatClueWeb22DocumentStore.C(path=store_path, count=count)
41
+
42
+
43
+ @datatags("conversation", "context", "query")
44
+ @datatasks("conversational search", "query rewriting")
45
+ @reference("documents", clueweb22)
46
+ @filedownloader(
47
+ "topics.json",
48
+ "https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2025/data/2025_test_topics.json",
49
+ checker=HashCheck("16f8444a8d0a8dfe0090f478f185a63c"),
50
+ )
51
+ @dataset(
52
+ id="2025",
53
+ url="https://github.com/irlabamsterdam/iKAT/tree/main/2025",
54
+ )
55
+ def test_2025(topics, documents) -> Adhoc.C:
56
+ """Question-in-context rewriting
57
+
58
+ iKAT is a test dataset for question-in-context rewriting that consists of
59
+ questions each given in a dialog context together with a context-independent
60
+ rewriting of the question.
61
+ """
62
+ return Adhoc.C(
63
+ topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
64
+ # TODO: add when available
65
+ assessments=TrecAdhocAssessments.C(path="/to/do"),
66
+ documents=documents,
67
+ )
68
+
69
+
70
+ @datatags("conversation", "context", "query")
71
+ @datatasks("conversational search", "query rewriting")
72
+ @reference("documents", clueweb22)
73
+ @filedownloader(
74
+ "qrels",
75
+ "https://trec.nist.gov/data/ikat/2024-qrels.txt",
76
+ checker=HashCheck("57f958903ed1c12bbac207f62800814f"),
77
+ )
78
+ @filedownloader(
79
+ "topics.json",
80
+ "https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2024/data/2024_test_topics.json",
81
+ checker=HashCheck("ad45bc6e7add2081d69ea60a0a4d1203"),
82
+ )
83
+ @dataset(
84
+ Adhoc,
85
+ id="2024",
86
+ url="https://github.com/irlabamsterdam/iKAT/tree/main/2024",
87
+ )
88
+ def test_2024(topics, qrels, documents) -> Adhoc.C:
89
+ """iKAT 2024 dataset"""
90
+ return Adhoc.C(
91
+ topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
92
+ assessments=TrecAdhocAssessments.C(path=qrels),
93
+ documents=documents,
94
+ )
95
+
96
+
97
+ @datatags("conversation", "context", "query")
98
+ @datatasks("conversational search", "query rewriting")
99
+ @reference("documents", clueweb22)
100
+ @filedownloader(
101
+ "qrels",
102
+ "https://trec.nist.gov/data/ikat/2023-qrels.all-turns.txt",
103
+ checker=HashCheck("79dc121bab25b2245e52a53263e5ad1f"),
104
+ )
105
+ @filedownloader(
106
+ "topics.json",
107
+ "https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2023/data/2023_test_topics.json",
108
+ checker=HashCheck("684fa0197cdec8c3cfb6a2e586ab83f6"),
109
+ )
110
+ @dataset(
111
+ Adhoc,
112
+ id="2023",
113
+ url="https://github.com/irlabamsterdam/iKAT/tree/main/2023",
114
+ )
115
+ def test_2023(topics, qrels, documents) -> Adhoc.C:
116
+ """iKAT 2023 dataset"""
117
+ return Adhoc.C(
118
+ topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
119
+ assessments=TrecAdhocAssessments.C(path=qrels),
120
+ documents=documents,
121
+ )
@@ -26,7 +26,7 @@ def english(dir):
26
26
 
27
27
  If you use this data, please cite Sentiment140 as your source.
28
28
  """
29
- return {
30
- "train": Generic(path=dir / "training.1600000.processed.noemoticon.csv"),
31
- "test": Generic(path=dir / "testdata.manual.2009.06.14.csv"),
32
- }
29
+ return Supervised.C(
30
+ train=Generic(path=dir / "training.1600000.processed.noemoticon.csv"),
31
+ test=Generic(path=dir / "testdata.manual.2009.06.14.csv"),
32
+ )
@@ -17,7 +17,6 @@ of three CD-ROMs of SGML encoded documents distributed by LDC plus queries and a
17
17
  See also https://trec.nist.gov/data/docs_eng.html and https://trec.nist.gov/data/intro_eng.html
18
18
  """
19
19
 
20
- from datamaestro.data import Base
21
20
  from datamaestro_text.data.ir.trec import TipsterCollection
22
21
  from datamaestro.download.links import linkfolder
23
22
  from datamaestro.definitions import (
@@ -1,10 +1,13 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from enum import Enum
3
+ from datamaestro_text.data.ir.base import IDItem, SimpleTextItem
4
+ from experimaestro import Param
3
5
  from typing import Dict, Generic, Iterator, List, Optional, Sequence, Tuple
4
6
  from attr import define
7
+ from datamaestro.record import record_type
5
8
  from datamaestro.data import Base
6
9
  from datamaestro.record import Record, Item
7
- from datamaestro_text.data.ir import TopicRecord
10
+ from datamaestro_text.data.ir import TopicRecord, Topics
8
11
  from datamaestro_text.utils.iter import FactoryIterable, LazyList, RangeView
9
12
 
10
13
  # ---- Basic types
@@ -120,20 +123,17 @@ class ConversationNode:
120
123
  ...
121
124
 
122
125
  @abstractmethod
123
- def parent(self) -> Optional["ConversationNode"]:
124
- ...
126
+ def parent(self) -> Optional["ConversationNode"]: ...
125
127
 
126
128
  @abstractmethod
127
- def children(self) -> List["ConversationNode"]:
128
- ...
129
+ def children(self) -> List["ConversationNode"]: ...
129
130
 
130
131
 
131
132
  class ConversationTree(ABC):
132
133
  """Represents a conversation tree"""
133
134
 
134
135
  @abstractmethod
135
- def root(self) -> ConversationNode:
136
- ...
136
+ def root(self) -> ConversationNode: ...
137
137
 
138
138
  @abstractmethod
139
139
  def __iter__(self) -> Iterator[ConversationNode]:
@@ -253,5 +253,30 @@ class ConversationDataset(Base, ABC):
253
253
  @abstractmethod
254
254
  def __iter__(self) -> Iterator[ConversationTree]:
255
255
  """Return an iterator over conversations"""
256
- for i in range(len(self)):
257
- yield self.get(i)
256
+ ...
257
+
258
+
259
+ class ConversationUserTopics(Topics):
260
+ """Extract user topics from conversations"""
261
+
262
+ conversations: Param[ConversationDataset]
263
+
264
+ topic_recordtype = record_type(IDItem, SimpleTextItem)
265
+
266
+ def iter(self) -> Iterator[TopicRecord]:
267
+ """Returns an iterator over topics"""
268
+ # Extracts topics from conversations, Each user query is a topic (can perform retrieval on it)
269
+ # TODO: merge with xpmir.learning.DatasetConversationBase -> same logic
270
+
271
+ records: List[TopicRecord] = []
272
+ for conversation in self.conversations.__iter__():
273
+ nodes = [
274
+ node
275
+ for node in conversation
276
+ if node.entry[EntryType] == EntryType.USER_QUERY
277
+ ]
278
+ for node in nodes:
279
+ records.append(
280
+ node.entry.update(ConversationHistoryItem(node.history()))
281
+ )
282
+ return iter(records)
@@ -1,10 +1,11 @@
1
- from typing import Iterator, List, Optional
1
+ from typing import Iterator, List
2
2
  from attr import define, field
3
3
  import json
4
4
  import logging
5
5
  from datamaestro.data import File
6
6
  from datamaestro.record import Record
7
7
 
8
+ from datamaestro_text.data.ir import Topics
8
9
  from datamaestro_text.data.ir.base import (
9
10
  IDItem,
10
11
  SimpleTextItem,
@@ -12,7 +13,6 @@ from datamaestro_text.data.ir.base import (
12
13
 
13
14
 
14
15
  from .base import (
15
- AnswerDocumentURL,
16
16
  AnswerEntry,
17
17
  ConversationTree,
18
18
  EntryType,
@@ -21,6 +21,25 @@ from .base import (
21
21
  )
22
22
  from . import ConversationDataset
23
23
 
24
+ # Keys to change in the dataset entries for compatibility across different years
25
+
26
+ KEY_MAPPINGS = {
27
+ # Keys to replace: Target Key
28
+ "turns": "responses",
29
+ "utterance": "user_utterance",
30
+ "ptkb_provenance": "relevant_ptkbs",
31
+ "response_provenance": "citations",
32
+ }
33
+
34
+
35
+ def norm_dict(entry: dict) -> dict:
36
+ """Convert keys in the entry to match the expected format."""
37
+ normalized = {}
38
+ for k, v in entry.items():
39
+ # Check for direct mapping, then try lowercase mapping
40
+ new_key = KEY_MAPPINGS.get(k) or KEY_MAPPINGS.get(k.lower()) or k.lower()
41
+ normalized[new_key] = v
42
+ return normalized
24
43
 
25
44
 
26
45
  @define(kw_only=True)
@@ -47,7 +66,7 @@ class IkatConversationEntry:
47
66
 
48
67
 
49
68
  @define(kw_only=True)
50
- class IkatDatasetEntry:
69
+ class IkatConversationTopic:
51
70
  """A query with past history"""
52
71
 
53
72
  number: str
@@ -60,14 +79,20 @@ class IkatDatasetEntry:
60
79
  """The personal knowledge base associated with the user"""
61
80
 
62
81
  responses: List[IkatConversationEntry] = field(
63
- converter=lambda items: [IkatConversationEntry(**item) if isinstance(item, dict) else item for item in items]
82
+ converter=lambda items: [
83
+ IkatConversationEntry(**item) if isinstance(item, dict) else item
84
+ for item in map(norm_dict, items)
85
+ ]
64
86
  )
65
87
  """The list of responses to the query"""
66
88
 
67
89
 
68
- class IkatDataset(ConversationDataset, File):
90
+ class IkatConversations(ConversationDataset, File):
91
+ """A dataset containing conversations from the IKAT project"""
69
92
 
70
- def entries(self) -> Iterator[IkatDatasetEntry]:
93
+ """Keys to change in the dataset entries for compatibility across different years"""
94
+
95
+ def entries(self) -> Iterator[IkatConversationTopic]:
71
96
  """Reads all conversation entries from the dataset file."""
72
97
  with self.path.open("rt") as fp:
73
98
  raw_data = json.load(fp)
@@ -75,12 +100,13 @@ class IkatDataset(ConversationDataset, File):
75
100
  logging.debug("Loaded %d entries from %s", len(raw_data), self.path)
76
101
  logging.debug(f"raw data has keys {raw_data[0].keys()}")
77
102
 
78
- processed_data = []
79
103
  for entry in raw_data:
80
- processed_data.append(IkatDatasetEntry(**{key.lower(): value for key, value in entry.items()}))
81
-
82
- logging.debug(f"First parsed data sample: {processed_data[0]}")
83
- return iter(processed_data)
104
+ try:
105
+ normalized_entry = norm_dict(entry)
106
+ yield IkatConversationTopic(**normalized_entry)
107
+ except Exception as e:
108
+ logging.warning(f"Failed to parse entry: {e}")
109
+ raise e
84
110
 
85
111
  def __iter__(self) -> Iterator[ConversationTree]:
86
112
  for entry in self.entries():
@@ -88,7 +114,7 @@ class IkatDataset(ConversationDataset, File):
88
114
 
89
115
  for turn in entry.responses:
90
116
  turn: IkatConversationEntry = turn # Ensure type is correct
91
- query_id = f"{entry.number}#{turn.turn_id}"
117
+ query_id = f"{entry.number}_{turn.turn_id}"
92
118
 
93
119
  # USER QUERY record
94
120
  history.append(
@@ -117,4 +143,3 @@ class IkatDataset(ConversationDataset, File):
117
143
  # Ensure reverse if needed for compatibility (optional)
118
144
  history.reverse()
119
145
  yield SingleConversationTree(entry.number, history)
120
-
@@ -2,9 +2,10 @@
2
2
 
3
3
  from abc import ABC, abstractmethod
4
4
  from functools import cached_property
5
+ import logging
5
6
  from pathlib import Path
6
7
  from attrs import define
7
- from typing import Callable, Dict, Iterator, List, Optional, Tuple, Type
8
+ from typing import Callable, Dict, Iterator, List, Optional, Tuple, Type, TYPE_CHECKING
8
9
  import random
9
10
  from experimaestro import Config
10
11
  from datamaestro.definitions import datatasks, Param, Meta
@@ -28,6 +29,9 @@ from .base import ( # noqa: F401
28
29
  AdhocAssessedTopic,
29
30
  )
30
31
 
32
+ #: A adhoc run dictionary (query id -> doc id -> score)
33
+ AdhocRunDict = dict[str, dict[str, float]]
34
+
31
35
 
32
36
  class Documents(Base):
33
37
  """A set of documents with identifiers
@@ -45,6 +49,22 @@ class Documents(Base):
45
49
  def iter_documents(self) -> Iterator[DocumentRecord]:
46
50
  return self.iter()
47
51
 
52
+ def iter_documents_from(self, start=0) -> Iterator[DocumentRecord]:
53
+ """Iterate over a range of documents
54
+
55
+ Can be specialized in a subclass for faster access
56
+
57
+ :param start: The starting document, defaults to 0
58
+ :return: An iterator
59
+ """
60
+ iter = self.iter()
61
+ if start > 0:
62
+ logging.info("skipping %d documents", start + 1)
63
+ for _ in range(start + 1):
64
+ next(iter)
65
+
66
+ return iter
67
+
48
68
  def iter_ids(self) -> Iterator[str]:
49
69
  """Iterates over document ids
50
70
 
@@ -168,7 +188,10 @@ class AdhocAssessments(Base, ABC):
168
188
  class AdhocRun(Base):
169
189
  """IR adhoc run"""
170
190
 
171
- pass
191
+ @abstractmethod
192
+ def get_dict(self) -> "AdhocRunDict":
193
+ """Get the run as a dictionary query ID -> doc ID -> score"""
194
+ ...
172
195
 
173
196
 
174
197
  class AdhocResults(Base):
@@ -43,6 +43,7 @@ class IDItem(Item, ABC):
43
43
 
44
44
  id: str
45
45
 
46
+
46
47
  @define
47
48
  class UrlItem(Item):
48
49
  """An url item"""
@@ -70,7 +71,7 @@ class AdhocAssessedTopic:
70
71
  """List of assessments for this topic"""
71
72
 
72
73
 
73
- def create_record(*items: Item, id: str = None, text: str = None):
74
+ def create_record(*items: Item, id: str = None, text: str = None) -> Record:
74
75
  """Easy creation of a text/id item"""
75
76
  extra_items = []
76
77
  if id is not None:
@@ -99,7 +99,7 @@ class WapoDocument(TextItem):
99
99
  body_media: Tuple[WapoDocMedia, ...]
100
100
 
101
101
  @cached_property
102
- def text(self):
102
+ def text(self):
103
103
  return f"{self.title} {self.body_paras_html}"
104
104
 
105
105
 
@@ -132,11 +132,15 @@ class DprW100Doc(TextItem):
132
132
  text: str
133
133
  title: str
134
134
 
135
- @define
135
+
136
+ @define
136
137
  class MsMarcoV2Passage(TextItem):
137
138
  text: str
138
139
  spans: Tuple[Tuple[int, int], ...]
139
140
  msmarco_document_id: str
141
+
142
+
143
+ @define
140
144
  class Touche2020(TextItem):
141
145
  text: str
142
146
  title: str
@@ -204,11 +208,13 @@ class TrecTopic(SimpleTextItem):
204
208
 
205
209
  TrecTopicRecord = record_type(IDItem, TrecTopic)
206
210
 
211
+
207
212
  @define
208
213
  class DprW100Query(TextItem):
209
214
  text: str
210
215
  answers: Tuple[str]
211
216
 
217
+
212
218
  @define
213
219
  class TrecBackgroundLinkingQuery(IDItem):
214
220
  query_id: str