datamaestro-text 2024.3.10__tar.gz → 2025.1.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. datamaestro_text-2025.1.7/.pre-commit-config.yaml +19 -0
  2. {datamaestro-text-2024.3.10/src/datamaestro_text.egg-info → datamaestro_text-2025.1.7}/PKG-INFO +3 -3
  3. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/source/api/conversation.rst +8 -0
  4. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/source/datasets/conversation.rst +2 -0
  5. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/source/datasets/irds.rst +4 -3
  6. datamaestro_text-2025.1.7/requirements.txt +3 -0
  7. datamaestro_text-2025.1.7/src/datamaestro_text/config/com/github/apple/ml-qrecc.py +87 -0
  8. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/github/prdwb/orconvqa.py +4 -7
  9. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/conversation/base.py +76 -10
  10. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/conversation/orconvqa.py +12 -2
  11. datamaestro_text-2025.1.7/src/datamaestro_text/data/conversation/qrecc.py +99 -0
  12. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/__init__.py +3 -2
  13. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/formats.py +20 -5
  14. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/stores.py +13 -6
  15. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/datasets/irds/data.py +237 -124
  16. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/datasets/irds/helpers.py +58 -2
  17. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/version.py +2 -2
  18. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7/src/datamaestro_text.egg-info}/PKG-INFO +3 -3
  19. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text.egg-info/SOURCES.txt +2 -0
  20. datamaestro_text-2025.1.7/src/datamaestro_text.egg-info/requires.txt +3 -0
  21. datamaestro-text-2024.3.10/.pre-commit-config.yaml +0 -11
  22. datamaestro-text-2024.3.10/requirements.txt +0 -3
  23. datamaestro-text-2024.3.10/src/datamaestro_text.egg-info/requires.txt +0 -3
  24. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/.circleci/config.yml +0 -0
  25. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/.flake8 +0 -0
  26. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/.github/workflows/pytest.yml +0 -0
  27. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/.github/workflows/python-publish.yml +0 -0
  28. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/.gitignore +0 -0
  29. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/.readthedocs.yml +0 -0
  30. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/LICENSE +0 -0
  31. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/MANIFEST.in +0 -0
  32. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/Makefile +0 -0
  33. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/README.md +0 -0
  34. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/Makefile +0 -0
  35. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/make.bat +0 -0
  36. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/requirements.txt +0 -0
  37. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/source/api/embeddings.rst +0 -0
  38. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/source/api/index.rst +0 -0
  39. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/source/api/ir.rst +0 -0
  40. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/source/api/nlp.rst +0 -0
  41. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/source/api/recommendation.rst +0 -0
  42. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/source/api/text.rst +0 -0
  43. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/source/conf.py +0 -0
  44. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/source/datasets/embeddings.rst +0 -0
  45. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/source/datasets/index.rst +0 -0
  46. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/source/datasets/ir.rst +0 -0
  47. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/source/datasets/recommendation.rst +0 -0
  48. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/source/datasets/text.rst +0 -0
  49. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/source/index.rst +0 -0
  50. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/mkdocs.yml +0 -0
  51. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/pyproject.toml +0 -0
  52. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/requirements-dev.txt +0 -0
  53. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/setup.cfg +0 -0
  54. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/__init__.py +0 -0
  55. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/__init__.py +0 -0
  56. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/ai/quac.yaml +0 -0
  57. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/fastml/goodbooks-10k.yaml +0 -0
  58. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/github/aagohary/canard.py +0 -0
  59. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/github/soskek/bookcorpus.yaml +0 -0
  60. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/microsoft/msmarco/passage.py +0 -0
  61. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/microsoft/wikiqa.yaml +0 -0
  62. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/oscar-corpus.py +0 -0
  63. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/sentiment140.py +0 -0
  64. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/smashwords/bookcorpus.py +0 -0
  65. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/edu/cornell/nlvr.yaml +0 -0
  66. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/edu/stanford/__init__.py +0 -0
  67. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/edu/stanford/aclimdb.py +0 -0
  68. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/edu/stanford/glove.py +0 -0
  69. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/edu/stanford/im2p.yaml +0 -0
  70. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/edu/upenn/__init__.py +0 -0
  71. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/edu/upenn/ldc/__init__.py +0 -0
  72. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/edu/upenn/ldc/aquaint.py +0 -0
  73. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/__init__.py +0 -0
  74. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/__init__.py +0 -0
  75. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/ir/covid.py +0 -0
  76. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/trec/__init__.py +0 -0
  77. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/trec/adhoc.py +0 -0
  78. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/trec/clueweb.yaml +0 -0
  79. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/trec/deeplearning.yaml +0 -0
  80. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/trec/index.yaml +0 -0
  81. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/trec/tipster.py +0 -0
  82. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/trec/web.yaml +0 -0
  83. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/io/github/rajpurkar/squad.yaml +0 -0
  84. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/io/github/thunlp/fewrel.py +0 -0
  85. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/io/metamind/research/__init__.py +0 -0
  86. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/io/metamind/research/wikitext.py +0 -0
  87. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/net/mattmahoney/enwiki.yaml +0 -0
  88. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/org/acm/recsys/cb2014.yaml +0 -0
  89. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/org/cocodataset/index.yaml +0 -0
  90. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/org/grouplens/movielens.py +0 -0
  91. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/org/universaldependencies/french.py +0 -0
  92. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/uk/ac/ucl/cs/qangaroo.yaml +0 -0
  93. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/__init__.py +0 -0
  94. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/conversation/__init__.py +0 -0
  95. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/conversation/canard.py +0 -0
  96. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/embeddings.py +0 -0
  97. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/base.py +0 -0
  98. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/cord19.py +0 -0
  99. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/csv.py +0 -0
  100. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/data.py +0 -0
  101. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/huggingface.py +0 -0
  102. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/trec.py +0 -0
  103. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/utils.py +0 -0
  104. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/recommendation.py +0 -0
  105. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/tagging.py +0 -0
  106. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/text.py +0 -0
  107. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/datasets/irds/__init__.py +0 -0
  108. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/datasets/irds/datasets.py +0 -0
  109. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/datasets/irds/utils.py +0 -0
  110. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/download/tmdb.py +0 -0
  111. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/interfaces/plaintext.py +0 -0
  112. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/interfaces/trec.py +0 -0
  113. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/test/__init__.py +0 -0
  114. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/test/test_datasets.py +0 -0
  115. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/test/test_documented.py +0 -0
  116. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/transforms/__init__.py +0 -0
  117. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/transforms/ir/__init__.py +0 -0
  118. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/utils/__init__.py +0 -0
  119. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/utils/files.py +0 -0
  120. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/utils/iter.py +0 -0
  121. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/utils/randomstream.py +0 -0
  122. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/utils/shuffle.py +0 -0
  123. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text.egg-info/dependency_links.txt +0 -0
  124. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text.egg-info/entry_points.txt +0 -0
  125. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text.egg-info/top_level.txt +0 -0
  126. {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/tox.ini +0 -0
@@ -0,0 +1,19 @@
1
+ repos:
2
+ - hooks:
3
+ - id: check-yaml
4
+ - id: end-of-file-fixer
5
+ - id: trailing-whitespace
6
+ repo: https://github.com/pre-commit/pre-commit-hooks
7
+ rev: v4.4.0
8
+ - hooks:
9
+ - exclude: ^src/experimaestro/server/data
10
+ id: black
11
+ repo: https://github.com/psf/black
12
+ rev: 23.1.0
13
+ - hooks:
14
+ - additional_dependencies:
15
+ - flake8-print
16
+ - flake8-fixme
17
+ id: flake8
18
+ repo: https://github.com/pycqa/flake8
19
+ rev: 6.0.0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamaestro-text
3
- Version: 2024.3.10
3
+ Version: 2025.1.7
4
4
  Summary: Datamaestro module for text-related datasets
5
5
  Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
6
6
  License: GPL-3
@@ -18,8 +18,8 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
18
  Requires-Python: >=3.8
19
19
  Description-Content-Type: text/markdown
20
20
  License-File: LICENSE
21
- Requires-Dist: datamaestro>=1.1.0
22
- Requires-Dist: ir_datasets
21
+ Requires-Dist: datamaestro>=1.2.1
22
+ Requires-Dist: ir_datasets>=0.5.8
23
23
  Requires-Dist: attrs
24
24
 
25
25
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) [![PyPI version](https://badge.fury.io/py/datamaestro-text.svg)](https://badge.fury.io/py/datamaestro-text)
@@ -45,3 +45,11 @@ Contextual query reformulation
45
45
 
46
46
  .. autoclass:: datamaestro_text.data.conversation.orconvqa.OrConvQADatasetHistoryEntry
47
47
  :members:
48
+
49
+
50
+
51
+ .. autoclass:: datamaestro_text.data.conversation.orconvqa.QReCCDatasetEntry
52
+ :members:
53
+
54
+ .. autoxpmconfig:: datamaestro_text.data.conversation.qrecc.QReCCDataset
55
+ :members: iter
@@ -8,3 +8,5 @@ Contextualized Query Rewriting
8
8
  .. dm:datasets:: com.github.aagohary.canard text
9
9
 
10
10
  .. dm:datasets:: com.github.prdwb.orconvqa text
11
+
12
+ .. dm:datasets:: com.github.apple.ml-qrecc text
@@ -8,9 +8,10 @@ version of `ir-datasets` is more ancient or newer than the one used at generatio
8
8
  Data types
9
9
  ----------
10
10
 
11
- .. autoxpmconfig:: xpmir.datasets.irds.data.Topics
12
- .. autoxpmconfig:: xpmir.datasets.irds.data.Documents
13
- .. autoxpmconfig:: xpmir.datasets.irds.data.AdhocAssessments
11
+ .. autoxpmconfig:: datamaestro_text.datasets.irds.data.Topics
12
+ .. autoxpmconfig:: datamaestro_text.datasets.irds.data.Documents
13
+ .. autoxpmconfig:: datamaestro_text.datasets.irds.data.AdhocAssessments
14
+ .. autoxpmconfig:: datamaestro_text.datasets.irds.data.LZ4DocumentStore
14
15
 
15
16
 
16
17
  List of datasets
@@ -0,0 +1,3 @@
1
+ datamaestro>=1.2.1
2
+ ir_datasets>=0.5.8
3
+ attrs
@@ -0,0 +1,87 @@
1
+ # See documentation on https://datamaestro.readthedocs.io
2
+
3
+ import re
4
+ import json
5
+ from pathlib import Path
6
+ from datamaestro.definitions import datatasks, datatags, dataset
7
+ from datamaestro.data.ml import Supervised
8
+ from datamaestro.download import reference
9
+ from datamaestro.download.archive import zipdownloader
10
+ from datamaestro.download.wayback import wayback_documents
11
+ from datamaestro.utils import HashCheck
12
+ from datamaestro_text.data.conversation.qrecc import QReCCDataset
13
+ from datamaestro_text.datasets.irds.data import (
14
+ LZ4JSONLDocumentStore,
15
+ SimpleJsonDocument,
16
+ )
17
+ from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
18
+
19
+
20
+ @datatags("conversation", "context", "query")
21
+ @datatasks("query rewriting")
22
+ @zipdownloader(
23
+ "data",
24
+ "https://github.com/apple/ml-qrecc/raw/main/dataset/qrecc_data.zip",
25
+ checker=HashCheck("f88fcc7ef3678cd6312080389c8abd67"),
26
+ )
27
+ @dataset(
28
+ Supervised[QReCCDataset, None, QReCCDataset],
29
+ url="https://github.com/apple/ml-qrecc",
30
+ doi="https://doi.org/10.48550/arXiv.2010.04898",
31
+ id="",
32
+ )
33
+ def main(data: Path):
34
+ """Open-Domain Question Answering Goes Conversational via Question Rewriting
35
+
36
+ We introduce QReCC (Question Rewriting in Conversational Context), an
37
+ end-to-end open-domain question answering dataset comprising of 14K
38
+ conversations with 81K question-answer pairs. The goal of this dataset is to
39
+ provide a challenging benchmark for end-to-end conversational question
40
+ answering that includes the individual subtasks of question rewriting,
41
+ passage retrieval and reading comprehension
42
+ """
43
+ return {
44
+ "train": QReCCDataset(path=data / "qrecc_train.json"),
45
+ "test": QReCCDataset(path=data / "qrecc_test.json"),
46
+ }
47
+
48
+
49
+ @dataset(
50
+ url="https://github.com/apple/ml-qrecc",
51
+ doi="https://doi.org/10.48550/arXiv.2010.04898",
52
+ )
53
+ class Content(LZ4JSONLDocumentStore):
54
+ """QReCC mentionned URLs content"""
55
+
56
+ @staticmethod
57
+ def __create_dataset__(dataset, options=None):
58
+ ds = reference(reference=main).setup(dataset, options)
59
+ documents_path = wayback_documents(
60
+ "20191127", lambda: Content._urls(ds), name="wayback.jsonl"
61
+ ).setup(dataset, options)
62
+
63
+ store_path = lz4docstore_builder(
64
+ "store",
65
+ lambda: Content._documents(documents_path),
66
+ SimpleJsonDocument,
67
+ "id",
68
+ ).setup(dataset, options)
69
+
70
+ return LZ4JSONLDocumentStore(jsonl_path=store_path)
71
+
72
+ @staticmethod
73
+ def _documents(path: Path):
74
+ """Iterates over documents from wayback"""
75
+ with path.open("rt") as fp:
76
+ for line in fp:
77
+ yield SimpleJsonDocument(**json.loads(line))
78
+
79
+ @staticmethod
80
+ def _urls(supervised: Supervised[QReCCDataset, None, QReCCDataset]):
81
+ urls = set()
82
+ for ds in [supervised.train, supervised.test]:
83
+ for entry in ds.entries():
84
+ if entry.answer_url:
85
+ url = re.sub("#.*$", "", entry.answer_url)
86
+ urls.add(url)
87
+ return urls
@@ -1,11 +1,9 @@
1
1
  # See documentation on https://datamaestro.readthedocs.io
2
2
 
3
- from collections import namedtuple
4
3
  import gzip
5
4
  import json
6
5
  from pathlib import Path
7
- from typing import Iterator, NamedTuple
8
- import attrs
6
+ from typing import Iterator
9
7
  from datamaestro.definitions import datatasks, datatags, dataset
10
8
  from datamaestro.download.single import filedownloader
11
9
  from datamaestro.utils import HashCheck
@@ -14,10 +12,7 @@ from datamaestro.utils import HashCheck
14
12
  from datamaestro_text.data.conversation.orconvqa import OrConvQADataset
15
13
  from datamaestro.data.ml import Supervised
16
14
 
17
- from datamaestro_text.data.ir import DocumentStore
18
- from datamaestro_text.data.ir.formats import OrConvQADocument
19
15
  from datamaestro_text.data.ir.stores import OrConvQADocumentStore
20
- from datamaestro_text.datasets.irds.data import LZ4DocumentStore
21
16
  from datamaestro_text.datasets.irds.helpers import lz4docstore_downloader
22
17
 
23
18
 
@@ -63,7 +58,9 @@ def preprocessed(train, dev, test):
63
58
  def orConvQADocumentReader(source: Path) -> Iterator[OrConvQADocumentStore.NAMED_TUPLE]:
64
59
  with gzip.open(source, "rt") as fp:
65
60
  for line in fp:
66
- yield OrConvQADocumentStore.NAMED_TUPLE(**json.loads(line))
61
+ data = json.loads(line)
62
+ data["body"] = data.pop("text")
63
+ yield OrConvQADocumentStore.NAMED_TUPLE(**data)
67
64
 
68
65
 
69
66
  @lz4docstore_downloader(
@@ -1,6 +1,6 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from enum import Enum
3
- from typing import Dict, Generic, Iterator, List, Optional, Sequence
3
+ from typing import Dict, Generic, Iterator, List, Optional, Sequence, Tuple
4
4
  from attr import define
5
5
  from datamaestro.data import Base
6
6
  from datamaestro.record import Record, Item
@@ -61,6 +61,20 @@ class AnswerEntry(Item):
61
61
  """The system answer"""
62
62
 
63
63
 
64
+ @define
65
+ class AnswerDocumentID(Item):
66
+ """An answer as a document ID"""
67
+
68
+ document_id: str
69
+
70
+
71
+ @define
72
+ class AnswerDocumentURL(Item):
73
+ """An answer as a document ID"""
74
+
75
+ url: str
76
+
77
+
64
78
  @define
65
79
  class RetrievedEntry(Item):
66
80
  """List of system-retrieved documents and their relevance"""
@@ -68,8 +82,8 @@ class RetrievedEntry(Item):
68
82
  documents: List[str]
69
83
  """List of retrieved documents"""
70
84
 
71
- document_relevances: Optional[List[str]] = None
72
- """List of retrieved documents and their relevance status"""
85
+ relevant_documents: Optional[Dict[int, Tuple[Optional[int], Optional[int]]]] = None
86
+ """List of relevance status (optional), with start/stop position"""
73
87
 
74
88
 
75
89
  @define
@@ -95,56 +109,99 @@ class ConversationHistoryItem(Item):
95
109
 
96
110
 
97
111
  class ConversationNode:
112
+ @abstractmethod
98
113
  def entry(self) -> Record:
99
114
  """The current conversation entry"""
100
115
  ...
101
116
 
117
+ @abstractmethod
102
118
  def history(self) -> ConversationHistory:
103
119
  """Preceding conversation entries, from most recent to more ancient"""
104
120
  ...
105
121
 
122
+ @abstractmethod
123
+ def parent(self) -> Optional["ConversationNode"]:
124
+ ...
125
+
126
+ @abstractmethod
127
+ def children(self) -> List["ConversationNode"]:
128
+ ...
129
+
106
130
 
107
- class ConversationTree:
131
+ class ConversationTree(ABC):
132
+ @abstractmethod
133
+ def root(self) -> ConversationNode:
134
+ ...
135
+
136
+ @abstractmethod
108
137
  def __iter__(self) -> Iterator[ConversationNode]:
109
138
  """Iterates over conversation nodes"""
110
- pass
139
+ ...
111
140
 
112
141
 
113
142
  # ---- A conversation tree
114
143
 
115
144
 
116
- class SingleConversationTree(ConversationTree):
145
+ class SingleConversationTree(ConversationTree, ABC):
117
146
  """Simple conversations, based on a sequence of entries"""
118
147
 
119
148
  id: str
120
- history: Sequence[Record]
149
+ history: List[Record]
121
150
 
122
151
  def __init__(self, id: Optional[str], history: List[Record]):
123
152
  """Create a simple conversation
124
153
 
125
- :param history: The entries, in reverse order (i.e. more ancient first)
154
+ :param history: The entries, in **reverse** order (i.e. more ancient first)
126
155
  """
127
156
  self.history = history or []
157
+ self.id = id
128
158
 
129
159
  def add(self, entry: Record):
130
160
  self.history.insert(0, entry)
131
161
 
132
162
  def __iter__(self) -> Iterator[ConversationNode]:
133
- for ix in range(len(self.history)):
163
+ """Iterates over the conversation (starting with the beginning)"""
164
+ for ix in reversed(range(len(self.history))):
134
165
  yield SingleConversationTreeNode(self, ix)
135
166
 
167
+ def root(self):
168
+ return SingleConversationTreeNode(self, len(self.history) - 1)
169
+
136
170
 
137
171
  @define
138
172
  class SingleConversationTreeNode(ConversationNode):
139
173
  tree: SingleConversationTree
140
174
  index: int
141
175
 
176
+ @property
142
177
  def entry(self) -> Record:
143
178
  return self.tree.history[self.index]
144
179
 
180
+ @entry.setter
181
+ def entry(self, record: Record):
182
+ try:
183
+ self.tree.history[self.index] = record
184
+ except Exception as e:
185
+ print(e)
186
+ raise
187
+
145
188
  def history(self) -> Sequence[Record]:
146
189
  return self.tree.history[self.index + 1 :]
147
190
 
191
+ def parent(self) -> Optional[ConversationNode]:
192
+ return (
193
+ SingleConversationTreeNode(self.tree, self.index + 1)
194
+ if self.index < len(self.tree.history) - 1
195
+ else []
196
+ )
197
+
198
+ def children(self) -> List[ConversationNode]:
199
+ return (
200
+ [SingleConversationTreeNode(self.tree, self.index - 1)]
201
+ if self.index > 0
202
+ else []
203
+ )
204
+
148
205
 
149
206
  class ConversationTreeNode(ConversationNode, ConversationTree):
150
207
  """A conversation tree node"""
@@ -178,6 +235,15 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
178
235
  for child in self.children:
179
236
  yield from child
180
237
 
238
+ def parent(self) -> Optional[ConversationNode]:
239
+ return self.parent
240
+
241
+ def children(self) -> List[ConversationNode]:
242
+ return self.children
243
+
244
+ def root(self):
245
+ return self
246
+
181
247
 
182
248
  class ConversationDataset(Base, ABC):
183
249
  """A dataset made of conversations"""
@@ -186,4 +252,4 @@ class ConversationDataset(Base, ABC):
186
252
  def __iter__(self) -> Iterator[ConversationTree]:
187
253
  """Return an iterator over conversations"""
188
254
  for i in range(len(self)):
189
- return self.get(i)
255
+ yield self.get(i)
@@ -102,16 +102,26 @@ class OrConvQADataset(ConversationDataset, File):
102
102
  # Add to current
103
103
  history.append(
104
104
  Record(
105
- IDItem(query_no),
105
+ IDItem(entry.query_id),
106
106
  SimpleTextItem(entry.query),
107
107
  SimpleDecontextualizedItem(entry.rewrite),
108
108
  EntryType.USER_QUERY,
109
109
  )
110
110
  )
111
+
112
+ relevances = {}
113
+ for rank, relevance in enumerate(entry.retrieval_labels):
114
+ if relevance > 0:
115
+ relevances[rank] = (entry.answer.answer_start, None)
116
+
117
+ assert (
118
+ len(relevances) <= 1
119
+ ), f"Too many relevance labels ({len(relevances)}) for {entry.query_id}"
120
+
111
121
  history.append(
112
122
  Record(
113
123
  AnswerEntry(entry.answer.text),
114
- RetrievedEntry(entry.evidences, entry.retrieval_labels),
124
+ RetrievedEntry(entry.evidences, relevances),
115
125
  EntryType.SYSTEM_ANSWER,
116
126
  )
117
127
  )
@@ -0,0 +1,99 @@
1
+ from typing import Iterator, List, Optional
2
+ from attr import define
3
+ import json
4
+ from datamaestro.data import File
5
+ from datamaestro.record import Record
6
+
7
+ from datamaestro_text.data.ir.base import (
8
+ IDItem,
9
+ SimpleTextItem,
10
+ )
11
+
12
+
13
+ from .base import (
14
+ AnswerDocumentURL,
15
+ AnswerEntry,
16
+ ConversationTree,
17
+ EntryType,
18
+ SimpleDecontextualizedItem,
19
+ SingleConversationTree,
20
+ )
21
+ from . import ConversationDataset
22
+
23
+
24
+ @define(kw_only=True)
25
+ class QReCCDatasetEntry:
26
+ """A query with past history"""
27
+
28
+ conversation_no: int
29
+ """Conversation ID"""
30
+
31
+ turn_no: int
32
+ """The turn in the conversation"""
33
+
34
+ conversation_source: str
35
+ """Conversation source"""
36
+
37
+ question: str
38
+ """The last issued query"""
39
+
40
+ rewrite: str
41
+ """Manually rewritten query"""
42
+
43
+ context: List[str]
44
+ """The list of queries asked by the user"""
45
+
46
+ answer: str
47
+ """The answer"""
48
+
49
+ answer_url: str
50
+ """The URL containing the answer"""
51
+
52
+
53
+ class QReCCDataset(ConversationDataset, File):
54
+ def entries(self) -> Iterator[QReCCDatasetEntry]:
55
+ """Iterates over re-written query with their context"""
56
+ with self.path.open("rt") as fp:
57
+ data = json.load(fp)
58
+
59
+ data = [
60
+ QReCCDatasetEntry(**{key.lower(): value for key, value in entry.items()})
61
+ for entry in data
62
+ ]
63
+ return iter(data)
64
+
65
+ def __iter__(self) -> Iterator[ConversationTree]:
66
+ history: List[Record] = []
67
+ current_id: Optional[str] = None
68
+
69
+ for entry in self.entries():
70
+ # Creates a new conversation if needed
71
+ if entry.conversation_no != current_id:
72
+ if current_id is not None:
73
+ history.reverse()
74
+ yield SingleConversationTree(current_id, history)
75
+
76
+ current_id = entry.conversation_no
77
+ history = []
78
+
79
+ # Add to current
80
+ history.append(
81
+ Record(
82
+ IDItem(f"{entry.conversation_no}#{entry.turn_no}"),
83
+ SimpleTextItem(entry.question),
84
+ AnswerDocumentURL(entry.answer_url),
85
+ SimpleDecontextualizedItem(entry.rewrite),
86
+ EntryType.USER_QUERY,
87
+ )
88
+ )
89
+
90
+ history.append(
91
+ Record(
92
+ AnswerEntry(entry.answer),
93
+ EntryType.SYSTEM_ANSWER,
94
+ )
95
+ )
96
+
97
+ # Yields the last one
98
+ history.reverse()
99
+ yield SingleConversationTree(current_id, history)
@@ -25,6 +25,7 @@ from .base import ( # noqa: F401
25
25
  create_record,
26
26
  # Other things
27
27
  AdhocAssessment,
28
+ AdhocAssessedTopic,
28
29
  )
29
30
 
30
31
 
@@ -83,7 +84,7 @@ class DocumentStore(Documents):
83
84
  def document_int(self, internal_docid: int) -> DocumentRecord:
84
85
  """Returns a document given its internal ID"""
85
86
  docid = self.docid_internal2external(internal_docid)
86
- return self.document(docid)
87
+ return self.document_ext(docid)
87
88
 
88
89
  def document_ext(self, docid: str) -> DocumentRecord:
89
90
  """Returns a document given its external ID"""
@@ -159,7 +160,7 @@ class TopicsStore(Topics):
159
160
  class AdhocAssessments(Base, ABC):
160
161
  """Ad-hoc assessments (qrels)"""
161
162
 
162
- def iter(self) -> Iterator[AdhocAssessment]:
163
+ def iter(self) -> Iterator[AdhocAssessedTopic]:
163
164
  """Returns an iterator over assessments"""
164
165
  raise NotImplementedError(f"For class {self.__class__}")
165
166
 
@@ -1,5 +1,5 @@
1
1
  from functools import cached_property
2
- from typing import ClassVar, Tuple
2
+ from typing import ClassVar, Tuple, List
3
3
  from attrs import define
4
4
  from datamaestro.record import record_type
5
5
  from ir_datasets.datasets.wapo import WapoDocMedia
@@ -117,7 +117,6 @@ class TweetDoc(TextItem):
117
117
 
118
118
  @define
119
119
  class OrConvQADocument(TextItem):
120
- id: str
121
120
  title: str
122
121
  body: str
123
122
  aid: str
@@ -127,12 +126,21 @@ class OrConvQADocument(TextItem):
127
126
  def text(self):
128
127
  return f"{self.title} {self.body}"
129
128
 
129
+ @define
130
+ class Touche2020(TextItem):
131
+ text: str
132
+ title: str
133
+ stance: str
134
+ url: str
130
135
 
131
136
  @define
132
- class TrecTopic(TextItem):
137
+ class SciDocs(TextItem):
133
138
  text: str
134
- query: str
135
- narrative: str
139
+ title: str
140
+ authors: List[str]
141
+ year: int
142
+ cited_by: List[str]
143
+ references: List[str]
136
144
 
137
145
 
138
146
  @define
@@ -167,6 +175,13 @@ class TrecMb14Query(TextItem):
167
175
  def get_text(self):
168
176
  return f"{self.query}"
169
177
 
178
+ @define
179
+ class SciDocsTopic(TextItem):
180
+ text: str
181
+ authors: List[str]
182
+ year: int
183
+ cited_by: List[str]
184
+ references: List[str]
170
185
 
171
186
  @define()
172
187
  class TrecTopic(SimpleTextItem):
@@ -1,16 +1,21 @@
1
1
  from collections import namedtuple
2
- from typing import List
2
+ from typing import List, NamedTuple
3
3
  from experimaestro import Constant
4
4
  import attrs
5
5
 
6
+ from datamaestro.record import Record
7
+ from datamaestro_text.data.ir.base import IDItem
6
8
  from datamaestro_text.datasets.irds.data import LZ4DocumentStore
7
9
  from datamaestro_text.data.ir.formats import OrConvQADocument
8
10
 
9
11
 
10
12
  class OrConvQADocumentStore(LZ4DocumentStore):
11
- NAMED_TUPLE = namedtuple(
12
- "OrConvQADocument", [a.name for a in attrs.fields(OrConvQADocument)]
13
- )
13
+ class NAMED_TUPLE(NamedTuple):
14
+ id: str
15
+ title: str
16
+ body: str
17
+ aid: str
18
+ bid: int
14
19
 
15
20
  lookup_field: Constant[str] = "id"
16
21
  fields: Constant[List[str]] = list(NAMED_TUPLE._fields)
@@ -18,5 +23,7 @@ class OrConvQADocumentStore(LZ4DocumentStore):
18
23
 
19
24
  data_cls = NAMED_TUPLE
20
25
 
21
- def converter(self, data: NAMED_TUPLE) -> OrConvQADocument:
22
- return OrConvQADocument(**data._asdict())
26
+ def converter(self, data: NAMED_TUPLE) -> Record:
27
+ fields = data._asdict()
28
+ del fields["id"]
29
+ return Record(OrConvQADocument(**fields), IDItem(data.id))