datamaestro-text 2026.1.1__tar.gz → 2026.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/PKG-INFO +2 -8
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/pyproject.toml +29 -11
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/__init__.py +1 -1
- datamaestro_text-2026.2.3/src/datamaestro_text/config/com/github/aagohary/canard.py +46 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/config/com/github/apple/ml-qrecc.py +30 -25
- datamaestro_text-2026.2.3/src/datamaestro_text/config/com/github/ikat.py +135 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/config/com/github/prdwb/orconvqa.py +41 -37
- datamaestro_text-2026.2.3/src/datamaestro_text/config/com/microsoft/msmarco/passage.py +462 -0
- datamaestro_text-2026.2.3/src/datamaestro_text/config/com/oscar-corpus.py +23 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/config/com/sentiment140.py +17 -12
- datamaestro_text-2026.2.3/src/datamaestro_text/config/com/smashwords/bookcorpus.py +26 -0
- datamaestro_text-2026.2.3/src/datamaestro_text/config/edu/stanford/aclimdb.py +21 -0
- datamaestro_text-2026.2.3/src/datamaestro_text/config/edu/stanford/glove.py +115 -0
- datamaestro_text-2026.2.3/src/datamaestro_text/config/edu/upenn/ldc/aquaint.py +55 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/config/fr/granddebat.py +57 -48
- datamaestro_text-2026.2.3/src/datamaestro_text/config/gov/nist/ir/covid.py +106 -0
- datamaestro_text-2026.2.3/src/datamaestro_text/config/gov/nist/trec/adhoc.py +526 -0
- datamaestro_text-2026.2.3/src/datamaestro_text/config/gov/nist/trec/tipster.py +290 -0
- datamaestro_text-2026.2.3/src/datamaestro_text/config/io/github/thunlp/fewrel.py +40 -0
- datamaestro_text-2026.2.3/src/datamaestro_text/config/io/metamind/research/wikitext.py +90 -0
- datamaestro_text-2026.2.3/src/datamaestro_text/config/org/grouplens/movielens.py +44 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/config/org/universaldependencies/french.py +16 -11
- datamaestro_text-2026.2.3/src/datamaestro_text/data/conversation/__init__.py +8 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/data/conversation/base.py +2 -2
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/data/conversation/canard.py +3 -4
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/data/conversation/ikat.py +0 -1
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/data/conversation/orconvqa.py +3 -3
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/data/embeddings.py +1 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/data/ir/__init__.py +1 -1
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/data/ir/base.py +1 -1
- datamaestro_text-2026.2.3/src/datamaestro_text/data/ir/data.py +1 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/data/ir/formats.py +2 -1
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/data/ir/stores.py +1 -1
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/data/text.py +1 -0
- datamaestro_text-2026.2.3/src/datamaestro_text/datasets/__init__.py +1 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/datasets/irds/data.py +1 -6
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/download/tmdb.py +0 -1
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/test/test_documented.py +2 -2
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/transforms/ir/__init__.py +12 -13
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/utils/shuffle.py +1 -1
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/version.py +2 -2
- datamaestro_text-2026.1.1/src/datamaestro_text/config/com/github/aagohary/canard.py +0 -43
- datamaestro_text-2026.1.1/src/datamaestro_text/config/com/github/ikat.py +0 -121
- datamaestro_text-2026.1.1/src/datamaestro_text/config/com/microsoft/msmarco/passage.py +0 -391
- datamaestro_text-2026.1.1/src/datamaestro_text/config/com/oscar-corpus.py +0 -20
- datamaestro_text-2026.1.1/src/datamaestro_text/config/com/smashwords/bookcorpus.py +0 -23
- datamaestro_text-2026.1.1/src/datamaestro_text/config/edu/stanford/aclimdb.py +0 -16
- datamaestro_text-2026.1.1/src/datamaestro_text/config/edu/stanford/glove.py +0 -81
- datamaestro_text-2026.1.1/src/datamaestro_text/config/edu/upenn/ldc/aquaint.py +0 -37
- datamaestro_text-2026.1.1/src/datamaestro_text/config/gov/nist/ir/covid.py +0 -96
- datamaestro_text-2026.1.1/src/datamaestro_text/config/gov/nist/trec/adhoc.py +0 -386
- datamaestro_text-2026.1.1/src/datamaestro_text/config/gov/nist/trec/tipster.py +0 -184
- datamaestro_text-2026.1.1/src/datamaestro_text/config/io/github/thunlp/fewrel.py +0 -35
- datamaestro_text-2026.1.1/src/datamaestro_text/config/io/metamind/research/wikitext.py +0 -72
- datamaestro_text-2026.1.1/src/datamaestro_text/config/org/grouplens/movielens.py +0 -53
- datamaestro_text-2026.1.1/src/datamaestro_text/data/conversation/__init__.py +0 -8
- datamaestro_text-2026.1.1/src/datamaestro_text/data/ir/data.py +0 -1
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/.gitignore +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/LICENSE +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/README.md +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/config/__init__.py +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/config/ai/quac.yaml +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/config/com/fastml/goodbooks-10k.yaml +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/config/com/github/soskek/bookcorpus.yaml +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/config/com/microsoft/wikiqa.yaml +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/config/edu/cornell/nlvr.yaml +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/config/edu/stanford/__init__.py +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/config/edu/stanford/im2p.yaml +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/config/edu/upenn/__init__.py +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/config/edu/upenn/ldc/__init__.py +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/config/gov/__init__.py +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/config/gov/nist/__init__.py +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/config/gov/nist/trec/__init__.py +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/config/gov/nist/trec/clueweb.yaml +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/config/gov/nist/trec/deeplearning.yaml +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/config/gov/nist/trec/index.yaml +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/config/gov/nist/trec/web.yaml +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/config/io/github/rajpurkar/squad.yaml +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/config/io/metamind/research/__init__.py +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/config/net/mattmahoney/enwiki.yaml +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/config/org/acm/recsys/cb2014.yaml +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/config/org/cocodataset/index.yaml +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/config/uk/ac/ucl/cs/qangaroo.yaml +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/data/__init__.py +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/data/conversation/qrecc.py +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/data/debate/__init__.py +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/data/debate/granddebat.py +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/data/ir/cord19.py +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/data/ir/csv.py +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/data/ir/huggingface.py +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/data/ir/trec.py +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/data/ir/utils.py +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/data/recommendation.py +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/data/tagging.py +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/datasets/irds/__init__.py +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/datasets/irds/datasets.py +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/datasets/irds/helpers.py +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/datasets/irds/utils.py +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/interfaces/plaintext.py +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/interfaces/trec.py +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/test/__init__.py +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/test/test_datasets.py +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/transforms/__init__.py +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/utils/__init__.py +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/utils/files.py +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/utils/iter.py +0 -0
- {datamaestro_text-2026.1.1 → datamaestro_text-2026.2.3}/src/datamaestro_text/utils/randomstream.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datamaestro-text
|
|
3
|
-
Version: 2026.
|
|
3
|
+
Version: 2026.2.3
|
|
4
4
|
Summary: Datamaestro module for text-related datasets
|
|
5
5
|
Project-URL: Homepage, https://github.com/experimaestro/datamaestro_text
|
|
6
6
|
Project-URL: Documentation, https://datamaestro-text.readthedocs.io/en/latest/
|
|
@@ -22,15 +22,9 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
22
22
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
23
23
|
Requires-Python: >=3.10
|
|
24
24
|
Requires-Dist: attrs
|
|
25
|
-
Requires-Dist: datamaestro>=1.
|
|
25
|
+
Requires-Dist: datamaestro>=1.8.0
|
|
26
26
|
Requires-Dist: experimaestro
|
|
27
27
|
Requires-Dist: ir-datasets>=0.5.8
|
|
28
|
-
Provides-Extra: dev
|
|
29
|
-
Requires-Dist: docutils; extra == 'dev'
|
|
30
|
-
Requires-Dist: flake8; extra == 'dev'
|
|
31
|
-
Requires-Dist: pytest; extra == 'dev'
|
|
32
|
-
Requires-Dist: sphinx<8; extra == 'dev'
|
|
33
|
-
Requires-Dist: sphobjinv; extra == 'dev'
|
|
34
28
|
Description-Content-Type: text/markdown
|
|
35
29
|
|
|
36
30
|
[](https://github.com/pre-commit/pre-commit) [](https://badge.fury.io/py/datamaestro-text)
|
|
@@ -26,21 +26,12 @@ classifiers = [
|
|
|
26
26
|
requires-python = ">=3.10"
|
|
27
27
|
dynamic = ["version"]
|
|
28
28
|
dependencies = [
|
|
29
|
-
"datamaestro>=1.
|
|
29
|
+
"datamaestro>=1.8.0",
|
|
30
30
|
"ir_datasets>=0.5.8",
|
|
31
31
|
"attrs",
|
|
32
32
|
"experimaestro",
|
|
33
33
|
]
|
|
34
34
|
|
|
35
|
-
[project.optional-dependencies]
|
|
36
|
-
dev = [
|
|
37
|
-
"pytest",
|
|
38
|
-
"docutils",
|
|
39
|
-
"sphobjinv",
|
|
40
|
-
"flake8",
|
|
41
|
-
"sphinx<8",
|
|
42
|
-
]
|
|
43
|
-
|
|
44
35
|
[project.urls]
|
|
45
36
|
Homepage = "https://github.com/experimaestro/datamaestro_text"
|
|
46
37
|
Documentation = "https://datamaestro-text.readthedocs.io/en/latest/"
|
|
@@ -79,9 +70,36 @@ norecursedirs = ["node_modules"]
|
|
|
79
70
|
[dependency-groups]
|
|
80
71
|
dev = [
|
|
81
72
|
"docutils>=0.21.2",
|
|
82
|
-
"flake8>=7.3.0",
|
|
83
73
|
"git-cliff>=2.11.0",
|
|
84
74
|
"pytest>=8.4.1",
|
|
75
|
+
"ruff>=0.8",
|
|
85
76
|
"sphinx>=7,<8",
|
|
86
77
|
"sphobjinv>=2.3.1.3",
|
|
87
78
|
]
|
|
79
|
+
docs = [
|
|
80
|
+
"datamaestro>=0.8.5",
|
|
81
|
+
"experimaestro>=2.0.0b29", # Pre-release needed for EnumType.name() fix
|
|
82
|
+
"myst-parser>=0.18.0",
|
|
83
|
+
"sphinx>=6,<8", # experimaestro.sphinx uses restify removed in Sphinx 8
|
|
84
|
+
"sphinx-codeautolink>=0.15.0",
|
|
85
|
+
"sphinx-rtd-theme>=3.1.0",
|
|
86
|
+
"sphinx-toolbox>=4.1.2",
|
|
87
|
+
]
|
|
88
|
+
|
|
89
|
+
[tool.ruff]
|
|
90
|
+
line-length = 88
|
|
91
|
+
target-version = "py310"
|
|
92
|
+
|
|
93
|
+
[tool.ruff.lint]
|
|
94
|
+
select = [
|
|
95
|
+
"E", # pycodestyle errors
|
|
96
|
+
"F", # pyflakes
|
|
97
|
+
"W", # pycodestyle warnings
|
|
98
|
+
"C90", # mccabe complexity
|
|
99
|
+
]
|
|
100
|
+
ignore = [
|
|
101
|
+
"E501", # line too long (handled by formatter)
|
|
102
|
+
]
|
|
103
|
+
|
|
104
|
+
[tool.ruff.lint.mccabe]
|
|
105
|
+
max-complexity = 20
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from datamaestro.definitions import Dataset, datatasks, datatags, dataset
|
|
2
|
+
from datamaestro.download.single import FileDownloader
|
|
3
|
+
from datamaestro.utils import HashCheck
|
|
4
|
+
|
|
5
|
+
from datamaestro.data.ml import Supervised
|
|
6
|
+
from datamaestro_text.data.conversation.canard import CanardDataset
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@datatags("conversation", "context", "query")
|
|
10
|
+
@datatasks("query rewriting")
|
|
11
|
+
@dataset(url="https://sites.google.com/view/qanta/projects/canard", id="")
|
|
12
|
+
class Main(Dataset):
|
|
13
|
+
"""Question-in-context rewriting
|
|
14
|
+
|
|
15
|
+
CANARD is a dataset for question-in-context rewriting that consists of
|
|
16
|
+
questions each given in a dialog context together with a context-independent
|
|
17
|
+
rewriting of the question. The context of each question is the dialog
|
|
18
|
+
utterances that precede the question. CANARD can be used to evaluate
|
|
19
|
+
question rewriting models that handle important linguistic phenomena such as
|
|
20
|
+
co-reference and ellipsis resolution.
|
|
21
|
+
|
|
22
|
+
Each dataset is an instance of :class:`datamaestro_text.data.conversation.CanardDataset`
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
TRAIN = FileDownloader(
|
|
26
|
+
"train.json",
|
|
27
|
+
"https://raw.githubusercontent.com/aagohary/canard/refs/heads/master/data/release/train.json",
|
|
28
|
+
checker=HashCheck("73624ac646fb81e09b0fd7f01370ada3"),
|
|
29
|
+
)
|
|
30
|
+
DEV = FileDownloader(
|
|
31
|
+
"dev.json",
|
|
32
|
+
"https://raw.githubusercontent.com/aagohary/canard/refs/heads/master/data/release/dev.json",
|
|
33
|
+
checker=HashCheck("c84525631a83bc771c58ff31f4a9b601"),
|
|
34
|
+
)
|
|
35
|
+
TEST = FileDownloader(
|
|
36
|
+
"test.json",
|
|
37
|
+
"https://raw.githubusercontent.com/aagohary/canard/refs/heads/master/data/release/test.json",
|
|
38
|
+
checker=HashCheck("3fc14d0078e7a5056f5da571728f024e"),
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
def config(self) -> Supervised:
|
|
42
|
+
return Supervised.C(
|
|
43
|
+
train=CanardDataset.C(path=self.TRAIN.path),
|
|
44
|
+
validation=CanardDataset.C(path=self.DEV.path),
|
|
45
|
+
test=CanardDataset.C(path=self.TEST.path),
|
|
46
|
+
)
|
|
@@ -3,10 +3,10 @@
|
|
|
3
3
|
import re
|
|
4
4
|
import json
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from datamaestro.definitions import datatasks, datatags, dataset
|
|
6
|
+
from datamaestro.definitions import Dataset, datatasks, datatags, dataset
|
|
7
7
|
from datamaestro.data.ml import Supervised
|
|
8
8
|
from datamaestro.download import reference
|
|
9
|
-
from datamaestro.download.archive import
|
|
9
|
+
from datamaestro.download.archive import ZipDownloader
|
|
10
10
|
from datamaestro.download.wayback import wayback_documents
|
|
11
11
|
from datamaestro.utils import HashCheck
|
|
12
12
|
from datamaestro_text.data.conversation.qrecc import QReCCDataset
|
|
@@ -19,17 +19,12 @@ from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
|
|
|
19
19
|
|
|
20
20
|
@datatags("conversation", "context", "query")
|
|
21
21
|
@datatasks("query rewriting")
|
|
22
|
-
@zipdownloader(
|
|
23
|
-
"data",
|
|
24
|
-
"https://github.com/apple/ml-qrecc/raw/main/dataset/qrecc_data.zip",
|
|
25
|
-
checker=HashCheck("f88fcc7ef3678cd6312080389c8abd67"),
|
|
26
|
-
)
|
|
27
22
|
@dataset(
|
|
28
23
|
url="https://github.com/apple/ml-qrecc",
|
|
29
24
|
doi="https://doi.org/10.48550/arXiv.2010.04898",
|
|
30
25
|
id="",
|
|
31
26
|
)
|
|
32
|
-
|
|
27
|
+
class Main(Dataset):
|
|
33
28
|
"""Open-Domain Question Answering Goes Conversational via Question Rewriting
|
|
34
29
|
|
|
35
30
|
We introduce QReCC (Question Rewriting in Conversational Context), an
|
|
@@ -39,34 +34,44 @@ def main(data: Path) -> Supervised[QReCCDataset, None, QReCCDataset]:
|
|
|
39
34
|
answering that includes the individual subtasks of question rewriting,
|
|
40
35
|
passage retrieval and reading comprehension
|
|
41
36
|
"""
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
37
|
+
|
|
38
|
+
DATA = ZipDownloader(
|
|
39
|
+
"data",
|
|
40
|
+
"https://github.com/apple/ml-qrecc/raw/main/dataset/qrecc_data.zip",
|
|
41
|
+
checker=HashCheck("f88fcc7ef3678cd6312080389c8abd67"),
|
|
45
42
|
)
|
|
46
43
|
|
|
44
|
+
def config(self) -> Supervised:
|
|
45
|
+
return Supervised.C(
|
|
46
|
+
train=QReCCDataset.C(path=self.DATA.path / "qrecc_train.json"),
|
|
47
|
+
test=QReCCDataset.C(path=self.DATA.path / "qrecc_test.json"),
|
|
48
|
+
)
|
|
49
|
+
|
|
47
50
|
|
|
48
51
|
@dataset(
|
|
49
52
|
url="https://github.com/apple/ml-qrecc",
|
|
50
53
|
doi="https://doi.org/10.48550/arXiv.2010.04898",
|
|
51
54
|
)
|
|
52
|
-
class Content(
|
|
55
|
+
class Content(Dataset):
|
|
53
56
|
"""QReCC mentionned URLs content"""
|
|
54
57
|
|
|
55
|
-
|
|
56
|
-
def __create_dataset__(dataset, options=None):
|
|
57
|
-
ds = reference(reference=main).setup(dataset, options)
|
|
58
|
-
documents_path = wayback_documents(
|
|
59
|
-
"20191127", lambda: Content._urls(ds), name="wayback.jsonl"
|
|
60
|
-
).setup(dataset, options)
|
|
58
|
+
MAIN = reference(reference=Main)
|
|
61
59
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
60
|
+
WAYBACK_DOCS = wayback_documents(
|
|
61
|
+
"20191127",
|
|
62
|
+
lambda: Content._urls(Content.MAIN.prepare()),
|
|
63
|
+
name="wayback.jsonl",
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
STORE = lz4docstore_builder(
|
|
67
|
+
"store",
|
|
68
|
+
lambda: Content._documents(Content.WAYBACK_DOCS.path),
|
|
69
|
+
SimpleJsonDocument,
|
|
70
|
+
"id",
|
|
71
|
+
)
|
|
68
72
|
|
|
69
|
-
|
|
73
|
+
def config(self) -> LZ4JSONLDocumentStore:
|
|
74
|
+
return LZ4JSONLDocumentStore.C(jsonl_path=self.STORE.path)
|
|
70
75
|
|
|
71
76
|
@staticmethod
|
|
72
77
|
def _documents(path: Path):
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# See documentation on https://datamaestro.readthedocs.io
|
|
2
|
+
|
|
3
|
+
from datamaestro.download import reference
|
|
4
|
+
from datamaestro.definitions import Dataset, datatasks, datatags, dataset
|
|
5
|
+
from datamaestro_text.data.conversation.base import ConversationUserTopics
|
|
6
|
+
from datamaestro_text.data.ir import Adhoc
|
|
7
|
+
|
|
8
|
+
from datamaestro.utils import HashCheck
|
|
9
|
+
from datamaestro.context import DatafolderPath
|
|
10
|
+
from datamaestro.download.single import FileDownloader
|
|
11
|
+
from datamaestro_text.data.conversation.ikat import IkatConversations
|
|
12
|
+
from datamaestro.download.links import linkfolder
|
|
13
|
+
|
|
14
|
+
from datamaestro_text.data.ir.stores import IKatClueWeb22DocumentStore
|
|
15
|
+
from datamaestro_text.data.ir.trec import TrecAdhocAssessments
|
|
16
|
+
from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataset()
|
|
20
|
+
class Clueweb22(Dataset):
|
|
21
|
+
# Number of documents in the dataset
|
|
22
|
+
count = 116_838_987
|
|
23
|
+
|
|
24
|
+
JSONL_FOLDER = linkfolder(
|
|
25
|
+
"documents", [DatafolderPath("gov.nist.trec.ikat.clueweb22", "jsonl")]
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
STORE_PATH = lz4docstore_builder(
|
|
29
|
+
"store",
|
|
30
|
+
IKatClueWeb22DocumentStore.generator(
|
|
31
|
+
JSONL_FOLDER,
|
|
32
|
+
"ikat_2023_passages_jsonl.sha256sums",
|
|
33
|
+
"ikat_2023_passages_hashes.tsv.bz2",
|
|
34
|
+
),
|
|
35
|
+
IKatClueWeb22DocumentStore.Document,
|
|
36
|
+
"id",
|
|
37
|
+
count_hint=count,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
def config(self) -> IKatClueWeb22DocumentStore:
|
|
41
|
+
return IKatClueWeb22DocumentStore.C(path=self.STORE_PATH.path, count=self.count)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@datatags("conversation", "context", "query")
|
|
45
|
+
@datatasks("conversational search", "query rewriting")
|
|
46
|
+
@dataset(
|
|
47
|
+
id=".2025",
|
|
48
|
+
url="https://github.com/irlabamsterdam/iKAT/tree/main/2025",
|
|
49
|
+
)
|
|
50
|
+
class Test2025(Dataset):
|
|
51
|
+
"""Question-in-context rewriting
|
|
52
|
+
|
|
53
|
+
iKAT is a test dataset for question-in-context rewriting that consists of
|
|
54
|
+
questions each given in a dialog context together with a context-independent
|
|
55
|
+
rewriting of the question.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
DOCUMENTS = reference(varname="documents", reference=Clueweb22)
|
|
59
|
+
TOPICS = FileDownloader(
|
|
60
|
+
"topics.json",
|
|
61
|
+
"https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2025/data/2025_test_topics.json",
|
|
62
|
+
checker=HashCheck("16f8444a8d0a8dfe0090f478f185a63c"),
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
def config(self) -> Adhoc:
|
|
66
|
+
return Adhoc.C(
|
|
67
|
+
topics=ConversationUserTopics.C(
|
|
68
|
+
conversations=IkatConversations.C(path=self.TOPICS.path)
|
|
69
|
+
),
|
|
70
|
+
# TODO: add when available
|
|
71
|
+
assessments=TrecAdhocAssessments.C(path="/to/do"),
|
|
72
|
+
documents=self.DOCUMENTS.prepare(),
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@datatags("conversation", "context", "query")
|
|
77
|
+
@datatasks("conversational search", "query rewriting")
|
|
78
|
+
@dataset(
|
|
79
|
+
id=".2024",
|
|
80
|
+
url="https://github.com/irlabamsterdam/iKAT/tree/main/2024",
|
|
81
|
+
)
|
|
82
|
+
class Test2024(Dataset):
|
|
83
|
+
"""iKAT 2024 dataset"""
|
|
84
|
+
|
|
85
|
+
DOCUMENTS = reference(varname="documents", reference=Clueweb22)
|
|
86
|
+
QRELS = FileDownloader(
|
|
87
|
+
"qrels",
|
|
88
|
+
"https://trec.nist.gov/data/ikat/2024-qrels.txt",
|
|
89
|
+
checker=HashCheck("57f958903ed1c12bbac207f62800814f"),
|
|
90
|
+
)
|
|
91
|
+
TOPICS = FileDownloader(
|
|
92
|
+
"topics.json",
|
|
93
|
+
"https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2024/data/2024_test_topics.json",
|
|
94
|
+
checker=HashCheck("ad45bc6e7add2081d69ea60a0a4d1203"),
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
def config(self) -> Adhoc:
|
|
98
|
+
return Adhoc.C(
|
|
99
|
+
topics=ConversationUserTopics.C(
|
|
100
|
+
conversations=IkatConversations.C(path=self.TOPICS.path)
|
|
101
|
+
),
|
|
102
|
+
assessments=TrecAdhocAssessments.C(path=self.QRELS.path),
|
|
103
|
+
documents=self.DOCUMENTS.prepare(),
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
@datatags("conversation", "context", "query")
|
|
108
|
+
@datatasks("conversational search", "query rewriting")
|
|
109
|
+
@dataset(
|
|
110
|
+
id=".2023",
|
|
111
|
+
url="https://github.com/irlabamsterdam/iKAT/tree/main/2023",
|
|
112
|
+
)
|
|
113
|
+
class Test2023(Dataset):
|
|
114
|
+
"""iKAT 2023 dataset"""
|
|
115
|
+
|
|
116
|
+
DOCUMENTS = reference(varname="documents", reference=Clueweb22)
|
|
117
|
+
QRELS = FileDownloader(
|
|
118
|
+
"qrels",
|
|
119
|
+
"https://trec.nist.gov/data/ikat/2023-qrels.all-turns.txt",
|
|
120
|
+
checker=HashCheck("79dc121bab25b2245e52a53263e5ad1f"),
|
|
121
|
+
)
|
|
122
|
+
TOPICS = FileDownloader(
|
|
123
|
+
"topics.json",
|
|
124
|
+
"https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2023/data/2023_test_topics.json",
|
|
125
|
+
checker=HashCheck("684fa0197cdec8c3cfb6a2e586ab83f6"),
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
def config(self) -> Adhoc:
|
|
129
|
+
return Adhoc.C(
|
|
130
|
+
topics=ConversationUserTopics.C(
|
|
131
|
+
conversations=IkatConversations.C(path=self.TOPICS.path)
|
|
132
|
+
),
|
|
133
|
+
assessments=TrecAdhocAssessments.C(path=self.QRELS.path),
|
|
134
|
+
documents=self.DOCUMENTS.prepare(),
|
|
135
|
+
)
|
|
@@ -4,8 +4,8 @@ import gzip
|
|
|
4
4
|
import json
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from typing import Iterator
|
|
7
|
-
from datamaestro.definitions import datatasks, datatags, dataset
|
|
8
|
-
from datamaestro.download.single import
|
|
7
|
+
from datamaestro.definitions import Dataset, datatasks, datatags, dataset
|
|
8
|
+
from datamaestro.download.single import FileDownloader
|
|
9
9
|
from datamaestro.utils import HashCheck
|
|
10
10
|
|
|
11
11
|
|
|
@@ -18,26 +18,10 @@ from datamaestro_text.datasets.irds.helpers import lz4docstore_downloader
|
|
|
18
18
|
|
|
19
19
|
@datatags("conversation", "context", "query")
|
|
20
20
|
@datatasks("query rewriting")
|
|
21
|
-
@filedownloader(
|
|
22
|
-
"train.jsonl",
|
|
23
|
-
"https://ciir.cs.umass.edu/downloads/ORConvQA/preprocessed/train.txt",
|
|
24
|
-
checker=HashCheck("7513a9ef12d8b7a4471166dc4fef77b7"),
|
|
25
|
-
)
|
|
26
|
-
@filedownloader(
|
|
27
|
-
"dev.jsonl",
|
|
28
|
-
"https://ciir.cs.umass.edu/downloads/ORConvQA/preprocessed/dev.txt",
|
|
29
|
-
checker=HashCheck("7765658995cc9ffd5eb39a400d814b20"),
|
|
30
|
-
)
|
|
31
|
-
@filedownloader(
|
|
32
|
-
"test.jsonl",
|
|
33
|
-
"https://ciir.cs.umass.edu/downloads/ORConvQA/preprocessed/test.txt",
|
|
34
|
-
checker=HashCheck("0cf3a755f06297b9c02e7db45f8dc8be"),
|
|
35
|
-
)
|
|
36
21
|
@dataset(
|
|
37
|
-
Supervised,
|
|
38
22
|
url="https://github.com/prdwb/orconvqa-release",
|
|
39
23
|
)
|
|
40
|
-
|
|
24
|
+
class Preprocessed(Dataset):
|
|
41
25
|
"""Open-Retrieval Conversational Question Answering datasets
|
|
42
26
|
|
|
43
27
|
OrConvQA is an aggregation of three existing datasets:
|
|
@@ -48,11 +32,29 @@ def preprocessed(train, dev, test):
|
|
|
48
32
|
|
|
49
33
|
Each dataset is an instance of :class:`datamaestro_text.data.conversation.OrConvQADataset`
|
|
50
34
|
"""
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
"
|
|
54
|
-
"
|
|
55
|
-
|
|
35
|
+
|
|
36
|
+
TRAIN = FileDownloader(
|
|
37
|
+
"train.jsonl",
|
|
38
|
+
"https://ciir.cs.umass.edu/downloads/ORConvQA/preprocessed/train.txt",
|
|
39
|
+
checker=HashCheck("7513a9ef12d8b7a4471166dc4fef77b7"),
|
|
40
|
+
)
|
|
41
|
+
DEV = FileDownloader(
|
|
42
|
+
"dev.jsonl",
|
|
43
|
+
"https://ciir.cs.umass.edu/downloads/ORConvQA/preprocessed/dev.txt",
|
|
44
|
+
checker=HashCheck("7765658995cc9ffd5eb39a400d814b20"),
|
|
45
|
+
)
|
|
46
|
+
TEST = FileDownloader(
|
|
47
|
+
"test.jsonl",
|
|
48
|
+
"https://ciir.cs.umass.edu/downloads/ORConvQA/preprocessed/test.txt",
|
|
49
|
+
checker=HashCheck("0cf3a755f06297b9c02e7db45f8dc8be"),
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
def config(self) -> Supervised:
|
|
53
|
+
return Supervised.C(
|
|
54
|
+
train=OrConvQADataset.C(path=self.TRAIN.path),
|
|
55
|
+
validation=OrConvQADataset.C(path=self.DEV.path),
|
|
56
|
+
test=OrConvQADataset.C(path=self.TEST.path),
|
|
57
|
+
)
|
|
56
58
|
|
|
57
59
|
|
|
58
60
|
def orConvQADocumentReader(source: Path) -> Iterator[OrConvQADocumentStore.NAMED_TUPLE]:
|
|
@@ -63,21 +65,10 @@ def orConvQADocumentReader(source: Path) -> Iterator[OrConvQADocumentStore.NAMED
|
|
|
63
65
|
yield OrConvQADocumentStore.NAMED_TUPLE(**data)
|
|
64
66
|
|
|
65
67
|
|
|
66
|
-
@lz4docstore_downloader(
|
|
67
|
-
"all_blocks",
|
|
68
|
-
"https://ciir.cs.umass.edu/downloads/ORConvQA/all_blocks.txt.gz",
|
|
69
|
-
orConvQADocumentReader,
|
|
70
|
-
OrConvQADocumentStore.NAMED_TUPLE,
|
|
71
|
-
"id",
|
|
72
|
-
checker=HashCheck("1095a3408690e7bbe4d8a87a2bae6356"),
|
|
73
|
-
size=5_086_902_800,
|
|
74
|
-
count_hint=11_377_951,
|
|
75
|
-
)
|
|
76
68
|
@dataset(
|
|
77
|
-
OrConvQADocumentStore,
|
|
78
69
|
url="https://github.com/prdwb/orconvqa-release",
|
|
79
70
|
)
|
|
80
|
-
|
|
71
|
+
class Passages(Dataset):
|
|
81
72
|
"""orConvQA wikipedia files
|
|
82
73
|
|
|
83
74
|
OrConvQA is an aggregation of three existing datasets:
|
|
@@ -86,4 +77,17 @@ def passages(all_blocks):
|
|
|
86
77
|
1. the CANARD dataset that consists of context-independent rewrites of QuAC questions, and
|
|
87
78
|
3. the Wikipedia corpus that serves as the knowledge source of answering questions.
|
|
88
79
|
"""
|
|
89
|
-
|
|
80
|
+
|
|
81
|
+
ALL_BLOCKS = lz4docstore_downloader(
|
|
82
|
+
"all_blocks",
|
|
83
|
+
"https://ciir.cs.umass.edu/downloads/ORConvQA/all_blocks.txt.gz",
|
|
84
|
+
orConvQADocumentReader,
|
|
85
|
+
OrConvQADocumentStore.NAMED_TUPLE,
|
|
86
|
+
"id",
|
|
87
|
+
checker=HashCheck("1095a3408690e7bbe4d8a87a2bae6356"),
|
|
88
|
+
size=5_086_902_800,
|
|
89
|
+
count_hint=11_377_951,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
def config(self) -> OrConvQADocumentStore:
|
|
93
|
+
return OrConvQADocumentStore.C(path=self.ALL_BLOCKS.path, count=11_377_951)
|