datamaestro-text 2026.1.1__py3-none-any.whl → 2026.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. datamaestro_text/__init__.py +1 -1
  2. datamaestro_text/config/com/github/aagohary/canard.py +27 -24
  3. datamaestro_text/config/com/github/apple/ml-qrecc.py +30 -25
  4. datamaestro_text/config/com/github/ikat.py +76 -62
  5. datamaestro_text/config/com/github/prdwb/orconvqa.py +41 -37
  6. datamaestro_text/config/com/microsoft/msmarco/passage.py +278 -207
  7. datamaestro_text/config/com/oscar-corpus.py +13 -10
  8. datamaestro_text/config/com/sentiment140.py +17 -12
  9. datamaestro_text/config/com/smashwords/bookcorpus.py +13 -10
  10. datamaestro_text/config/edu/stanford/aclimdb.py +14 -9
  11. datamaestro_text/config/edu/stanford/glove.py +66 -32
  12. datamaestro_text/config/edu/upenn/ldc/aquaint.py +35 -17
  13. datamaestro_text/config/fr/granddebat.py +57 -48
  14. datamaestro_text/config/gov/nist/ir/covid.py +62 -52
  15. datamaestro_text/config/gov/nist/trec/adhoc.py +395 -255
  16. datamaestro_text/config/gov/nist/trec/tipster.py +170 -64
  17. datamaestro_text/config/io/github/thunlp/fewrel.py +20 -15
  18. datamaestro_text/config/io/metamind/research/wikitext.py +51 -33
  19. datamaestro_text/config/org/grouplens/movielens.py +28 -37
  20. datamaestro_text/config/org/universaldependencies/french.py +16 -11
  21. datamaestro_text/data/conversation/__init__.py +6 -6
  22. datamaestro_text/data/conversation/base.py +2 -2
  23. datamaestro_text/data/conversation/canard.py +3 -4
  24. datamaestro_text/data/conversation/ikat.py +0 -1
  25. datamaestro_text/data/conversation/orconvqa.py +3 -3
  26. datamaestro_text/data/embeddings.py +1 -0
  27. datamaestro_text/data/ir/__init__.py +1 -1
  28. datamaestro_text/data/ir/base.py +1 -1
  29. datamaestro_text/data/ir/data.py +1 -1
  30. datamaestro_text/data/ir/formats.py +2 -1
  31. datamaestro_text/data/ir/stores.py +1 -1
  32. datamaestro_text/data/text.py +1 -0
  33. datamaestro_text/datasets/__init__.py +1 -0
  34. datamaestro_text/datasets/irds/data.py +1 -6
  35. datamaestro_text/download/tmdb.py +0 -1
  36. datamaestro_text/test/test_documented.py +2 -2
  37. datamaestro_text/transforms/ir/__init__.py +12 -13
  38. datamaestro_text/utils/shuffle.py +1 -1
  39. datamaestro_text/version.py +2 -2
  40. {datamaestro_text-2026.1.1.dist-info → datamaestro_text-2026.2.3.dist-info}/METADATA +2 -8
  41. {datamaestro_text-2026.1.1.dist-info → datamaestro_text-2026.2.3.dist-info}/RECORD +44 -43
  42. {datamaestro_text-2026.1.1.dist-info → datamaestro_text-2026.2.3.dist-info}/WHEEL +0 -0
  43. {datamaestro_text-2026.1.1.dist-info → datamaestro_text-2026.2.3.dist-info}/entry_points.txt +0 -0
  44. {datamaestro_text-2026.1.1.dist-info → datamaestro_text-2026.2.3.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,6 @@
1
1
  import datamaestro
2
2
 
3
- from .version import version, version_tuple
3
+ from .version import version as version, version_tuple as version_tuple
4
4
 
5
5
 
6
6
  class Repository(datamaestro.Repository):
@@ -1,5 +1,5 @@
1
- from datamaestro.definitions import datatasks, datatags, dataset
2
- from datamaestro.download.single import filedownloader
1
+ from datamaestro.definitions import Dataset, datatasks, datatags, dataset
2
+ from datamaestro.download.single import FileDownloader
3
3
  from datamaestro.utils import HashCheck
4
4
 
5
5
  from datamaestro.data.ml import Supervised
@@ -8,23 +8,8 @@ from datamaestro_text.data.conversation.canard import CanardDataset
8
8
 
9
9
  @datatags("conversation", "context", "query")
10
10
  @datatasks("query rewriting")
11
- @filedownloader(
12
- "train.json",
13
- "https://raw.githubusercontent.com/aagohary/canard/refs/heads/master/data/release/train.json",
14
- checker=HashCheck("73624ac646fb81e09b0fd7f01370ada3"),
15
- )
16
- @filedownloader(
17
- "dev.json",
18
- "https://raw.githubusercontent.com/aagohary/canard/refs/heads/master/data/release/dev.json",
19
- checker=HashCheck("c84525631a83bc771c58ff31f4a9b601"),
20
- )
21
- @filedownloader(
22
- "test.json",
23
- "https://raw.githubusercontent.com/aagohary/canard/refs/heads/master/data/release/test.json",
24
- checker=HashCheck("3fc14d0078e7a5056f5da571728f024e"),
25
- )
26
- @dataset(Supervised, url="https://sites.google.com/view/qanta/projects/canard", id="")
27
- def main(train, dev, test):
11
+ @dataset(url="https://sites.google.com/view/qanta/projects/canard", id="")
12
+ class Main(Dataset):
28
13
  """Question-in-context rewriting
29
14
 
30
15
  CANARD is a dataset for question-in-context rewriting that consists of
@@ -36,8 +21,26 @@ def main(train, dev, test):
36
21
 
37
22
  Each dataset is an instance of :class:`datamaestro_text.data.conversation.CanardDataset`
38
23
  """
39
- return {
40
- "train": CanardDataset.C(path=train),
41
- "validation": CanardDataset.C(path=dev),
42
- "test": CanardDataset.C(path=test),
43
- }
24
+
25
+ TRAIN = FileDownloader(
26
+ "train.json",
27
+ "https://raw.githubusercontent.com/aagohary/canard/refs/heads/master/data/release/train.json",
28
+ checker=HashCheck("73624ac646fb81e09b0fd7f01370ada3"),
29
+ )
30
+ DEV = FileDownloader(
31
+ "dev.json",
32
+ "https://raw.githubusercontent.com/aagohary/canard/refs/heads/master/data/release/dev.json",
33
+ checker=HashCheck("c84525631a83bc771c58ff31f4a9b601"),
34
+ )
35
+ TEST = FileDownloader(
36
+ "test.json",
37
+ "https://raw.githubusercontent.com/aagohary/canard/refs/heads/master/data/release/test.json",
38
+ checker=HashCheck("3fc14d0078e7a5056f5da571728f024e"),
39
+ )
40
+
41
+ def config(self) -> Supervised:
42
+ return Supervised.C(
43
+ train=CanardDataset.C(path=self.TRAIN.path),
44
+ validation=CanardDataset.C(path=self.DEV.path),
45
+ test=CanardDataset.C(path=self.TEST.path),
46
+ )
@@ -3,10 +3,10 @@
3
3
  import re
4
4
  import json
5
5
  from pathlib import Path
6
- from datamaestro.definitions import datatasks, datatags, dataset
6
+ from datamaestro.definitions import Dataset, datatasks, datatags, dataset
7
7
  from datamaestro.data.ml import Supervised
8
8
  from datamaestro.download import reference
9
- from datamaestro.download.archive import zipdownloader
9
+ from datamaestro.download.archive import ZipDownloader
10
10
  from datamaestro.download.wayback import wayback_documents
11
11
  from datamaestro.utils import HashCheck
12
12
  from datamaestro_text.data.conversation.qrecc import QReCCDataset
@@ -19,17 +19,12 @@ from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
19
19
 
20
20
  @datatags("conversation", "context", "query")
21
21
  @datatasks("query rewriting")
22
- @zipdownloader(
23
- "data",
24
- "https://github.com/apple/ml-qrecc/raw/main/dataset/qrecc_data.zip",
25
- checker=HashCheck("f88fcc7ef3678cd6312080389c8abd67"),
26
- )
27
22
  @dataset(
28
23
  url="https://github.com/apple/ml-qrecc",
29
24
  doi="https://doi.org/10.48550/arXiv.2010.04898",
30
25
  id="",
31
26
  )
32
- def main(data: Path) -> Supervised[QReCCDataset, None, QReCCDataset]:
27
+ class Main(Dataset):
33
28
  """Open-Domain Question Answering Goes Conversational via Question Rewriting
34
29
 
35
30
  We introduce QReCC (Question Rewriting in Conversational Context), an
@@ -39,34 +34,44 @@ def main(data: Path) -> Supervised[QReCCDataset, None, QReCCDataset]:
39
34
  answering that includes the individual subtasks of question rewriting,
40
35
  passage retrieval and reading comprehension
41
36
  """
42
- return Supervised.C(
43
- train=QReCCDataset.C(path=data / "qrecc_train.json"),
44
- test=QReCCDataset.C(path=data / "qrecc_test.json"),
37
+
38
+ DATA = ZipDownloader(
39
+ "data",
40
+ "https://github.com/apple/ml-qrecc/raw/main/dataset/qrecc_data.zip",
41
+ checker=HashCheck("f88fcc7ef3678cd6312080389c8abd67"),
45
42
  )
46
43
 
44
+ def config(self) -> Supervised:
45
+ return Supervised.C(
46
+ train=QReCCDataset.C(path=self.DATA.path / "qrecc_train.json"),
47
+ test=QReCCDataset.C(path=self.DATA.path / "qrecc_test.json"),
48
+ )
49
+
47
50
 
48
51
  @dataset(
49
52
  url="https://github.com/apple/ml-qrecc",
50
53
  doi="https://doi.org/10.48550/arXiv.2010.04898",
51
54
  )
52
- class Content(LZ4JSONLDocumentStore):
55
+ class Content(Dataset):
53
56
  """QReCC mentionned URLs content"""
54
57
 
55
- @staticmethod
56
- def __create_dataset__(dataset, options=None):
57
- ds = reference(reference=main).setup(dataset, options)
58
- documents_path = wayback_documents(
59
- "20191127", lambda: Content._urls(ds), name="wayback.jsonl"
60
- ).setup(dataset, options)
58
+ MAIN = reference(reference=Main)
61
59
 
62
- store_path = lz4docstore_builder(
63
- "store",
64
- lambda: Content._documents(documents_path),
65
- SimpleJsonDocument,
66
- "id",
67
- ).setup(dataset, options)
60
+ WAYBACK_DOCS = wayback_documents(
61
+ "20191127",
62
+ lambda: Content._urls(Content.MAIN.prepare()),
63
+ name="wayback.jsonl",
64
+ )
65
+
66
+ STORE = lz4docstore_builder(
67
+ "store",
68
+ lambda: Content._documents(Content.WAYBACK_DOCS.path),
69
+ SimpleJsonDocument,
70
+ "id",
71
+ )
68
72
 
69
- return Content.C(jsonl_path=store_path)
73
+ def config(self) -> LZ4JSONLDocumentStore:
74
+ return LZ4JSONLDocumentStore.C(jsonl_path=self.STORE.path)
70
75
 
71
76
  @staticmethod
72
77
  def _documents(path: Path):
@@ -1,14 +1,13 @@
1
1
  # See documentation on https://datamaestro.readthedocs.io
2
2
 
3
- import bz2
4
3
  from datamaestro.download import reference
5
- from datamaestro.definitions import datatasks, datatags, dataset
4
+ from datamaestro.definitions import Dataset, datatasks, datatags, dataset
6
5
  from datamaestro_text.data.conversation.base import ConversationUserTopics
7
6
  from datamaestro_text.data.ir import Adhoc
8
7
 
9
8
  from datamaestro.utils import HashCheck
10
9
  from datamaestro.context import DatafolderPath
11
- from datamaestro.download.single import filedownloader
10
+ from datamaestro.download.single import FileDownloader
12
11
  from datamaestro_text.data.conversation.ikat import IkatConversations
13
12
  from datamaestro.download.links import linkfolder
14
13
 
@@ -17,105 +16,120 @@ from datamaestro_text.data.ir.trec import TrecAdhocAssessments
17
16
  from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
18
17
 
19
18
 
20
- @dataset(as_prepare=True)
21
- def clueweb22(dataset, options=None) -> IKatClueWeb22DocumentStore:
19
+ @dataset()
20
+ class Clueweb22(Dataset):
22
21
  # Number of documents in the dataset
23
22
  count = 116_838_987
24
23
 
25
- jsonl_folder = linkfolder(
24
+ JSONL_FOLDER = linkfolder(
26
25
  "documents", [DatafolderPath("gov.nist.trec.ikat.clueweb22", "jsonl")]
27
- ).setup(dataset, options)
28
- store_path = lz4docstore_builder(
26
+ )
27
+
28
+ STORE_PATH = lz4docstore_builder(
29
29
  "store",
30
30
  IKatClueWeb22DocumentStore.generator(
31
- jsonl_folder,
32
- jsonl_folder / "ikat_2023_passages_jsonl.sha256sums",
33
- jsonl_folder / "ikat_2023_passages_hashes.tsv.bz2",
31
+ JSONL_FOLDER,
32
+ "ikat_2023_passages_jsonl.sha256sums",
33
+ "ikat_2023_passages_hashes.tsv.bz2",
34
34
  ),
35
35
  IKatClueWeb22DocumentStore.Document,
36
36
  "id",
37
37
  count_hint=count,
38
- ).setup(dataset, options)
38
+ )
39
39
 
40
- return IKatClueWeb22DocumentStore.C(path=store_path, count=count)
40
+ def config(self) -> IKatClueWeb22DocumentStore:
41
+ return IKatClueWeb22DocumentStore.C(path=self.STORE_PATH.path, count=self.count)
41
42
 
42
43
 
43
44
  @datatags("conversation", "context", "query")
44
45
  @datatasks("conversational search", "query rewriting")
45
- @reference("documents", clueweb22)
46
- @filedownloader(
47
- "topics.json",
48
- "https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2025/data/2025_test_topics.json",
49
- checker=HashCheck("16f8444a8d0a8dfe0090f478f185a63c"),
50
- )
51
46
  @dataset(
52
- id="2025",
47
+ id=".2025",
53
48
  url="https://github.com/irlabamsterdam/iKAT/tree/main/2025",
54
49
  )
55
- def test_2025(topics, documents) -> Adhoc.C:
50
+ class Test2025(Dataset):
56
51
  """Question-in-context rewriting
57
52
 
58
53
  iKAT is a test dataset for question-in-context rewriting that consists of
59
54
  questions each given in a dialog context together with a context-independent
60
55
  rewriting of the question.
61
56
  """
62
- return Adhoc.C(
63
- topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
64
- # TODO: add when available
65
- assessments=TrecAdhocAssessments.C(path="/to/do"),
66
- documents=documents,
57
+
58
+ DOCUMENTS = reference(varname="documents", reference=Clueweb22)
59
+ TOPICS = FileDownloader(
60
+ "topics.json",
61
+ "https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2025/data/2025_test_topics.json",
62
+ checker=HashCheck("16f8444a8d0a8dfe0090f478f185a63c"),
67
63
  )
68
64
 
65
+ def config(self) -> Adhoc:
66
+ return Adhoc.C(
67
+ topics=ConversationUserTopics.C(
68
+ conversations=IkatConversations.C(path=self.TOPICS.path)
69
+ ),
70
+ # TODO: add when available
71
+ assessments=TrecAdhocAssessments.C(path="/to/do"),
72
+ documents=self.DOCUMENTS.prepare(),
73
+ )
74
+
69
75
 
70
76
  @datatags("conversation", "context", "query")
71
77
  @datatasks("conversational search", "query rewriting")
72
- @reference("documents", clueweb22)
73
- @filedownloader(
74
- "qrels",
75
- "https://trec.nist.gov/data/ikat/2024-qrels.txt",
76
- checker=HashCheck("57f958903ed1c12bbac207f62800814f"),
77
- )
78
- @filedownloader(
79
- "topics.json",
80
- "https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2024/data/2024_test_topics.json",
81
- checker=HashCheck("ad45bc6e7add2081d69ea60a0a4d1203"),
82
- )
83
78
  @dataset(
84
- Adhoc,
85
- id="2024",
79
+ id=".2024",
86
80
  url="https://github.com/irlabamsterdam/iKAT/tree/main/2024",
87
81
  )
88
- def test_2024(topics, qrels, documents) -> Adhoc.C:
82
+ class Test2024(Dataset):
89
83
  """iKAT 2024 dataset"""
90
- return Adhoc.C(
91
- topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
92
- assessments=TrecAdhocAssessments.C(path=qrels),
93
- documents=documents,
84
+
85
+ DOCUMENTS = reference(varname="documents", reference=Clueweb22)
86
+ QRELS = FileDownloader(
87
+ "qrels",
88
+ "https://trec.nist.gov/data/ikat/2024-qrels.txt",
89
+ checker=HashCheck("57f958903ed1c12bbac207f62800814f"),
90
+ )
91
+ TOPICS = FileDownloader(
92
+ "topics.json",
93
+ "https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2024/data/2024_test_topics.json",
94
+ checker=HashCheck("ad45bc6e7add2081d69ea60a0a4d1203"),
94
95
  )
95
96
 
97
+ def config(self) -> Adhoc:
98
+ return Adhoc.C(
99
+ topics=ConversationUserTopics.C(
100
+ conversations=IkatConversations.C(path=self.TOPICS.path)
101
+ ),
102
+ assessments=TrecAdhocAssessments.C(path=self.QRELS.path),
103
+ documents=self.DOCUMENTS.prepare(),
104
+ )
105
+
96
106
 
97
107
  @datatags("conversation", "context", "query")
98
108
  @datatasks("conversational search", "query rewriting")
99
- @reference("documents", clueweb22)
100
- @filedownloader(
101
- "qrels",
102
- "https://trec.nist.gov/data/ikat/2023-qrels.all-turns.txt",
103
- checker=HashCheck("79dc121bab25b2245e52a53263e5ad1f"),
104
- )
105
- @filedownloader(
106
- "topics.json",
107
- "https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2023/data/2023_test_topics.json",
108
- checker=HashCheck("684fa0197cdec8c3cfb6a2e586ab83f6"),
109
- )
110
109
  @dataset(
111
- Adhoc,
112
- id="2023",
110
+ id=".2023",
113
111
  url="https://github.com/irlabamsterdam/iKAT/tree/main/2023",
114
112
  )
115
- def test_2023(topics, qrels, documents) -> Adhoc.C:
113
+ class Test2023(Dataset):
116
114
  """iKAT 2023 dataset"""
117
- return Adhoc.C(
118
- topics=ConversationUserTopics.C(conversations=IkatConversations.C(path=topics)),
119
- assessments=TrecAdhocAssessments.C(path=qrels),
120
- documents=documents,
115
+
116
+ DOCUMENTS = reference(varname="documents", reference=Clueweb22)
117
+ QRELS = FileDownloader(
118
+ "qrels",
119
+ "https://trec.nist.gov/data/ikat/2023-qrels.all-turns.txt",
120
+ checker=HashCheck("79dc121bab25b2245e52a53263e5ad1f"),
121
121
  )
122
+ TOPICS = FileDownloader(
123
+ "topics.json",
124
+ "https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2023/data/2023_test_topics.json",
125
+ checker=HashCheck("684fa0197cdec8c3cfb6a2e586ab83f6"),
126
+ )
127
+
128
+ def config(self) -> Adhoc:
129
+ return Adhoc.C(
130
+ topics=ConversationUserTopics.C(
131
+ conversations=IkatConversations.C(path=self.TOPICS.path)
132
+ ),
133
+ assessments=TrecAdhocAssessments.C(path=self.QRELS.path),
134
+ documents=self.DOCUMENTS.prepare(),
135
+ )
@@ -4,8 +4,8 @@ import gzip
4
4
  import json
5
5
  from pathlib import Path
6
6
  from typing import Iterator
7
- from datamaestro.definitions import datatasks, datatags, dataset
8
- from datamaestro.download.single import filedownloader
7
+ from datamaestro.definitions import Dataset, datatasks, datatags, dataset
8
+ from datamaestro.download.single import FileDownloader
9
9
  from datamaestro.utils import HashCheck
10
10
 
11
11
 
@@ -18,26 +18,10 @@ from datamaestro_text.datasets.irds.helpers import lz4docstore_downloader
18
18
 
19
19
  @datatags("conversation", "context", "query")
20
20
  @datatasks("query rewriting")
21
- @filedownloader(
22
- "train.jsonl",
23
- "https://ciir.cs.umass.edu/downloads/ORConvQA/preprocessed/train.txt",
24
- checker=HashCheck("7513a9ef12d8b7a4471166dc4fef77b7"),
25
- )
26
- @filedownloader(
27
- "dev.jsonl",
28
- "https://ciir.cs.umass.edu/downloads/ORConvQA/preprocessed/dev.txt",
29
- checker=HashCheck("7765658995cc9ffd5eb39a400d814b20"),
30
- )
31
- @filedownloader(
32
- "test.jsonl",
33
- "https://ciir.cs.umass.edu/downloads/ORConvQA/preprocessed/test.txt",
34
- checker=HashCheck("0cf3a755f06297b9c02e7db45f8dc8be"),
35
- )
36
21
  @dataset(
37
- Supervised,
38
22
  url="https://github.com/prdwb/orconvqa-release",
39
23
  )
40
- def preprocessed(train, dev, test):
24
+ class Preprocessed(Dataset):
41
25
  """Open-Retrieval Conversational Question Answering datasets
42
26
 
43
27
  OrConvQA is an aggregation of three existing datasets:
@@ -48,11 +32,29 @@ def preprocessed(train, dev, test):
48
32
 
49
33
  Each dataset is an instance of :class:`datamaestro_text.data.conversation.OrConvQADataset`
50
34
  """
51
- return {
52
- "train": OrConvQADataset.C(path=train),
53
- "validation": OrConvQADataset.C(path=dev),
54
- "test": OrConvQADataset.C(path=test),
55
- }
35
+
36
+ TRAIN = FileDownloader(
37
+ "train.jsonl",
38
+ "https://ciir.cs.umass.edu/downloads/ORConvQA/preprocessed/train.txt",
39
+ checker=HashCheck("7513a9ef12d8b7a4471166dc4fef77b7"),
40
+ )
41
+ DEV = FileDownloader(
42
+ "dev.jsonl",
43
+ "https://ciir.cs.umass.edu/downloads/ORConvQA/preprocessed/dev.txt",
44
+ checker=HashCheck("7765658995cc9ffd5eb39a400d814b20"),
45
+ )
46
+ TEST = FileDownloader(
47
+ "test.jsonl",
48
+ "https://ciir.cs.umass.edu/downloads/ORConvQA/preprocessed/test.txt",
49
+ checker=HashCheck("0cf3a755f06297b9c02e7db45f8dc8be"),
50
+ )
51
+
52
+ def config(self) -> Supervised:
53
+ return Supervised.C(
54
+ train=OrConvQADataset.C(path=self.TRAIN.path),
55
+ validation=OrConvQADataset.C(path=self.DEV.path),
56
+ test=OrConvQADataset.C(path=self.TEST.path),
57
+ )
56
58
 
57
59
 
58
60
  def orConvQADocumentReader(source: Path) -> Iterator[OrConvQADocumentStore.NAMED_TUPLE]:
@@ -63,21 +65,10 @@ def orConvQADocumentReader(source: Path) -> Iterator[OrConvQADocumentStore.NAMED
63
65
  yield OrConvQADocumentStore.NAMED_TUPLE(**data)
64
66
 
65
67
 
66
- @lz4docstore_downloader(
67
- "all_blocks",
68
- "https://ciir.cs.umass.edu/downloads/ORConvQA/all_blocks.txt.gz",
69
- orConvQADocumentReader,
70
- OrConvQADocumentStore.NAMED_TUPLE,
71
- "id",
72
- checker=HashCheck("1095a3408690e7bbe4d8a87a2bae6356"),
73
- size=5_086_902_800,
74
- count_hint=11_377_951,
75
- )
76
68
  @dataset(
77
- OrConvQADocumentStore,
78
69
  url="https://github.com/prdwb/orconvqa-release",
79
70
  )
80
- def passages(all_blocks):
71
+ class Passages(Dataset):
81
72
  """orConvQA wikipedia files
82
73
 
83
74
  OrConvQA is an aggregation of three existing datasets:
@@ -86,4 +77,17 @@ def passages(all_blocks):
86
77
  1. the CANARD dataset that consists of context-independent rewrites of QuAC questions, and
87
78
  3. the Wikipedia corpus that serves as the knowledge source of answering questions.
88
79
  """
89
- return {"path": all_blocks, "count": 11_377_951}
80
+
81
+ ALL_BLOCKS = lz4docstore_downloader(
82
+ "all_blocks",
83
+ "https://ciir.cs.umass.edu/downloads/ORConvQA/all_blocks.txt.gz",
84
+ orConvQADocumentReader,
85
+ OrConvQADocumentStore.NAMED_TUPLE,
86
+ "id",
87
+ checker=HashCheck("1095a3408690e7bbe4d8a87a2bae6356"),
88
+ size=5_086_902_800,
89
+ count_hint=11_377_951,
90
+ )
91
+
92
+ def config(self) -> OrConvQADocumentStore:
93
+ return OrConvQADocumentStore.C(path=self.ALL_BLOCKS.path, count=11_377_951)