datamaestro-text 2023.12.5__tar.gz → 2023.12.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/.readthedocs.yml +1 -0
- {datamaestro-text-2023.12.5/src/datamaestro_text.egg-info → datamaestro-text-2023.12.12}/PKG-INFO +1 -1
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/docs/source/datasets/conversation.rst +2 -0
- datamaestro-text-2023.12.12/docs/source/datasets/irds.rst +19 -0
- datamaestro-text-2023.12.12/src/datamaestro_text/config/com/github/prdwb/orconvqa.py +92 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/ir/__init__.py +1 -1
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/ir/formats.py +14 -0
- datamaestro-text-2023.12.12/src/datamaestro_text/data/ir/stores.py +22 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/datasets/irds/data.py +65 -2
- datamaestro-text-2023.12.12/src/datamaestro_text/datasets/irds/helpers.py +71 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/version.py +2 -2
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12/src/datamaestro_text.egg-info}/PKG-INFO +1 -1
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text.egg-info/SOURCES.txt +3 -0
- datamaestro-text-2023.12.5/src/datamaestro_text/config/com/github/prdwb/orconvqa.py +0 -49
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/.circleci/config.yml +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/.flake8 +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/.github/workflows/pytest.yml +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/.github/workflows/python-publish.yml +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/.gitignore +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/.pre-commit-config.yaml +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/LICENSE +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/MANIFEST.in +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/Makefile +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/README.md +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/docs/Makefile +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/docs/make.bat +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/docs/requirements.txt +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/docs/source/api/conversation.rst +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/docs/source/api/index.rst +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/docs/source/api/ir.rst +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/docs/source/api/text.rst +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/docs/source/conf.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/docs/source/datasets/embeddings.rst +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/docs/source/datasets/index.rst +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/docs/source/datasets/ir.rst +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/docs/source/datasets/recommendation.rst +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/docs/source/datasets/text.rst +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/docs/source/index.rst +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/mkdocs.yml +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/pyproject.toml +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/requirements-dev.txt +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/requirements.txt +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/setup.cfg +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/__init__.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/__init__.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/ai/quac.yaml +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/com/fastml/goodbooks-10k.yaml +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/com/github/aagohary/canard.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/com/github/soskek/bookcorpus.yaml +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/com/microsoft/msmarco/passage.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/com/microsoft/wikiqa.yaml +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/com/oscar-corpus.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/com/sentiment140.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/com/smashwords/bookcorpus.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/edu/cornell/nlvr.yaml +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/edu/stanford/__init__.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/edu/stanford/aclimdb.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/edu/stanford/glove.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/edu/stanford/im2p.yaml +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/edu/upenn/__init__.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/edu/upenn/ldc/__init__.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/edu/upenn/ldc/aquaint.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/gov/__init__.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/gov/nist/__init__.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/gov/nist/ir/covid.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/gov/nist/trec/__init__.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/gov/nist/trec/adhoc.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/gov/nist/trec/clueweb.yaml +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/gov/nist/trec/deeplearning.yaml +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/gov/nist/trec/index.yaml +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/gov/nist/trec/tipster.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/gov/nist/trec/web.yaml +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/io/github/rajpurkar/squad.yaml +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/io/github/thunlp/fewrel.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/io/metamind/research/__init__.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/io/metamind/research/wikitext.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/net/mattmahoney/enwiki.yaml +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/org/acm/recsys/cb2014.yaml +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/org/cocodataset/index.yaml +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/org/grouplens/movielens.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/org/universaldependencies/french.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/uk/ac/ucl/cs/qangaroo.yaml +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/conversation/__init__.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/conversation/base.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/conversation/canard.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/conversation/orconvqa.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/embeddings.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/ir/base.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/ir/cord19.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/ir/csv.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/ir/data.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/ir/huggingface.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/ir/trec.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/ir/utils.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/recommendation.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/tagging.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/text.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/datasets/irds/__init__.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/datasets/irds/datasets.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/datasets/irds/utils.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/download/tmdb.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/interfaces/plaintext.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/interfaces/trec.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/test/__init__.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/test/test_datasets.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/transforms/__init__.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/transforms/ir/__init__.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/utils/__init__.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/utils/files.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/utils/iter.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/utils/randomstream.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/utils/shuffle.py +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text.egg-info/dependency_links.txt +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text.egg-info/entry_points.txt +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text.egg-info/requires.txt +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text.egg-info/top_level.txt +0 -0
- {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/tox.ini +0 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
IR Datasets
|
|
2
|
+
===========
|
|
3
|
+
|
|
4
|
+
XPMIR provides an interface to the `IR Datasets <https://ir-datasets.com/>`_ library.
|
|
5
|
+
The list below is provided as a reference, but might not be up-to-date if your
|
|
6
|
+
version of `ir-datasets` is more ancient or newer than the one used at generation time.
|
|
7
|
+
|
|
8
|
+
Data types
|
|
9
|
+
----------
|
|
10
|
+
|
|
11
|
+
.. autoxpmconfig:: xpmir.datasets.irds.data.Topics
|
|
12
|
+
.. autoxpmconfig:: xpmir.datasets.irds.data.Documents
|
|
13
|
+
.. autoxpmconfig:: xpmir.datasets.irds.data.AdhocAssessments
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
List of datasets
|
|
17
|
+
----------------
|
|
18
|
+
|
|
19
|
+
.. dm:repository:: irds
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# See documentation on https://datamaestro.readthedocs.io
|
|
2
|
+
|
|
3
|
+
from collections import namedtuple
|
|
4
|
+
import gzip
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Iterator, NamedTuple
|
|
8
|
+
import attrs
|
|
9
|
+
from datamaestro.definitions import datatasks, datatags, dataset
|
|
10
|
+
from datamaestro.download.single import filedownloader
|
|
11
|
+
from datamaestro.utils import HashCheck
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
from datamaestro_text.data.conversation.orconvqa import OrConvQADataset
|
|
15
|
+
from datamaestro.data.ml import Supervised
|
|
16
|
+
|
|
17
|
+
from datamaestro_text.data.ir import DocumentStore
|
|
18
|
+
from datamaestro_text.data.ir.formats import OrConvQADocument
|
|
19
|
+
from datamaestro_text.data.ir.stores import OrConvQADocumentStore
|
|
20
|
+
from datamaestro_text.datasets.irds.data import LZ4DocumentStore
|
|
21
|
+
from datamaestro_text.datasets.irds.helpers import lz4docstore_downloader
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@datatags("conversation", "context", "query")
|
|
25
|
+
@datatasks("query rewriting")
|
|
26
|
+
@filedownloader(
|
|
27
|
+
"train.jsonl",
|
|
28
|
+
"https://ciir.cs.umass.edu/downloads/ORConvQA/preprocessed/train.txt",
|
|
29
|
+
checker=HashCheck("7513a9ef12d8b7a4471166dc4fef77b7"),
|
|
30
|
+
)
|
|
31
|
+
@filedownloader(
|
|
32
|
+
"dev.jsonl",
|
|
33
|
+
"https://ciir.cs.umass.edu/downloads/ORConvQA/preprocessed/dev.txt",
|
|
34
|
+
checker=HashCheck("7765658995cc9ffd5eb39a400d814b20"),
|
|
35
|
+
)
|
|
36
|
+
@filedownloader(
|
|
37
|
+
"test.jsonl",
|
|
38
|
+
"https://ciir.cs.umass.edu/downloads/ORConvQA/preprocessed/test.txt",
|
|
39
|
+
checker=HashCheck("0cf3a755f06297b9c02e7db45f8dc8be"),
|
|
40
|
+
)
|
|
41
|
+
@dataset(
|
|
42
|
+
Supervised,
|
|
43
|
+
url="https://github.com/prdwb/orconvqa-release",
|
|
44
|
+
)
|
|
45
|
+
def preprocessed(train, dev, test):
|
|
46
|
+
"""Open-Retrieval Conversational Question Answering datasets
|
|
47
|
+
|
|
48
|
+
OrConvQA is an aggregation of three existing datasets:
|
|
49
|
+
|
|
50
|
+
1. the QuAC dataset that offers information-seeking conversations,
|
|
51
|
+
1. the CANARD dataset that consists of context-independent rewrites of QuAC questions, and
|
|
52
|
+
3. the Wikipedia corpus that serves as the knowledge source of answering questions.
|
|
53
|
+
|
|
54
|
+
Each dataset is an instance of :class:`datamaestro_text.data.conversation.OrConvQADataset`
|
|
55
|
+
"""
|
|
56
|
+
return {
|
|
57
|
+
"train": OrConvQADataset(path=train),
|
|
58
|
+
"validation": OrConvQADataset(path=dev),
|
|
59
|
+
"test": OrConvQADataset(path=test),
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def orConvQADocumentReader(source: Path) -> Iterator[OrConvQADocumentStore.NAMED_TUPLE]:
|
|
64
|
+
with gzip.open(source, "rt") as fp:
|
|
65
|
+
for line in fp:
|
|
66
|
+
yield OrConvQADocumentStore.NAMED_TUPLE(**json.loads(line))
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@lz4docstore_downloader(
|
|
70
|
+
"all_blocks",
|
|
71
|
+
"https://ciir.cs.umass.edu/downloads/ORConvQA/all_blocks.txt.gz",
|
|
72
|
+
orConvQADocumentReader,
|
|
73
|
+
OrConvQADocumentStore.NAMED_TUPLE,
|
|
74
|
+
"id",
|
|
75
|
+
checker=HashCheck("1095a3408690e7bbe4d8a87a2bae6356"),
|
|
76
|
+
size=5_086_902_800,
|
|
77
|
+
count_hint=11_377_951,
|
|
78
|
+
)
|
|
79
|
+
@dataset(
|
|
80
|
+
OrConvQADocumentStore,
|
|
81
|
+
url="https://github.com/prdwb/orconvqa-release",
|
|
82
|
+
)
|
|
83
|
+
def passages(all_blocks):
|
|
84
|
+
"""orConvQA wikipedia files
|
|
85
|
+
|
|
86
|
+
OrConvQA is an aggregation of three existing datasets:
|
|
87
|
+
|
|
88
|
+
1. the QuAC dataset that offers information-seeking conversations,
|
|
89
|
+
1. the CANARD dataset that consists of context-independent rewrites of QuAC questions, and
|
|
90
|
+
3. the Wikipedia corpus that serves as the knowledge source of answering questions.
|
|
91
|
+
"""
|
|
92
|
+
return {"path": all_blocks, "count": 11_377_951}
|
{datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/ir/__init__.py
RENAMED
|
@@ -95,7 +95,7 @@ class DocumentStore(Documents):
|
|
|
95
95
|
self, randint: Optional[Callable[[int], int]]
|
|
96
96
|
) -> Iterator[Document]:
|
|
97
97
|
"""Sample documents from the dataset"""
|
|
98
|
-
length = self.documentcount
|
|
98
|
+
length = self.documentcount
|
|
99
99
|
randint = randint or (lambda max: random.randint(0, max - 1))
|
|
100
100
|
while True:
|
|
101
101
|
yield self.document_int(randint(length))
|
{datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/ir/formats.py
RENAMED
|
@@ -130,6 +130,20 @@ class TweetDoc(IDHolder, Document):
|
|
|
130
130
|
return f"{self.text}"
|
|
131
131
|
|
|
132
132
|
|
|
133
|
+
@define
|
|
134
|
+
class OrConvQADocument(IDHolder, Document):
|
|
135
|
+
id: str
|
|
136
|
+
title: str
|
|
137
|
+
text: str
|
|
138
|
+
aid: str
|
|
139
|
+
bid: int
|
|
140
|
+
|
|
141
|
+
has_text: ClassVar[bool] = True
|
|
142
|
+
|
|
143
|
+
def get_text(self):
|
|
144
|
+
return f"{self.title} {self.text}"
|
|
145
|
+
|
|
146
|
+
|
|
133
147
|
@define
|
|
134
148
|
class TrecTopic(GenericTopic):
|
|
135
149
|
text: str
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from collections import namedtuple
|
|
2
|
+
from typing import List
|
|
3
|
+
from experimaestro import Constant
|
|
4
|
+
import attrs
|
|
5
|
+
|
|
6
|
+
from datamaestro_text.datasets.irds.data import LZ4DocumentStore
|
|
7
|
+
from datamaestro_text.data.ir.formats import OrConvQADocument
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class OrConvQADocumentStore(LZ4DocumentStore):
|
|
11
|
+
NAMED_TUPLE = namedtuple(
|
|
12
|
+
"OrConvQADocument", [a.name for a in attrs.fields(OrConvQADocument)]
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
lookup_field: Constant[str] = "id"
|
|
16
|
+
fields: Constant[List[str]] = list(NAMED_TUPLE._fields)
|
|
17
|
+
index_fields: Constant[List[str]] = ["id"]
|
|
18
|
+
|
|
19
|
+
data_cls = NAMED_TUPLE
|
|
20
|
+
|
|
21
|
+
def converter(self, data: NAMED_TUPLE) -> OrConvQADocument:
|
|
22
|
+
return OrConvQADocument(**data._asdict())
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any, Iterator, NamedTuple, Tuple, Type, List
|
|
3
4
|
import attrs
|
|
4
5
|
import ir_datasets
|
|
6
|
+
from ir_datasets.indices import PickleLz4FullStore
|
|
5
7
|
from ir_datasets.formats import (
|
|
6
8
|
GenericDoc,
|
|
7
9
|
GenericQuery,
|
|
@@ -10,7 +12,7 @@ from ir_datasets.formats import (
|
|
|
10
12
|
TrecQuery,
|
|
11
13
|
)
|
|
12
14
|
import ir_datasets.datasets as _irds
|
|
13
|
-
from experimaestro import Config
|
|
15
|
+
from experimaestro import Config, Param
|
|
14
16
|
from experimaestro.compat import cached_property
|
|
15
17
|
from experimaestro import Option
|
|
16
18
|
import datamaestro_text.data.ir as ir
|
|
@@ -208,6 +210,67 @@ if hasattr(_irds, "miracl"):
|
|
|
208
210
|
)
|
|
209
211
|
|
|
210
212
|
|
|
213
|
+
# Fix while PR https://github.com/allenai/ir_datasets/pull/252
|
|
214
|
+
# is not in.
|
|
215
|
+
class DMPickleLz4FullStore(PickleLz4FullStore):
|
|
216
|
+
def get_many(self, doc_ids, field=None):
|
|
217
|
+
result = {}
|
|
218
|
+
field_idx = self._doc_cls._fields.index(field) if field is not None else None
|
|
219
|
+
for doc in self.get_many_iter(doc_ids):
|
|
220
|
+
if field is not None:
|
|
221
|
+
result[getattr(doc, self._id_field)] = doc[field_idx]
|
|
222
|
+
else:
|
|
223
|
+
result[getattr(doc, self._id_field)] = doc
|
|
224
|
+
return result
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
class LZ4DocumentStore(ir.DocumentStore):
|
|
228
|
+
"""A LZ4-based document store"""
|
|
229
|
+
|
|
230
|
+
path: Param[Path]
|
|
231
|
+
|
|
232
|
+
#: Lookup field
|
|
233
|
+
lookup_field: Param[str]
|
|
234
|
+
|
|
235
|
+
# Extra indexed fields (e.g. URLs)
|
|
236
|
+
index_fields: List[str]
|
|
237
|
+
|
|
238
|
+
@cached_property
|
|
239
|
+
def store(self):
|
|
240
|
+
return DMPickleLz4FullStore(
|
|
241
|
+
self.path, None, self.data_cls, self.lookup_field, self.index_fields
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
@cached_property
|
|
245
|
+
def _docs(self):
|
|
246
|
+
return self.store.__iter__()
|
|
247
|
+
|
|
248
|
+
def docid_internal2external(self, ix: int):
|
|
249
|
+
return getattr(self._docs[ix], self.store._id_field)
|
|
250
|
+
|
|
251
|
+
def document_ext(self, docid: str) -> Document:
|
|
252
|
+
return self.converter(self.store.get(docid))
|
|
253
|
+
|
|
254
|
+
def documents_ext(self, docids: List[str]) -> Document:
|
|
255
|
+
"""Returns documents given their external IDs (optimized for batch)"""
|
|
256
|
+
retrieved = self.store.get_many(docids)
|
|
257
|
+
return [self.converter(retrieved[docid]) for docid in docids]
|
|
258
|
+
|
|
259
|
+
def converter(self, data):
|
|
260
|
+
"""Converts a document from LZ4 tuples to any other format"""
|
|
261
|
+
# By default, use identity
|
|
262
|
+
return data
|
|
263
|
+
|
|
264
|
+
def iter(self) -> Iterator[Document]:
|
|
265
|
+
"""Returns an iterator over documents"""
|
|
266
|
+
return map(self.converter, self.store.__iter__())
|
|
267
|
+
|
|
268
|
+
def documentcount(self):
|
|
269
|
+
if self.count:
|
|
270
|
+
return self.count
|
|
271
|
+
return self.store.count()
|
|
272
|
+
|
|
273
|
+
|
|
211
274
|
@attrs.define()
|
|
212
275
|
class IRDSQueryWrapper(ir.Topic):
|
|
213
276
|
query: Any
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Optional, Type, Callable, Iterator
|
|
3
|
+
from ir_datasets.indices import PickleLz4FullStore
|
|
4
|
+
from datamaestro.download import Download
|
|
5
|
+
from datamaestro.utils import FileChecker
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
import urllib3
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class lz4docstore_downloader(Download):
|
|
11
|
+
"""Uses ir_datasets Lz4FullStore to build a document store for a stream of documents"""
|
|
12
|
+
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
varname: str,
|
|
16
|
+
url: str,
|
|
17
|
+
iter_factory: Callable[[Path], Iterator],
|
|
18
|
+
doc_cls: Type,
|
|
19
|
+
lookup_field: str,
|
|
20
|
+
*,
|
|
21
|
+
count_hint: Optional[int] = None,
|
|
22
|
+
size: Optional[int] = None,
|
|
23
|
+
checker: FileChecker = None,
|
|
24
|
+
):
|
|
25
|
+
super().__init__(varname)
|
|
26
|
+
self.iter_factory = iter_factory
|
|
27
|
+
self.url = url
|
|
28
|
+
self.doc_cls = doc_cls
|
|
29
|
+
self.size = size
|
|
30
|
+
self.lookup_field = lookup_field
|
|
31
|
+
self.count_hint = count_hint
|
|
32
|
+
self.checker = checker
|
|
33
|
+
|
|
34
|
+
p = urllib3.util.parse_url(self.url)
|
|
35
|
+
assert p is not None
|
|
36
|
+
self.name = Path(p.path).with_suffix("").name
|
|
37
|
+
|
|
38
|
+
def prepare(self):
|
|
39
|
+
return self.definition.datapath / self.name
|
|
40
|
+
|
|
41
|
+
def download(self, force=False):
|
|
42
|
+
# Creates directory if needed
|
|
43
|
+
destination = self.definition.datapath / self.name
|
|
44
|
+
destination.mkdir(exist_ok=True)
|
|
45
|
+
|
|
46
|
+
# Early exit
|
|
47
|
+
if (destination / "done").is_file() and not force:
|
|
48
|
+
return True
|
|
49
|
+
|
|
50
|
+
# Download (cache)
|
|
51
|
+
logging.info("Building the document index")
|
|
52
|
+
with self.context.downloadURL(self.url, size=self.size) as file:
|
|
53
|
+
# Checks the file
|
|
54
|
+
if self.checker:
|
|
55
|
+
self.checker.check(file.path)
|
|
56
|
+
|
|
57
|
+
# Builds the LZ4 store
|
|
58
|
+
store = PickleLz4FullStore(
|
|
59
|
+
destination,
|
|
60
|
+
lambda: self.iter_factory(Path(file.path)),
|
|
61
|
+
self.doc_cls,
|
|
62
|
+
lookup_field=self.lookup_field,
|
|
63
|
+
index_fields=[self.lookup_field],
|
|
64
|
+
key_field_prefix=None,
|
|
65
|
+
size_hint=None,
|
|
66
|
+
count_hint=self.count_hint,
|
|
67
|
+
)
|
|
68
|
+
store.build()
|
|
69
|
+
|
|
70
|
+
# All good!
|
|
71
|
+
(destination / "done").touch()
|
|
@@ -12,5 +12,5 @@ __version__: str
|
|
|
12
12
|
__version_tuple__: VERSION_TUPLE
|
|
13
13
|
version_tuple: VERSION_TUPLE
|
|
14
14
|
|
|
15
|
-
__version__ = version = '2023.12.
|
|
16
|
-
__version_tuple__ = version_tuple = (2023, 12,
|
|
15
|
+
__version__ = version = '2023.12.12'
|
|
16
|
+
__version_tuple__ = version_tuple = (2023, 12, 12)
|
{datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text.egg-info/SOURCES.txt
RENAMED
|
@@ -27,6 +27,7 @@ docs/source/datasets/conversation.rst
|
|
|
27
27
|
docs/source/datasets/embeddings.rst
|
|
28
28
|
docs/source/datasets/index.rst
|
|
29
29
|
docs/source/datasets/ir.rst
|
|
30
|
+
docs/source/datasets/irds.rst
|
|
30
31
|
docs/source/datasets/recommendation.rst
|
|
31
32
|
docs/source/datasets/text.rst
|
|
32
33
|
src/datamaestro_text/__init__.py
|
|
@@ -91,11 +92,13 @@ src/datamaestro_text/data/ir/csv.py
|
|
|
91
92
|
src/datamaestro_text/data/ir/data.py
|
|
92
93
|
src/datamaestro_text/data/ir/formats.py
|
|
93
94
|
src/datamaestro_text/data/ir/huggingface.py
|
|
95
|
+
src/datamaestro_text/data/ir/stores.py
|
|
94
96
|
src/datamaestro_text/data/ir/trec.py
|
|
95
97
|
src/datamaestro_text/data/ir/utils.py
|
|
96
98
|
src/datamaestro_text/datasets/irds/__init__.py
|
|
97
99
|
src/datamaestro_text/datasets/irds/data.py
|
|
98
100
|
src/datamaestro_text/datasets/irds/datasets.py
|
|
101
|
+
src/datamaestro_text/datasets/irds/helpers.py
|
|
99
102
|
src/datamaestro_text/datasets/irds/utils.py
|
|
100
103
|
src/datamaestro_text/download/tmdb.py
|
|
101
104
|
src/datamaestro_text/interfaces/plaintext.py
|
|
@@ -1,49 +0,0 @@
|
|
|
1
|
-
# See documentation on https://datamaestro.readthedocs.io
|
|
2
|
-
|
|
3
|
-
from datamaestro.definitions import datatasks, datatags, dataset
|
|
4
|
-
from datamaestro.download.single import filedownloader
|
|
5
|
-
from datamaestro.utils import HashCheck
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
from datamaestro_text.data.conversation.orconvqa import OrConvQADataset
|
|
9
|
-
from datamaestro.data.ml import Supervised
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
@datatags("conversation", "context", "query")
|
|
13
|
-
@datatasks("query rewriting")
|
|
14
|
-
@filedownloader(
|
|
15
|
-
"train.jsonl",
|
|
16
|
-
"https://ciir.cs.umass.edu/downloads/ORConvQA/preprocessed/train.txt",
|
|
17
|
-
checker=HashCheck("7513a9ef12d8b7a4471166dc4fef77b7"),
|
|
18
|
-
)
|
|
19
|
-
@filedownloader(
|
|
20
|
-
"dev.jsonl",
|
|
21
|
-
"https://ciir.cs.umass.edu/downloads/ORConvQA/preprocessed/dev.txt",
|
|
22
|
-
checker=HashCheck("7765658995cc9ffd5eb39a400d814b20"),
|
|
23
|
-
)
|
|
24
|
-
@filedownloader(
|
|
25
|
-
"test.jsonl",
|
|
26
|
-
"https://ciir.cs.umass.edu/downloads/ORConvQA/preprocessed/test.txt",
|
|
27
|
-
checker=HashCheck("0cf3a755f06297b9c02e7db45f8dc8be"),
|
|
28
|
-
)
|
|
29
|
-
@dataset(
|
|
30
|
-
Supervised,
|
|
31
|
-
url="https://github.com/prdwb/orconvqa-release",
|
|
32
|
-
)
|
|
33
|
-
def preprocessed(train, dev, test):
|
|
34
|
-
"""Question-in-context rewriting
|
|
35
|
-
|
|
36
|
-
CANARD is a dataset for question-in-context rewriting that consists of
|
|
37
|
-
questions each given in a dialog context together with a context-independent
|
|
38
|
-
rewriting of the question. The context of each question is the dialog
|
|
39
|
-
utterances that precede the question. CANARD can be used to evaluate
|
|
40
|
-
question rewriting models that handle important linguistic phenomena such as
|
|
41
|
-
co-reference and ellipsis resolution.
|
|
42
|
-
|
|
43
|
-
Each dataset is an instance of :class:`datamaestro_text.data.conversation.OrConvQADataset`
|
|
44
|
-
"""
|
|
45
|
-
return {
|
|
46
|
-
"train": OrConvQADataset(path=train),
|
|
47
|
-
"validation": OrConvQADataset(path=dev),
|
|
48
|
-
"test": OrConvQADataset(path=test),
|
|
49
|
-
}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/.github/workflows/python-publish.yml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/docs/source/datasets/embeddings.rst
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/docs/source/datasets/recommendation.rst
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/__init__.py
RENAMED
|
File without changes
|
{datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/ai/quac.yaml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/embeddings.py
RENAMED
|
File without changes
|
{datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/ir/base.py
RENAMED
|
File without changes
|
{datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/ir/cord19.py
RENAMED
|
File without changes
|
{datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/ir/csv.py
RENAMED
|
File without changes
|
{datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/ir/data.py
RENAMED
|
File without changes
|
|
File without changes
|
{datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/ir/trec.py
RENAMED
|
File without changes
|
{datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/ir/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
{datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/tagging.py
RENAMED
|
File without changes
|
{datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/text.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/download/tmdb.py
RENAMED
|
File without changes
|
|
File without changes
|
{datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/interfaces/trec.py
RENAMED
|
File without changes
|
{datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/test/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/utils/__init__.py
RENAMED
|
File without changes
|
{datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/utils/files.py
RENAMED
|
File without changes
|
{datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/utils/iter.py
RENAMED
|
File without changes
|
|
File without changes
|
{datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/utils/shuffle.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|