datamaestro-text 2024.3.10__tar.gz → 2024.5.31__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamaestro_text-2024.5.31/.pre-commit-config.yaml +19 -0
- {datamaestro-text-2024.3.10/src/datamaestro_text.egg-info → datamaestro_text-2024.5.31}/PKG-INFO +2 -2
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/docs/source/api/conversation.rst +8 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/docs/source/datasets/conversation.rst +2 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/docs/source/datasets/irds.rst +4 -3
- datamaestro_text-2024.5.31/requirements.txt +3 -0
- datamaestro_text-2024.5.31/src/datamaestro_text/config/com/github/apple/ml-qrecc.py +37 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/com/github/prdwb/orconvqa.py +4 -7
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/data/conversation/base.py +76 -10
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/data/conversation/orconvqa.py +12 -2
- datamaestro_text-2024.5.31/src/datamaestro_text/data/conversation/qrecc.py +99 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/data/ir/__init__.py +3 -2
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/data/ir/stores.py +2 -1
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/datasets/irds/data.py +118 -23
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/version.py +2 -2
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31/src/datamaestro_text.egg-info}/PKG-INFO +2 -2
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text.egg-info/SOURCES.txt +2 -0
- datamaestro_text-2024.5.31/src/datamaestro_text.egg-info/requires.txt +3 -0
- datamaestro-text-2024.3.10/.pre-commit-config.yaml +0 -11
- datamaestro-text-2024.3.10/requirements.txt +0 -3
- datamaestro-text-2024.3.10/src/datamaestro_text.egg-info/requires.txt +0 -3
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/.circleci/config.yml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/.flake8 +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/.github/workflows/pytest.yml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/.github/workflows/python-publish.yml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/.gitignore +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/.readthedocs.yml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/LICENSE +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/MANIFEST.in +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/Makefile +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/README.md +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/docs/Makefile +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/docs/make.bat +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/docs/requirements.txt +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/docs/source/api/embeddings.rst +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/docs/source/api/index.rst +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/docs/source/api/ir.rst +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/docs/source/api/nlp.rst +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/docs/source/api/recommendation.rst +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/docs/source/api/text.rst +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/docs/source/conf.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/docs/source/datasets/embeddings.rst +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/docs/source/datasets/index.rst +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/docs/source/datasets/ir.rst +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/docs/source/datasets/recommendation.rst +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/docs/source/datasets/text.rst +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/docs/source/index.rst +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/mkdocs.yml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/pyproject.toml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/requirements-dev.txt +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/setup.cfg +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/__init__.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/__init__.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/ai/quac.yaml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/com/fastml/goodbooks-10k.yaml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/com/github/aagohary/canard.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/com/github/soskek/bookcorpus.yaml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/com/microsoft/msmarco/passage.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/com/microsoft/wikiqa.yaml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/com/oscar-corpus.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/com/sentiment140.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/com/smashwords/bookcorpus.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/edu/cornell/nlvr.yaml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/edu/stanford/__init__.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/edu/stanford/aclimdb.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/edu/stanford/glove.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/edu/stanford/im2p.yaml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/edu/upenn/__init__.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/edu/upenn/ldc/__init__.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/edu/upenn/ldc/aquaint.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/gov/__init__.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/gov/nist/__init__.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/gov/nist/ir/covid.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/gov/nist/trec/__init__.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/gov/nist/trec/adhoc.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/gov/nist/trec/clueweb.yaml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/gov/nist/trec/deeplearning.yaml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/gov/nist/trec/index.yaml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/gov/nist/trec/tipster.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/gov/nist/trec/web.yaml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/io/github/rajpurkar/squad.yaml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/io/github/thunlp/fewrel.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/io/metamind/research/__init__.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/io/metamind/research/wikitext.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/net/mattmahoney/enwiki.yaml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/org/acm/recsys/cb2014.yaml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/org/cocodataset/index.yaml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/org/grouplens/movielens.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/org/universaldependencies/french.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/uk/ac/ucl/cs/qangaroo.yaml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/data/__init__.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/data/conversation/__init__.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/data/conversation/canard.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/data/embeddings.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/data/ir/base.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/data/ir/cord19.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/data/ir/csv.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/data/ir/data.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/data/ir/formats.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/data/ir/huggingface.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/data/ir/trec.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/data/ir/utils.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/data/recommendation.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/data/tagging.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/data/text.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/datasets/irds/__init__.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/datasets/irds/datasets.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/datasets/irds/helpers.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/datasets/irds/utils.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/download/tmdb.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/interfaces/plaintext.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/interfaces/trec.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/test/__init__.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/test/test_datasets.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/test/test_documented.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/transforms/__init__.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/transforms/ir/__init__.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/utils/__init__.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/utils/files.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/utils/iter.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/utils/randomstream.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/utils/shuffle.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text.egg-info/dependency_links.txt +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text.egg-info/entry_points.txt +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text.egg-info/top_level.txt +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/tox.ini +0 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- hooks:
|
|
3
|
+
- id: check-yaml
|
|
4
|
+
- id: end-of-file-fixer
|
|
5
|
+
- id: trailing-whitespace
|
|
6
|
+
repo: https://github.com/pre-commit/pre-commit-hooks
|
|
7
|
+
rev: v4.4.0
|
|
8
|
+
- hooks:
|
|
9
|
+
- exclude: ^src/experimaestro/server/data
|
|
10
|
+
id: black
|
|
11
|
+
repo: https://github.com/psf/black
|
|
12
|
+
rev: 23.1.0
|
|
13
|
+
- hooks:
|
|
14
|
+
- additional_dependencies:
|
|
15
|
+
- flake8-print
|
|
16
|
+
- flake8-fixme
|
|
17
|
+
id: flake8
|
|
18
|
+
repo: https://github.com/pycqa/flake8
|
|
19
|
+
rev: 6.0.0
|
{datamaestro-text-2024.3.10/src/datamaestro_text.egg-info → datamaestro_text-2024.5.31}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datamaestro-text
|
|
3
|
-
Version: 2024.
|
|
3
|
+
Version: 2024.5.31
|
|
4
4
|
Summary: Datamaestro module for text-related datasets
|
|
5
5
|
Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
|
|
6
6
|
License: GPL-3
|
|
@@ -18,7 +18,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
18
18
|
Requires-Python: >=3.8
|
|
19
19
|
Description-Content-Type: text/markdown
|
|
20
20
|
License-File: LICENSE
|
|
21
|
-
Requires-Dist: datamaestro>=1.1.
|
|
21
|
+
Requires-Dist: datamaestro>=1.1.1
|
|
22
22
|
Requires-Dist: ir_datasets
|
|
23
23
|
Requires-Dist: attrs
|
|
24
24
|
|
|
@@ -45,3 +45,11 @@ Contextual query reformulation
|
|
|
45
45
|
|
|
46
46
|
.. autoclass:: datamaestro_text.data.conversation.orconvqa.OrConvQADatasetHistoryEntry
|
|
47
47
|
:members:
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
.. autoclass:: datamaestro_text.data.conversation.orconvqa.QReCCDatasetEntry
|
|
52
|
+
:members:
|
|
53
|
+
|
|
54
|
+
.. autoxpmconfig:: datamaestro_text.data.conversation.qrecc.QReCCDataset
|
|
55
|
+
:members: iter
|
|
@@ -8,9 +8,10 @@ version of `ir-datasets` is more ancient or newer than the one used at generatio
|
|
|
8
8
|
Data types
|
|
9
9
|
----------
|
|
10
10
|
|
|
11
|
-
.. autoxpmconfig::
|
|
12
|
-
.. autoxpmconfig::
|
|
13
|
-
.. autoxpmconfig::
|
|
11
|
+
.. autoxpmconfig:: datamaestro_text.datasets.irds.data.Topics
|
|
12
|
+
.. autoxpmconfig:: datamaestro_text.datasets.irds.data.Documents
|
|
13
|
+
.. autoxpmconfig:: datamaestro_text.datasets.irds.data.AdhocAssessments
|
|
14
|
+
.. autoxpmconfig:: datamaestro_text.datasets.irds.data.LZ4DocumentStore
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
List of datasets
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# See documentation on https://datamaestro.readthedocs.io
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from datamaestro.definitions import datatasks, datatags, dataset
|
|
5
|
+
from datamaestro.data.ml import Supervised
|
|
6
|
+
from datamaestro.download.archive import zipdownloader
|
|
7
|
+
from datamaestro.utils import HashCheck
|
|
8
|
+
from datamaestro_text.data.conversation.qrecc import QReCCDataset
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@datatags("conversation", "context", "query")
|
|
12
|
+
@datatasks("query rewriting")
|
|
13
|
+
@zipdownloader(
|
|
14
|
+
"data",
|
|
15
|
+
"https://github.com/apple/ml-qrecc/raw/main/dataset/qrecc_data.zip",
|
|
16
|
+
checker=HashCheck("f88fcc7ef3678cd6312080389c8abd67"),
|
|
17
|
+
)
|
|
18
|
+
@dataset(
|
|
19
|
+
Supervised[QReCCDataset, None, QReCCDataset],
|
|
20
|
+
url="https://github.com/apple/ml-qrecc",
|
|
21
|
+
doi="https://doi.org/10.48550/arXiv.2010.04898",
|
|
22
|
+
id="",
|
|
23
|
+
)
|
|
24
|
+
def main(data: Path):
|
|
25
|
+
"""Open-Domain Question Answering Goes Conversational via Question Rewriting
|
|
26
|
+
|
|
27
|
+
We introduce QReCC (Question Rewriting in Conversational Context), an
|
|
28
|
+
end-to-end open-domain question answering dataset comprising of 14K
|
|
29
|
+
conversations with 81K question-answer pairs. The goal of this dataset is to
|
|
30
|
+
provide a challenging benchmark for end-to-end conversational question
|
|
31
|
+
answering that includes the individual subtasks of question rewriting,
|
|
32
|
+
passage retrieval and reading comprehension
|
|
33
|
+
"""
|
|
34
|
+
return {
|
|
35
|
+
"train": QReCCDataset(path=data / "qrecc_train.json"),
|
|
36
|
+
"test": QReCCDataset(path=data / "qrecc_test.json"),
|
|
37
|
+
}
|
|
@@ -1,11 +1,9 @@
|
|
|
1
1
|
# See documentation on https://datamaestro.readthedocs.io
|
|
2
2
|
|
|
3
|
-
from collections import namedtuple
|
|
4
3
|
import gzip
|
|
5
4
|
import json
|
|
6
5
|
from pathlib import Path
|
|
7
|
-
from typing import Iterator
|
|
8
|
-
import attrs
|
|
6
|
+
from typing import Iterator
|
|
9
7
|
from datamaestro.definitions import datatasks, datatags, dataset
|
|
10
8
|
from datamaestro.download.single import filedownloader
|
|
11
9
|
from datamaestro.utils import HashCheck
|
|
@@ -14,10 +12,7 @@ from datamaestro.utils import HashCheck
|
|
|
14
12
|
from datamaestro_text.data.conversation.orconvqa import OrConvQADataset
|
|
15
13
|
from datamaestro.data.ml import Supervised
|
|
16
14
|
|
|
17
|
-
from datamaestro_text.data.ir import DocumentStore
|
|
18
|
-
from datamaestro_text.data.ir.formats import OrConvQADocument
|
|
19
15
|
from datamaestro_text.data.ir.stores import OrConvQADocumentStore
|
|
20
|
-
from datamaestro_text.datasets.irds.data import LZ4DocumentStore
|
|
21
16
|
from datamaestro_text.datasets.irds.helpers import lz4docstore_downloader
|
|
22
17
|
|
|
23
18
|
|
|
@@ -63,7 +58,9 @@ def preprocessed(train, dev, test):
|
|
|
63
58
|
def orConvQADocumentReader(source: Path) -> Iterator[OrConvQADocumentStore.NAMED_TUPLE]:
|
|
64
59
|
with gzip.open(source, "rt") as fp:
|
|
65
60
|
for line in fp:
|
|
66
|
-
|
|
61
|
+
data = json.loads(line)
|
|
62
|
+
data["body"] = data.pop("text")
|
|
63
|
+
yield OrConvQADocumentStore.NAMED_TUPLE(**data)
|
|
67
64
|
|
|
68
65
|
|
|
69
66
|
@lz4docstore_downloader(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
from enum import Enum
|
|
3
|
-
from typing import Dict, Generic, Iterator, List, Optional, Sequence
|
|
3
|
+
from typing import Dict, Generic, Iterator, List, Optional, Sequence, Tuple
|
|
4
4
|
from attr import define
|
|
5
5
|
from datamaestro.data import Base
|
|
6
6
|
from datamaestro.record import Record, Item
|
|
@@ -61,6 +61,20 @@ class AnswerEntry(Item):
|
|
|
61
61
|
"""The system answer"""
|
|
62
62
|
|
|
63
63
|
|
|
64
|
+
@define
|
|
65
|
+
class AnswerDocumentID(Item):
|
|
66
|
+
"""An answer as a document ID"""
|
|
67
|
+
|
|
68
|
+
document_id: str
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@define
|
|
72
|
+
class AnswerDocumentURL(Item):
|
|
73
|
+
"""An answer as a document ID"""
|
|
74
|
+
|
|
75
|
+
url: str
|
|
76
|
+
|
|
77
|
+
|
|
64
78
|
@define
|
|
65
79
|
class RetrievedEntry(Item):
|
|
66
80
|
"""List of system-retrieved documents and their relevance"""
|
|
@@ -68,8 +82,8 @@ class RetrievedEntry(Item):
|
|
|
68
82
|
documents: List[str]
|
|
69
83
|
"""List of retrieved documents"""
|
|
70
84
|
|
|
71
|
-
|
|
72
|
-
"""List of
|
|
85
|
+
relevant_documents: Optional[Dict[int, Tuple[Optional[int], Optional[int]]]] = None
|
|
86
|
+
"""List of relevance status (optional), with start/stop position"""
|
|
73
87
|
|
|
74
88
|
|
|
75
89
|
@define
|
|
@@ -95,56 +109,99 @@ class ConversationHistoryItem(Item):
|
|
|
95
109
|
|
|
96
110
|
|
|
97
111
|
class ConversationNode:
|
|
112
|
+
@abstractmethod
|
|
98
113
|
def entry(self) -> Record:
|
|
99
114
|
"""The current conversation entry"""
|
|
100
115
|
...
|
|
101
116
|
|
|
117
|
+
@abstractmethod
|
|
102
118
|
def history(self) -> ConversationHistory:
|
|
103
119
|
"""Preceding conversation entries, from most recent to more ancient"""
|
|
104
120
|
...
|
|
105
121
|
|
|
122
|
+
@abstractmethod
|
|
123
|
+
def parent(self) -> Optional["ConversationNode"]:
|
|
124
|
+
...
|
|
125
|
+
|
|
126
|
+
@abstractmethod
|
|
127
|
+
def children(self) -> List["ConversationNode"]:
|
|
128
|
+
...
|
|
129
|
+
|
|
106
130
|
|
|
107
|
-
class ConversationTree:
|
|
131
|
+
class ConversationTree(ABC):
|
|
132
|
+
@abstractmethod
|
|
133
|
+
def root(self) -> ConversationNode:
|
|
134
|
+
...
|
|
135
|
+
|
|
136
|
+
@abstractmethod
|
|
108
137
|
def __iter__(self) -> Iterator[ConversationNode]:
|
|
109
138
|
"""Iterates over conversation nodes"""
|
|
110
|
-
|
|
139
|
+
...
|
|
111
140
|
|
|
112
141
|
|
|
113
142
|
# ---- A conversation tree
|
|
114
143
|
|
|
115
144
|
|
|
116
|
-
class SingleConversationTree(ConversationTree):
|
|
145
|
+
class SingleConversationTree(ConversationTree, ABC):
|
|
117
146
|
"""Simple conversations, based on a sequence of entries"""
|
|
118
147
|
|
|
119
148
|
id: str
|
|
120
|
-
history:
|
|
149
|
+
history: List[Record]
|
|
121
150
|
|
|
122
151
|
def __init__(self, id: Optional[str], history: List[Record]):
|
|
123
152
|
"""Create a simple conversation
|
|
124
153
|
|
|
125
|
-
:param history: The entries, in reverse order (i.e. more ancient first)
|
|
154
|
+
:param history: The entries, in **reverse** order (i.e. more ancient first)
|
|
126
155
|
"""
|
|
127
156
|
self.history = history or []
|
|
157
|
+
self.id = id
|
|
128
158
|
|
|
129
159
|
def add(self, entry: Record):
|
|
130
160
|
self.history.insert(0, entry)
|
|
131
161
|
|
|
132
162
|
def __iter__(self) -> Iterator[ConversationNode]:
|
|
133
|
-
|
|
163
|
+
"""Iterates over the conversation (starting with the beginning)"""
|
|
164
|
+
for ix in reversed(range(len(self.history))):
|
|
134
165
|
yield SingleConversationTreeNode(self, ix)
|
|
135
166
|
|
|
167
|
+
def root(self):
|
|
168
|
+
return SingleConversationTreeNode(self, len(self.history) - 1)
|
|
169
|
+
|
|
136
170
|
|
|
137
171
|
@define
|
|
138
172
|
class SingleConversationTreeNode(ConversationNode):
|
|
139
173
|
tree: SingleConversationTree
|
|
140
174
|
index: int
|
|
141
175
|
|
|
176
|
+
@property
|
|
142
177
|
def entry(self) -> Record:
|
|
143
178
|
return self.tree.history[self.index]
|
|
144
179
|
|
|
180
|
+
@entry.setter
|
|
181
|
+
def entry(self, record: Record):
|
|
182
|
+
try:
|
|
183
|
+
self.tree.history[self.index] = record
|
|
184
|
+
except Exception as e:
|
|
185
|
+
print(e)
|
|
186
|
+
raise
|
|
187
|
+
|
|
145
188
|
def history(self) -> Sequence[Record]:
|
|
146
189
|
return self.tree.history[self.index + 1 :]
|
|
147
190
|
|
|
191
|
+
def parent(self) -> ConversationNode | None:
|
|
192
|
+
return (
|
|
193
|
+
SingleConversationTreeNode(self.tree, self.index + 1)
|
|
194
|
+
if self.index < len(self.tree.history) - 1
|
|
195
|
+
else []
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
def children(self) -> List[ConversationNode]:
|
|
199
|
+
return (
|
|
200
|
+
[SingleConversationTreeNode(self.tree, self.index - 1)]
|
|
201
|
+
if self.index > 0
|
|
202
|
+
else []
|
|
203
|
+
)
|
|
204
|
+
|
|
148
205
|
|
|
149
206
|
class ConversationTreeNode(ConversationNode, ConversationTree):
|
|
150
207
|
"""A conversation tree node"""
|
|
@@ -178,6 +235,15 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
|
|
|
178
235
|
for child in self.children:
|
|
179
236
|
yield from child
|
|
180
237
|
|
|
238
|
+
def parent(self) -> ConversationNode | None:
|
|
239
|
+
return self.parent
|
|
240
|
+
|
|
241
|
+
def children(self) -> List[ConversationNode]:
|
|
242
|
+
return self.children
|
|
243
|
+
|
|
244
|
+
def root(self):
|
|
245
|
+
return self
|
|
246
|
+
|
|
181
247
|
|
|
182
248
|
class ConversationDataset(Base, ABC):
|
|
183
249
|
"""A dataset made of conversations"""
|
|
@@ -186,4 +252,4 @@ class ConversationDataset(Base, ABC):
|
|
|
186
252
|
def __iter__(self) -> Iterator[ConversationTree]:
|
|
187
253
|
"""Return an iterator over conversations"""
|
|
188
254
|
for i in range(len(self)):
|
|
189
|
-
|
|
255
|
+
yield self.get(i)
|
|
@@ -102,16 +102,26 @@ class OrConvQADataset(ConversationDataset, File):
|
|
|
102
102
|
# Add to current
|
|
103
103
|
history.append(
|
|
104
104
|
Record(
|
|
105
|
-
IDItem(
|
|
105
|
+
IDItem(entry.query_id),
|
|
106
106
|
SimpleTextItem(entry.query),
|
|
107
107
|
SimpleDecontextualizedItem(entry.rewrite),
|
|
108
108
|
EntryType.USER_QUERY,
|
|
109
109
|
)
|
|
110
110
|
)
|
|
111
|
+
|
|
112
|
+
relevances = {}
|
|
113
|
+
for rank, relevance in enumerate(entry.retrieval_labels):
|
|
114
|
+
if relevance > 0:
|
|
115
|
+
relevances[rank] = (entry.answer.answer_start, None)
|
|
116
|
+
|
|
117
|
+
assert (
|
|
118
|
+
len(relevances) <= 1
|
|
119
|
+
), f"Too many relevance labels ({len(relevances)}) for {entry.query_id}"
|
|
120
|
+
|
|
111
121
|
history.append(
|
|
112
122
|
Record(
|
|
113
123
|
AnswerEntry(entry.answer.text),
|
|
114
|
-
RetrievedEntry(entry.evidences,
|
|
124
|
+
RetrievedEntry(entry.evidences, relevances),
|
|
115
125
|
EntryType.SYSTEM_ANSWER,
|
|
116
126
|
)
|
|
117
127
|
)
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
from typing import Iterator, List, Optional
|
|
2
|
+
from attr import define
|
|
3
|
+
import json
|
|
4
|
+
from datamaestro.data import File
|
|
5
|
+
from datamaestro.record import Record
|
|
6
|
+
|
|
7
|
+
from datamaestro_text.data.ir.base import (
|
|
8
|
+
IDItem,
|
|
9
|
+
SimpleTextItem,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
from .base import (
|
|
14
|
+
AnswerDocumentURL,
|
|
15
|
+
AnswerEntry,
|
|
16
|
+
ConversationTree,
|
|
17
|
+
EntryType,
|
|
18
|
+
SimpleDecontextualizedItem,
|
|
19
|
+
SingleConversationTree,
|
|
20
|
+
)
|
|
21
|
+
from . import ConversationDataset
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@define(kw_only=True)
|
|
25
|
+
class QReCCDatasetEntry:
|
|
26
|
+
"""A query with past history"""
|
|
27
|
+
|
|
28
|
+
conversation_no: int
|
|
29
|
+
"""Conversation ID"""
|
|
30
|
+
|
|
31
|
+
turn_no: int
|
|
32
|
+
"""The turn in the conversation"""
|
|
33
|
+
|
|
34
|
+
conversation_source: str
|
|
35
|
+
"""Conversation source"""
|
|
36
|
+
|
|
37
|
+
question: str
|
|
38
|
+
"""The last issued query"""
|
|
39
|
+
|
|
40
|
+
rewrite: str
|
|
41
|
+
"""Manually rewritten query"""
|
|
42
|
+
|
|
43
|
+
context: List[str]
|
|
44
|
+
"""The list of queries asked by the user"""
|
|
45
|
+
|
|
46
|
+
answer: str
|
|
47
|
+
"""The answer"""
|
|
48
|
+
|
|
49
|
+
answer_url: str
|
|
50
|
+
"""The URL containing the answer"""
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class QReCCDataset(ConversationDataset, File):
|
|
54
|
+
def entries(self) -> Iterator[QReCCDatasetEntry]:
|
|
55
|
+
"""Iterates over re-written query with their context"""
|
|
56
|
+
with self.path.open("rt") as fp:
|
|
57
|
+
data = json.load(fp)
|
|
58
|
+
|
|
59
|
+
data = [
|
|
60
|
+
QReCCDatasetEntry(**{key.lower(): value for key, value in entry.items()})
|
|
61
|
+
for entry in data
|
|
62
|
+
]
|
|
63
|
+
return iter(data)
|
|
64
|
+
|
|
65
|
+
def __iter__(self) -> Iterator[ConversationTree]:
|
|
66
|
+
history: List[Record] = []
|
|
67
|
+
current_id: Optional[str] = None
|
|
68
|
+
|
|
69
|
+
for entry in self.entries():
|
|
70
|
+
# Creates a new conversation if needed
|
|
71
|
+
if entry.conversation_no != current_id:
|
|
72
|
+
if current_id is not None:
|
|
73
|
+
history.reverse()
|
|
74
|
+
yield SingleConversationTree(current_id, history)
|
|
75
|
+
|
|
76
|
+
current_id = entry.conversation_no
|
|
77
|
+
history = []
|
|
78
|
+
|
|
79
|
+
# Add to current
|
|
80
|
+
history.append(
|
|
81
|
+
Record(
|
|
82
|
+
IDItem(f"{entry.conversation_no}#{entry.turn_no}"),
|
|
83
|
+
SimpleTextItem(entry.question),
|
|
84
|
+
AnswerDocumentURL(entry.answer_url),
|
|
85
|
+
SimpleDecontextualizedItem(entry.rewrite),
|
|
86
|
+
EntryType.USER_QUERY,
|
|
87
|
+
)
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
history.append(
|
|
91
|
+
Record(
|
|
92
|
+
AnswerEntry(entry.answer),
|
|
93
|
+
EntryType.SYSTEM_ANSWER,
|
|
94
|
+
)
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# Yields the last one
|
|
98
|
+
history.reverse()
|
|
99
|
+
yield SingleConversationTree(current_id, history)
|
{datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/data/ir/__init__.py
RENAMED
|
@@ -25,6 +25,7 @@ from .base import ( # noqa: F401
|
|
|
25
25
|
create_record,
|
|
26
26
|
# Other things
|
|
27
27
|
AdhocAssessment,
|
|
28
|
+
AdhocAssessedTopic,
|
|
28
29
|
)
|
|
29
30
|
|
|
30
31
|
|
|
@@ -83,7 +84,7 @@ class DocumentStore(Documents):
|
|
|
83
84
|
def document_int(self, internal_docid: int) -> DocumentRecord:
|
|
84
85
|
"""Returns a document given its internal ID"""
|
|
85
86
|
docid = self.docid_internal2external(internal_docid)
|
|
86
|
-
return self.
|
|
87
|
+
return self.document_ext(docid)
|
|
87
88
|
|
|
88
89
|
def document_ext(self, docid: str) -> DocumentRecord:
|
|
89
90
|
"""Returns a document given its external ID"""
|
|
@@ -159,7 +160,7 @@ class TopicsStore(Topics):
|
|
|
159
160
|
class AdhocAssessments(Base, ABC):
|
|
160
161
|
"""Ad-hoc assessments (qrels)"""
|
|
161
162
|
|
|
162
|
-
def iter(self) -> Iterator[
|
|
163
|
+
def iter(self) -> Iterator[AdhocAssessedTopic]:
|
|
163
164
|
"""Returns an iterator over assessments"""
|
|
164
165
|
raise NotImplementedError(f"For class {self.__class__}")
|
|
165
166
|
|
{datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/data/ir/stores.py
RENAMED
|
@@ -3,6 +3,7 @@ from typing import List
|
|
|
3
3
|
from experimaestro import Constant
|
|
4
4
|
import attrs
|
|
5
5
|
|
|
6
|
+
from datamaestro.record import Record
|
|
6
7
|
from datamaestro_text.datasets.irds.data import LZ4DocumentStore
|
|
7
8
|
from datamaestro_text.data.ir.formats import OrConvQADocument
|
|
8
9
|
|
|
@@ -19,4 +20,4 @@ class OrConvQADocumentStore(LZ4DocumentStore):
|
|
|
19
20
|
data_cls = NAMED_TUPLE
|
|
20
21
|
|
|
21
22
|
def converter(self, data: NAMED_TUPLE) -> OrConvQADocument:
|
|
22
|
-
return OrConvQADocument(**data._asdict())
|
|
23
|
+
return Record(OrConvQADocument(**data._asdict()))
|
{datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/datasets/irds/data.py
RENAMED
|
@@ -2,7 +2,7 @@ from abc import ABC, abstractmethod
|
|
|
2
2
|
from functools import partial
|
|
3
3
|
import logging
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Iterator, Tuple, Type, List
|
|
5
|
+
from typing import Dict, Iterator, Tuple, Type, List
|
|
6
6
|
import ir_datasets
|
|
7
7
|
from ir_datasets.indices import PickleLz4FullStore
|
|
8
8
|
from ir_datasets.formats import (
|
|
@@ -17,6 +17,7 @@ from experimaestro import Config, Param
|
|
|
17
17
|
from experimaestro.compat import cached_property
|
|
18
18
|
from experimaestro import Option
|
|
19
19
|
from datamaestro.record import RecordType, record_type
|
|
20
|
+
from datamaestro_text.data.conversation.base import AnswerEntry
|
|
20
21
|
import datamaestro_text.data.ir as ir
|
|
21
22
|
from datamaestro_text.data.ir.base import (
|
|
22
23
|
Record,
|
|
@@ -254,15 +255,12 @@ class LZ4DocumentStore(ir.DocumentStore):
|
|
|
254
255
|
return getattr(self._docs[ix], self.store._id_field)
|
|
255
256
|
|
|
256
257
|
def document_ext(self, docid: str) -> DocumentRecord:
|
|
257
|
-
return self.converter(self.
|
|
258
|
+
return self.converter(self.store.get(docid))
|
|
258
259
|
|
|
259
260
|
def documents_ext(self, docids: List[str]) -> DocumentRecord:
|
|
260
261
|
"""Returns documents given their external IDs (optimized for batch)"""
|
|
261
262
|
retrieved = self.store.get_many(docids)
|
|
262
|
-
return [
|
|
263
|
-
self.converter(self.document_recordtype, retrieved[docid])
|
|
264
|
-
for docid in docids
|
|
265
|
-
]
|
|
263
|
+
return [self.converter(retrieved[docid]) for docid in docids]
|
|
266
264
|
|
|
267
265
|
def converter(self, data):
|
|
268
266
|
"""Converts a document from LZ4 tuples to any other format"""
|
|
@@ -271,10 +269,9 @@ class LZ4DocumentStore(ir.DocumentStore):
|
|
|
271
269
|
|
|
272
270
|
def iter(self) -> Iterator[DocumentRecord]:
|
|
273
271
|
"""Returns an iterator over documents"""
|
|
274
|
-
return map(
|
|
275
|
-
partial(self.converter, self.document_recordtype), self.store.__iter__()
|
|
276
|
-
)
|
|
272
|
+
return map(self.converter, self.store.__iter__())
|
|
277
273
|
|
|
274
|
+
@cached_property
|
|
278
275
|
def documentcount(self):
|
|
279
276
|
if self.count:
|
|
280
277
|
return self.count
|
|
@@ -399,8 +396,9 @@ if hasattr(_irds.trec_cast, "Cast2022Query"):
|
|
|
399
396
|
from datamaestro_text.data.conversation.base import (
|
|
400
397
|
ConversationTreeNode,
|
|
401
398
|
DecontextualizedDictItem,
|
|
402
|
-
|
|
399
|
+
AnswerDocumentID,
|
|
403
400
|
ConversationHistoryItem,
|
|
401
|
+
EntryType,
|
|
404
402
|
)
|
|
405
403
|
|
|
406
404
|
class CastTopicsHandler(TopicsHandler):
|
|
@@ -428,7 +426,6 @@ if hasattr(_irds.trec_cast, "Cast2022Query"):
|
|
|
428
426
|
"""Returns an iterator over topics"""
|
|
429
427
|
return iter(self.records)
|
|
430
428
|
|
|
431
|
-
class Cast2020TopicsHandler(CastTopicsHandler):
|
|
432
429
|
@cached_property
|
|
433
430
|
def records(self):
|
|
434
431
|
try:
|
|
@@ -437,11 +434,7 @@ if hasattr(_irds.trec_cast, "Cast2022Query"):
|
|
|
437
434
|
conversation = []
|
|
438
435
|
records = []
|
|
439
436
|
|
|
440
|
-
for (
|
|
441
|
-
query
|
|
442
|
-
) in (
|
|
443
|
-
self.dataset.dataset.queries_iter()
|
|
444
|
-
): # type: _irds.trec_cast.Cast2020Query
|
|
437
|
+
for query in self.dataset.dataset.queries_iter():
|
|
445
438
|
decontextualized = DecontextualizedDictItem(
|
|
446
439
|
"manual",
|
|
447
440
|
{
|
|
@@ -449,28 +442,35 @@ if hasattr(_irds.trec_cast, "Cast2022Query"):
|
|
|
449
442
|
"auto": query.automatic_rewritten_utterance,
|
|
450
443
|
},
|
|
451
444
|
)
|
|
445
|
+
|
|
446
|
+
is_new_conversation = topic_number != query.topic_number
|
|
447
|
+
|
|
452
448
|
topic = Record(
|
|
453
449
|
IDItem(query.query_id),
|
|
454
450
|
SimpleTextItem(query.raw_utterance),
|
|
455
451
|
decontextualized,
|
|
456
452
|
ConversationHistoryItem(
|
|
457
|
-
node.conversation(False)
|
|
453
|
+
[] if is_new_conversation else node.conversation(False)
|
|
458
454
|
),
|
|
455
|
+
EntryType.USER_QUERY,
|
|
459
456
|
)
|
|
460
457
|
|
|
461
|
-
if
|
|
462
|
-
node = node.add(ConversationTreeNode(topic))
|
|
463
|
-
else:
|
|
458
|
+
if is_new_conversation:
|
|
464
459
|
conversation = []
|
|
465
460
|
node = ConversationTreeNode(topic)
|
|
466
461
|
topic_number = query.topic_number
|
|
462
|
+
else:
|
|
463
|
+
node = node.add(ConversationTreeNode(topic))
|
|
467
464
|
|
|
468
465
|
records.append(topic)
|
|
469
466
|
|
|
470
467
|
conversation.append(node)
|
|
471
468
|
node = node.add(
|
|
472
469
|
ConversationTreeNode(
|
|
473
|
-
Record(
|
|
470
|
+
Record(
|
|
471
|
+
AnswerDocumentID(self.get_canonical_result_id(query)),
|
|
472
|
+
EntryType.SYSTEM_ANSWER,
|
|
473
|
+
)
|
|
474
474
|
)
|
|
475
475
|
)
|
|
476
476
|
conversation.append(node)
|
|
@@ -480,15 +480,110 @@ if hasattr(_irds.trec_cast, "Cast2022Query"):
|
|
|
480
480
|
|
|
481
481
|
return records
|
|
482
482
|
|
|
483
|
+
@staticmethod
|
|
484
|
+
def get_canonical_result_id():
|
|
485
|
+
return None
|
|
486
|
+
|
|
487
|
+
class Cast2020TopicsHandler(CastTopicsHandler):
|
|
488
|
+
@staticmethod
|
|
489
|
+
def get_canonical_result_id(query: _irds.trec_cast.Cast2020Query):
|
|
490
|
+
return query.manual_canonical_result_id
|
|
491
|
+
|
|
492
|
+
class Cast2021TopicsHandler(CastTopicsHandler):
|
|
493
|
+
@staticmethod
|
|
494
|
+
def get_canonical_result_id(query: _irds.trec_cast.Cast2021Query):
|
|
495
|
+
return query.canonical_result_id
|
|
496
|
+
|
|
497
|
+
class Cast2022TopicsHandler(CastTopicsHandler):
|
|
498
|
+
def __init__(self, dataset):
|
|
499
|
+
self.dataset = dataset
|
|
500
|
+
|
|
501
|
+
@cached_property
|
|
502
|
+
def records(self):
|
|
503
|
+
try:
|
|
504
|
+
records = []
|
|
505
|
+
nodes: Dict[str, ConversationTreeNode] = {}
|
|
506
|
+
|
|
507
|
+
for (
|
|
508
|
+
query
|
|
509
|
+
) in (
|
|
510
|
+
self.dataset.dataset.queries_iter()
|
|
511
|
+
): # type: _irds.trec_cast.Cast2022Query
|
|
512
|
+
parent = nodes[query.parent_id] if query.parent_id else None
|
|
513
|
+
|
|
514
|
+
if query.participant == "User":
|
|
515
|
+
topic = Record(
|
|
516
|
+
IDItem(query.query_id),
|
|
517
|
+
SimpleTextItem(query.raw_utterance),
|
|
518
|
+
DecontextualizedDictItem(
|
|
519
|
+
"manual",
|
|
520
|
+
{
|
|
521
|
+
"manual": query.manual_rewritten_utterance,
|
|
522
|
+
},
|
|
523
|
+
),
|
|
524
|
+
ConversationHistoryItem(
|
|
525
|
+
parent.conversation(False) if parent else []
|
|
526
|
+
),
|
|
527
|
+
EntryType.USER_QUERY,
|
|
528
|
+
)
|
|
529
|
+
node = ConversationTreeNode(topic)
|
|
530
|
+
records.append(topic)
|
|
531
|
+
else:
|
|
532
|
+
node = ConversationTreeNode(
|
|
533
|
+
Record(
|
|
534
|
+
AnswerEntry(query.response),
|
|
535
|
+
EntryType.SYSTEM_ANSWER,
|
|
536
|
+
)
|
|
537
|
+
)
|
|
538
|
+
|
|
539
|
+
nodes[query.query_id] = node
|
|
540
|
+
if parent:
|
|
541
|
+
parent.add(node)
|
|
542
|
+
except Exception:
|
|
543
|
+
logging.exception("Error while computing topic records")
|
|
544
|
+
raise
|
|
545
|
+
|
|
546
|
+
return records
|
|
547
|
+
|
|
483
548
|
Topics.HANDLERS.update(
|
|
484
549
|
{
|
|
485
550
|
# _irds.trec_cast.Cast2019Query: Cast2019TopicsHandler,
|
|
486
551
|
_irds.trec_cast.Cast2020Query: Cast2020TopicsHandler,
|
|
487
|
-
|
|
488
|
-
|
|
552
|
+
_irds.trec_cast.Cast2021Query: Cast2021TopicsHandler,
|
|
553
|
+
_irds.trec_cast.Cast2022Query: Cast2022TopicsHandler,
|
|
489
554
|
}
|
|
490
555
|
)
|
|
491
556
|
|
|
557
|
+
class CastDocHandler:
|
|
558
|
+
def check(self, cls):
|
|
559
|
+
assert issubclass(cls, _irds.trec_cast.CastDoc)
|
|
560
|
+
|
|
561
|
+
@cached_property
|
|
562
|
+
def target_cls(self):
|
|
563
|
+
return formats.TitleUrlDocument
|
|
564
|
+
|
|
565
|
+
def __call__(self, _, doc: _irds.trec_cast.CastDoc):
|
|
566
|
+
return Record(
|
|
567
|
+
IDItem(doc.doc_id), formats.SimpleTextItem(" ".join(doc.passages))
|
|
568
|
+
)
|
|
569
|
+
|
|
570
|
+
class CastPassageDocHandler:
|
|
571
|
+
def check(self, cls):
|
|
572
|
+
assert issubclass(cls, _irds.trec_cast.CastPassageDoc)
|
|
573
|
+
|
|
574
|
+
@cached_property
|
|
575
|
+
def target_cls(self):
|
|
576
|
+
return formats.TitleUrlDocument
|
|
577
|
+
|
|
578
|
+
def __call__(self, _, doc: _irds.trec_cast.CastPassageDoc):
|
|
579
|
+
return Record(
|
|
580
|
+
IDItem(doc.doc_id),
|
|
581
|
+
formats.TitleUrlDocument(doc.text, doc.title, doc.url),
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
Documents.CONVERTERS[_irds.trec_cast.CastDoc] = CastDocHandler()
|
|
585
|
+
Documents.CONVERTERS[_irds.trec_cast.CastPassageDoc] = CastPassageDocHandler()
|
|
586
|
+
|
|
492
587
|
|
|
493
588
|
class Adhoc(ir.Adhoc, IRDSId):
|
|
494
589
|
pass
|
|
@@ -12,5 +12,5 @@ __version__: str
|
|
|
12
12
|
__version_tuple__: VERSION_TUPLE
|
|
13
13
|
version_tuple: VERSION_TUPLE
|
|
14
14
|
|
|
15
|
-
__version__ = version = '2024.
|
|
16
|
-
__version_tuple__ = version_tuple = (2024,
|
|
15
|
+
__version__ = version = '2024.5.31'
|
|
16
|
+
__version_tuple__ = version_tuple = (2024, 5, 31)
|
{datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31/src/datamaestro_text.egg-info}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datamaestro-text
|
|
3
|
-
Version: 2024.
|
|
3
|
+
Version: 2024.5.31
|
|
4
4
|
Summary: Datamaestro module for text-related datasets
|
|
5
5
|
Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
|
|
6
6
|
License: GPL-3
|
|
@@ -18,7 +18,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
18
18
|
Requires-Python: >=3.8
|
|
19
19
|
Description-Content-Type: text/markdown
|
|
20
20
|
License-File: LICENSE
|
|
21
|
-
Requires-Dist: datamaestro>=1.1.
|
|
21
|
+
Requires-Dist: datamaestro>=1.1.1
|
|
22
22
|
Requires-Dist: ir_datasets
|
|
23
23
|
Requires-Dist: attrs
|
|
24
24
|
|
{datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text.egg-info/SOURCES.txt
RENAMED
|
@@ -47,6 +47,7 @@ src/datamaestro_text/config/com/oscar-corpus.py
|
|
|
47
47
|
src/datamaestro_text/config/com/sentiment140.py
|
|
48
48
|
src/datamaestro_text/config/com/fastml/goodbooks-10k.yaml
|
|
49
49
|
src/datamaestro_text/config/com/github/aagohary/canard.py
|
|
50
|
+
src/datamaestro_text/config/com/github/apple/ml-qrecc.py
|
|
50
51
|
src/datamaestro_text/config/com/github/prdwb/orconvqa.py
|
|
51
52
|
src/datamaestro_text/config/com/github/soskek/bookcorpus.yaml
|
|
52
53
|
src/datamaestro_text/config/com/microsoft/wikiqa.yaml
|
|
@@ -89,6 +90,7 @@ src/datamaestro_text/data/conversation/__init__.py
|
|
|
89
90
|
src/datamaestro_text/data/conversation/base.py
|
|
90
91
|
src/datamaestro_text/data/conversation/canard.py
|
|
91
92
|
src/datamaestro_text/data/conversation/orconvqa.py
|
|
93
|
+
src/datamaestro_text/data/conversation/qrecc.py
|
|
92
94
|
src/datamaestro_text/data/ir/__init__.py
|
|
93
95
|
src/datamaestro_text/data/ir/base.py
|
|
94
96
|
src/datamaestro_text/data/ir/cord19.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/.github/workflows/python-publish.yml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/docs/source/api/recommendation.rst
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/docs/source/datasets/embeddings.rst
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/docs/source/datasets/recommendation.rst
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/__init__.py
RENAMED
|
File without changes
|
{datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/config/ai/quac.yaml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/data/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/data/embeddings.py
RENAMED
|
File without changes
|
{datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/data/ir/base.py
RENAMED
|
File without changes
|
{datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/data/ir/cord19.py
RENAMED
|
File without changes
|
{datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/data/ir/csv.py
RENAMED
|
File without changes
|
{datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/data/ir/data.py
RENAMED
|
File without changes
|
{datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/data/ir/formats.py
RENAMED
|
File without changes
|
|
File without changes
|
{datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/data/ir/trec.py
RENAMED
|
File without changes
|
{datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/data/ir/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
{datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/data/tagging.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/download/tmdb.py
RENAMED
|
File without changes
|
|
File without changes
|
{datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/interfaces/trec.py
RENAMED
|
File without changes
|
{datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/test/__init__.py
RENAMED
|
File without changes
|
{datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/test/test_datasets.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/utils/__init__.py
RENAMED
|
File without changes
|
{datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/utils/files.py
RENAMED
|
File without changes
|
{datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/utils/iter.py
RENAMED
|
File without changes
|
{datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/utils/randomstream.py
RENAMED
|
File without changes
|
{datamaestro-text-2024.3.10 → datamaestro_text-2024.5.31}/src/datamaestro_text/utils/shuffle.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|