datamaestro-text 2024.3.10__tar.gz → 2025.1.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamaestro_text-2025.1.7/.pre-commit-config.yaml +19 -0
- {datamaestro-text-2024.3.10/src/datamaestro_text.egg-info → datamaestro_text-2025.1.7}/PKG-INFO +3 -3
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/source/api/conversation.rst +8 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/source/datasets/conversation.rst +2 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/source/datasets/irds.rst +4 -3
- datamaestro_text-2025.1.7/requirements.txt +3 -0
- datamaestro_text-2025.1.7/src/datamaestro_text/config/com/github/apple/ml-qrecc.py +87 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/github/prdwb/orconvqa.py +4 -7
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/conversation/base.py +76 -10
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/conversation/orconvqa.py +12 -2
- datamaestro_text-2025.1.7/src/datamaestro_text/data/conversation/qrecc.py +99 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/__init__.py +3 -2
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/formats.py +20 -5
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/stores.py +13 -6
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/datasets/irds/data.py +237 -124
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/datasets/irds/helpers.py +58 -2
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/version.py +2 -2
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7/src/datamaestro_text.egg-info}/PKG-INFO +3 -3
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text.egg-info/SOURCES.txt +2 -0
- datamaestro_text-2025.1.7/src/datamaestro_text.egg-info/requires.txt +3 -0
- datamaestro-text-2024.3.10/.pre-commit-config.yaml +0 -11
- datamaestro-text-2024.3.10/requirements.txt +0 -3
- datamaestro-text-2024.3.10/src/datamaestro_text.egg-info/requires.txt +0 -3
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/.circleci/config.yml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/.flake8 +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/.github/workflows/pytest.yml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/.github/workflows/python-publish.yml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/.gitignore +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/.readthedocs.yml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/LICENSE +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/MANIFEST.in +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/Makefile +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/README.md +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/Makefile +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/make.bat +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/requirements.txt +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/source/api/embeddings.rst +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/source/api/index.rst +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/source/api/ir.rst +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/source/api/nlp.rst +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/source/api/recommendation.rst +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/source/api/text.rst +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/source/conf.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/source/datasets/embeddings.rst +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/source/datasets/index.rst +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/source/datasets/ir.rst +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/source/datasets/recommendation.rst +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/source/datasets/text.rst +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/docs/source/index.rst +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/mkdocs.yml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/pyproject.toml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/requirements-dev.txt +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/setup.cfg +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/__init__.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/__init__.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/ai/quac.yaml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/fastml/goodbooks-10k.yaml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/github/aagohary/canard.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/github/soskek/bookcorpus.yaml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/microsoft/msmarco/passage.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/microsoft/wikiqa.yaml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/oscar-corpus.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/sentiment140.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/smashwords/bookcorpus.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/edu/cornell/nlvr.yaml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/edu/stanford/__init__.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/edu/stanford/aclimdb.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/edu/stanford/glove.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/edu/stanford/im2p.yaml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/edu/upenn/__init__.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/edu/upenn/ldc/__init__.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/edu/upenn/ldc/aquaint.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/__init__.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/__init__.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/ir/covid.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/trec/__init__.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/trec/adhoc.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/trec/clueweb.yaml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/trec/deeplearning.yaml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/trec/index.yaml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/trec/tipster.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/trec/web.yaml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/io/github/rajpurkar/squad.yaml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/io/github/thunlp/fewrel.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/io/metamind/research/__init__.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/io/metamind/research/wikitext.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/net/mattmahoney/enwiki.yaml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/org/acm/recsys/cb2014.yaml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/org/cocodataset/index.yaml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/org/grouplens/movielens.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/org/universaldependencies/french.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/uk/ac/ucl/cs/qangaroo.yaml +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/__init__.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/conversation/__init__.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/conversation/canard.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/embeddings.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/base.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/cord19.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/csv.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/data.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/huggingface.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/trec.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/utils.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/recommendation.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/tagging.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/text.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/datasets/irds/__init__.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/datasets/irds/datasets.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/datasets/irds/utils.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/download/tmdb.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/interfaces/plaintext.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/interfaces/trec.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/test/__init__.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/test/test_datasets.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/test/test_documented.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/transforms/__init__.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/transforms/ir/__init__.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/utils/__init__.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/utils/files.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/utils/iter.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/utils/randomstream.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/utils/shuffle.py +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text.egg-info/dependency_links.txt +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text.egg-info/entry_points.txt +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text.egg-info/top_level.txt +0 -0
- {datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/tox.ini +0 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- hooks:
|
|
3
|
+
- id: check-yaml
|
|
4
|
+
- id: end-of-file-fixer
|
|
5
|
+
- id: trailing-whitespace
|
|
6
|
+
repo: https://github.com/pre-commit/pre-commit-hooks
|
|
7
|
+
rev: v4.4.0
|
|
8
|
+
- hooks:
|
|
9
|
+
- exclude: ^src/experimaestro/server/data
|
|
10
|
+
id: black
|
|
11
|
+
repo: https://github.com/psf/black
|
|
12
|
+
rev: 23.1.0
|
|
13
|
+
- hooks:
|
|
14
|
+
- additional_dependencies:
|
|
15
|
+
- flake8-print
|
|
16
|
+
- flake8-fixme
|
|
17
|
+
id: flake8
|
|
18
|
+
repo: https://github.com/pycqa/flake8
|
|
19
|
+
rev: 6.0.0
|
{datamaestro-text-2024.3.10/src/datamaestro_text.egg-info → datamaestro_text-2025.1.7}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datamaestro-text
|
|
3
|
-
Version:
|
|
3
|
+
Version: 2025.1.7
|
|
4
4
|
Summary: Datamaestro module for text-related datasets
|
|
5
5
|
Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
|
|
6
6
|
License: GPL-3
|
|
@@ -18,8 +18,8 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
18
18
|
Requires-Python: >=3.8
|
|
19
19
|
Description-Content-Type: text/markdown
|
|
20
20
|
License-File: LICENSE
|
|
21
|
-
Requires-Dist: datamaestro>=1.1
|
|
22
|
-
Requires-Dist: ir_datasets
|
|
21
|
+
Requires-Dist: datamaestro>=1.2.1
|
|
22
|
+
Requires-Dist: ir_datasets>=0.5.8
|
|
23
23
|
Requires-Dist: attrs
|
|
24
24
|
|
|
25
25
|
[](https://github.com/pre-commit/pre-commit) [](https://badge.fury.io/py/datamaestro-text)
|
|
@@ -45,3 +45,11 @@ Contextual query reformulation
|
|
|
45
45
|
|
|
46
46
|
.. autoclass:: datamaestro_text.data.conversation.orconvqa.OrConvQADatasetHistoryEntry
|
|
47
47
|
:members:
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
.. autoclass:: datamaestro_text.data.conversation.orconvqa.QReCCDatasetEntry
|
|
52
|
+
:members:
|
|
53
|
+
|
|
54
|
+
.. autoxpmconfig:: datamaestro_text.data.conversation.qrecc.QReCCDataset
|
|
55
|
+
:members: iter
|
|
@@ -8,9 +8,10 @@ version of `ir-datasets` is more ancient or newer than the one used at generatio
|
|
|
8
8
|
Data types
|
|
9
9
|
----------
|
|
10
10
|
|
|
11
|
-
.. autoxpmconfig::
|
|
12
|
-
.. autoxpmconfig::
|
|
13
|
-
.. autoxpmconfig::
|
|
11
|
+
.. autoxpmconfig:: datamaestro_text.datasets.irds.data.Topics
|
|
12
|
+
.. autoxpmconfig:: datamaestro_text.datasets.irds.data.Documents
|
|
13
|
+
.. autoxpmconfig:: datamaestro_text.datasets.irds.data.AdhocAssessments
|
|
14
|
+
.. autoxpmconfig:: datamaestro_text.datasets.irds.data.LZ4DocumentStore
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
List of datasets
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# See documentation on https://datamaestro.readthedocs.io
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import json
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from datamaestro.definitions import datatasks, datatags, dataset
|
|
7
|
+
from datamaestro.data.ml import Supervised
|
|
8
|
+
from datamaestro.download import reference
|
|
9
|
+
from datamaestro.download.archive import zipdownloader
|
|
10
|
+
from datamaestro.download.wayback import wayback_documents
|
|
11
|
+
from datamaestro.utils import HashCheck
|
|
12
|
+
from datamaestro_text.data.conversation.qrecc import QReCCDataset
|
|
13
|
+
from datamaestro_text.datasets.irds.data import (
|
|
14
|
+
LZ4JSONLDocumentStore,
|
|
15
|
+
SimpleJsonDocument,
|
|
16
|
+
)
|
|
17
|
+
from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@datatags("conversation", "context", "query")
|
|
21
|
+
@datatasks("query rewriting")
|
|
22
|
+
@zipdownloader(
|
|
23
|
+
"data",
|
|
24
|
+
"https://github.com/apple/ml-qrecc/raw/main/dataset/qrecc_data.zip",
|
|
25
|
+
checker=HashCheck("f88fcc7ef3678cd6312080389c8abd67"),
|
|
26
|
+
)
|
|
27
|
+
@dataset(
|
|
28
|
+
Supervised[QReCCDataset, None, QReCCDataset],
|
|
29
|
+
url="https://github.com/apple/ml-qrecc",
|
|
30
|
+
doi="https://doi.org/10.48550/arXiv.2010.04898",
|
|
31
|
+
id="",
|
|
32
|
+
)
|
|
33
|
+
def main(data: Path):
|
|
34
|
+
"""Open-Domain Question Answering Goes Conversational via Question Rewriting
|
|
35
|
+
|
|
36
|
+
We introduce QReCC (Question Rewriting in Conversational Context), an
|
|
37
|
+
end-to-end open-domain question answering dataset comprising of 14K
|
|
38
|
+
conversations with 81K question-answer pairs. The goal of this dataset is to
|
|
39
|
+
provide a challenging benchmark for end-to-end conversational question
|
|
40
|
+
answering that includes the individual subtasks of question rewriting,
|
|
41
|
+
passage retrieval and reading comprehension
|
|
42
|
+
"""
|
|
43
|
+
return {
|
|
44
|
+
"train": QReCCDataset(path=data / "qrecc_train.json"),
|
|
45
|
+
"test": QReCCDataset(path=data / "qrecc_test.json"),
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataset(
|
|
50
|
+
url="https://github.com/apple/ml-qrecc",
|
|
51
|
+
doi="https://doi.org/10.48550/arXiv.2010.04898",
|
|
52
|
+
)
|
|
53
|
+
class Content(LZ4JSONLDocumentStore):
|
|
54
|
+
"""QReCC mentionned URLs content"""
|
|
55
|
+
|
|
56
|
+
@staticmethod
|
|
57
|
+
def __create_dataset__(dataset, options=None):
|
|
58
|
+
ds = reference(reference=main).setup(dataset, options)
|
|
59
|
+
documents_path = wayback_documents(
|
|
60
|
+
"20191127", lambda: Content._urls(ds), name="wayback.jsonl"
|
|
61
|
+
).setup(dataset, options)
|
|
62
|
+
|
|
63
|
+
store_path = lz4docstore_builder(
|
|
64
|
+
"store",
|
|
65
|
+
lambda: Content._documents(documents_path),
|
|
66
|
+
SimpleJsonDocument,
|
|
67
|
+
"id",
|
|
68
|
+
).setup(dataset, options)
|
|
69
|
+
|
|
70
|
+
return LZ4JSONLDocumentStore(jsonl_path=store_path)
|
|
71
|
+
|
|
72
|
+
@staticmethod
|
|
73
|
+
def _documents(path: Path):
|
|
74
|
+
"""Iterates over documents from wayback"""
|
|
75
|
+
with path.open("rt") as fp:
|
|
76
|
+
for line in fp:
|
|
77
|
+
yield SimpleJsonDocument(**json.loads(line))
|
|
78
|
+
|
|
79
|
+
@staticmethod
|
|
80
|
+
def _urls(supervised: Supervised[QReCCDataset, None, QReCCDataset]):
|
|
81
|
+
urls = set()
|
|
82
|
+
for ds in [supervised.train, supervised.test]:
|
|
83
|
+
for entry in ds.entries():
|
|
84
|
+
if entry.answer_url:
|
|
85
|
+
url = re.sub("#.*$", "", entry.answer_url)
|
|
86
|
+
urls.add(url)
|
|
87
|
+
return urls
|
|
@@ -1,11 +1,9 @@
|
|
|
1
1
|
# See documentation on https://datamaestro.readthedocs.io
|
|
2
2
|
|
|
3
|
-
from collections import namedtuple
|
|
4
3
|
import gzip
|
|
5
4
|
import json
|
|
6
5
|
from pathlib import Path
|
|
7
|
-
from typing import Iterator
|
|
8
|
-
import attrs
|
|
6
|
+
from typing import Iterator
|
|
9
7
|
from datamaestro.definitions import datatasks, datatags, dataset
|
|
10
8
|
from datamaestro.download.single import filedownloader
|
|
11
9
|
from datamaestro.utils import HashCheck
|
|
@@ -14,10 +12,7 @@ from datamaestro.utils import HashCheck
|
|
|
14
12
|
from datamaestro_text.data.conversation.orconvqa import OrConvQADataset
|
|
15
13
|
from datamaestro.data.ml import Supervised
|
|
16
14
|
|
|
17
|
-
from datamaestro_text.data.ir import DocumentStore
|
|
18
|
-
from datamaestro_text.data.ir.formats import OrConvQADocument
|
|
19
15
|
from datamaestro_text.data.ir.stores import OrConvQADocumentStore
|
|
20
|
-
from datamaestro_text.datasets.irds.data import LZ4DocumentStore
|
|
21
16
|
from datamaestro_text.datasets.irds.helpers import lz4docstore_downloader
|
|
22
17
|
|
|
23
18
|
|
|
@@ -63,7 +58,9 @@ def preprocessed(train, dev, test):
|
|
|
63
58
|
def orConvQADocumentReader(source: Path) -> Iterator[OrConvQADocumentStore.NAMED_TUPLE]:
|
|
64
59
|
with gzip.open(source, "rt") as fp:
|
|
65
60
|
for line in fp:
|
|
66
|
-
|
|
61
|
+
data = json.loads(line)
|
|
62
|
+
data["body"] = data.pop("text")
|
|
63
|
+
yield OrConvQADocumentStore.NAMED_TUPLE(**data)
|
|
67
64
|
|
|
68
65
|
|
|
69
66
|
@lz4docstore_downloader(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
from enum import Enum
|
|
3
|
-
from typing import Dict, Generic, Iterator, List, Optional, Sequence
|
|
3
|
+
from typing import Dict, Generic, Iterator, List, Optional, Sequence, Tuple
|
|
4
4
|
from attr import define
|
|
5
5
|
from datamaestro.data import Base
|
|
6
6
|
from datamaestro.record import Record, Item
|
|
@@ -61,6 +61,20 @@ class AnswerEntry(Item):
|
|
|
61
61
|
"""The system answer"""
|
|
62
62
|
|
|
63
63
|
|
|
64
|
+
@define
|
|
65
|
+
class AnswerDocumentID(Item):
|
|
66
|
+
"""An answer as a document ID"""
|
|
67
|
+
|
|
68
|
+
document_id: str
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@define
|
|
72
|
+
class AnswerDocumentURL(Item):
|
|
73
|
+
"""An answer as a document ID"""
|
|
74
|
+
|
|
75
|
+
url: str
|
|
76
|
+
|
|
77
|
+
|
|
64
78
|
@define
|
|
65
79
|
class RetrievedEntry(Item):
|
|
66
80
|
"""List of system-retrieved documents and their relevance"""
|
|
@@ -68,8 +82,8 @@ class RetrievedEntry(Item):
|
|
|
68
82
|
documents: List[str]
|
|
69
83
|
"""List of retrieved documents"""
|
|
70
84
|
|
|
71
|
-
|
|
72
|
-
"""List of
|
|
85
|
+
relevant_documents: Optional[Dict[int, Tuple[Optional[int], Optional[int]]]] = None
|
|
86
|
+
"""List of relevance status (optional), with start/stop position"""
|
|
73
87
|
|
|
74
88
|
|
|
75
89
|
@define
|
|
@@ -95,56 +109,99 @@ class ConversationHistoryItem(Item):
|
|
|
95
109
|
|
|
96
110
|
|
|
97
111
|
class ConversationNode:
|
|
112
|
+
@abstractmethod
|
|
98
113
|
def entry(self) -> Record:
|
|
99
114
|
"""The current conversation entry"""
|
|
100
115
|
...
|
|
101
116
|
|
|
117
|
+
@abstractmethod
|
|
102
118
|
def history(self) -> ConversationHistory:
|
|
103
119
|
"""Preceding conversation entries, from most recent to more ancient"""
|
|
104
120
|
...
|
|
105
121
|
|
|
122
|
+
@abstractmethod
|
|
123
|
+
def parent(self) -> Optional["ConversationNode"]:
|
|
124
|
+
...
|
|
125
|
+
|
|
126
|
+
@abstractmethod
|
|
127
|
+
def children(self) -> List["ConversationNode"]:
|
|
128
|
+
...
|
|
129
|
+
|
|
106
130
|
|
|
107
|
-
class ConversationTree:
|
|
131
|
+
class ConversationTree(ABC):
|
|
132
|
+
@abstractmethod
|
|
133
|
+
def root(self) -> ConversationNode:
|
|
134
|
+
...
|
|
135
|
+
|
|
136
|
+
@abstractmethod
|
|
108
137
|
def __iter__(self) -> Iterator[ConversationNode]:
|
|
109
138
|
"""Iterates over conversation nodes"""
|
|
110
|
-
|
|
139
|
+
...
|
|
111
140
|
|
|
112
141
|
|
|
113
142
|
# ---- A conversation tree
|
|
114
143
|
|
|
115
144
|
|
|
116
|
-
class SingleConversationTree(ConversationTree):
|
|
145
|
+
class SingleConversationTree(ConversationTree, ABC):
|
|
117
146
|
"""Simple conversations, based on a sequence of entries"""
|
|
118
147
|
|
|
119
148
|
id: str
|
|
120
|
-
history:
|
|
149
|
+
history: List[Record]
|
|
121
150
|
|
|
122
151
|
def __init__(self, id: Optional[str], history: List[Record]):
|
|
123
152
|
"""Create a simple conversation
|
|
124
153
|
|
|
125
|
-
:param history: The entries, in reverse order (i.e. more ancient first)
|
|
154
|
+
:param history: The entries, in **reverse** order (i.e. more ancient first)
|
|
126
155
|
"""
|
|
127
156
|
self.history = history or []
|
|
157
|
+
self.id = id
|
|
128
158
|
|
|
129
159
|
def add(self, entry: Record):
|
|
130
160
|
self.history.insert(0, entry)
|
|
131
161
|
|
|
132
162
|
def __iter__(self) -> Iterator[ConversationNode]:
|
|
133
|
-
|
|
163
|
+
"""Iterates over the conversation (starting with the beginning)"""
|
|
164
|
+
for ix in reversed(range(len(self.history))):
|
|
134
165
|
yield SingleConversationTreeNode(self, ix)
|
|
135
166
|
|
|
167
|
+
def root(self):
|
|
168
|
+
return SingleConversationTreeNode(self, len(self.history) - 1)
|
|
169
|
+
|
|
136
170
|
|
|
137
171
|
@define
|
|
138
172
|
class SingleConversationTreeNode(ConversationNode):
|
|
139
173
|
tree: SingleConversationTree
|
|
140
174
|
index: int
|
|
141
175
|
|
|
176
|
+
@property
|
|
142
177
|
def entry(self) -> Record:
|
|
143
178
|
return self.tree.history[self.index]
|
|
144
179
|
|
|
180
|
+
@entry.setter
|
|
181
|
+
def entry(self, record: Record):
|
|
182
|
+
try:
|
|
183
|
+
self.tree.history[self.index] = record
|
|
184
|
+
except Exception as e:
|
|
185
|
+
print(e)
|
|
186
|
+
raise
|
|
187
|
+
|
|
145
188
|
def history(self) -> Sequence[Record]:
|
|
146
189
|
return self.tree.history[self.index + 1 :]
|
|
147
190
|
|
|
191
|
+
def parent(self) -> Optional[ConversationNode]:
|
|
192
|
+
return (
|
|
193
|
+
SingleConversationTreeNode(self.tree, self.index + 1)
|
|
194
|
+
if self.index < len(self.tree.history) - 1
|
|
195
|
+
else []
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
def children(self) -> List[ConversationNode]:
|
|
199
|
+
return (
|
|
200
|
+
[SingleConversationTreeNode(self.tree, self.index - 1)]
|
|
201
|
+
if self.index > 0
|
|
202
|
+
else []
|
|
203
|
+
)
|
|
204
|
+
|
|
148
205
|
|
|
149
206
|
class ConversationTreeNode(ConversationNode, ConversationTree):
|
|
150
207
|
"""A conversation tree node"""
|
|
@@ -178,6 +235,15 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
|
|
|
178
235
|
for child in self.children:
|
|
179
236
|
yield from child
|
|
180
237
|
|
|
238
|
+
def parent(self) -> Optional[ConversationNode]:
|
|
239
|
+
return self.parent
|
|
240
|
+
|
|
241
|
+
def children(self) -> List[ConversationNode]:
|
|
242
|
+
return self.children
|
|
243
|
+
|
|
244
|
+
def root(self):
|
|
245
|
+
return self
|
|
246
|
+
|
|
181
247
|
|
|
182
248
|
class ConversationDataset(Base, ABC):
|
|
183
249
|
"""A dataset made of conversations"""
|
|
@@ -186,4 +252,4 @@ class ConversationDataset(Base, ABC):
|
|
|
186
252
|
def __iter__(self) -> Iterator[ConversationTree]:
|
|
187
253
|
"""Return an iterator over conversations"""
|
|
188
254
|
for i in range(len(self)):
|
|
189
|
-
|
|
255
|
+
yield self.get(i)
|
|
@@ -102,16 +102,26 @@ class OrConvQADataset(ConversationDataset, File):
|
|
|
102
102
|
# Add to current
|
|
103
103
|
history.append(
|
|
104
104
|
Record(
|
|
105
|
-
IDItem(
|
|
105
|
+
IDItem(entry.query_id),
|
|
106
106
|
SimpleTextItem(entry.query),
|
|
107
107
|
SimpleDecontextualizedItem(entry.rewrite),
|
|
108
108
|
EntryType.USER_QUERY,
|
|
109
109
|
)
|
|
110
110
|
)
|
|
111
|
+
|
|
112
|
+
relevances = {}
|
|
113
|
+
for rank, relevance in enumerate(entry.retrieval_labels):
|
|
114
|
+
if relevance > 0:
|
|
115
|
+
relevances[rank] = (entry.answer.answer_start, None)
|
|
116
|
+
|
|
117
|
+
assert (
|
|
118
|
+
len(relevances) <= 1
|
|
119
|
+
), f"Too many relevance labels ({len(relevances)}) for {entry.query_id}"
|
|
120
|
+
|
|
111
121
|
history.append(
|
|
112
122
|
Record(
|
|
113
123
|
AnswerEntry(entry.answer.text),
|
|
114
|
-
RetrievedEntry(entry.evidences,
|
|
124
|
+
RetrievedEntry(entry.evidences, relevances),
|
|
115
125
|
EntryType.SYSTEM_ANSWER,
|
|
116
126
|
)
|
|
117
127
|
)
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
from typing import Iterator, List, Optional
|
|
2
|
+
from attr import define
|
|
3
|
+
import json
|
|
4
|
+
from datamaestro.data import File
|
|
5
|
+
from datamaestro.record import Record
|
|
6
|
+
|
|
7
|
+
from datamaestro_text.data.ir.base import (
|
|
8
|
+
IDItem,
|
|
9
|
+
SimpleTextItem,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
from .base import (
|
|
14
|
+
AnswerDocumentURL,
|
|
15
|
+
AnswerEntry,
|
|
16
|
+
ConversationTree,
|
|
17
|
+
EntryType,
|
|
18
|
+
SimpleDecontextualizedItem,
|
|
19
|
+
SingleConversationTree,
|
|
20
|
+
)
|
|
21
|
+
from . import ConversationDataset
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@define(kw_only=True)
|
|
25
|
+
class QReCCDatasetEntry:
|
|
26
|
+
"""A query with past history"""
|
|
27
|
+
|
|
28
|
+
conversation_no: int
|
|
29
|
+
"""Conversation ID"""
|
|
30
|
+
|
|
31
|
+
turn_no: int
|
|
32
|
+
"""The turn in the conversation"""
|
|
33
|
+
|
|
34
|
+
conversation_source: str
|
|
35
|
+
"""Conversation source"""
|
|
36
|
+
|
|
37
|
+
question: str
|
|
38
|
+
"""The last issued query"""
|
|
39
|
+
|
|
40
|
+
rewrite: str
|
|
41
|
+
"""Manually rewritten query"""
|
|
42
|
+
|
|
43
|
+
context: List[str]
|
|
44
|
+
"""The list of queries asked by the user"""
|
|
45
|
+
|
|
46
|
+
answer: str
|
|
47
|
+
"""The answer"""
|
|
48
|
+
|
|
49
|
+
answer_url: str
|
|
50
|
+
"""The URL containing the answer"""
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class QReCCDataset(ConversationDataset, File):
|
|
54
|
+
def entries(self) -> Iterator[QReCCDatasetEntry]:
|
|
55
|
+
"""Iterates over re-written query with their context"""
|
|
56
|
+
with self.path.open("rt") as fp:
|
|
57
|
+
data = json.load(fp)
|
|
58
|
+
|
|
59
|
+
data = [
|
|
60
|
+
QReCCDatasetEntry(**{key.lower(): value for key, value in entry.items()})
|
|
61
|
+
for entry in data
|
|
62
|
+
]
|
|
63
|
+
return iter(data)
|
|
64
|
+
|
|
65
|
+
def __iter__(self) -> Iterator[ConversationTree]:
|
|
66
|
+
history: List[Record] = []
|
|
67
|
+
current_id: Optional[str] = None
|
|
68
|
+
|
|
69
|
+
for entry in self.entries():
|
|
70
|
+
# Creates a new conversation if needed
|
|
71
|
+
if entry.conversation_no != current_id:
|
|
72
|
+
if current_id is not None:
|
|
73
|
+
history.reverse()
|
|
74
|
+
yield SingleConversationTree(current_id, history)
|
|
75
|
+
|
|
76
|
+
current_id = entry.conversation_no
|
|
77
|
+
history = []
|
|
78
|
+
|
|
79
|
+
# Add to current
|
|
80
|
+
history.append(
|
|
81
|
+
Record(
|
|
82
|
+
IDItem(f"{entry.conversation_no}#{entry.turn_no}"),
|
|
83
|
+
SimpleTextItem(entry.question),
|
|
84
|
+
AnswerDocumentURL(entry.answer_url),
|
|
85
|
+
SimpleDecontextualizedItem(entry.rewrite),
|
|
86
|
+
EntryType.USER_QUERY,
|
|
87
|
+
)
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
history.append(
|
|
91
|
+
Record(
|
|
92
|
+
AnswerEntry(entry.answer),
|
|
93
|
+
EntryType.SYSTEM_ANSWER,
|
|
94
|
+
)
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# Yields the last one
|
|
98
|
+
history.reverse()
|
|
99
|
+
yield SingleConversationTree(current_id, history)
|
{datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/__init__.py
RENAMED
|
@@ -25,6 +25,7 @@ from .base import ( # noqa: F401
|
|
|
25
25
|
create_record,
|
|
26
26
|
# Other things
|
|
27
27
|
AdhocAssessment,
|
|
28
|
+
AdhocAssessedTopic,
|
|
28
29
|
)
|
|
29
30
|
|
|
30
31
|
|
|
@@ -83,7 +84,7 @@ class DocumentStore(Documents):
|
|
|
83
84
|
def document_int(self, internal_docid: int) -> DocumentRecord:
|
|
84
85
|
"""Returns a document given its internal ID"""
|
|
85
86
|
docid = self.docid_internal2external(internal_docid)
|
|
86
|
-
return self.
|
|
87
|
+
return self.document_ext(docid)
|
|
87
88
|
|
|
88
89
|
def document_ext(self, docid: str) -> DocumentRecord:
|
|
89
90
|
"""Returns a document given its external ID"""
|
|
@@ -159,7 +160,7 @@ class TopicsStore(Topics):
|
|
|
159
160
|
class AdhocAssessments(Base, ABC):
|
|
160
161
|
"""Ad-hoc assessments (qrels)"""
|
|
161
162
|
|
|
162
|
-
def iter(self) -> Iterator[
|
|
163
|
+
def iter(self) -> Iterator[AdhocAssessedTopic]:
|
|
163
164
|
"""Returns an iterator over assessments"""
|
|
164
165
|
raise NotImplementedError(f"For class {self.__class__}")
|
|
165
166
|
|
{datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/formats.py
RENAMED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from functools import cached_property
|
|
2
|
-
from typing import ClassVar, Tuple
|
|
2
|
+
from typing import ClassVar, Tuple, List
|
|
3
3
|
from attrs import define
|
|
4
4
|
from datamaestro.record import record_type
|
|
5
5
|
from ir_datasets.datasets.wapo import WapoDocMedia
|
|
@@ -117,7 +117,6 @@ class TweetDoc(TextItem):
|
|
|
117
117
|
|
|
118
118
|
@define
|
|
119
119
|
class OrConvQADocument(TextItem):
|
|
120
|
-
id: str
|
|
121
120
|
title: str
|
|
122
121
|
body: str
|
|
123
122
|
aid: str
|
|
@@ -127,12 +126,21 @@ class OrConvQADocument(TextItem):
|
|
|
127
126
|
def text(self):
|
|
128
127
|
return f"{self.title} {self.body}"
|
|
129
128
|
|
|
129
|
+
@define
|
|
130
|
+
class Touche2020(TextItem):
|
|
131
|
+
text: str
|
|
132
|
+
title: str
|
|
133
|
+
stance: str
|
|
134
|
+
url: str
|
|
130
135
|
|
|
131
136
|
@define
|
|
132
|
-
class
|
|
137
|
+
class SciDocs(TextItem):
|
|
133
138
|
text: str
|
|
134
|
-
|
|
135
|
-
|
|
139
|
+
title: str
|
|
140
|
+
authors: List[str]
|
|
141
|
+
year: int
|
|
142
|
+
cited_by: List[str]
|
|
143
|
+
references: List[str]
|
|
136
144
|
|
|
137
145
|
|
|
138
146
|
@define
|
|
@@ -167,6 +175,13 @@ class TrecMb14Query(TextItem):
|
|
|
167
175
|
def get_text(self):
|
|
168
176
|
return f"{self.query}"
|
|
169
177
|
|
|
178
|
+
@define
|
|
179
|
+
class SciDocsTopic(TextItem):
|
|
180
|
+
text: str
|
|
181
|
+
authors: List[str]
|
|
182
|
+
year: int
|
|
183
|
+
cited_by: List[str]
|
|
184
|
+
references: List[str]
|
|
170
185
|
|
|
171
186
|
@define()
|
|
172
187
|
class TrecTopic(SimpleTextItem):
|
{datamaestro-text-2024.3.10 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/stores.py
RENAMED
|
@@ -1,16 +1,21 @@
|
|
|
1
1
|
from collections import namedtuple
|
|
2
|
-
from typing import List
|
|
2
|
+
from typing import List, NamedTuple
|
|
3
3
|
from experimaestro import Constant
|
|
4
4
|
import attrs
|
|
5
5
|
|
|
6
|
+
from datamaestro.record import Record
|
|
7
|
+
from datamaestro_text.data.ir.base import IDItem
|
|
6
8
|
from datamaestro_text.datasets.irds.data import LZ4DocumentStore
|
|
7
9
|
from datamaestro_text.data.ir.formats import OrConvQADocument
|
|
8
10
|
|
|
9
11
|
|
|
10
12
|
class OrConvQADocumentStore(LZ4DocumentStore):
|
|
11
|
-
NAMED_TUPLE
|
|
12
|
-
|
|
13
|
-
|
|
13
|
+
class NAMED_TUPLE(NamedTuple):
|
|
14
|
+
id: str
|
|
15
|
+
title: str
|
|
16
|
+
body: str
|
|
17
|
+
aid: str
|
|
18
|
+
bid: int
|
|
14
19
|
|
|
15
20
|
lookup_field: Constant[str] = "id"
|
|
16
21
|
fields: Constant[List[str]] = list(NAMED_TUPLE._fields)
|
|
@@ -18,5 +23,7 @@ class OrConvQADocumentStore(LZ4DocumentStore):
|
|
|
18
23
|
|
|
19
24
|
data_cls = NAMED_TUPLE
|
|
20
25
|
|
|
21
|
-
def converter(self, data: NAMED_TUPLE) ->
|
|
22
|
-
|
|
26
|
+
def converter(self, data: NAMED_TUPLE) -> Record:
|
|
27
|
+
fields = data._asdict()
|
|
28
|
+
del fields["id"]
|
|
29
|
+
return Record(OrConvQADocument(**fields), IDItem(data.id))
|