datamaestro-text 2024.5.31__tar.gz → 2025.1.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datamaestro_text-2024.5.31/src/datamaestro_text.egg-info → datamaestro_text-2025.1.7}/PKG-INFO +3 -3
- datamaestro_text-2025.1.7/requirements.txt +3 -0
- datamaestro_text-2025.1.7/src/datamaestro_text/config/com/github/apple/ml-qrecc.py +87 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/conversation/base.py +2 -2
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/formats.py +20 -5
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/stores.py +12 -6
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/datasets/irds/data.py +222 -204
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/datasets/irds/helpers.py +58 -2
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/version.py +2 -2
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7/src/datamaestro_text.egg-info}/PKG-INFO +3 -3
- datamaestro_text-2025.1.7/src/datamaestro_text.egg-info/requires.txt +3 -0
- datamaestro_text-2024.5.31/requirements.txt +0 -3
- datamaestro_text-2024.5.31/src/datamaestro_text/config/com/github/apple/ml-qrecc.py +0 -37
- datamaestro_text-2024.5.31/src/datamaestro_text.egg-info/requires.txt +0 -3
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/.circleci/config.yml +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/.flake8 +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/.github/workflows/pytest.yml +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/.github/workflows/python-publish.yml +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/.gitignore +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/.pre-commit-config.yaml +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/.readthedocs.yml +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/LICENSE +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/MANIFEST.in +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/Makefile +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/README.md +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/Makefile +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/make.bat +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/requirements.txt +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/source/api/conversation.rst +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/source/api/embeddings.rst +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/source/api/index.rst +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/source/api/ir.rst +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/source/api/nlp.rst +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/source/api/recommendation.rst +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/source/api/text.rst +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/source/conf.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/source/datasets/conversation.rst +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/source/datasets/embeddings.rst +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/source/datasets/index.rst +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/source/datasets/ir.rst +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/source/datasets/irds.rst +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/source/datasets/recommendation.rst +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/source/datasets/text.rst +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/source/index.rst +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/mkdocs.yml +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/pyproject.toml +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/requirements-dev.txt +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/setup.cfg +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/__init__.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/__init__.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/ai/quac.yaml +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/fastml/goodbooks-10k.yaml +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/github/aagohary/canard.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/github/prdwb/orconvqa.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/github/soskek/bookcorpus.yaml +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/microsoft/msmarco/passage.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/microsoft/wikiqa.yaml +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/oscar-corpus.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/sentiment140.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/smashwords/bookcorpus.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/edu/cornell/nlvr.yaml +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/edu/stanford/__init__.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/edu/stanford/aclimdb.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/edu/stanford/glove.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/edu/stanford/im2p.yaml +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/edu/upenn/__init__.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/edu/upenn/ldc/__init__.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/edu/upenn/ldc/aquaint.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/__init__.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/__init__.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/ir/covid.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/trec/__init__.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/trec/adhoc.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/trec/clueweb.yaml +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/trec/deeplearning.yaml +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/trec/index.yaml +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/trec/tipster.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/trec/web.yaml +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/io/github/rajpurkar/squad.yaml +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/io/github/thunlp/fewrel.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/io/metamind/research/__init__.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/io/metamind/research/wikitext.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/net/mattmahoney/enwiki.yaml +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/org/acm/recsys/cb2014.yaml +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/org/cocodataset/index.yaml +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/org/grouplens/movielens.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/org/universaldependencies/french.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/uk/ac/ucl/cs/qangaroo.yaml +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/__init__.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/conversation/__init__.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/conversation/canard.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/conversation/orconvqa.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/conversation/qrecc.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/embeddings.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/__init__.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/base.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/cord19.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/csv.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/data.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/huggingface.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/trec.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/utils.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/recommendation.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/tagging.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/text.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/datasets/irds/__init__.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/datasets/irds/datasets.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/datasets/irds/utils.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/download/tmdb.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/interfaces/plaintext.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/interfaces/trec.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/test/__init__.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/test/test_datasets.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/test/test_documented.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/transforms/__init__.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/transforms/ir/__init__.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/utils/__init__.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/utils/files.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/utils/iter.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/utils/randomstream.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/utils/shuffle.py +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text.egg-info/SOURCES.txt +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text.egg-info/dependency_links.txt +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text.egg-info/entry_points.txt +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text.egg-info/top_level.txt +0 -0
- {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/tox.ini +0 -0
{datamaestro_text-2024.5.31/src/datamaestro_text.egg-info → datamaestro_text-2025.1.7}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datamaestro-text
|
|
3
|
-
Version:
|
|
3
|
+
Version: 2025.1.7
|
|
4
4
|
Summary: Datamaestro module for text-related datasets
|
|
5
5
|
Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
|
|
6
6
|
License: GPL-3
|
|
@@ -18,8 +18,8 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
18
18
|
Requires-Python: >=3.8
|
|
19
19
|
Description-Content-Type: text/markdown
|
|
20
20
|
License-File: LICENSE
|
|
21
|
-
Requires-Dist: datamaestro>=1.
|
|
22
|
-
Requires-Dist: ir_datasets
|
|
21
|
+
Requires-Dist: datamaestro>=1.2.1
|
|
22
|
+
Requires-Dist: ir_datasets>=0.5.8
|
|
23
23
|
Requires-Dist: attrs
|
|
24
24
|
|
|
25
25
|
[](https://github.com/pre-commit/pre-commit) [](https://badge.fury.io/py/datamaestro-text)
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# See documentation on https://datamaestro.readthedocs.io
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import json
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from datamaestro.definitions import datatasks, datatags, dataset
|
|
7
|
+
from datamaestro.data.ml import Supervised
|
|
8
|
+
from datamaestro.download import reference
|
|
9
|
+
from datamaestro.download.archive import zipdownloader
|
|
10
|
+
from datamaestro.download.wayback import wayback_documents
|
|
11
|
+
from datamaestro.utils import HashCheck
|
|
12
|
+
from datamaestro_text.data.conversation.qrecc import QReCCDataset
|
|
13
|
+
from datamaestro_text.datasets.irds.data import (
|
|
14
|
+
LZ4JSONLDocumentStore,
|
|
15
|
+
SimpleJsonDocument,
|
|
16
|
+
)
|
|
17
|
+
from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@datatags("conversation", "context", "query")
|
|
21
|
+
@datatasks("query rewriting")
|
|
22
|
+
@zipdownloader(
|
|
23
|
+
"data",
|
|
24
|
+
"https://github.com/apple/ml-qrecc/raw/main/dataset/qrecc_data.zip",
|
|
25
|
+
checker=HashCheck("f88fcc7ef3678cd6312080389c8abd67"),
|
|
26
|
+
)
|
|
27
|
+
@dataset(
|
|
28
|
+
Supervised[QReCCDataset, None, QReCCDataset],
|
|
29
|
+
url="https://github.com/apple/ml-qrecc",
|
|
30
|
+
doi="https://doi.org/10.48550/arXiv.2010.04898",
|
|
31
|
+
id="",
|
|
32
|
+
)
|
|
33
|
+
def main(data: Path):
|
|
34
|
+
"""Open-Domain Question Answering Goes Conversational via Question Rewriting
|
|
35
|
+
|
|
36
|
+
We introduce QReCC (Question Rewriting in Conversational Context), an
|
|
37
|
+
end-to-end open-domain question answering dataset comprising of 14K
|
|
38
|
+
conversations with 81K question-answer pairs. The goal of this dataset is to
|
|
39
|
+
provide a challenging benchmark for end-to-end conversational question
|
|
40
|
+
answering that includes the individual subtasks of question rewriting,
|
|
41
|
+
passage retrieval and reading comprehension
|
|
42
|
+
"""
|
|
43
|
+
return {
|
|
44
|
+
"train": QReCCDataset(path=data / "qrecc_train.json"),
|
|
45
|
+
"test": QReCCDataset(path=data / "qrecc_test.json"),
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataset(
|
|
50
|
+
url="https://github.com/apple/ml-qrecc",
|
|
51
|
+
doi="https://doi.org/10.48550/arXiv.2010.04898",
|
|
52
|
+
)
|
|
53
|
+
class Content(LZ4JSONLDocumentStore):
|
|
54
|
+
"""QReCC mentionned URLs content"""
|
|
55
|
+
|
|
56
|
+
@staticmethod
|
|
57
|
+
def __create_dataset__(dataset, options=None):
|
|
58
|
+
ds = reference(reference=main).setup(dataset, options)
|
|
59
|
+
documents_path = wayback_documents(
|
|
60
|
+
"20191127", lambda: Content._urls(ds), name="wayback.jsonl"
|
|
61
|
+
).setup(dataset, options)
|
|
62
|
+
|
|
63
|
+
store_path = lz4docstore_builder(
|
|
64
|
+
"store",
|
|
65
|
+
lambda: Content._documents(documents_path),
|
|
66
|
+
SimpleJsonDocument,
|
|
67
|
+
"id",
|
|
68
|
+
).setup(dataset, options)
|
|
69
|
+
|
|
70
|
+
return LZ4JSONLDocumentStore(jsonl_path=store_path)
|
|
71
|
+
|
|
72
|
+
@staticmethod
|
|
73
|
+
def _documents(path: Path):
|
|
74
|
+
"""Iterates over documents from wayback"""
|
|
75
|
+
with path.open("rt") as fp:
|
|
76
|
+
for line in fp:
|
|
77
|
+
yield SimpleJsonDocument(**json.loads(line))
|
|
78
|
+
|
|
79
|
+
@staticmethod
|
|
80
|
+
def _urls(supervised: Supervised[QReCCDataset, None, QReCCDataset]):
|
|
81
|
+
urls = set()
|
|
82
|
+
for ds in [supervised.train, supervised.test]:
|
|
83
|
+
for entry in ds.entries():
|
|
84
|
+
if entry.answer_url:
|
|
85
|
+
url = re.sub("#.*$", "", entry.answer_url)
|
|
86
|
+
urls.add(url)
|
|
87
|
+
return urls
|
|
@@ -188,7 +188,7 @@ class SingleConversationTreeNode(ConversationNode):
|
|
|
188
188
|
def history(self) -> Sequence[Record]:
|
|
189
189
|
return self.tree.history[self.index + 1 :]
|
|
190
190
|
|
|
191
|
-
def parent(self) -> ConversationNode
|
|
191
|
+
def parent(self) -> Optional[ConversationNode]:
|
|
192
192
|
return (
|
|
193
193
|
SingleConversationTreeNode(self.tree, self.index + 1)
|
|
194
194
|
if self.index < len(self.tree.history) - 1
|
|
@@ -235,7 +235,7 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
|
|
|
235
235
|
for child in self.children:
|
|
236
236
|
yield from child
|
|
237
237
|
|
|
238
|
-
def parent(self) -> ConversationNode
|
|
238
|
+
def parent(self) -> Optional[ConversationNode]:
|
|
239
239
|
return self.parent
|
|
240
240
|
|
|
241
241
|
def children(self) -> List[ConversationNode]:
|
{datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/formats.py
RENAMED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from functools import cached_property
|
|
2
|
-
from typing import ClassVar, Tuple
|
|
2
|
+
from typing import ClassVar, Tuple, List
|
|
3
3
|
from attrs import define
|
|
4
4
|
from datamaestro.record import record_type
|
|
5
5
|
from ir_datasets.datasets.wapo import WapoDocMedia
|
|
@@ -117,7 +117,6 @@ class TweetDoc(TextItem):
|
|
|
117
117
|
|
|
118
118
|
@define
|
|
119
119
|
class OrConvQADocument(TextItem):
|
|
120
|
-
id: str
|
|
121
120
|
title: str
|
|
122
121
|
body: str
|
|
123
122
|
aid: str
|
|
@@ -127,12 +126,21 @@ class OrConvQADocument(TextItem):
|
|
|
127
126
|
def text(self):
|
|
128
127
|
return f"{self.title} {self.body}"
|
|
129
128
|
|
|
129
|
+
@define
|
|
130
|
+
class Touche2020(TextItem):
|
|
131
|
+
text: str
|
|
132
|
+
title: str
|
|
133
|
+
stance: str
|
|
134
|
+
url: str
|
|
130
135
|
|
|
131
136
|
@define
|
|
132
|
-
class
|
|
137
|
+
class SciDocs(TextItem):
|
|
133
138
|
text: str
|
|
134
|
-
|
|
135
|
-
|
|
139
|
+
title: str
|
|
140
|
+
authors: List[str]
|
|
141
|
+
year: int
|
|
142
|
+
cited_by: List[str]
|
|
143
|
+
references: List[str]
|
|
136
144
|
|
|
137
145
|
|
|
138
146
|
@define
|
|
@@ -167,6 +175,13 @@ class TrecMb14Query(TextItem):
|
|
|
167
175
|
def get_text(self):
|
|
168
176
|
return f"{self.query}"
|
|
169
177
|
|
|
178
|
+
@define
|
|
179
|
+
class SciDocsTopic(TextItem):
|
|
180
|
+
text: str
|
|
181
|
+
authors: List[str]
|
|
182
|
+
year: int
|
|
183
|
+
cited_by: List[str]
|
|
184
|
+
references: List[str]
|
|
170
185
|
|
|
171
186
|
@define()
|
|
172
187
|
class TrecTopic(SimpleTextItem):
|
{datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/stores.py
RENAMED
|
@@ -1,17 +1,21 @@
|
|
|
1
1
|
from collections import namedtuple
|
|
2
|
-
from typing import List
|
|
2
|
+
from typing import List, NamedTuple
|
|
3
3
|
from experimaestro import Constant
|
|
4
4
|
import attrs
|
|
5
5
|
|
|
6
6
|
from datamaestro.record import Record
|
|
7
|
+
from datamaestro_text.data.ir.base import IDItem
|
|
7
8
|
from datamaestro_text.datasets.irds.data import LZ4DocumentStore
|
|
8
9
|
from datamaestro_text.data.ir.formats import OrConvQADocument
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
class OrConvQADocumentStore(LZ4DocumentStore):
|
|
12
|
-
NAMED_TUPLE
|
|
13
|
-
|
|
14
|
-
|
|
13
|
+
class NAMED_TUPLE(NamedTuple):
|
|
14
|
+
id: str
|
|
15
|
+
title: str
|
|
16
|
+
body: str
|
|
17
|
+
aid: str
|
|
18
|
+
bid: int
|
|
15
19
|
|
|
16
20
|
lookup_field: Constant[str] = "id"
|
|
17
21
|
fields: Constant[List[str]] = list(NAMED_TUPLE._fields)
|
|
@@ -19,5 +23,7 @@ class OrConvQADocumentStore(LZ4DocumentStore):
|
|
|
19
23
|
|
|
20
24
|
data_cls = NAMED_TUPLE
|
|
21
25
|
|
|
22
|
-
def converter(self, data: NAMED_TUPLE) ->
|
|
23
|
-
|
|
26
|
+
def converter(self, data: NAMED_TUPLE) -> Record:
|
|
27
|
+
fields = data._asdict()
|
|
28
|
+
del fields["id"]
|
|
29
|
+
return Record(OrConvQADocument(**fields), IDItem(data.id))
|