datamaestro-text 2024.5.31__tar.gz → 2025.1.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. {datamaestro_text-2024.5.31/src/datamaestro_text.egg-info → datamaestro_text-2025.1.7}/PKG-INFO +3 -3
  2. datamaestro_text-2025.1.7/requirements.txt +3 -0
  3. datamaestro_text-2025.1.7/src/datamaestro_text/config/com/github/apple/ml-qrecc.py +87 -0
  4. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/conversation/base.py +2 -2
  5. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/formats.py +20 -5
  6. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/stores.py +12 -6
  7. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/datasets/irds/data.py +222 -204
  8. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/datasets/irds/helpers.py +58 -2
  9. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/version.py +2 -2
  10. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7/src/datamaestro_text.egg-info}/PKG-INFO +3 -3
  11. datamaestro_text-2025.1.7/src/datamaestro_text.egg-info/requires.txt +3 -0
  12. datamaestro_text-2024.5.31/requirements.txt +0 -3
  13. datamaestro_text-2024.5.31/src/datamaestro_text/config/com/github/apple/ml-qrecc.py +0 -37
  14. datamaestro_text-2024.5.31/src/datamaestro_text.egg-info/requires.txt +0 -3
  15. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/.circleci/config.yml +0 -0
  16. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/.flake8 +0 -0
  17. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/.github/workflows/pytest.yml +0 -0
  18. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/.github/workflows/python-publish.yml +0 -0
  19. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/.gitignore +0 -0
  20. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/.pre-commit-config.yaml +0 -0
  21. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/.readthedocs.yml +0 -0
  22. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/LICENSE +0 -0
  23. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/MANIFEST.in +0 -0
  24. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/Makefile +0 -0
  25. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/README.md +0 -0
  26. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/Makefile +0 -0
  27. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/make.bat +0 -0
  28. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/requirements.txt +0 -0
  29. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/source/api/conversation.rst +0 -0
  30. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/source/api/embeddings.rst +0 -0
  31. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/source/api/index.rst +0 -0
  32. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/source/api/ir.rst +0 -0
  33. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/source/api/nlp.rst +0 -0
  34. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/source/api/recommendation.rst +0 -0
  35. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/source/api/text.rst +0 -0
  36. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/source/conf.py +0 -0
  37. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/source/datasets/conversation.rst +0 -0
  38. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/source/datasets/embeddings.rst +0 -0
  39. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/source/datasets/index.rst +0 -0
  40. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/source/datasets/ir.rst +0 -0
  41. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/source/datasets/irds.rst +0 -0
  42. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/source/datasets/recommendation.rst +0 -0
  43. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/source/datasets/text.rst +0 -0
  44. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/docs/source/index.rst +0 -0
  45. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/mkdocs.yml +0 -0
  46. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/pyproject.toml +0 -0
  47. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/requirements-dev.txt +0 -0
  48. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/setup.cfg +0 -0
  49. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/__init__.py +0 -0
  50. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/__init__.py +0 -0
  51. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/ai/quac.yaml +0 -0
  52. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/fastml/goodbooks-10k.yaml +0 -0
  53. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/github/aagohary/canard.py +0 -0
  54. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/github/prdwb/orconvqa.py +0 -0
  55. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/github/soskek/bookcorpus.yaml +0 -0
  56. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/microsoft/msmarco/passage.py +0 -0
  57. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/microsoft/wikiqa.yaml +0 -0
  58. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/oscar-corpus.py +0 -0
  59. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/sentiment140.py +0 -0
  60. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/com/smashwords/bookcorpus.py +0 -0
  61. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/edu/cornell/nlvr.yaml +0 -0
  62. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/edu/stanford/__init__.py +0 -0
  63. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/edu/stanford/aclimdb.py +0 -0
  64. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/edu/stanford/glove.py +0 -0
  65. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/edu/stanford/im2p.yaml +0 -0
  66. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/edu/upenn/__init__.py +0 -0
  67. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/edu/upenn/ldc/__init__.py +0 -0
  68. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/edu/upenn/ldc/aquaint.py +0 -0
  69. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/__init__.py +0 -0
  70. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/__init__.py +0 -0
  71. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/ir/covid.py +0 -0
  72. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/trec/__init__.py +0 -0
  73. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/trec/adhoc.py +0 -0
  74. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/trec/clueweb.yaml +0 -0
  75. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/trec/deeplearning.yaml +0 -0
  76. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/trec/index.yaml +0 -0
  77. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/trec/tipster.py +0 -0
  78. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/gov/nist/trec/web.yaml +0 -0
  79. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/io/github/rajpurkar/squad.yaml +0 -0
  80. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/io/github/thunlp/fewrel.py +0 -0
  81. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/io/metamind/research/__init__.py +0 -0
  82. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/io/metamind/research/wikitext.py +0 -0
  83. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/net/mattmahoney/enwiki.yaml +0 -0
  84. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/org/acm/recsys/cb2014.yaml +0 -0
  85. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/org/cocodataset/index.yaml +0 -0
  86. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/org/grouplens/movielens.py +0 -0
  87. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/org/universaldependencies/french.py +0 -0
  88. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/config/uk/ac/ucl/cs/qangaroo.yaml +0 -0
  89. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/__init__.py +0 -0
  90. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/conversation/__init__.py +0 -0
  91. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/conversation/canard.py +0 -0
  92. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/conversation/orconvqa.py +0 -0
  93. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/conversation/qrecc.py +0 -0
  94. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/embeddings.py +0 -0
  95. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/__init__.py +0 -0
  96. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/base.py +0 -0
  97. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/cord19.py +0 -0
  98. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/csv.py +0 -0
  99. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/data.py +0 -0
  100. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/huggingface.py +0 -0
  101. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/trec.py +0 -0
  102. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/ir/utils.py +0 -0
  103. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/recommendation.py +0 -0
  104. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/tagging.py +0 -0
  105. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/data/text.py +0 -0
  106. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/datasets/irds/__init__.py +0 -0
  107. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/datasets/irds/datasets.py +0 -0
  108. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/datasets/irds/utils.py +0 -0
  109. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/download/tmdb.py +0 -0
  110. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/interfaces/plaintext.py +0 -0
  111. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/interfaces/trec.py +0 -0
  112. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/test/__init__.py +0 -0
  113. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/test/test_datasets.py +0 -0
  114. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/test/test_documented.py +0 -0
  115. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/transforms/__init__.py +0 -0
  116. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/transforms/ir/__init__.py +0 -0
  117. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/utils/__init__.py +0 -0
  118. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/utils/files.py +0 -0
  119. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/utils/iter.py +0 -0
  120. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/utils/randomstream.py +0 -0
  121. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text/utils/shuffle.py +0 -0
  122. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text.egg-info/SOURCES.txt +0 -0
  123. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text.egg-info/dependency_links.txt +0 -0
  124. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text.egg-info/entry_points.txt +0 -0
  125. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/src/datamaestro_text.egg-info/top_level.txt +0 -0
  126. {datamaestro_text-2024.5.31 → datamaestro_text-2025.1.7}/tox.ini +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamaestro-text
3
- Version: 2024.5.31
3
+ Version: 2025.1.7
4
4
  Summary: Datamaestro module for text-related datasets
5
5
  Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
6
6
  License: GPL-3
@@ -18,8 +18,8 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
18
  Requires-Python: >=3.8
19
19
  Description-Content-Type: text/markdown
20
20
  License-File: LICENSE
21
- Requires-Dist: datamaestro>=1.1.1
22
- Requires-Dist: ir_datasets
21
+ Requires-Dist: datamaestro>=1.2.1
22
+ Requires-Dist: ir_datasets>=0.5.8
23
23
  Requires-Dist: attrs
24
24
 
25
25
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) [![PyPI version](https://badge.fury.io/py/datamaestro-text.svg)](https://badge.fury.io/py/datamaestro-text)
@@ -0,0 +1,3 @@
1
+ datamaestro>=1.2.1
2
+ ir_datasets>=0.5.8
3
+ attrs
@@ -0,0 +1,87 @@
1
+ # See documentation on https://datamaestro.readthedocs.io
2
+
3
+ import re
4
+ import json
5
+ from pathlib import Path
6
+ from datamaestro.definitions import datatasks, datatags, dataset
7
+ from datamaestro.data.ml import Supervised
8
+ from datamaestro.download import reference
9
+ from datamaestro.download.archive import zipdownloader
10
+ from datamaestro.download.wayback import wayback_documents
11
+ from datamaestro.utils import HashCheck
12
+ from datamaestro_text.data.conversation.qrecc import QReCCDataset
13
+ from datamaestro_text.datasets.irds.data import (
14
+ LZ4JSONLDocumentStore,
15
+ SimpleJsonDocument,
16
+ )
17
+ from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
18
+
19
+
20
+ @datatags("conversation", "context", "query")
21
+ @datatasks("query rewriting")
22
+ @zipdownloader(
23
+ "data",
24
+ "https://github.com/apple/ml-qrecc/raw/main/dataset/qrecc_data.zip",
25
+ checker=HashCheck("f88fcc7ef3678cd6312080389c8abd67"),
26
+ )
27
+ @dataset(
28
+ Supervised[QReCCDataset, None, QReCCDataset],
29
+ url="https://github.com/apple/ml-qrecc",
30
+ doi="https://doi.org/10.48550/arXiv.2010.04898",
31
+ id="",
32
+ )
33
+ def main(data: Path):
34
+ """Open-Domain Question Answering Goes Conversational via Question Rewriting
35
+
36
+ We introduce QReCC (Question Rewriting in Conversational Context), an
37
+ end-to-end open-domain question answering dataset comprising of 14K
38
+ conversations with 81K question-answer pairs. The goal of this dataset is to
39
+ provide a challenging benchmark for end-to-end conversational question
40
+ answering that includes the individual subtasks of question rewriting,
41
+ passage retrieval and reading comprehension
42
+ """
43
+ return {
44
+ "train": QReCCDataset(path=data / "qrecc_train.json"),
45
+ "test": QReCCDataset(path=data / "qrecc_test.json"),
46
+ }
47
+
48
+
49
+ @dataset(
50
+ url="https://github.com/apple/ml-qrecc",
51
+ doi="https://doi.org/10.48550/arXiv.2010.04898",
52
+ )
53
+ class Content(LZ4JSONLDocumentStore):
54
+ """QReCC mentionned URLs content"""
55
+
56
+ @staticmethod
57
+ def __create_dataset__(dataset, options=None):
58
+ ds = reference(reference=main).setup(dataset, options)
59
+ documents_path = wayback_documents(
60
+ "20191127", lambda: Content._urls(ds), name="wayback.jsonl"
61
+ ).setup(dataset, options)
62
+
63
+ store_path = lz4docstore_builder(
64
+ "store",
65
+ lambda: Content._documents(documents_path),
66
+ SimpleJsonDocument,
67
+ "id",
68
+ ).setup(dataset, options)
69
+
70
+ return LZ4JSONLDocumentStore(jsonl_path=store_path)
71
+
72
+ @staticmethod
73
+ def _documents(path: Path):
74
+ """Iterates over documents from wayback"""
75
+ with path.open("rt") as fp:
76
+ for line in fp:
77
+ yield SimpleJsonDocument(**json.loads(line))
78
+
79
+ @staticmethod
80
+ def _urls(supervised: Supervised[QReCCDataset, None, QReCCDataset]):
81
+ urls = set()
82
+ for ds in [supervised.train, supervised.test]:
83
+ for entry in ds.entries():
84
+ if entry.answer_url:
85
+ url = re.sub("#.*$", "", entry.answer_url)
86
+ urls.add(url)
87
+ return urls
@@ -188,7 +188,7 @@ class SingleConversationTreeNode(ConversationNode):
188
188
  def history(self) -> Sequence[Record]:
189
189
  return self.tree.history[self.index + 1 :]
190
190
 
191
- def parent(self) -> ConversationNode | None:
191
+ def parent(self) -> Optional[ConversationNode]:
192
192
  return (
193
193
  SingleConversationTreeNode(self.tree, self.index + 1)
194
194
  if self.index < len(self.tree.history) - 1
@@ -235,7 +235,7 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
235
235
  for child in self.children:
236
236
  yield from child
237
237
 
238
- def parent(self) -> ConversationNode | None:
238
+ def parent(self) -> Optional[ConversationNode]:
239
239
  return self.parent
240
240
 
241
241
  def children(self) -> List[ConversationNode]:
@@ -1,5 +1,5 @@
1
1
  from functools import cached_property
2
- from typing import ClassVar, Tuple
2
+ from typing import ClassVar, Tuple, List
3
3
  from attrs import define
4
4
  from datamaestro.record import record_type
5
5
  from ir_datasets.datasets.wapo import WapoDocMedia
@@ -117,7 +117,6 @@ class TweetDoc(TextItem):
117
117
 
118
118
  @define
119
119
  class OrConvQADocument(TextItem):
120
- id: str
121
120
  title: str
122
121
  body: str
123
122
  aid: str
@@ -127,12 +126,21 @@ class OrConvQADocument(TextItem):
127
126
  def text(self):
128
127
  return f"{self.title} {self.body}"
129
128
 
129
+ @define
130
+ class Touche2020(TextItem):
131
+ text: str
132
+ title: str
133
+ stance: str
134
+ url: str
130
135
 
131
136
  @define
132
- class TrecTopic(TextItem):
137
+ class SciDocs(TextItem):
133
138
  text: str
134
- query: str
135
- narrative: str
139
+ title: str
140
+ authors: List[str]
141
+ year: int
142
+ cited_by: List[str]
143
+ references: List[str]
136
144
 
137
145
 
138
146
  @define
@@ -167,6 +175,13 @@ class TrecMb14Query(TextItem):
167
175
  def get_text(self):
168
176
  return f"{self.query}"
169
177
 
178
+ @define
179
+ class SciDocsTopic(TextItem):
180
+ text: str
181
+ authors: List[str]
182
+ year: int
183
+ cited_by: List[str]
184
+ references: List[str]
170
185
 
171
186
  @define()
172
187
  class TrecTopic(SimpleTextItem):
@@ -1,17 +1,21 @@
1
1
  from collections import namedtuple
2
- from typing import List
2
+ from typing import List, NamedTuple
3
3
  from experimaestro import Constant
4
4
  import attrs
5
5
 
6
6
  from datamaestro.record import Record
7
+ from datamaestro_text.data.ir.base import IDItem
7
8
  from datamaestro_text.datasets.irds.data import LZ4DocumentStore
8
9
  from datamaestro_text.data.ir.formats import OrConvQADocument
9
10
 
10
11
 
11
12
  class OrConvQADocumentStore(LZ4DocumentStore):
12
- NAMED_TUPLE = namedtuple(
13
- "OrConvQADocument", [a.name for a in attrs.fields(OrConvQADocument)]
14
- )
13
+ class NAMED_TUPLE(NamedTuple):
14
+ id: str
15
+ title: str
16
+ body: str
17
+ aid: str
18
+ bid: int
15
19
 
16
20
  lookup_field: Constant[str] = "id"
17
21
  fields: Constant[List[str]] = list(NAMED_TUPLE._fields)
@@ -19,5 +23,7 @@ class OrConvQADocumentStore(LZ4DocumentStore):
19
23
 
20
24
  data_cls = NAMED_TUPLE
21
25
 
22
- def converter(self, data: NAMED_TUPLE) -> OrConvQADocument:
23
- return Record(OrConvQADocument(**data._asdict()))
26
+ def converter(self, data: NAMED_TUPLE) -> Record:
27
+ fields = data._asdict()
28
+ del fields["id"]
29
+ return Record(OrConvQADocument(**fields), IDItem(data.id))