datamaestro-text 2023.12.5__tar.gz → 2023.12.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/.readthedocs.yml +1 -0
  2. {datamaestro-text-2023.12.5/src/datamaestro_text.egg-info → datamaestro-text-2023.12.12}/PKG-INFO +1 -1
  3. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/docs/source/datasets/conversation.rst +2 -0
  4. datamaestro-text-2023.12.12/docs/source/datasets/irds.rst +19 -0
  5. datamaestro-text-2023.12.12/src/datamaestro_text/config/com/github/prdwb/orconvqa.py +92 -0
  6. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/ir/__init__.py +1 -1
  7. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/ir/formats.py +14 -0
  8. datamaestro-text-2023.12.12/src/datamaestro_text/data/ir/stores.py +22 -0
  9. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/datasets/irds/data.py +65 -2
  10. datamaestro-text-2023.12.12/src/datamaestro_text/datasets/irds/helpers.py +71 -0
  11. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/version.py +2 -2
  12. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12/src/datamaestro_text.egg-info}/PKG-INFO +1 -1
  13. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text.egg-info/SOURCES.txt +3 -0
  14. datamaestro-text-2023.12.5/src/datamaestro_text/config/com/github/prdwb/orconvqa.py +0 -49
  15. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/.circleci/config.yml +0 -0
  16. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/.flake8 +0 -0
  17. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/.github/workflows/pytest.yml +0 -0
  18. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/.github/workflows/python-publish.yml +0 -0
  19. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/.gitignore +0 -0
  20. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/.pre-commit-config.yaml +0 -0
  21. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/LICENSE +0 -0
  22. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/MANIFEST.in +0 -0
  23. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/Makefile +0 -0
  24. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/README.md +0 -0
  25. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/docs/Makefile +0 -0
  26. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/docs/make.bat +0 -0
  27. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/docs/requirements.txt +0 -0
  28. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/docs/source/api/conversation.rst +0 -0
  29. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/docs/source/api/index.rst +0 -0
  30. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/docs/source/api/ir.rst +0 -0
  31. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/docs/source/api/text.rst +0 -0
  32. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/docs/source/conf.py +0 -0
  33. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/docs/source/datasets/embeddings.rst +0 -0
  34. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/docs/source/datasets/index.rst +0 -0
  35. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/docs/source/datasets/ir.rst +0 -0
  36. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/docs/source/datasets/recommendation.rst +0 -0
  37. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/docs/source/datasets/text.rst +0 -0
  38. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/docs/source/index.rst +0 -0
  39. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/mkdocs.yml +0 -0
  40. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/pyproject.toml +0 -0
  41. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/requirements-dev.txt +0 -0
  42. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/requirements.txt +0 -0
  43. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/setup.cfg +0 -0
  44. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/__init__.py +0 -0
  45. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/__init__.py +0 -0
  46. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/ai/quac.yaml +0 -0
  47. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/com/fastml/goodbooks-10k.yaml +0 -0
  48. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/com/github/aagohary/canard.py +0 -0
  49. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/com/github/soskek/bookcorpus.yaml +0 -0
  50. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/com/microsoft/msmarco/passage.py +0 -0
  51. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/com/microsoft/wikiqa.yaml +0 -0
  52. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/com/oscar-corpus.py +0 -0
  53. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/com/sentiment140.py +0 -0
  54. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/com/smashwords/bookcorpus.py +0 -0
  55. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/edu/cornell/nlvr.yaml +0 -0
  56. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/edu/stanford/__init__.py +0 -0
  57. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/edu/stanford/aclimdb.py +0 -0
  58. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/edu/stanford/glove.py +0 -0
  59. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/edu/stanford/im2p.yaml +0 -0
  60. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/edu/upenn/__init__.py +0 -0
  61. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/edu/upenn/ldc/__init__.py +0 -0
  62. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/edu/upenn/ldc/aquaint.py +0 -0
  63. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/gov/__init__.py +0 -0
  64. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/gov/nist/__init__.py +0 -0
  65. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/gov/nist/ir/covid.py +0 -0
  66. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/gov/nist/trec/__init__.py +0 -0
  67. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/gov/nist/trec/adhoc.py +0 -0
  68. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/gov/nist/trec/clueweb.yaml +0 -0
  69. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/gov/nist/trec/deeplearning.yaml +0 -0
  70. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/gov/nist/trec/index.yaml +0 -0
  71. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/gov/nist/trec/tipster.py +0 -0
  72. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/gov/nist/trec/web.yaml +0 -0
  73. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/io/github/rajpurkar/squad.yaml +0 -0
  74. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/io/github/thunlp/fewrel.py +0 -0
  75. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/io/metamind/research/__init__.py +0 -0
  76. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/io/metamind/research/wikitext.py +0 -0
  77. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/net/mattmahoney/enwiki.yaml +0 -0
  78. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/org/acm/recsys/cb2014.yaml +0 -0
  79. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/org/cocodataset/index.yaml +0 -0
  80. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/org/grouplens/movielens.py +0 -0
  81. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/org/universaldependencies/french.py +0 -0
  82. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/config/uk/ac/ucl/cs/qangaroo.yaml +0 -0
  83. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/conversation/__init__.py +0 -0
  84. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/conversation/base.py +0 -0
  85. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/conversation/canard.py +0 -0
  86. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/conversation/orconvqa.py +0 -0
  87. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/embeddings.py +0 -0
  88. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/ir/base.py +0 -0
  89. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/ir/cord19.py +0 -0
  90. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/ir/csv.py +0 -0
  91. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/ir/data.py +0 -0
  92. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/ir/huggingface.py +0 -0
  93. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/ir/trec.py +0 -0
  94. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/ir/utils.py +0 -0
  95. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/recommendation.py +0 -0
  96. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/tagging.py +0 -0
  97. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/data/text.py +0 -0
  98. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/datasets/irds/__init__.py +0 -0
  99. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/datasets/irds/datasets.py +0 -0
  100. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/datasets/irds/utils.py +0 -0
  101. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/download/tmdb.py +0 -0
  102. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/interfaces/plaintext.py +0 -0
  103. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/interfaces/trec.py +0 -0
  104. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/test/__init__.py +0 -0
  105. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/test/test_datasets.py +0 -0
  106. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/transforms/__init__.py +0 -0
  107. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/transforms/ir/__init__.py +0 -0
  108. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/utils/__init__.py +0 -0
  109. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/utils/files.py +0 -0
  110. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/utils/iter.py +0 -0
  111. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/utils/randomstream.py +0 -0
  112. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text/utils/shuffle.py +0 -0
  113. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text.egg-info/dependency_links.txt +0 -0
  114. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text.egg-info/entry_points.txt +0 -0
  115. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text.egg-info/requires.txt +0 -0
  116. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/src/datamaestro_text.egg-info/top_level.txt +0 -0
  117. {datamaestro-text-2023.12.5 → datamaestro-text-2023.12.12}/tox.ini +0 -0
@@ -20,3 +20,4 @@ python:
20
20
  - method: pip
21
21
  path: .
22
22
  - requirements: docs/requirements.txt
23
+ - requirements: requirements.txt
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamaestro-text
3
- Version: 2023.12.5
3
+ Version: 2023.12.12
4
4
  Summary: Datamaestro module for text-related datasets
5
5
  Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
6
6
  License: GPL-3
@@ -6,3 +6,5 @@ Contextualized Query Rewriting
6
6
  ==============================
7
7
 
8
8
  .. dm:datasets:: com.github.aagohary.canard text
9
+
10
+ .. dm:datasets:: com.github.prdwb.orconvqa text
@@ -0,0 +1,19 @@
1
+ IR Datasets
2
+ ===========
3
+
4
+ XPMIR provides an interface to the `IR Datasets <https://ir-datasets.com/>`_ library.
5
+ The list below is provided as a reference, but might not be up-to-date if your
6
+ version of `ir-datasets` is more ancient or newer than the one used at generation time.
7
+
8
+ Data types
9
+ ----------
10
+
11
+ .. autoxpmconfig:: xpmir.datasets.irds.data.Topics
12
+ .. autoxpmconfig:: xpmir.datasets.irds.data.Documents
13
+ .. autoxpmconfig:: xpmir.datasets.irds.data.AdhocAssessments
14
+
15
+
16
+ List of datasets
17
+ ----------------
18
+
19
+ .. dm:repository:: irds
@@ -0,0 +1,92 @@
1
+ # See documentation on https://datamaestro.readthedocs.io
2
+
3
+ from collections import namedtuple
4
+ import gzip
5
+ import json
6
+ from pathlib import Path
7
+ from typing import Iterator, NamedTuple
8
+ import attrs
9
+ from datamaestro.definitions import datatasks, datatags, dataset
10
+ from datamaestro.download.single import filedownloader
11
+ from datamaestro.utils import HashCheck
12
+
13
+
14
+ from datamaestro_text.data.conversation.orconvqa import OrConvQADataset
15
+ from datamaestro.data.ml import Supervised
16
+
17
+ from datamaestro_text.data.ir import DocumentStore
18
+ from datamaestro_text.data.ir.formats import OrConvQADocument
19
+ from datamaestro_text.data.ir.stores import OrConvQADocumentStore
20
+ from datamaestro_text.datasets.irds.data import LZ4DocumentStore
21
+ from datamaestro_text.datasets.irds.helpers import lz4docstore_downloader
22
+
23
+
24
+ @datatags("conversation", "context", "query")
25
+ @datatasks("query rewriting")
26
+ @filedownloader(
27
+ "train.jsonl",
28
+ "https://ciir.cs.umass.edu/downloads/ORConvQA/preprocessed/train.txt",
29
+ checker=HashCheck("7513a9ef12d8b7a4471166dc4fef77b7"),
30
+ )
31
+ @filedownloader(
32
+ "dev.jsonl",
33
+ "https://ciir.cs.umass.edu/downloads/ORConvQA/preprocessed/dev.txt",
34
+ checker=HashCheck("7765658995cc9ffd5eb39a400d814b20"),
35
+ )
36
+ @filedownloader(
37
+ "test.jsonl",
38
+ "https://ciir.cs.umass.edu/downloads/ORConvQA/preprocessed/test.txt",
39
+ checker=HashCheck("0cf3a755f06297b9c02e7db45f8dc8be"),
40
+ )
41
+ @dataset(
42
+ Supervised,
43
+ url="https://github.com/prdwb/orconvqa-release",
44
+ )
45
+ def preprocessed(train, dev, test):
46
+ """Open-Retrieval Conversational Question Answering datasets
47
+
48
+ OrConvQA is an aggregation of three existing datasets:
49
+
50
+ 1. the QuAC dataset that offers information-seeking conversations,
51
+ 1. the CANARD dataset that consists of context-independent rewrites of QuAC questions, and
52
+ 3. the Wikipedia corpus that serves as the knowledge source of answering questions.
53
+
54
+ Each dataset is an instance of :class:`datamaestro_text.data.conversation.OrConvQADataset`
55
+ """
56
+ return {
57
+ "train": OrConvQADataset(path=train),
58
+ "validation": OrConvQADataset(path=dev),
59
+ "test": OrConvQADataset(path=test),
60
+ }
61
+
62
+
63
+ def orConvQADocumentReader(source: Path) -> Iterator[OrConvQADocumentStore.NAMED_TUPLE]:
64
+ with gzip.open(source, "rt") as fp:
65
+ for line in fp:
66
+ yield OrConvQADocumentStore.NAMED_TUPLE(**json.loads(line))
67
+
68
+
69
+ @lz4docstore_downloader(
70
+ "all_blocks",
71
+ "https://ciir.cs.umass.edu/downloads/ORConvQA/all_blocks.txt.gz",
72
+ orConvQADocumentReader,
73
+ OrConvQADocumentStore.NAMED_TUPLE,
74
+ "id",
75
+ checker=HashCheck("1095a3408690e7bbe4d8a87a2bae6356"),
76
+ size=5_086_902_800,
77
+ count_hint=11_377_951,
78
+ )
79
+ @dataset(
80
+ OrConvQADocumentStore,
81
+ url="https://github.com/prdwb/orconvqa-release",
82
+ )
83
+ def passages(all_blocks):
84
+ """orConvQA wikipedia files
85
+
86
+ OrConvQA is an aggregation of three existing datasets:
87
+
88
+ 1. the QuAC dataset that offers information-seeking conversations,
89
+ 1. the CANARD dataset that consists of context-independent rewrites of QuAC questions, and
90
+ 3. the Wikipedia corpus that serves as the knowledge source of answering questions.
91
+ """
92
+ return {"path": all_blocks, "count": 11_377_951}
@@ -95,7 +95,7 @@ class DocumentStore(Documents):
95
95
  self, randint: Optional[Callable[[int], int]]
96
96
  ) -> Iterator[Document]:
97
97
  """Sample documents from the dataset"""
98
- length = self.documentcount()
98
+ length = self.documentcount
99
99
  randint = randint or (lambda max: random.randint(0, max - 1))
100
100
  while True:
101
101
  yield self.document_int(randint(length))
@@ -130,6 +130,20 @@ class TweetDoc(IDHolder, Document):
130
130
  return f"{self.text}"
131
131
 
132
132
 
133
+ @define
134
+ class OrConvQADocument(IDHolder, Document):
135
+ id: str
136
+ title: str
137
+ text: str
138
+ aid: str
139
+ bid: int
140
+
141
+ has_text: ClassVar[bool] = True
142
+
143
+ def get_text(self):
144
+ return f"{self.title} {self.text}"
145
+
146
+
133
147
  @define
134
148
  class TrecTopic(GenericTopic):
135
149
  text: str
@@ -0,0 +1,22 @@
1
+ from collections import namedtuple
2
+ from typing import List
3
+ from experimaestro import Constant
4
+ import attrs
5
+
6
+ from datamaestro_text.datasets.irds.data import LZ4DocumentStore
7
+ from datamaestro_text.data.ir.formats import OrConvQADocument
8
+
9
+
10
+ class OrConvQADocumentStore(LZ4DocumentStore):
11
+ NAMED_TUPLE = namedtuple(
12
+ "OrConvQADocument", [a.name for a in attrs.fields(OrConvQADocument)]
13
+ )
14
+
15
+ lookup_field: Constant[str] = "id"
16
+ fields: Constant[List[str]] = list(NAMED_TUPLE._fields)
17
+ index_fields: Constant[List[str]] = ["id"]
18
+
19
+ data_cls = NAMED_TUPLE
20
+
21
+ def converter(self, data: NAMED_TUPLE) -> OrConvQADocument:
22
+ return OrConvQADocument(**data._asdict())
@@ -1,7 +1,9 @@
1
1
  import logging
2
- from typing import Any, Iterator, Tuple, Type, List
2
+ from pathlib import Path
3
+ from typing import Any, Iterator, NamedTuple, Tuple, Type, List
3
4
  import attrs
4
5
  import ir_datasets
6
+ from ir_datasets.indices import PickleLz4FullStore
5
7
  from ir_datasets.formats import (
6
8
  GenericDoc,
7
9
  GenericQuery,
@@ -10,7 +12,7 @@ from ir_datasets.formats import (
10
12
  TrecQuery,
11
13
  )
12
14
  import ir_datasets.datasets as _irds
13
- from experimaestro import Config
15
+ from experimaestro import Config, Param
14
16
  from experimaestro.compat import cached_property
15
17
  from experimaestro import Option
16
18
  import datamaestro_text.data.ir as ir
@@ -208,6 +210,67 @@ if hasattr(_irds, "miracl"):
208
210
  )
209
211
 
210
212
 
213
+ # Fix while PR https://github.com/allenai/ir_datasets/pull/252
214
+ # is not in.
215
+ class DMPickleLz4FullStore(PickleLz4FullStore):
216
+ def get_many(self, doc_ids, field=None):
217
+ result = {}
218
+ field_idx = self._doc_cls._fields.index(field) if field is not None else None
219
+ for doc in self.get_many_iter(doc_ids):
220
+ if field is not None:
221
+ result[getattr(doc, self._id_field)] = doc[field_idx]
222
+ else:
223
+ result[getattr(doc, self._id_field)] = doc
224
+ return result
225
+
226
+
227
+ class LZ4DocumentStore(ir.DocumentStore):
228
+ """A LZ4-based document store"""
229
+
230
+ path: Param[Path]
231
+
232
+ #: Lookup field
233
+ lookup_field: Param[str]
234
+
235
+ # Extra indexed fields (e.g. URLs)
236
+ index_fields: List[str]
237
+
238
+ @cached_property
239
+ def store(self):
240
+ return DMPickleLz4FullStore(
241
+ self.path, None, self.data_cls, self.lookup_field, self.index_fields
242
+ )
243
+
244
+ @cached_property
245
+ def _docs(self):
246
+ return self.store.__iter__()
247
+
248
+ def docid_internal2external(self, ix: int):
249
+ return getattr(self._docs[ix], self.store._id_field)
250
+
251
+ def document_ext(self, docid: str) -> Document:
252
+ return self.converter(self.store.get(docid))
253
+
254
+ def documents_ext(self, docids: List[str]) -> Document:
255
+ """Returns documents given their external IDs (optimized for batch)"""
256
+ retrieved = self.store.get_many(docids)
257
+ return [self.converter(retrieved[docid]) for docid in docids]
258
+
259
+ def converter(self, data):
260
+ """Converts a document from LZ4 tuples to any other format"""
261
+ # By default, use identity
262
+ return data
263
+
264
+ def iter(self) -> Iterator[Document]:
265
+ """Returns an iterator over documents"""
266
+ return map(self.converter, self.store.__iter__())
267
+
268
+ def documentcount(self):
269
+ if self.count:
270
+ return self.count
271
+ return self.store.count()
272
+
273
+
211
274
  @attrs.define()
212
275
  class IRDSQueryWrapper(ir.Topic):
213
276
  query: Any
@@ -0,0 +1,71 @@
1
+ import logging
2
+ from typing import Optional, Type, Callable, Iterator
3
+ from ir_datasets.indices import PickleLz4FullStore
4
+ from datamaestro.download import Download
5
+ from datamaestro.utils import FileChecker
6
+ from pathlib import Path
7
+ import urllib3
8
+
9
+
10
+ class lz4docstore_downloader(Download):
11
+ """Uses ir_datasets Lz4FullStore to build a document store for a stream of documents"""
12
+
13
+ def __init__(
14
+ self,
15
+ varname: str,
16
+ url: str,
17
+ iter_factory: Callable[[Path], Iterator],
18
+ doc_cls: Type,
19
+ lookup_field: str,
20
+ *,
21
+ count_hint: Optional[int] = None,
22
+ size: Optional[int] = None,
23
+ checker: FileChecker = None,
24
+ ):
25
+ super().__init__(varname)
26
+ self.iter_factory = iter_factory
27
+ self.url = url
28
+ self.doc_cls = doc_cls
29
+ self.size = size
30
+ self.lookup_field = lookup_field
31
+ self.count_hint = count_hint
32
+ self.checker = checker
33
+
34
+ p = urllib3.util.parse_url(self.url)
35
+ assert p is not None
36
+ self.name = Path(p.path).with_suffix("").name
37
+
38
+ def prepare(self):
39
+ return self.definition.datapath / self.name
40
+
41
+ def download(self, force=False):
42
+ # Creates directory if needed
43
+ destination = self.definition.datapath / self.name
44
+ destination.mkdir(exist_ok=True)
45
+
46
+ # Early exit
47
+ if (destination / "done").is_file() and not force:
48
+ return True
49
+
50
+ # Download (cache)
51
+ logging.info("Building the document index")
52
+ with self.context.downloadURL(self.url, size=self.size) as file:
53
+ # Checks the file
54
+ if self.checker:
55
+ self.checker.check(file.path)
56
+
57
+ # Builds the LZ4 store
58
+ store = PickleLz4FullStore(
59
+ destination,
60
+ lambda: self.iter_factory(Path(file.path)),
61
+ self.doc_cls,
62
+ lookup_field=self.lookup_field,
63
+ index_fields=[self.lookup_field],
64
+ key_field_prefix=None,
65
+ size_hint=None,
66
+ count_hint=self.count_hint,
67
+ )
68
+ store.build()
69
+
70
+ # All good!
71
+ (destination / "done").touch()
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '2023.12.5'
16
- __version_tuple__ = version_tuple = (2023, 12, 5)
15
+ __version__ = version = '2023.12.12'
16
+ __version_tuple__ = version_tuple = (2023, 12, 12)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamaestro-text
3
- Version: 2023.12.5
3
+ Version: 2023.12.12
4
4
  Summary: Datamaestro module for text-related datasets
5
5
  Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
6
6
  License: GPL-3
@@ -27,6 +27,7 @@ docs/source/datasets/conversation.rst
27
27
  docs/source/datasets/embeddings.rst
28
28
  docs/source/datasets/index.rst
29
29
  docs/source/datasets/ir.rst
30
+ docs/source/datasets/irds.rst
30
31
  docs/source/datasets/recommendation.rst
31
32
  docs/source/datasets/text.rst
32
33
  src/datamaestro_text/__init__.py
@@ -91,11 +92,13 @@ src/datamaestro_text/data/ir/csv.py
91
92
  src/datamaestro_text/data/ir/data.py
92
93
  src/datamaestro_text/data/ir/formats.py
93
94
  src/datamaestro_text/data/ir/huggingface.py
95
+ src/datamaestro_text/data/ir/stores.py
94
96
  src/datamaestro_text/data/ir/trec.py
95
97
  src/datamaestro_text/data/ir/utils.py
96
98
  src/datamaestro_text/datasets/irds/__init__.py
97
99
  src/datamaestro_text/datasets/irds/data.py
98
100
  src/datamaestro_text/datasets/irds/datasets.py
101
+ src/datamaestro_text/datasets/irds/helpers.py
99
102
  src/datamaestro_text/datasets/irds/utils.py
100
103
  src/datamaestro_text/download/tmdb.py
101
104
  src/datamaestro_text/interfaces/plaintext.py
@@ -1,49 +0,0 @@
1
- # See documentation on https://datamaestro.readthedocs.io
2
-
3
- from datamaestro.definitions import datatasks, datatags, dataset
4
- from datamaestro.download.single import filedownloader
5
- from datamaestro.utils import HashCheck
6
-
7
-
8
- from datamaestro_text.data.conversation.orconvqa import OrConvQADataset
9
- from datamaestro.data.ml import Supervised
10
-
11
-
12
- @datatags("conversation", "context", "query")
13
- @datatasks("query rewriting")
14
- @filedownloader(
15
- "train.jsonl",
16
- "https://ciir.cs.umass.edu/downloads/ORConvQA/preprocessed/train.txt",
17
- checker=HashCheck("7513a9ef12d8b7a4471166dc4fef77b7"),
18
- )
19
- @filedownloader(
20
- "dev.jsonl",
21
- "https://ciir.cs.umass.edu/downloads/ORConvQA/preprocessed/dev.txt",
22
- checker=HashCheck("7765658995cc9ffd5eb39a400d814b20"),
23
- )
24
- @filedownloader(
25
- "test.jsonl",
26
- "https://ciir.cs.umass.edu/downloads/ORConvQA/preprocessed/test.txt",
27
- checker=HashCheck("0cf3a755f06297b9c02e7db45f8dc8be"),
28
- )
29
- @dataset(
30
- Supervised,
31
- url="https://github.com/prdwb/orconvqa-release",
32
- )
33
- def preprocessed(train, dev, test):
34
- """Question-in-context rewriting
35
-
36
- CANARD is a dataset for question-in-context rewriting that consists of
37
- questions each given in a dialog context together with a context-independent
38
- rewriting of the question. The context of each question is the dialog
39
- utterances that precede the question. CANARD can be used to evaluate
40
- question rewriting models that handle important linguistic phenomena such as
41
- co-reference and ellipsis resolution.
42
-
43
- Each dataset is an instance of :class:`datamaestro_text.data.conversation.OrConvQADataset`
44
- """
45
- return {
46
- "train": OrConvQADataset(path=train),
47
- "validation": OrConvQADataset(path=dev),
48
- "test": OrConvQADataset(path=test),
49
- }