datamaestro-text 2023.7.4__tar.gz → 2023.7.6.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. {datamaestro-text-2023.7.4/src/datamaestro_text.egg-info → datamaestro-text-2023.7.6.1}/PKG-INFO +1 -1
  2. datamaestro-text-2023.7.6.1/src/datamaestro_text/data/ir/formats.py +23 -0
  3. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/datasets/irds/data.py +36 -15
  4. datamaestro-text-2023.7.6.1/src/datamaestro_text/version.py +4 -0
  5. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1/src/datamaestro_text.egg-info}/PKG-INFO +1 -1
  6. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text.egg-info/SOURCES.txt +1 -0
  7. datamaestro-text-2023.7.4/src/datamaestro_text/version.py +0 -4
  8. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/.circleci/config.yml +0 -0
  9. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/.flake8 +0 -0
  10. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/.github/workflows/pytest.yml +0 -0
  11. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/.github/workflows/python-publish.yml +0 -0
  12. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/.gitignore +0 -0
  13. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/.pre-commit-config.yaml +0 -0
  14. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/.readthedocs.yml +0 -0
  15. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/LICENSE +0 -0
  16. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/MANIFEST.in +0 -0
  17. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/Makefile +0 -0
  18. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/README.md +0 -0
  19. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/docs/Makefile +0 -0
  20. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/docs/make.bat +0 -0
  21. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/docs/requirements.txt +0 -0
  22. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/docs/source/api/conversation.rst +0 -0
  23. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/docs/source/api/index.rst +0 -0
  24. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/docs/source/api/ir.rst +0 -0
  25. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/docs/source/api/text.rst +0 -0
  26. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/docs/source/conf.py +0 -0
  27. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/docs/source/datasets/conversation.rst +0 -0
  28. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/docs/source/datasets/embeddings.rst +0 -0
  29. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/docs/source/datasets/index.rst +0 -0
  30. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/docs/source/datasets/ir.rst +0 -0
  31. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/docs/source/datasets/recommendation.rst +0 -0
  32. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/docs/source/datasets/text.rst +0 -0
  33. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/docs/source/index.rst +0 -0
  34. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/mkdocs.yml +0 -0
  35. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/pyproject.toml +0 -0
  36. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/requirements-dev.txt +0 -0
  37. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/requirements.txt +0 -0
  38. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/setup.cfg +0 -0
  39. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/setup.py +0 -0
  40. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/__init__.py +0 -0
  41. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/__init__.py +0 -0
  42. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/ai/quac.yaml +0 -0
  43. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/com/fastml/goodbooks-10k.yaml +0 -0
  44. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/com/github/aagohary/canard.py +0 -0
  45. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/com/github/prdwb/orconvqa.py +0 -0
  46. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/com/github/soskek/bookcorpus.yaml +0 -0
  47. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/com/microsoft/msmarco/passage.py +0 -0
  48. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/com/microsoft/wikiqa.yaml +0 -0
  49. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/com/oscar-corpus.py +0 -0
  50. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/com/sentiment140.py +0 -0
  51. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/com/smashwords/bookcorpus.py +0 -0
  52. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/edu/cornell/nlvr.yaml +0 -0
  53. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/edu/stanford/__init__.py +0 -0
  54. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/edu/stanford/aclimdb.py +0 -0
  55. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/edu/stanford/glove.py +0 -0
  56. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/edu/stanford/im2p.yaml +0 -0
  57. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/edu/upenn/__init__.py +0 -0
  58. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/edu/upenn/ldc/__init__.py +0 -0
  59. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/edu/upenn/ldc/aquaint.py +0 -0
  60. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/gov/__init__.py +0 -0
  61. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/gov/nist/__init__.py +0 -0
  62. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/gov/nist/ir/covid.py +0 -0
  63. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/gov/nist/trec/__init__.py +0 -0
  64. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/gov/nist/trec/adhoc.py +0 -0
  65. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/gov/nist/trec/clueweb.yaml +0 -0
  66. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/gov/nist/trec/deeplearning.yaml +0 -0
  67. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/gov/nist/trec/index.yaml +0 -0
  68. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/gov/nist/trec/tipster.py +0 -0
  69. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/gov/nist/trec/web.yaml +0 -0
  70. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/io/github/rajpurkar/squad.yaml +0 -0
  71. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/io/github/thunlp/fewrel.py +0 -0
  72. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/io/metamind/research/__init__.py +0 -0
  73. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/io/metamind/research/wikitext.py +0 -0
  74. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/net/mattmahoney/enwiki.yaml +0 -0
  75. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/org/acm/recsys/cb2014.yaml +0 -0
  76. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/org/cocodataset/index.yaml +0 -0
  77. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/org/grouplens/movielens.py +0 -0
  78. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/org/universaldependencies/french.py +0 -0
  79. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/config/uk/ac/ucl/cs/qangaroo.yaml +0 -0
  80. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/data/conversation/__init__.py +0 -0
  81. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/data/conversation/base.py +0 -0
  82. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/data/conversation/canard.py +0 -0
  83. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/data/conversation/orconvqa.py +0 -0
  84. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/data/embeddings.py +0 -0
  85. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/data/ir/__init__.py +0 -0
  86. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/data/ir/base.py +0 -0
  87. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/data/ir/cord19.py +0 -0
  88. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/data/ir/csv.py +0 -0
  89. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/data/ir/data.py +0 -0
  90. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/data/ir/huggingface.py +0 -0
  91. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/data/ir/trec.py +0 -0
  92. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/data/ir/utils.py +0 -0
  93. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/data/recommendation.py +0 -0
  94. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/data/tagging.py +0 -0
  95. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/data/text.py +0 -0
  96. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/datasets/irds/__init__.py +0 -0
  97. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/datasets/irds/datasets.py +0 -0
  98. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/datasets/irds/utils.py +0 -0
  99. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/download/tmdb.py +0 -0
  100. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/interfaces/plaintext.py +0 -0
  101. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/interfaces/trec.py +0 -0
  102. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/test/__init__.py +0 -0
  103. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/test/test_datasets.py +0 -0
  104. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/transforms/__init__.py +0 -0
  105. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/transforms/ir.py +0 -0
  106. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/utils/__init__.py +0 -0
  107. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/utils/files.py +0 -0
  108. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/utils/randomstream.py +0 -0
  109. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text/utils/shuffle.py +0 -0
  110. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text.egg-info/dependency_links.txt +0 -0
  111. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text.egg-info/entry_points.txt +0 -0
  112. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text.egg-info/requires.txt +0 -0
  113. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text.egg-info/top_level.txt +0 -0
  114. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/src/datamaestro_text.egg-info/zip-safe +0 -0
  115. {datamaestro-text-2023.7.4 → datamaestro-text-2023.7.6.1}/tox.ini +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamaestro-text
3
- Version: 2023.7.4
3
+ Version: 2023.7.6.1
4
4
  Summary: Datamaestro module for text-related datasets
5
5
  Home-page: https://github.com/experimaestro/datamaestro_text
6
6
  Author: Benjamin Piwowarski
@@ -0,0 +1,23 @@
1
+ from typing import ClassVar
2
+ from attrs import define
3
+ from .base import IDHolder, Document, Topic
4
+
5
+
6
+ @define
7
+ class CordDocument(IDHolder, Document):
8
+ text: str
9
+ title: str
10
+ url: str
11
+ pubmed_id: str
12
+
13
+ has_text: ClassVar[bool] = True
14
+
15
+ def get_text(self):
16
+ return f"{self.title} {self.text}"
17
+
18
+
19
+ @define
20
+ class TrecTopic(IDHolder, Topic):
21
+ text: str
22
+ query: str
23
+ narrative: str
@@ -1,10 +1,11 @@
1
- import ir_datasets
2
- from ir_datasets.formats import GenericDoc, GenericQuery, GenericDocPair
3
1
  import logging
2
+ from typing import Any, Iterator, Tuple, Type
4
3
  import attrs
4
+ import ir_datasets
5
+ from ir_datasets.formats import GenericDoc, GenericQuery, GenericDocPair
6
+ import ir_datasets.datasets as _irds
5
7
  from experimaestro import Config
6
8
  from experimaestro.compat import cached_property
7
- from typing import Any, Iterator, Tuple
8
9
  from experimaestro import Option
9
10
  import datamaestro_text.data.ir as ir
10
11
  from datamaestro_text.data.ir.base import (
@@ -17,6 +18,7 @@ from datamaestro_text.data.ir.base import (
17
18
  IDDocument,
18
19
  IDTopic,
19
20
  )
21
+ import datamaestro_text.data.ir.formats as formats
20
22
 
21
23
 
22
24
  # Interface between ir_datasets and datamaestro:
@@ -64,11 +66,18 @@ class AdhocAssessments(ir.AdhocAssessments, IRDSId):
64
66
  return qrels.values()
65
67
 
66
68
 
67
- def tuple_constructor(cls):
68
- def constructor(entry):
69
- return cls(*tuple(entry))
69
+ class tuple_constructor:
70
+ def __init__(self, target_cls: Type, *fields: str):
71
+ self.target_cls = target_cls
72
+ self.fields = fields
73
+
74
+ def check(self, source_cls: Type):
75
+ assert (
76
+ source_cls._fields == self.fields
77
+ ), f"Internal error: Fields do not match ({source_cls._fields} and {self.fields})"
70
78
 
71
- return constructor
79
+ def __call__(self, entry):
80
+ return self.target_cls(*tuple(entry))
72
81
 
73
82
 
74
83
  @attrs.define()
@@ -77,7 +86,12 @@ class IRDSDocumentWrapper(ir.Document):
77
86
 
78
87
 
79
88
  class Documents(ir.DocumentStore, IRDSId):
80
- CONVERTERS = {GenericDoc: (GenericDocument, tuple_constructor)}
89
+ CONVERTERS = {
90
+ GenericDoc: tuple_constructor(GenericDocument, "doc_id", "text"),
91
+ _irds.beir.BeirCordDoc: tuple_constructor(
92
+ formats.CordDocument, "doc_id", "text", "title", "url", "pubmed_id"
93
+ ),
94
+ }
81
95
 
82
96
  """Wraps an ir datasets collection -- and provide a default text
83
97
  value depending on the collection itself"""
@@ -115,12 +129,13 @@ class Documents(ir.DocumentStore, IRDSId):
115
129
 
116
130
  @cached_property
117
131
  def document_cls(self):
118
- return Documents.CONVERTERS[self.dataset.docs_cls()][0]
132
+ return self.converter.target_cls
119
133
 
120
134
  @cached_property
121
135
  def converter(self):
122
- document_cls, constructor = Documents.CONVERTERS[self.dataset.docs_cls()]
123
- return constructor(document_cls)
136
+ converter = Documents.CONVERTERS[self.dataset.docs_cls()]
137
+ converter.check(self.dataset.docs_cls())
138
+ return converter
124
139
 
125
140
 
126
141
  @attrs.define()
@@ -129,7 +144,12 @@ class IRDSQueryWrapper(ir.Topic):
129
144
 
130
145
 
131
146
  class Topics(ir.TopicsStore, IRDSId):
132
- CONVERTERS = {GenericQuery: (GenericTopic, tuple_constructor)}
147
+ CONVERTERS = {
148
+ GenericQuery: tuple_constructor(GenericTopic, "query_id", "text"),
149
+ _irds.beir.BeirCovidQuery: tuple_constructor(
150
+ formats.TrecTopic, "query_id", "text", "query", "narrative"
151
+ ),
152
+ }
133
153
 
134
154
  def iter(self) -> Iterator[ir.Topic]:
135
155
  """Returns an iterator over topics"""
@@ -167,12 +187,13 @@ class Topics(ir.TopicsStore, IRDSId):
167
187
 
168
188
  @cached_property
169
189
  def topic_cls(self):
170
- return Topics.CONVERTERS[self.dataset.queries_cls()][0]
190
+ return self.converter.target_cls
171
191
 
172
192
  @cached_property
173
193
  def converter(self):
174
- topic_cls, constructor = Topics.CONVERTERS[self.dataset.queries_cls()]
175
- return constructor(topic_cls)
194
+ converter = Topics.CONVERTERS[self.dataset.queries_cls()]
195
+ converter.check(self.dataset.queries_cls())
196
+ return converter
176
197
 
177
198
 
178
199
  class Adhoc(ir.Adhoc, IRDSId):
@@ -0,0 +1,4 @@
1
+ # file generated by setuptools_scm
2
+ # don't change, don't track in version control
3
+ __version__ = version = '2023.7.6.1'
4
+ __version_tuple__ = version_tuple = (2023, 7, 6, 1)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamaestro-text
3
- Version: 2023.7.4
3
+ Version: 2023.7.6.1
4
4
  Summary: Datamaestro module for text-related datasets
5
5
  Home-page: https://github.com/experimaestro/datamaestro_text
6
6
  Author: Benjamin Piwowarski
@@ -92,6 +92,7 @@ src/datamaestro_text/data/ir/base.py
92
92
  src/datamaestro_text/data/ir/cord19.py
93
93
  src/datamaestro_text/data/ir/csv.py
94
94
  src/datamaestro_text/data/ir/data.py
95
+ src/datamaestro_text/data/ir/formats.py
95
96
  src/datamaestro_text/data/ir/huggingface.py
96
97
  src/datamaestro_text/data/ir/trec.py
97
98
  src/datamaestro_text/data/ir/utils.py
@@ -1,4 +0,0 @@
1
- # file generated by setuptools_scm
2
- # don't change, don't track in version control
3
- __version__ = version = '2023.7.4'
4
- __version_tuple__ = version_tuple = (2023, 7, 4)