datamaestro-text 2023.7.4__py3-none-any.whl → 2023.7.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,23 @@
1
+ from typing import ClassVar
2
+ from attrs import define
3
+ from .base import IDHolder, Document, Topic
4
+
5
+
6
+ @define
7
+ class CordDocument(IDHolder, Document):
8
+ text: str
9
+ title: str
10
+ url: str
11
+ pubmed_id: str
12
+
13
+ has_text: ClassVar[bool] = True
14
+
15
+ def get_text(self):
16
+ return f"{self.title} {self.text}"
17
+
18
+
19
+ @define
20
+ class TrecTopic(IDHolder, Topic):
21
+ text: str
22
+ query: str
23
+ narrative: str
@@ -1,10 +1,11 @@
1
- import ir_datasets
2
- from ir_datasets.formats import GenericDoc, GenericQuery, GenericDocPair
3
1
  import logging
2
+ from typing import Any, Iterator, Tuple, Type
4
3
  import attrs
4
+ import ir_datasets
5
+ from ir_datasets.formats import GenericDoc, GenericQuery, GenericDocPair
6
+ import ir_datasets.datasets as _irds
5
7
  from experimaestro import Config
6
8
  from experimaestro.compat import cached_property
7
- from typing import Any, Iterator, Tuple
8
9
  from experimaestro import Option
9
10
  import datamaestro_text.data.ir as ir
10
11
  from datamaestro_text.data.ir.base import (
@@ -17,6 +18,7 @@ from datamaestro_text.data.ir.base import (
17
18
  IDDocument,
18
19
  IDTopic,
19
20
  )
21
+ import datamaestro_text.data.ir.formats as formats
20
22
 
21
23
 
22
24
  # Interface between ir_datasets and datamaestro:
@@ -64,11 +66,18 @@ class AdhocAssessments(ir.AdhocAssessments, IRDSId):
64
66
  return qrels.values()
65
67
 
66
68
 
67
- def tuple_constructor(cls):
68
- def constructor(entry):
69
- return cls(*tuple(entry))
69
+ class tuple_constructor:
70
+ def __init__(self, target_cls: Type, *fields: str):
71
+ self.target_cls = target_cls
72
+ self.fields = fields
73
+
74
+ def check(self, source_cls: Type):
75
+ assert (
76
+ source_cls._fields == self.fields
77
+ ), f"Internal error: Fields do not match ({source_cls._fields} and {self.fields})"
70
78
 
71
- return constructor
79
+ def __call__(self, entry):
80
+ return self.target_cls(*tuple(entry))
72
81
 
73
82
 
74
83
  @attrs.define()
@@ -77,7 +86,12 @@ class IRDSDocumentWrapper(ir.Document):
77
86
 
78
87
 
79
88
  class Documents(ir.DocumentStore, IRDSId):
80
- CONVERTERS = {GenericDoc: (GenericDocument, tuple_constructor)}
89
+ CONVERTERS = {
90
+ GenericDoc: tuple_constructor(GenericDocument, "doc_id", "text"),
91
+ _irds.beir.BeirCordDoc: tuple_constructor(
92
+ formats.CordDocument, "doc_id", "text", "title", "url", "pubmed_id"
93
+ ),
94
+ }
81
95
 
82
96
  """Wraps an ir datasets collection -- and provide a default text
83
97
  value depending on the collection itself"""
@@ -115,12 +129,13 @@ class Documents(ir.DocumentStore, IRDSId):
115
129
 
116
130
  @cached_property
117
131
  def document_cls(self):
118
- return Documents.CONVERTERS[self.dataset.docs_cls()][0]
132
+ return self.converter.target_cls
119
133
 
120
134
  @cached_property
121
135
  def converter(self):
122
- document_cls, constructor = Documents.CONVERTERS[self.dataset.docs_cls()]
123
- return constructor(document_cls)
136
+ converter = Documents.CONVERTERS[self.dataset.docs_cls()]
137
+ converter.check(self.dataset.docs_cls())
138
+ return converter
124
139
 
125
140
 
126
141
  @attrs.define()
@@ -129,7 +144,12 @@ class IRDSQueryWrapper(ir.Topic):
129
144
 
130
145
 
131
146
  class Topics(ir.TopicsStore, IRDSId):
132
- CONVERTERS = {GenericQuery: (GenericTopic, tuple_constructor)}
147
+ CONVERTERS = {
148
+ GenericQuery: tuple_constructor(GenericTopic, "query_id", "text"),
149
+ _irds.beir.BeirCovidQuery: tuple_constructor(
150
+ formats.TrecTopic, "query_id", "text", "query", "narrative"
151
+ ),
152
+ }
133
153
 
134
154
  def iter(self) -> Iterator[ir.Topic]:
135
155
  """Returns an iterator over topics"""
@@ -167,12 +187,13 @@ class Topics(ir.TopicsStore, IRDSId):
167
187
 
168
188
  @cached_property
169
189
  def topic_cls(self):
170
- return Topics.CONVERTERS[self.dataset.queries_cls()][0]
190
+ return self.converter.target_cls
171
191
 
172
192
  @cached_property
173
193
  def converter(self):
174
- topic_cls, constructor = Topics.CONVERTERS[self.dataset.queries_cls()]
175
- return constructor(topic_cls)
194
+ converter = Topics.CONVERTERS[self.dataset.queries_cls()]
195
+ converter.check(self.dataset.queries_cls())
196
+ return converter
176
197
 
177
198
 
178
199
  class Adhoc(ir.Adhoc, IRDSId):
@@ -1,4 +1,4 @@
1
1
  # file generated by setuptools_scm
2
2
  # don't change, don't track in version control
3
- __version__ = version = '2023.7.4'
4
- __version_tuple__ = version_tuple = (2023, 7, 4)
3
+ __version__ = version = '2023.7.6.1'
4
+ __version_tuple__ = version_tuple = (2023, 7, 6, 1)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamaestro-text
3
- Version: 2023.7.4
3
+ Version: 2023.7.6.1
4
4
  Summary: Datamaestro module for text-related datasets
5
5
  Home-page: https://github.com/experimaestro/datamaestro_text
6
6
  Author: Benjamin Piwowarski
@@ -1,5 +1,5 @@
1
1
  datamaestro_text/__init__.py,sha256=hU8jZpkXl3F74qIfqnJl7v4nJ9YxfoR7IpJpUREFNRI,248
2
- datamaestro_text/version.py,sha256=ql5h48RqChgNgO-L9vkoUI2FaqbfAWiap3G8hwYBCw0,166
2
+ datamaestro_text/version.py,sha256=Je_wDQltf6IZOe5Cu_KgI0UT7_A2LPGuxgfuW4Mz3dY,171
3
3
  datamaestro_text/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  datamaestro_text/config/ai/quac.yaml,sha256=h1D7UJo2z1nZ_9MXpDHuQNJG9Ma2oayUdJV6lyumAIg,1103
5
5
  datamaestro_text/config/com/oscar-corpus.py,sha256=dDfdy_uhG6esRQQzUOOORwK8KvEq4c4hZbWt22pv-zY,733
@@ -52,11 +52,12 @@ datamaestro_text/data/ir/base.py,sha256=FOT0fk7_Kw-LRAPJnxNvuBHAhW6hhRCqWepqdBsp
52
52
  datamaestro_text/data/ir/cord19.py,sha256=JN31EQeg0UFAJlIkg0Ie0_pq-f-oS1OstZGJLJBeKyY,1130
53
53
  datamaestro_text/data/ir/csv.py,sha256=vgBNOeayEALwO01LmrzVOEVbs_MWJn3eIm-o0KiXjiE,1836
54
54
  datamaestro_text/data/ir/data.py,sha256=ERmmOxz_9mUkIuccNbzUa5Y6gVLLVDdyc4cCxbCCUbY,20
55
+ datamaestro_text/data/ir/formats.py,sha256=Nz8BJ1Lb6MygeYNHtUwwqX-flZ1QfrxaDkAJlvcv1fE,405
55
56
  datamaestro_text/data/ir/huggingface.py,sha256=G71VFDN-SllZy4LFxumEbCumEJvb5-orAbLemHjWhiA,956
56
57
  datamaestro_text/data/ir/trec.py,sha256=n98_O_sPPdU2i037fAboD4lB_I7C-RJrOLmmkg3osL8,1741
57
58
  datamaestro_text/data/ir/utils.py,sha256=6-GhXVtgkBZGhIs2-ODZua_3DmKjSSVydStpHDqbAwE,833
58
59
  datamaestro_text/datasets/irds/__init__.py,sha256=Tq0HN1qojnZYLBumM59BuTkz7r0gcu-5OXmDDLgPpAc,707
59
- datamaestro_text/datasets/irds/data.py,sha256=ja3HmwF-BoaWCXkP1jU0tmEvY2hfdLh07DFknBu29z8,5709
60
+ datamaestro_text/datasets/irds/data.py,sha256=L2zjWXzr3uvmJNJaw1ES47sHMo5uMKc9JHhGhkcICJo,6448
60
61
  datamaestro_text/datasets/irds/datasets.py,sha256=tI7_lK1e0LW52zaEdoGHFXHSQeoU6eMtA5_6wQSrRkE,5540
61
62
  datamaestro_text/datasets/irds/utils.py,sha256=m30JLIrV_HgilN11TvY9dGTyumES6LLzWZDUAMT915M,1425
62
63
  datamaestro_text/download/tmdb.py,sha256=kU_Vz9jhznlyeKMHziVu58IHoWv8zPu6CZTHVNQvmu4,4009
@@ -70,10 +71,10 @@ datamaestro_text/utils/__init__.py,sha256=2449YLTAtKJzkmt84Mu8sBRCCveNs5fiaqTCK_
70
71
  datamaestro_text/utils/files.py,sha256=n6ZGl5LNrZbHLcV9RFwd7cFT0vPUezit-2dsBzs1vRQ,170
71
72
  datamaestro_text/utils/randomstream.py,sha256=_-boH4IIqN8qcl3IktjpNp9vmF4TWRzHUSNVwg7WAr8,973
72
73
  datamaestro_text/utils/shuffle.py,sha256=o8JTz3mr0lYWyv0zEh91jEK12ci1etMiUnzh5GkOHCM,3490
73
- datamaestro_text-2023.7.4.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
74
- datamaestro_text-2023.7.4.dist-info/METADATA,sha256=CYB52WlcCdVQK-0ATQ517WDS8a3bmkZfBK5q4wPXeiE,1740
75
- datamaestro_text-2023.7.4.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
76
- datamaestro_text-2023.7.4.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
77
- datamaestro_text-2023.7.4.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
78
- datamaestro_text-2023.7.4.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
79
- datamaestro_text-2023.7.4.dist-info/RECORD,,
74
+ datamaestro_text-2023.7.6.1.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
75
+ datamaestro_text-2023.7.6.1.dist-info/METADATA,sha256=24yduxTyKWsLuDl4heKsRrNobJI5xUHkgNSb42G51tU,1742
76
+ datamaestro_text-2023.7.6.1.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
77
+ datamaestro_text-2023.7.6.1.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
78
+ datamaestro_text-2023.7.6.1.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
79
+ datamaestro_text-2023.7.6.1.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
80
+ datamaestro_text-2023.7.6.1.dist-info/RECORD,,