datamaestro-text 2023.7.4__py3-none-any.whl → 2023.7.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamaestro_text/data/ir/formats.py +23 -0
- datamaestro_text/datasets/irds/data.py +36 -15
- datamaestro_text/version.py +2 -2
- {datamaestro_text-2023.7.4.dist-info → datamaestro_text-2023.7.6.1.dist-info}/METADATA +1 -1
- {datamaestro_text-2023.7.4.dist-info → datamaestro_text-2023.7.6.1.dist-info}/RECORD +10 -9
- {datamaestro_text-2023.7.4.dist-info → datamaestro_text-2023.7.6.1.dist-info}/LICENSE +0 -0
- {datamaestro_text-2023.7.4.dist-info → datamaestro_text-2023.7.6.1.dist-info}/WHEEL +0 -0
- {datamaestro_text-2023.7.4.dist-info → datamaestro_text-2023.7.6.1.dist-info}/entry_points.txt +0 -0
- {datamaestro_text-2023.7.4.dist-info → datamaestro_text-2023.7.6.1.dist-info}/top_level.txt +0 -0
- {datamaestro_text-2023.7.4.dist-info → datamaestro_text-2023.7.6.1.dist-info}/zip-safe +0 -0
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from typing import ClassVar
|
|
2
|
+
from attrs import define
|
|
3
|
+
from .base import IDHolder, Document, Topic
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@define
|
|
7
|
+
class CordDocument(IDHolder, Document):
|
|
8
|
+
text: str
|
|
9
|
+
title: str
|
|
10
|
+
url: str
|
|
11
|
+
pubmed_id: str
|
|
12
|
+
|
|
13
|
+
has_text: ClassVar[bool] = True
|
|
14
|
+
|
|
15
|
+
def get_text(self):
|
|
16
|
+
return f"{self.title} {self.text}"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@define
|
|
20
|
+
class TrecTopic(IDHolder, Topic):
|
|
21
|
+
text: str
|
|
22
|
+
query: str
|
|
23
|
+
narrative: str
|
|
@@ -1,10 +1,11 @@
|
|
|
1
|
-
import ir_datasets
|
|
2
|
-
from ir_datasets.formats import GenericDoc, GenericQuery, GenericDocPair
|
|
3
1
|
import logging
|
|
2
|
+
from typing import Any, Iterator, Tuple, Type
|
|
4
3
|
import attrs
|
|
4
|
+
import ir_datasets
|
|
5
|
+
from ir_datasets.formats import GenericDoc, GenericQuery, GenericDocPair
|
|
6
|
+
import ir_datasets.datasets as _irds
|
|
5
7
|
from experimaestro import Config
|
|
6
8
|
from experimaestro.compat import cached_property
|
|
7
|
-
from typing import Any, Iterator, Tuple
|
|
8
9
|
from experimaestro import Option
|
|
9
10
|
import datamaestro_text.data.ir as ir
|
|
10
11
|
from datamaestro_text.data.ir.base import (
|
|
@@ -17,6 +18,7 @@ from datamaestro_text.data.ir.base import (
|
|
|
17
18
|
IDDocument,
|
|
18
19
|
IDTopic,
|
|
19
20
|
)
|
|
21
|
+
import datamaestro_text.data.ir.formats as formats
|
|
20
22
|
|
|
21
23
|
|
|
22
24
|
# Interface between ir_datasets and datamaestro:
|
|
@@ -64,11 +66,18 @@ class AdhocAssessments(ir.AdhocAssessments, IRDSId):
|
|
|
64
66
|
return qrels.values()
|
|
65
67
|
|
|
66
68
|
|
|
67
|
-
|
|
68
|
-
def
|
|
69
|
-
|
|
69
|
+
class tuple_constructor:
|
|
70
|
+
def __init__(self, target_cls: Type, *fields: str):
|
|
71
|
+
self.target_cls = target_cls
|
|
72
|
+
self.fields = fields
|
|
73
|
+
|
|
74
|
+
def check(self, source_cls: Type):
|
|
75
|
+
assert (
|
|
76
|
+
source_cls._fields == self.fields
|
|
77
|
+
), f"Internal error: Fields do not match ({source_cls._fields} and {self.fields})"
|
|
70
78
|
|
|
71
|
-
|
|
79
|
+
def __call__(self, entry):
|
|
80
|
+
return self.target_cls(*tuple(entry))
|
|
72
81
|
|
|
73
82
|
|
|
74
83
|
@attrs.define()
|
|
@@ -77,7 +86,12 @@ class IRDSDocumentWrapper(ir.Document):
|
|
|
77
86
|
|
|
78
87
|
|
|
79
88
|
class Documents(ir.DocumentStore, IRDSId):
|
|
80
|
-
CONVERTERS = {
|
|
89
|
+
CONVERTERS = {
|
|
90
|
+
GenericDoc: tuple_constructor(GenericDocument, "doc_id", "text"),
|
|
91
|
+
_irds.beir.BeirCordDoc: tuple_constructor(
|
|
92
|
+
formats.CordDocument, "doc_id", "text", "title", "url", "pubmed_id"
|
|
93
|
+
),
|
|
94
|
+
}
|
|
81
95
|
|
|
82
96
|
"""Wraps an ir datasets collection -- and provide a default text
|
|
83
97
|
value depending on the collection itself"""
|
|
@@ -115,12 +129,13 @@ class Documents(ir.DocumentStore, IRDSId):
|
|
|
115
129
|
|
|
116
130
|
@cached_property
|
|
117
131
|
def document_cls(self):
|
|
118
|
-
return
|
|
132
|
+
return self.converter.target_cls
|
|
119
133
|
|
|
120
134
|
@cached_property
|
|
121
135
|
def converter(self):
|
|
122
|
-
|
|
123
|
-
|
|
136
|
+
converter = Documents.CONVERTERS[self.dataset.docs_cls()]
|
|
137
|
+
converter.check(self.dataset.docs_cls())
|
|
138
|
+
return converter
|
|
124
139
|
|
|
125
140
|
|
|
126
141
|
@attrs.define()
|
|
@@ -129,7 +144,12 @@ class IRDSQueryWrapper(ir.Topic):
|
|
|
129
144
|
|
|
130
145
|
|
|
131
146
|
class Topics(ir.TopicsStore, IRDSId):
|
|
132
|
-
CONVERTERS = {
|
|
147
|
+
CONVERTERS = {
|
|
148
|
+
GenericQuery: tuple_constructor(GenericTopic, "query_id", "text"),
|
|
149
|
+
_irds.beir.BeirCovidQuery: tuple_constructor(
|
|
150
|
+
formats.TrecTopic, "query_id", "text", "query", "narrative"
|
|
151
|
+
),
|
|
152
|
+
}
|
|
133
153
|
|
|
134
154
|
def iter(self) -> Iterator[ir.Topic]:
|
|
135
155
|
"""Returns an iterator over topics"""
|
|
@@ -167,12 +187,13 @@ class Topics(ir.TopicsStore, IRDSId):
|
|
|
167
187
|
|
|
168
188
|
@cached_property
|
|
169
189
|
def topic_cls(self):
|
|
170
|
-
return
|
|
190
|
+
return self.converter.target_cls
|
|
171
191
|
|
|
172
192
|
@cached_property
|
|
173
193
|
def converter(self):
|
|
174
|
-
|
|
175
|
-
|
|
194
|
+
converter = Topics.CONVERTERS[self.dataset.queries_cls()]
|
|
195
|
+
converter.check(self.dataset.queries_cls())
|
|
196
|
+
return converter
|
|
176
197
|
|
|
177
198
|
|
|
178
199
|
class Adhoc(ir.Adhoc, IRDSId):
|
datamaestro_text/version.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
datamaestro_text/__init__.py,sha256=hU8jZpkXl3F74qIfqnJl7v4nJ9YxfoR7IpJpUREFNRI,248
|
|
2
|
-
datamaestro_text/version.py,sha256=
|
|
2
|
+
datamaestro_text/version.py,sha256=Je_wDQltf6IZOe5Cu_KgI0UT7_A2LPGuxgfuW4Mz3dY,171
|
|
3
3
|
datamaestro_text/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
datamaestro_text/config/ai/quac.yaml,sha256=h1D7UJo2z1nZ_9MXpDHuQNJG9Ma2oayUdJV6lyumAIg,1103
|
|
5
5
|
datamaestro_text/config/com/oscar-corpus.py,sha256=dDfdy_uhG6esRQQzUOOORwK8KvEq4c4hZbWt22pv-zY,733
|
|
@@ -52,11 +52,12 @@ datamaestro_text/data/ir/base.py,sha256=FOT0fk7_Kw-LRAPJnxNvuBHAhW6hhRCqWepqdBsp
|
|
|
52
52
|
datamaestro_text/data/ir/cord19.py,sha256=JN31EQeg0UFAJlIkg0Ie0_pq-f-oS1OstZGJLJBeKyY,1130
|
|
53
53
|
datamaestro_text/data/ir/csv.py,sha256=vgBNOeayEALwO01LmrzVOEVbs_MWJn3eIm-o0KiXjiE,1836
|
|
54
54
|
datamaestro_text/data/ir/data.py,sha256=ERmmOxz_9mUkIuccNbzUa5Y6gVLLVDdyc4cCxbCCUbY,20
|
|
55
|
+
datamaestro_text/data/ir/formats.py,sha256=Nz8BJ1Lb6MygeYNHtUwwqX-flZ1QfrxaDkAJlvcv1fE,405
|
|
55
56
|
datamaestro_text/data/ir/huggingface.py,sha256=G71VFDN-SllZy4LFxumEbCumEJvb5-orAbLemHjWhiA,956
|
|
56
57
|
datamaestro_text/data/ir/trec.py,sha256=n98_O_sPPdU2i037fAboD4lB_I7C-RJrOLmmkg3osL8,1741
|
|
57
58
|
datamaestro_text/data/ir/utils.py,sha256=6-GhXVtgkBZGhIs2-ODZua_3DmKjSSVydStpHDqbAwE,833
|
|
58
59
|
datamaestro_text/datasets/irds/__init__.py,sha256=Tq0HN1qojnZYLBumM59BuTkz7r0gcu-5OXmDDLgPpAc,707
|
|
59
|
-
datamaestro_text/datasets/irds/data.py,sha256=
|
|
60
|
+
datamaestro_text/datasets/irds/data.py,sha256=L2zjWXzr3uvmJNJaw1ES47sHMo5uMKc9JHhGhkcICJo,6448
|
|
60
61
|
datamaestro_text/datasets/irds/datasets.py,sha256=tI7_lK1e0LW52zaEdoGHFXHSQeoU6eMtA5_6wQSrRkE,5540
|
|
61
62
|
datamaestro_text/datasets/irds/utils.py,sha256=m30JLIrV_HgilN11TvY9dGTyumES6LLzWZDUAMT915M,1425
|
|
62
63
|
datamaestro_text/download/tmdb.py,sha256=kU_Vz9jhznlyeKMHziVu58IHoWv8zPu6CZTHVNQvmu4,4009
|
|
@@ -70,10 +71,10 @@ datamaestro_text/utils/__init__.py,sha256=2449YLTAtKJzkmt84Mu8sBRCCveNs5fiaqTCK_
|
|
|
70
71
|
datamaestro_text/utils/files.py,sha256=n6ZGl5LNrZbHLcV9RFwd7cFT0vPUezit-2dsBzs1vRQ,170
|
|
71
72
|
datamaestro_text/utils/randomstream.py,sha256=_-boH4IIqN8qcl3IktjpNp9vmF4TWRzHUSNVwg7WAr8,973
|
|
72
73
|
datamaestro_text/utils/shuffle.py,sha256=o8JTz3mr0lYWyv0zEh91jEK12ci1etMiUnzh5GkOHCM,3490
|
|
73
|
-
datamaestro_text-2023.7.
|
|
74
|
-
datamaestro_text-2023.7.
|
|
75
|
-
datamaestro_text-2023.7.
|
|
76
|
-
datamaestro_text-2023.7.
|
|
77
|
-
datamaestro_text-2023.7.
|
|
78
|
-
datamaestro_text-2023.7.
|
|
79
|
-
datamaestro_text-2023.7.
|
|
74
|
+
datamaestro_text-2023.7.6.1.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
75
|
+
datamaestro_text-2023.7.6.1.dist-info/METADATA,sha256=24yduxTyKWsLuDl4heKsRrNobJI5xUHkgNSb42G51tU,1742
|
|
76
|
+
datamaestro_text-2023.7.6.1.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
|
|
77
|
+
datamaestro_text-2023.7.6.1.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
|
|
78
|
+
datamaestro_text-2023.7.6.1.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
|
|
79
|
+
datamaestro_text-2023.7.6.1.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
80
|
+
datamaestro_text-2023.7.6.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
{datamaestro_text-2023.7.4.dist-info → datamaestro_text-2023.7.6.1.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|