datamaestro-text 2023.10.10__py3-none-any.whl → 2023.11.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamaestro_text/data/ir/formats.py +165 -2
- datamaestro_text/datasets/irds/data.py +85 -4
- datamaestro_text/version.py +2 -2
- {datamaestro_text-2023.10.10.dist-info → datamaestro_text-2023.11.22.dist-info}/METADATA +1 -6
- {datamaestro_text-2023.10.10.dist-info → datamaestro_text-2023.11.22.dist-info}/RECORD +9 -10
- {datamaestro_text-2023.10.10.dist-info → datamaestro_text-2023.11.22.dist-info}/WHEEL +1 -1
- datamaestro_text-2023.10.10.dist-info/zip-safe +0 -1
- {datamaestro_text-2023.10.10.dist-info → datamaestro_text-2023.11.22.dist-info}/LICENSE +0 -0
- {datamaestro_text-2023.10.10.dist-info → datamaestro_text-2023.11.22.dist-info}/entry_points.txt +0 -0
- {datamaestro_text-2023.10.10.dist-info → datamaestro_text-2023.11.22.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,8 @@
|
|
|
1
|
-
from typing import ClassVar
|
|
1
|
+
from typing import ClassVar, Tuple
|
|
2
2
|
from attrs import define
|
|
3
|
-
from .
|
|
3
|
+
from ir_datasets.datasets.wapo import WapoDocMedia
|
|
4
|
+
from .base import IDHolder, Document, GenericTopic, IDTopic
|
|
5
|
+
from ir_datasets.datasets.cord19 import Cord19FullTextSection
|
|
4
6
|
|
|
5
7
|
|
|
6
8
|
@define
|
|
@@ -16,11 +18,172 @@ class CordDocument(IDHolder, Document):
|
|
|
16
18
|
return f"{self.title} {self.text}"
|
|
17
19
|
|
|
18
20
|
|
|
21
|
+
@define
|
|
22
|
+
class DocumentWithTitle(IDHolder, Document):
|
|
23
|
+
"""Web document with title and URL"""
|
|
24
|
+
|
|
25
|
+
title: str
|
|
26
|
+
|
|
27
|
+
text: str
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@define
|
|
31
|
+
class CordFullTextDocument(IDHolder, Document):
|
|
32
|
+
title: str
|
|
33
|
+
doi: str
|
|
34
|
+
date: str
|
|
35
|
+
abstract: str
|
|
36
|
+
body: Tuple[Cord19FullTextSection, ...]
|
|
37
|
+
|
|
38
|
+
has_text: ClassVar[bool] = True
|
|
39
|
+
|
|
40
|
+
def get_text(self):
|
|
41
|
+
return f"{self.abstract}"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@define
|
|
45
|
+
class MsMarcoDocument(IDHolder, Document):
|
|
46
|
+
url: str
|
|
47
|
+
title: str
|
|
48
|
+
body: str
|
|
49
|
+
|
|
50
|
+
has_text: ClassVar[bool] = True
|
|
51
|
+
|
|
52
|
+
def get_text(self):
|
|
53
|
+
return f"{self.body}"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@define
|
|
57
|
+
class NFCorpusDocument(IDHolder, Document):
|
|
58
|
+
url: str
|
|
59
|
+
title: str
|
|
60
|
+
abstract: str
|
|
61
|
+
|
|
62
|
+
has_text: ClassVar[bool] = True
|
|
63
|
+
|
|
64
|
+
def get_text(self):
|
|
65
|
+
return f"{self.abstract}"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@define
|
|
69
|
+
class TitleDocument(IDHolder, Document):
|
|
70
|
+
text: str
|
|
71
|
+
title: str
|
|
72
|
+
has_text: ClassVar[bool] = True
|
|
73
|
+
|
|
74
|
+
def get_text(self):
|
|
75
|
+
return f"{self.title} {self.text}"
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@define
|
|
79
|
+
class TitleUrlDocument(IDHolder, Document):
|
|
80
|
+
text: str
|
|
81
|
+
title: str
|
|
82
|
+
url: str
|
|
83
|
+
has_text: ClassVar[bool] = True
|
|
84
|
+
|
|
85
|
+
def get_text(self):
|
|
86
|
+
return f"{self.title} {self.text}"
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@define
|
|
90
|
+
class TrecParsedDocument(IDHolder, Document):
|
|
91
|
+
title: str
|
|
92
|
+
body: str
|
|
93
|
+
marked_up_doc: bytes
|
|
94
|
+
|
|
95
|
+
has_text: ClassVar[bool] = True
|
|
96
|
+
|
|
97
|
+
def get_text(self):
|
|
98
|
+
return f"{self.title} {self.body}"
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@define
|
|
102
|
+
class WapoDocument(IDHolder, Document):
|
|
103
|
+
url: str
|
|
104
|
+
title: str
|
|
105
|
+
author: str
|
|
106
|
+
published_date: int
|
|
107
|
+
kicker: str
|
|
108
|
+
body: str
|
|
109
|
+
body_paras_html: Tuple[str, ...]
|
|
110
|
+
body_media: Tuple[WapoDocMedia, ...]
|
|
111
|
+
|
|
112
|
+
has_text: ClassVar[bool] = True
|
|
113
|
+
|
|
114
|
+
def get_text(self):
|
|
115
|
+
return f"{self.body}"
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@define
|
|
119
|
+
class TweetDoc(IDHolder, Document):
|
|
120
|
+
text: str
|
|
121
|
+
user_id: str
|
|
122
|
+
created_at: str
|
|
123
|
+
lang: str
|
|
124
|
+
reply_doc_id: str
|
|
125
|
+
retweet_doc_id: str
|
|
126
|
+
source: bytes
|
|
127
|
+
source_content_type: str
|
|
128
|
+
|
|
129
|
+
def get_text(self):
|
|
130
|
+
return f"{self.text}"
|
|
131
|
+
|
|
132
|
+
|
|
19
133
|
@define
|
|
20
134
|
class TrecTopic(GenericTopic):
|
|
21
135
|
text: str
|
|
22
136
|
query: str
|
|
23
137
|
narrative: str
|
|
24
138
|
|
|
139
|
+
def get_text(self):
|
|
140
|
+
return f"{self.text}"
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
@define
|
|
144
|
+
class UrlTopic(GenericTopic):
|
|
145
|
+
text: str
|
|
146
|
+
url: str
|
|
147
|
+
|
|
148
|
+
def get_text(self):
|
|
149
|
+
return f"{self.text}"
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
@define
|
|
153
|
+
class NFCorpusTopic(IDTopic):
|
|
154
|
+
title: str
|
|
155
|
+
all: str
|
|
156
|
+
|
|
157
|
+
def get_text(self):
|
|
158
|
+
return f"{self.title}"
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
@define
|
|
162
|
+
class TrecQuery(IDTopic):
|
|
163
|
+
title: str
|
|
164
|
+
description: str
|
|
165
|
+
narrative: str
|
|
166
|
+
|
|
167
|
+
def get_text(self):
|
|
168
|
+
return f"{self.description}"
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
@define
|
|
172
|
+
class TrecMb13Query(IDTopic):
|
|
173
|
+
query: str
|
|
174
|
+
time: str
|
|
175
|
+
tweet_time: str
|
|
176
|
+
|
|
177
|
+
def get_text(self):
|
|
178
|
+
return f"{self.query}"
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
@define
|
|
182
|
+
class TrecMb14Query(IDTopic):
|
|
183
|
+
query: str
|
|
184
|
+
time: str
|
|
185
|
+
tweet_time: str
|
|
186
|
+
description: str
|
|
187
|
+
|
|
25
188
|
def get_text(self):
|
|
26
189
|
return f"{self.query}"
|
|
@@ -2,7 +2,13 @@ import logging
|
|
|
2
2
|
from typing import Any, Iterator, Tuple, Type, List
|
|
3
3
|
import attrs
|
|
4
4
|
import ir_datasets
|
|
5
|
-
from ir_datasets.formats import
|
|
5
|
+
from ir_datasets.formats import (
|
|
6
|
+
GenericDoc,
|
|
7
|
+
GenericQuery,
|
|
8
|
+
GenericDocPair,
|
|
9
|
+
TrecParsedDoc,
|
|
10
|
+
TrecQuery,
|
|
11
|
+
)
|
|
6
12
|
import ir_datasets.datasets as _irds
|
|
7
13
|
from experimaestro import Config
|
|
8
14
|
from experimaestro.compat import cached_property
|
|
@@ -72,9 +78,10 @@ class tuple_constructor:
|
|
|
72
78
|
self.fields = fields
|
|
73
79
|
|
|
74
80
|
def check(self, source_cls: Type):
|
|
75
|
-
assert (
|
|
76
|
-
|
|
77
|
-
|
|
81
|
+
assert source_cls._fields == self.fields, (
|
|
82
|
+
"Internal error: Fields do not match, "
|
|
83
|
+
f"source({source_cls.__qualname__})={source_cls._fields} [vs] target={self.fields}"
|
|
84
|
+
)
|
|
78
85
|
|
|
79
86
|
def __call__(self, entry):
|
|
80
87
|
return self.target_cls(*tuple(entry))
|
|
@@ -91,6 +98,54 @@ class Documents(ir.DocumentStore, IRDSId):
|
|
|
91
98
|
_irds.beir.BeirCordDoc: tuple_constructor(
|
|
92
99
|
formats.CordDocument, "doc_id", "text", "title", "url", "pubmed_id"
|
|
93
100
|
),
|
|
101
|
+
_irds.beir.BeirTitleDoc: tuple_constructor(
|
|
102
|
+
formats.TitleDocument, "doc_id", "text", "title"
|
|
103
|
+
),
|
|
104
|
+
_irds.beir.BeirTitleUrlDoc: tuple_constructor(
|
|
105
|
+
formats.TitleUrlDocument, "doc_id", "text", "title", "url"
|
|
106
|
+
),
|
|
107
|
+
_irds.msmarco_document.MsMarcoDocument: tuple_constructor(
|
|
108
|
+
formats.MsMarcoDocument, "doc_id", "url", "title", "body"
|
|
109
|
+
),
|
|
110
|
+
_irds.cord19.Cord19FullTextDoc: tuple_constructor(
|
|
111
|
+
formats.CordFullTextDocument,
|
|
112
|
+
"doc_id",
|
|
113
|
+
"title",
|
|
114
|
+
"doi",
|
|
115
|
+
"date",
|
|
116
|
+
"abstract",
|
|
117
|
+
"body",
|
|
118
|
+
),
|
|
119
|
+
_irds.nfcorpus.NfCorpusDoc: tuple_constructor(
|
|
120
|
+
formats.NFCorpusDocument, "doc_id", "url", "title", "abstract"
|
|
121
|
+
),
|
|
122
|
+
TrecParsedDoc: tuple_constructor(
|
|
123
|
+
formats.TrecParsedDocument, "doc_id", "title", "body", "marked_up_doc"
|
|
124
|
+
),
|
|
125
|
+
_irds.wapo.WapoDoc: tuple_constructor(
|
|
126
|
+
formats.WapoDocument,
|
|
127
|
+
"doc_id",
|
|
128
|
+
"url",
|
|
129
|
+
"title",
|
|
130
|
+
"author",
|
|
131
|
+
"published_date",
|
|
132
|
+
"kicker",
|
|
133
|
+
"body",
|
|
134
|
+
"body_paras_html",
|
|
135
|
+
"body_media",
|
|
136
|
+
),
|
|
137
|
+
_irds.tweets2013_ia.TweetDoc: tuple_constructor(
|
|
138
|
+
formats.TweetDoc,
|
|
139
|
+
"doc_id",
|
|
140
|
+
"text",
|
|
141
|
+
"user_id",
|
|
142
|
+
"created_at",
|
|
143
|
+
"lang",
|
|
144
|
+
"reply_doc_id",
|
|
145
|
+
"retweet_doc_id",
|
|
146
|
+
"source",
|
|
147
|
+
"source_content_type",
|
|
148
|
+
),
|
|
94
149
|
}
|
|
95
150
|
|
|
96
151
|
"""Wraps an ir datasets collection -- and provide a default text
|
|
@@ -147,6 +202,12 @@ class Documents(ir.DocumentStore, IRDSId):
|
|
|
147
202
|
return converter
|
|
148
203
|
|
|
149
204
|
|
|
205
|
+
if hasattr(_irds, "miracl"):
|
|
206
|
+
Documents.CONVERTERS[_irds.miracl.MiraclDoc] = tuple_constructor(
|
|
207
|
+
formats.DocumentWithTitle, "doc_id", "title", "text"
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
|
|
150
211
|
@attrs.define()
|
|
151
212
|
class IRDSQueryWrapper(ir.Topic):
|
|
152
213
|
query: Any
|
|
@@ -158,6 +219,26 @@ class Topics(ir.TopicsStore, IRDSId):
|
|
|
158
219
|
_irds.beir.BeirCovidQuery: tuple_constructor(
|
|
159
220
|
formats.TrecTopic, "query_id", "text", "query", "narrative"
|
|
160
221
|
),
|
|
222
|
+
_irds.beir.BeirUrlQuery: tuple_constructor(
|
|
223
|
+
formats.UrlTopic, "query_id", "text", "url"
|
|
224
|
+
),
|
|
225
|
+
_irds.nfcorpus.NfCorpusQuery: tuple_constructor(
|
|
226
|
+
formats.NFCorpusTopic, "query_id", "title", "all"
|
|
227
|
+
),
|
|
228
|
+
TrecQuery: tuple_constructor(
|
|
229
|
+
formats.TrecQuery, "query_id", "title", "description", "narrative"
|
|
230
|
+
),
|
|
231
|
+
_irds.tweets2013_ia.TrecMb13Query: tuple_constructor(
|
|
232
|
+
formats.TrecMb13Query, "query_id", "query", "time", "tweet_time"
|
|
233
|
+
),
|
|
234
|
+
_irds.tweets2013_ia.TrecMb14Query: tuple_constructor(
|
|
235
|
+
formats.TrecMb14Query,
|
|
236
|
+
"query_id",
|
|
237
|
+
"query",
|
|
238
|
+
"time",
|
|
239
|
+
"tweet_time",
|
|
240
|
+
"description",
|
|
241
|
+
),
|
|
161
242
|
}
|
|
162
243
|
|
|
163
244
|
def iter(self) -> Iterator[ir.Topic]:
|
datamaestro_text/version.py
CHANGED
|
@@ -12,5 +12,5 @@ __version__: str
|
|
|
12
12
|
__version_tuple__: VERSION_TUPLE
|
|
13
13
|
version_tuple: VERSION_TUPLE
|
|
14
14
|
|
|
15
|
-
__version__ = version = '2023.
|
|
16
|
-
__version_tuple__ = version_tuple = (2023,
|
|
15
|
+
__version__ = version = '2023.11.22'
|
|
16
|
+
__version_tuple__ = version_tuple = (2023, 11, 22)
|
|
@@ -1,16 +1,13 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datamaestro-text
|
|
3
|
-
Version: 2023.
|
|
3
|
+
Version: 2023.11.22
|
|
4
4
|
Summary: Datamaestro module for text-related datasets
|
|
5
|
-
Home-page: https://github.com/experimaestro/datamaestro_text
|
|
6
|
-
Author: Benjamin Piwowarski
|
|
7
5
|
Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
|
|
8
6
|
License: GPL-3
|
|
9
7
|
Project-URL: homepage, https://github.com/experimaestro/datamaestro_text
|
|
10
8
|
Project-URL: documentation, https://datamaestro-text.readthedocs.io/en/latest/
|
|
11
9
|
Project-URL: repository, https://github.com/experimaestro/datamaestro_text
|
|
12
10
|
Keywords: dataset manager,information retrieval,experiments
|
|
13
|
-
Platform: any
|
|
14
11
|
Classifier: Development Status :: 4 - Beta
|
|
15
12
|
Classifier: Intended Audience :: Science/Research
|
|
16
13
|
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
@@ -23,8 +20,6 @@ Description-Content-Type: text/markdown
|
|
|
23
20
|
License-File: LICENSE
|
|
24
21
|
Requires-Dist: datamaestro >=0.8.16
|
|
25
22
|
Requires-Dist: attrs
|
|
26
|
-
Provides-Extra: test
|
|
27
|
-
Requires-Dist: tox ; extra == 'test'
|
|
28
23
|
|
|
29
24
|
[](https://github.com/pre-commit/pre-commit) [](https://badge.fury.io/py/datamaestro-text)
|
|
30
25
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
datamaestro_text/__init__.py,sha256=hU8jZpkXl3F74qIfqnJl7v4nJ9YxfoR7IpJpUREFNRI,248
|
|
2
|
-
datamaestro_text/version.py,sha256=
|
|
2
|
+
datamaestro_text/version.py,sha256=SfBDLSp-ExawWjfQ5F5ZPLPMVBHTFXz89upGMl-xJ38,421
|
|
3
3
|
datamaestro_text/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
datamaestro_text/config/ai/quac.yaml,sha256=h1D7UJo2z1nZ_9MXpDHuQNJG9Ma2oayUdJV6lyumAIg,1103
|
|
5
5
|
datamaestro_text/config/com/oscar-corpus.py,sha256=dDfdy_uhG6esRQQzUOOORwK8KvEq4c4hZbWt22pv-zY,733
|
|
@@ -52,12 +52,12 @@ datamaestro_text/data/ir/base.py,sha256=FOT0fk7_Kw-LRAPJnxNvuBHAhW6hhRCqWepqdBsp
|
|
|
52
52
|
datamaestro_text/data/ir/cord19.py,sha256=JN31EQeg0UFAJlIkg0Ie0_pq-f-oS1OstZGJLJBeKyY,1130
|
|
53
53
|
datamaestro_text/data/ir/csv.py,sha256=vgBNOeayEALwO01LmrzVOEVbs_MWJn3eIm-o0KiXjiE,1836
|
|
54
54
|
datamaestro_text/data/ir/data.py,sha256=ERmmOxz_9mUkIuccNbzUa5Y6gVLLVDdyc4cCxbCCUbY,20
|
|
55
|
-
datamaestro_text/data/ir/formats.py,sha256=
|
|
55
|
+
datamaestro_text/data/ir/formats.py,sha256=6344Tj2yTxQ5KW-YtkBbdbCgWTbSsO6f0AaJlvvibqM,3248
|
|
56
56
|
datamaestro_text/data/ir/huggingface.py,sha256=G71VFDN-SllZy4LFxumEbCumEJvb5-orAbLemHjWhiA,956
|
|
57
57
|
datamaestro_text/data/ir/trec.py,sha256=n98_O_sPPdU2i037fAboD4lB_I7C-RJrOLmmkg3osL8,1741
|
|
58
58
|
datamaestro_text/data/ir/utils.py,sha256=6-GhXVtgkBZGhIs2-ODZua_3DmKjSSVydStpHDqbAwE,833
|
|
59
59
|
datamaestro_text/datasets/irds/__init__.py,sha256=Tq0HN1qojnZYLBumM59BuTkz7r0gcu-5OXmDDLgPpAc,707
|
|
60
|
-
datamaestro_text/datasets/irds/data.py,sha256=
|
|
60
|
+
datamaestro_text/datasets/irds/data.py,sha256=hDctKswyzD_VrCRcD6pNIoKiiwvapWQBUwxzdFHesIM,9348
|
|
61
61
|
datamaestro_text/datasets/irds/datasets.py,sha256=4tNTmlcF2OmUttCMyz5YTepi91pvaZB4syy5u-jAKh4,5556
|
|
62
62
|
datamaestro_text/datasets/irds/utils.py,sha256=m30JLIrV_HgilN11TvY9dGTyumES6LLzWZDUAMT915M,1425
|
|
63
63
|
datamaestro_text/download/tmdb.py,sha256=kU_Vz9jhznlyeKMHziVu58IHoWv8zPu6CZTHVNQvmu4,4009
|
|
@@ -72,10 +72,9 @@ datamaestro_text/utils/files.py,sha256=n6ZGl5LNrZbHLcV9RFwd7cFT0vPUezit-2dsBzs1v
|
|
|
72
72
|
datamaestro_text/utils/iter.py,sha256=-m0Y_0YjSlEVbotzZYIA0Ca0Hq0G_bF9GfAZR2yxrAk,520
|
|
73
73
|
datamaestro_text/utils/randomstream.py,sha256=_-boH4IIqN8qcl3IktjpNp9vmF4TWRzHUSNVwg7WAr8,973
|
|
74
74
|
datamaestro_text/utils/shuffle.py,sha256=o8JTz3mr0lYWyv0zEh91jEK12ci1etMiUnzh5GkOHCM,3490
|
|
75
|
-
datamaestro_text-2023.
|
|
76
|
-
datamaestro_text-2023.
|
|
77
|
-
datamaestro_text-2023.
|
|
78
|
-
datamaestro_text-2023.
|
|
79
|
-
datamaestro_text-2023.
|
|
80
|
-
datamaestro_text-2023.
|
|
81
|
-
datamaestro_text-2023.10.10.dist-info/RECORD,,
|
|
75
|
+
datamaestro_text-2023.11.22.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
76
|
+
datamaestro_text-2023.11.22.dist-info/METADATA,sha256=227Q9AEKEkApflnZdzL5pC_J6fLPZtSqbBXFJ_A6adQ,1579
|
|
77
|
+
datamaestro_text-2023.11.22.dist-info/WHEEL,sha256=Xo9-1PvkuimrydujYJAjF7pCkriuXBpUPEjma1nZyJ0,92
|
|
78
|
+
datamaestro_text-2023.11.22.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
|
|
79
|
+
datamaestro_text-2023.11.22.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
|
|
80
|
+
datamaestro_text-2023.11.22.dist-info/RECORD,,
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
|
|
File without changes
|
{datamaestro_text-2023.10.10.dist-info → datamaestro_text-2023.11.22.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{datamaestro_text-2023.10.10.dist-info → datamaestro_text-2023.11.22.dist-info}/top_level.txt
RENAMED
|
File without changes
|