datamaestro-text 2023.10.10__py3-none-any.whl → 2023.11.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,8 @@
1
- from typing import ClassVar
1
+ from typing import ClassVar, Tuple
2
2
  from attrs import define
3
- from .base import IDHolder, Document, GenericTopic
3
+ from ir_datasets.datasets.wapo import WapoDocMedia
4
+ from .base import IDHolder, Document, GenericTopic, IDTopic
5
+ from ir_datasets.datasets.cord19 import Cord19FullTextSection
4
6
 
5
7
 
6
8
  @define
@@ -16,11 +18,172 @@ class CordDocument(IDHolder, Document):
16
18
  return f"{self.title} {self.text}"
17
19
 
18
20
 
21
+ @define
22
+ class DocumentWithTitle(IDHolder, Document):
23
+ """Web document with title and URL"""
24
+
25
+ title: str
26
+
27
+ text: str
28
+
29
+
30
+ @define
31
+ class CordFullTextDocument(IDHolder, Document):
32
+ title: str
33
+ doi: str
34
+ date: str
35
+ abstract: str
36
+ body: Tuple[Cord19FullTextSection, ...]
37
+
38
+ has_text: ClassVar[bool] = True
39
+
40
+ def get_text(self):
41
+ return f"{self.abstract}"
42
+
43
+
44
+ @define
45
+ class MsMarcoDocument(IDHolder, Document):
46
+ url: str
47
+ title: str
48
+ body: str
49
+
50
+ has_text: ClassVar[bool] = True
51
+
52
+ def get_text(self):
53
+ return f"{self.body}"
54
+
55
+
56
+ @define
57
+ class NFCorpusDocument(IDHolder, Document):
58
+ url: str
59
+ title: str
60
+ abstract: str
61
+
62
+ has_text: ClassVar[bool] = True
63
+
64
+ def get_text(self):
65
+ return f"{self.abstract}"
66
+
67
+
68
+ @define
69
+ class TitleDocument(IDHolder, Document):
70
+ text: str
71
+ title: str
72
+ has_text: ClassVar[bool] = True
73
+
74
+ def get_text(self):
75
+ return f"{self.title} {self.text}"
76
+
77
+
78
+ @define
79
+ class TitleUrlDocument(IDHolder, Document):
80
+ text: str
81
+ title: str
82
+ url: str
83
+ has_text: ClassVar[bool] = True
84
+
85
+ def get_text(self):
86
+ return f"{self.title} {self.text}"
87
+
88
+
89
+ @define
90
+ class TrecParsedDocument(IDHolder, Document):
91
+ title: str
92
+ body: str
93
+ marked_up_doc: bytes
94
+
95
+ has_text: ClassVar[bool] = True
96
+
97
+ def get_text(self):
98
+ return f"{self.title} {self.body}"
99
+
100
+
101
+ @define
102
+ class WapoDocument(IDHolder, Document):
103
+ url: str
104
+ title: str
105
+ author: str
106
+ published_date: int
107
+ kicker: str
108
+ body: str
109
+ body_paras_html: Tuple[str, ...]
110
+ body_media: Tuple[WapoDocMedia, ...]
111
+
112
+ has_text: ClassVar[bool] = True
113
+
114
+ def get_text(self):
115
+ return f"{self.body}"
116
+
117
+
118
+ @define
119
+ class TweetDoc(IDHolder, Document):
120
+ text: str
121
+ user_id: str
122
+ created_at: str
123
+ lang: str
124
+ reply_doc_id: str
125
+ retweet_doc_id: str
126
+ source: bytes
127
+ source_content_type: str
128
+
129
+ def get_text(self):
130
+ return f"{self.text}"
131
+
132
+
19
133
  @define
20
134
  class TrecTopic(GenericTopic):
21
135
  text: str
22
136
  query: str
23
137
  narrative: str
24
138
 
139
+ def get_text(self):
140
+ return f"{self.text}"
141
+
142
+
143
+ @define
144
+ class UrlTopic(GenericTopic):
145
+ text: str
146
+ url: str
147
+
148
+ def get_text(self):
149
+ return f"{self.text}"
150
+
151
+
152
+ @define
153
+ class NFCorpusTopic(IDTopic):
154
+ title: str
155
+ all: str
156
+
157
+ def get_text(self):
158
+ return f"{self.title}"
159
+
160
+
161
+ @define
162
+ class TrecQuery(IDTopic):
163
+ title: str
164
+ description: str
165
+ narrative: str
166
+
167
+ def get_text(self):
168
+ return f"{self.description}"
169
+
170
+
171
+ @define
172
+ class TrecMb13Query(IDTopic):
173
+ query: str
174
+ time: str
175
+ tweet_time: str
176
+
177
+ def get_text(self):
178
+ return f"{self.query}"
179
+
180
+
181
+ @define
182
+ class TrecMb14Query(IDTopic):
183
+ query: str
184
+ time: str
185
+ tweet_time: str
186
+ description: str
187
+
25
188
  def get_text(self):
26
189
  return f"{self.query}"
@@ -2,7 +2,13 @@ import logging
2
2
  from typing import Any, Iterator, Tuple, Type, List
3
3
  import attrs
4
4
  import ir_datasets
5
- from ir_datasets.formats import GenericDoc, GenericQuery, GenericDocPair
5
+ from ir_datasets.formats import (
6
+ GenericDoc,
7
+ GenericQuery,
8
+ GenericDocPair,
9
+ TrecParsedDoc,
10
+ TrecQuery,
11
+ )
6
12
  import ir_datasets.datasets as _irds
7
13
  from experimaestro import Config
8
14
  from experimaestro.compat import cached_property
@@ -72,9 +78,10 @@ class tuple_constructor:
72
78
  self.fields = fields
73
79
 
74
80
  def check(self, source_cls: Type):
75
- assert (
76
- source_cls._fields == self.fields
77
- ), f"Internal error: Fields do not match ({source_cls._fields} and {self.fields})"
81
+ assert source_cls._fields == self.fields, (
82
+ "Internal error: Fields do not match, "
83
+ f"source({source_cls.__qualname__})={source_cls._fields} [vs] target={self.fields}"
84
+ )
78
85
 
79
86
  def __call__(self, entry):
80
87
  return self.target_cls(*tuple(entry))
@@ -91,6 +98,54 @@ class Documents(ir.DocumentStore, IRDSId):
91
98
  _irds.beir.BeirCordDoc: tuple_constructor(
92
99
  formats.CordDocument, "doc_id", "text", "title", "url", "pubmed_id"
93
100
  ),
101
+ _irds.beir.BeirTitleDoc: tuple_constructor(
102
+ formats.TitleDocument, "doc_id", "text", "title"
103
+ ),
104
+ _irds.beir.BeirTitleUrlDoc: tuple_constructor(
105
+ formats.TitleUrlDocument, "doc_id", "text", "title", "url"
106
+ ),
107
+ _irds.msmarco_document.MsMarcoDocument: tuple_constructor(
108
+ formats.MsMarcoDocument, "doc_id", "url", "title", "body"
109
+ ),
110
+ _irds.cord19.Cord19FullTextDoc: tuple_constructor(
111
+ formats.CordFullTextDocument,
112
+ "doc_id",
113
+ "title",
114
+ "doi",
115
+ "date",
116
+ "abstract",
117
+ "body",
118
+ ),
119
+ _irds.nfcorpus.NfCorpusDoc: tuple_constructor(
120
+ formats.NFCorpusDocument, "doc_id", "url", "title", "abstract"
121
+ ),
122
+ TrecParsedDoc: tuple_constructor(
123
+ formats.TrecParsedDocument, "doc_id", "title", "body", "marked_up_doc"
124
+ ),
125
+ _irds.wapo.WapoDoc: tuple_constructor(
126
+ formats.WapoDocument,
127
+ "doc_id",
128
+ "url",
129
+ "title",
130
+ "author",
131
+ "published_date",
132
+ "kicker",
133
+ "body",
134
+ "body_paras_html",
135
+ "body_media",
136
+ ),
137
+ _irds.tweets2013_ia.TweetDoc: tuple_constructor(
138
+ formats.TweetDoc,
139
+ "doc_id",
140
+ "text",
141
+ "user_id",
142
+ "created_at",
143
+ "lang",
144
+ "reply_doc_id",
145
+ "retweet_doc_id",
146
+ "source",
147
+ "source_content_type",
148
+ ),
94
149
  }
95
150
 
96
151
  """Wraps an ir datasets collection -- and provide a default text
@@ -147,6 +202,12 @@ class Documents(ir.DocumentStore, IRDSId):
147
202
  return converter
148
203
 
149
204
 
205
+ if hasattr(_irds, "miracl"):
206
+ Documents.CONVERTERS[_irds.miracl.MiraclDoc] = tuple_constructor(
207
+ formats.DocumentWithTitle, "doc_id", "title", "text"
208
+ )
209
+
210
+
150
211
  @attrs.define()
151
212
  class IRDSQueryWrapper(ir.Topic):
152
213
  query: Any
@@ -158,6 +219,26 @@ class Topics(ir.TopicsStore, IRDSId):
158
219
  _irds.beir.BeirCovidQuery: tuple_constructor(
159
220
  formats.TrecTopic, "query_id", "text", "query", "narrative"
160
221
  ),
222
+ _irds.beir.BeirUrlQuery: tuple_constructor(
223
+ formats.UrlTopic, "query_id", "text", "url"
224
+ ),
225
+ _irds.nfcorpus.NfCorpusQuery: tuple_constructor(
226
+ formats.NFCorpusTopic, "query_id", "title", "all"
227
+ ),
228
+ TrecQuery: tuple_constructor(
229
+ formats.TrecQuery, "query_id", "title", "description", "narrative"
230
+ ),
231
+ _irds.tweets2013_ia.TrecMb13Query: tuple_constructor(
232
+ formats.TrecMb13Query, "query_id", "query", "time", "tweet_time"
233
+ ),
234
+ _irds.tweets2013_ia.TrecMb14Query: tuple_constructor(
235
+ formats.TrecMb14Query,
236
+ "query_id",
237
+ "query",
238
+ "time",
239
+ "tweet_time",
240
+ "description",
241
+ ),
161
242
  }
162
243
 
163
244
  def iter(self) -> Iterator[ir.Topic]:
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '2023.10.10'
16
- __version_tuple__ = version_tuple = (2023, 10, 10)
15
+ __version__ = version = '2023.11.22'
16
+ __version_tuple__ = version_tuple = (2023, 11, 22)
@@ -1,16 +1,13 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamaestro-text
3
- Version: 2023.10.10
3
+ Version: 2023.11.22
4
4
  Summary: Datamaestro module for text-related datasets
5
- Home-page: https://github.com/experimaestro/datamaestro_text
6
- Author: Benjamin Piwowarski
7
5
  Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
8
6
  License: GPL-3
9
7
  Project-URL: homepage, https://github.com/experimaestro/datamaestro_text
10
8
  Project-URL: documentation, https://datamaestro-text.readthedocs.io/en/latest/
11
9
  Project-URL: repository, https://github.com/experimaestro/datamaestro_text
12
10
  Keywords: dataset manager,information retrieval,experiments
13
- Platform: any
14
11
  Classifier: Development Status :: 4 - Beta
15
12
  Classifier: Intended Audience :: Science/Research
16
13
  Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
@@ -23,8 +20,6 @@ Description-Content-Type: text/markdown
23
20
  License-File: LICENSE
24
21
  Requires-Dist: datamaestro >=0.8.16
25
22
  Requires-Dist: attrs
26
- Provides-Extra: test
27
- Requires-Dist: tox ; extra == 'test'
28
23
 
29
24
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) [![PyPI version](https://badge.fury.io/py/datamaestro-text.svg)](https://badge.fury.io/py/datamaestro-text)
30
25
 
@@ -1,5 +1,5 @@
1
1
  datamaestro_text/__init__.py,sha256=hU8jZpkXl3F74qIfqnJl7v4nJ9YxfoR7IpJpUREFNRI,248
2
- datamaestro_text/version.py,sha256=Ol68Jx3UH3rRn1Qf0YlokVrHAvWr1Q4Y2-Qb3UjbyLI,421
2
+ datamaestro_text/version.py,sha256=SfBDLSp-ExawWjfQ5F5ZPLPMVBHTFXz89upGMl-xJ38,421
3
3
  datamaestro_text/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  datamaestro_text/config/ai/quac.yaml,sha256=h1D7UJo2z1nZ_9MXpDHuQNJG9Ma2oayUdJV6lyumAIg,1103
5
5
  datamaestro_text/config/com/oscar-corpus.py,sha256=dDfdy_uhG6esRQQzUOOORwK8KvEq4c4hZbWt22pv-zY,733
@@ -52,12 +52,12 @@ datamaestro_text/data/ir/base.py,sha256=FOT0fk7_Kw-LRAPJnxNvuBHAhW6hhRCqWepqdBsp
52
52
  datamaestro_text/data/ir/cord19.py,sha256=JN31EQeg0UFAJlIkg0Ie0_pq-f-oS1OstZGJLJBeKyY,1130
53
53
  datamaestro_text/data/ir/csv.py,sha256=vgBNOeayEALwO01LmrzVOEVbs_MWJn3eIm-o0KiXjiE,1836
54
54
  datamaestro_text/data/ir/data.py,sha256=ERmmOxz_9mUkIuccNbzUa5Y6gVLLVDdyc4cCxbCCUbY,20
55
- datamaestro_text/data/ir/formats.py,sha256=RWmfghR91O3xSOzATXM-VtTY0Nyjj9-I8czuWEksCAE,465
55
+ datamaestro_text/data/ir/formats.py,sha256=6344Tj2yTxQ5KW-YtkBbdbCgWTbSsO6f0AaJlvvibqM,3248
56
56
  datamaestro_text/data/ir/huggingface.py,sha256=G71VFDN-SllZy4LFxumEbCumEJvb5-orAbLemHjWhiA,956
57
57
  datamaestro_text/data/ir/trec.py,sha256=n98_O_sPPdU2i037fAboD4lB_I7C-RJrOLmmkg3osL8,1741
58
58
  datamaestro_text/data/ir/utils.py,sha256=6-GhXVtgkBZGhIs2-ODZua_3DmKjSSVydStpHDqbAwE,833
59
59
  datamaestro_text/datasets/irds/__init__.py,sha256=Tq0HN1qojnZYLBumM59BuTkz7r0gcu-5OXmDDLgPpAc,707
60
- datamaestro_text/datasets/irds/data.py,sha256=IN0JFrvMlTZFoFKNPx_3Y6dmO4D-3mq_R8nuC7MyiOM,6767
60
+ datamaestro_text/datasets/irds/data.py,sha256=hDctKswyzD_VrCRcD6pNIoKiiwvapWQBUwxzdFHesIM,9348
61
61
  datamaestro_text/datasets/irds/datasets.py,sha256=4tNTmlcF2OmUttCMyz5YTepi91pvaZB4syy5u-jAKh4,5556
62
62
  datamaestro_text/datasets/irds/utils.py,sha256=m30JLIrV_HgilN11TvY9dGTyumES6LLzWZDUAMT915M,1425
63
63
  datamaestro_text/download/tmdb.py,sha256=kU_Vz9jhznlyeKMHziVu58IHoWv8zPu6CZTHVNQvmu4,4009
@@ -72,10 +72,9 @@ datamaestro_text/utils/files.py,sha256=n6ZGl5LNrZbHLcV9RFwd7cFT0vPUezit-2dsBzs1v
72
72
  datamaestro_text/utils/iter.py,sha256=-m0Y_0YjSlEVbotzZYIA0Ca0Hq0G_bF9GfAZR2yxrAk,520
73
73
  datamaestro_text/utils/randomstream.py,sha256=_-boH4IIqN8qcl3IktjpNp9vmF4TWRzHUSNVwg7WAr8,973
74
74
  datamaestro_text/utils/shuffle.py,sha256=o8JTz3mr0lYWyv0zEh91jEK12ci1etMiUnzh5GkOHCM,3490
75
- datamaestro_text-2023.10.10.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
76
- datamaestro_text-2023.10.10.dist-info/METADATA,sha256=Pl--IRKaXPNbZvg68ba_PNGdteFcl0Cu7tAxfoadPr4,1740
77
- datamaestro_text-2023.10.10.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
78
- datamaestro_text-2023.10.10.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
79
- datamaestro_text-2023.10.10.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
80
- datamaestro_text-2023.10.10.dist-info/zip-safe,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
81
- datamaestro_text-2023.10.10.dist-info/RECORD,,
75
+ datamaestro_text-2023.11.22.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
76
+ datamaestro_text-2023.11.22.dist-info/METADATA,sha256=227Q9AEKEkApflnZdzL5pC_J6fLPZtSqbBXFJ_A6adQ,1579
77
+ datamaestro_text-2023.11.22.dist-info/WHEEL,sha256=Xo9-1PvkuimrydujYJAjF7pCkriuXBpUPEjma1nZyJ0,92
78
+ datamaestro_text-2023.11.22.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
79
+ datamaestro_text-2023.11.22.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
80
+ datamaestro_text-2023.11.22.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.41.2)
2
+ Generator: bdist_wheel (0.41.3)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5