datamaestro-text 2024.5.31__py3-none-any.whl → 2025.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,20 @@
1
1
  # See documentation on https://datamaestro.readthedocs.io
2
2
 
3
+ import re
4
+ import json
3
5
  from pathlib import Path
4
6
  from datamaestro.definitions import datatasks, datatags, dataset
5
7
  from datamaestro.data.ml import Supervised
8
+ from datamaestro.download import reference
6
9
  from datamaestro.download.archive import zipdownloader
10
+ from datamaestro.download.wayback import wayback_documents
7
11
  from datamaestro.utils import HashCheck
8
12
  from datamaestro_text.data.conversation.qrecc import QReCCDataset
13
+ from datamaestro_text.datasets.irds.data import (
14
+ LZ4JSONLDocumentStore,
15
+ SimpleJsonDocument,
16
+ )
17
+ from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
9
18
 
10
19
 
11
20
  @datatags("conversation", "context", "query")
@@ -35,3 +44,44 @@ def main(data: Path):
35
44
  "train": QReCCDataset(path=data / "qrecc_train.json"),
36
45
  "test": QReCCDataset(path=data / "qrecc_test.json"),
37
46
  }
47
+
48
+
49
+ @dataset(
50
+ url="https://github.com/apple/ml-qrecc",
51
+ doi="https://doi.org/10.48550/arXiv.2010.04898",
52
+ )
53
+ class Content(LZ4JSONLDocumentStore):
54
+ """QReCC mentionned URLs content"""
55
+
56
+ @staticmethod
57
+ def __create_dataset__(dataset, options=None):
58
+ ds = reference(reference=main).setup(dataset, options)
59
+ documents_path = wayback_documents(
60
+ "20191127", lambda: Content._urls(ds), name="wayback.jsonl"
61
+ ).setup(dataset, options)
62
+
63
+ store_path = lz4docstore_builder(
64
+ "store",
65
+ lambda: Content._documents(documents_path),
66
+ SimpleJsonDocument,
67
+ "id",
68
+ ).setup(dataset, options)
69
+
70
+ return LZ4JSONLDocumentStore(jsonl_path=store_path)
71
+
72
+ @staticmethod
73
+ def _documents(path: Path):
74
+ """Iterates over documents from wayback"""
75
+ with path.open("rt") as fp:
76
+ for line in fp:
77
+ yield SimpleJsonDocument(**json.loads(line))
78
+
79
+ @staticmethod
80
+ def _urls(supervised: Supervised[QReCCDataset, None, QReCCDataset]):
81
+ urls = set()
82
+ for ds in [supervised.train, supervised.test]:
83
+ for entry in ds.entries():
84
+ if entry.answer_url:
85
+ url = re.sub("#.*$", "", entry.answer_url)
86
+ urls.add(url)
87
+ return urls
@@ -188,7 +188,7 @@ class SingleConversationTreeNode(ConversationNode):
188
188
  def history(self) -> Sequence[Record]:
189
189
  return self.tree.history[self.index + 1 :]
190
190
 
191
- def parent(self) -> ConversationNode | None:
191
+ def parent(self) -> Optional[ConversationNode]:
192
192
  return (
193
193
  SingleConversationTreeNode(self.tree, self.index + 1)
194
194
  if self.index < len(self.tree.history) - 1
@@ -235,7 +235,7 @@ class ConversationTreeNode(ConversationNode, ConversationTree):
235
235
  for child in self.children:
236
236
  yield from child
237
237
 
238
- def parent(self) -> ConversationNode | None:
238
+ def parent(self) -> Optional[ConversationNode]:
239
239
  return self.parent
240
240
 
241
241
  def children(self) -> List[ConversationNode]:
@@ -1,5 +1,5 @@
1
1
  from functools import cached_property
2
- from typing import ClassVar, Tuple
2
+ from typing import ClassVar, Tuple, List
3
3
  from attrs import define
4
4
  from datamaestro.record import record_type
5
5
  from ir_datasets.datasets.wapo import WapoDocMedia
@@ -117,7 +117,6 @@ class TweetDoc(TextItem):
117
117
 
118
118
  @define
119
119
  class OrConvQADocument(TextItem):
120
- id: str
121
120
  title: str
122
121
  body: str
123
122
  aid: str
@@ -127,12 +126,21 @@ class OrConvQADocument(TextItem):
127
126
  def text(self):
128
127
  return f"{self.title} {self.body}"
129
128
 
129
+ @define
130
+ class Touche2020(TextItem):
131
+ text: str
132
+ title: str
133
+ stance: str
134
+ url: str
130
135
 
131
136
  @define
132
- class TrecTopic(TextItem):
137
+ class SciDocs(TextItem):
133
138
  text: str
134
- query: str
135
- narrative: str
139
+ title: str
140
+ authors: List[str]
141
+ year: int
142
+ cited_by: List[str]
143
+ references: List[str]
136
144
 
137
145
 
138
146
  @define
@@ -167,6 +175,13 @@ class TrecMb14Query(TextItem):
167
175
  def get_text(self):
168
176
  return f"{self.query}"
169
177
 
178
+ @define
179
+ class SciDocsTopic(TextItem):
180
+ text: str
181
+ authors: List[str]
182
+ year: int
183
+ cited_by: List[str]
184
+ references: List[str]
170
185
 
171
186
  @define()
172
187
  class TrecTopic(SimpleTextItem):
@@ -1,17 +1,21 @@
1
1
  from collections import namedtuple
2
- from typing import List
2
+ from typing import List, NamedTuple
3
3
  from experimaestro import Constant
4
4
  import attrs
5
5
 
6
6
  from datamaestro.record import Record
7
+ from datamaestro_text.data.ir.base import IDItem
7
8
  from datamaestro_text.datasets.irds.data import LZ4DocumentStore
8
9
  from datamaestro_text.data.ir.formats import OrConvQADocument
9
10
 
10
11
 
11
12
  class OrConvQADocumentStore(LZ4DocumentStore):
12
- NAMED_TUPLE = namedtuple(
13
- "OrConvQADocument", [a.name for a in attrs.fields(OrConvQADocument)]
14
- )
13
+ class NAMED_TUPLE(NamedTuple):
14
+ id: str
15
+ title: str
16
+ body: str
17
+ aid: str
18
+ bid: int
15
19
 
16
20
  lookup_field: Constant[str] = "id"
17
21
  fields: Constant[List[str]] = list(NAMED_TUPLE._fields)
@@ -19,5 +23,7 @@ class OrConvQADocumentStore(LZ4DocumentStore):
19
23
 
20
24
  data_cls = NAMED_TUPLE
21
25
 
22
- def converter(self, data: NAMED_TUPLE) -> OrConvQADocument:
23
- return Record(OrConvQADocument(**data._asdict()))
26
+ def converter(self, data: NAMED_TUPLE) -> Record:
27
+ fields = data._asdict()
28
+ del fields["id"]
29
+ return Record(OrConvQADocument(**fields), IDItem(data.id))
@@ -1,36 +1,44 @@
1
+ import logging
1
2
  from abc import ABC, abstractmethod
3
+ from dataclasses import dataclass
2
4
  from functools import partial
3
- import logging
4
5
  from pathlib import Path
5
- from typing import Dict, Iterator, Tuple, Type, List
6
+ from typing import Dict, Iterator, List, NamedTuple, Tuple, Type
7
+
6
8
  import ir_datasets
7
- from ir_datasets.indices import PickleLz4FullStore
9
+ import ir_datasets.datasets as _irds
10
+ from datamaestro.record import RecordType, record_type
11
+ from experimaestro import Config, Meta, Option, Param
12
+ from experimaestro.compat import cached_property
8
13
  from ir_datasets.formats import (
9
14
  GenericDoc,
10
- GenericQuery,
11
15
  GenericDocPair,
16
+ GenericQuery,
12
17
  TrecParsedDoc,
13
18
  TrecQuery,
14
19
  )
15
- import ir_datasets.datasets as _irds
16
- from experimaestro import Config, Param
17
- from experimaestro.compat import cached_property
18
- from experimaestro import Option
19
- from datamaestro.record import RecordType, record_type
20
- from datamaestro_text.data.conversation.base import AnswerEntry
20
+ from ir_datasets.indices import PickleLz4FullStore
21
+
21
22
  import datamaestro_text.data.ir as ir
23
+ import datamaestro_text.data.ir.formats as formats
24
+ from datamaestro_text.data.conversation.base import (
25
+ AnswerDocumentID,
26
+ AnswerEntry,
27
+ ConversationHistoryItem,
28
+ ConversationTreeNode,
29
+ DecontextualizedDictItem,
30
+ EntryType,
31
+ )
22
32
  from datamaestro_text.data.ir.base import (
23
- Record,
24
- TopicRecord,
25
- DocumentRecord,
26
- SimpleTextItem,
27
33
  AdhocAssessedTopic,
28
- SimpleAdhocAssessment,
34
+ DocumentRecord,
29
35
  IDItem,
36
+ Record,
37
+ SimpleAdhocAssessment,
38
+ SimpleTextItem,
39
+ TopicRecord,
30
40
  create_record,
31
41
  )
32
- import datamaestro_text.data.ir.formats as formats
33
-
34
42
 
35
43
  # Interface between ir_datasets and datamaestro:
36
44
  # provides adapted data types
@@ -109,6 +117,12 @@ class Documents(ir.DocumentStore, IRDSId):
109
117
  _irds.beir.BeirTitleUrlDoc: tuple_constructor(
110
118
  formats.TitleUrlDocument, "doc_id", "text", "title", "url"
111
119
  ),
120
+ _irds.beir.BeirToucheDoc: tuple_constructor(
121
+ formats.Touche2020, "doc_id", "text", "title", "stance", "url"
122
+ ),
123
+ _irds.beir.BeirSciDoc: tuple_constructor(
124
+ formats.SciDocs, "doc_id", "text", "title", "authors", "year", "cited_by", "references"
125
+ ),
112
126
  _irds.msmarco_document.MsMarcoDocument: tuple_constructor(
113
127
  formats.MsMarcoDocument, "doc_id", "url", "title", "body"
114
128
  ),
@@ -216,20 +230,6 @@ if hasattr(_irds, "miracl"):
216
230
  )
217
231
 
218
232
 
219
- # Fix while PR https://github.com/allenai/ir_datasets/pull/252
220
- # is not in.
221
- class DMPickleLz4FullStore(PickleLz4FullStore):
222
- def get_many(self, doc_ids, field=None):
223
- result = {}
224
- field_idx = self._doc_cls._fields.index(field) if field is not None else None
225
- for doc in self.get_many_iter(doc_ids):
226
- if field is not None:
227
- result[getattr(doc, self._id_field)] = doc[field_idx]
228
- else:
229
- result[getattr(doc, self._id_field)] = doc
230
- return result
231
-
232
-
233
233
  class LZ4DocumentStore(ir.DocumentStore):
234
234
  """A LZ4-based document store"""
235
235
 
@@ -243,7 +243,7 @@ class LZ4DocumentStore(ir.DocumentStore):
243
243
 
244
244
  @cached_property
245
245
  def store(self):
246
- return DMPickleLz4FullStore(
246
+ return PickleLz4FullStore(
247
247
  self.path, None, self.data_cls, self.lookup_field, self.index_fields
248
248
  )
249
249
 
@@ -262,10 +262,10 @@ class LZ4DocumentStore(ir.DocumentStore):
262
262
  retrieved = self.store.get_many(docids)
263
263
  return [self.converter(retrieved[docid]) for docid in docids]
264
264
 
265
+ @abstractmethod
265
266
  def converter(self, data):
266
- """Converts a document from LZ4 tuples to any other format"""
267
- # By default, use identity
268
- return data
267
+ """Converts a document from LZ4 tuples to a document record"""
268
+ ...
269
269
 
270
270
  def iter(self) -> Iterator[DocumentRecord]:
271
271
  """Returns an iterator over documents"""
@@ -278,6 +278,25 @@ class LZ4DocumentStore(ir.DocumentStore):
278
278
  return self.store.count()
279
279
 
280
280
 
281
+ class SimpleJsonDocument(NamedTuple):
282
+ id: str
283
+ text: str
284
+
285
+
286
+ class LZ4JSONLDocumentStore(LZ4DocumentStore):
287
+ jsonl_path: Meta[Path]
288
+ """json-l based document store
289
+
290
+ Each line is of the form
291
+ ```json
292
+ { "id": "...", "text": "..." }
293
+ ```
294
+ """
295
+
296
+ def converter(self, data):
297
+ return DocumentRecord(IDItem(data["id"]), SimpleTextItem(data["text"]))
298
+
299
+
281
300
  class TopicsHandler(ABC):
282
301
  @abstractmethod
283
302
  def topic_int(self, internal_topic_id: int) -> TopicRecord:
@@ -349,6 +368,12 @@ class Topics(ir.TopicsStore, IRDSId):
349
368
  TrecQuery: tuple_constructor(
350
369
  formats.TrecTopic, "query_id", "title", "description", "narrative"
351
370
  ),
371
+ _irds.beir.BeirToucheQuery: tuple_constructor(
372
+ formats.TrecTopic, "query_id", "text", "description", "narrative"
373
+ ),
374
+ _irds.beir.BeirSciQuery: tuple_constructor(
375
+ formats.SciDocsTopic, "query_id", "text", "authors", "year", "cited_by", "references"
376
+ ),
352
377
  _irds.tweets2013_ia.TrecMb13Query: tuple_constructor(
353
378
  formats.TrecMb13Query, "query_id", "query", "time", "tweet_time"
354
379
  ),
@@ -392,197 +417,190 @@ class Topics(ir.TopicsStore, IRDSId):
392
417
  return self.handler.iter()
393
418
 
394
419
 
395
- if hasattr(_irds.trec_cast, "Cast2022Query"):
396
- from datamaestro_text.data.conversation.base import (
397
- ConversationTreeNode,
398
- DecontextualizedDictItem,
399
- AnswerDocumentID,
400
- ConversationHistoryItem,
401
- EntryType,
402
- )
420
+ class CastTopicsHandler(TopicsHandler):
421
+ def __init__(self, dataset):
422
+ self.dataset = dataset
423
+
424
+ @cached_property
425
+ def ext2records(self):
426
+ return {record[IDItem].id: record for record in self.records}
427
+
428
+ def topic_int(self, internal_topic_id: int) -> TopicRecord:
429
+ """Returns a document given its internal ID"""
430
+ return self.records[internal_topic_id]
431
+
432
+ def topic_ext(self, external_topic_id: str) -> TopicRecord:
433
+ """Returns a document given its external ID"""
434
+ return self.ext2records[external_topic_id]
435
+
436
+ def iter(self) -> Iterator[ir.TopicRecord]:
437
+ """Returns an iterator over topics"""
438
+ return iter(self.records)
403
439
 
404
- class CastTopicsHandler(TopicsHandler):
405
- def __init__(self, dataset):
406
- self.dataset = dataset
407
-
408
- @property
409
- @abstractmethod
410
- def records(self):
411
- ...
412
-
413
- @cached_property
414
- def ext2records(self):
415
- return {record[IDItem].id: record for record in self.records}
416
-
417
- def topic_int(self, internal_topic_id: int) -> TopicRecord:
418
- """Returns a document given its internal ID"""
419
- return self.records[internal_topic_id]
420
-
421
- def topic_ext(self, external_topic_id: str) -> TopicRecord:
422
- """Returns a document given its external ID"""
423
- return self.ext2records[external_topic_id]
424
-
425
- def iter(self) -> Iterator[ir.TopicRecord]:
426
- """Returns an iterator over topics"""
427
- return iter(self.records)
428
-
429
- @cached_property
430
- def records(self):
431
- try:
432
- topic_number = None
433
- node = None
434
- conversation = []
435
- records = []
436
-
437
- for query in self.dataset.dataset.queries_iter():
438
- decontextualized = DecontextualizedDictItem(
439
- "manual",
440
- {
441
- "manual": query.manual_rewritten_utterance,
442
- "auto": query.automatic_rewritten_utterance,
443
- },
440
+ @cached_property
441
+ def records(self):
442
+ try:
443
+ topic_number = None
444
+ node = None
445
+ conversation = []
446
+ records = []
447
+
448
+ for query in self.dataset.dataset.queries_iter():
449
+ decontextualized = DecontextualizedDictItem(
450
+ "manual",
451
+ {
452
+ "manual": query.manual_rewritten_utterance,
453
+ "auto": query.automatic_rewritten_utterance,
454
+ },
455
+ )
456
+
457
+ is_new_conversation = topic_number != query.topic_number
458
+
459
+ topic = Record(
460
+ IDItem(query.query_id),
461
+ SimpleTextItem(query.raw_utterance),
462
+ decontextualized,
463
+ ConversationHistoryItem(
464
+ [] if is_new_conversation else node.conversation(False)
465
+ ),
466
+ EntryType.USER_QUERY,
467
+ )
468
+
469
+ if is_new_conversation:
470
+ conversation = []
471
+ node = ConversationTreeNode(topic)
472
+ topic_number = query.topic_number
473
+ else:
474
+ node = node.add(ConversationTreeNode(topic))
475
+
476
+ records.append(topic)
477
+
478
+ conversation.append(node)
479
+ node = node.add(
480
+ ConversationTreeNode(
481
+ Record(
482
+ AnswerDocumentID(self.get_canonical_result_id(query)),
483
+ EntryType.SYSTEM_ANSWER,
484
+ )
444
485
  )
486
+ )
487
+ conversation.append(node)
488
+ except Exception:
489
+ logging.exception("Error while computing topic records")
490
+ raise
491
+
492
+ return records
493
+
494
+ @staticmethod
495
+ def get_canonical_result_id():
496
+ return None
497
+
445
498
 
446
- is_new_conversation = topic_number != query.topic_number
499
+ class Cast2020TopicsHandler(CastTopicsHandler):
500
+ @staticmethod
501
+ def get_canonical_result_id(query: _irds.trec_cast.Cast2020Query):
502
+ return query.manual_canonical_result_id
447
503
 
504
+
505
+ class Cast2021TopicsHandler(CastTopicsHandler):
506
+ @staticmethod
507
+ def get_canonical_result_id(query: _irds.trec_cast.Cast2021Query):
508
+ return query.canonical_result_id
509
+
510
+
511
+ class Cast2022TopicsHandler(CastTopicsHandler):
512
+ def __init__(self, dataset):
513
+ self.dataset = dataset
514
+
515
+ @cached_property
516
+ def records(self):
517
+ try:
518
+ records = []
519
+ nodes: Dict[str, ConversationTreeNode] = {}
520
+
521
+ for (
522
+ query
523
+ ) in (
524
+ self.dataset.dataset.queries_iter()
525
+ ): # type: _irds.trec_cast.Cast2022Query
526
+ parent = nodes[query.parent_id] if query.parent_id else None
527
+
528
+ if query.participant == "User":
448
529
  topic = Record(
449
530
  IDItem(query.query_id),
450
531
  SimpleTextItem(query.raw_utterance),
451
- decontextualized,
532
+ DecontextualizedDictItem(
533
+ "manual",
534
+ {
535
+ "manual": query.manual_rewritten_utterance,
536
+ },
537
+ ),
452
538
  ConversationHistoryItem(
453
- [] if is_new_conversation else node.conversation(False)
539
+ parent.conversation(False) if parent else []
454
540
  ),
455
541
  EntryType.USER_QUERY,
456
542
  )
457
-
458
- if is_new_conversation:
459
- conversation = []
460
- node = ConversationTreeNode(topic)
461
- topic_number = query.topic_number
462
- else:
463
- node = node.add(ConversationTreeNode(topic))
464
-
543
+ node = ConversationTreeNode(topic)
465
544
  records.append(topic)
466
-
467
- conversation.append(node)
468
- node = node.add(
469
- ConversationTreeNode(
470
- Record(
471
- AnswerDocumentID(self.get_canonical_result_id(query)),
472
- EntryType.SYSTEM_ANSWER,
473
- )
545
+ else:
546
+ node = ConversationTreeNode(
547
+ Record(
548
+ AnswerEntry(query.response),
549
+ EntryType.SYSTEM_ANSWER,
474
550
  )
475
551
  )
476
- conversation.append(node)
477
- except Exception:
478
- logging.exception("Error while computing topic records")
479
- raise
480
-
481
- return records
482
-
483
- @staticmethod
484
- def get_canonical_result_id():
485
- return None
486
-
487
- class Cast2020TopicsHandler(CastTopicsHandler):
488
- @staticmethod
489
- def get_canonical_result_id(query: _irds.trec_cast.Cast2020Query):
490
- return query.manual_canonical_result_id
491
-
492
- class Cast2021TopicsHandler(CastTopicsHandler):
493
- @staticmethod
494
- def get_canonical_result_id(query: _irds.trec_cast.Cast2021Query):
495
- return query.canonical_result_id
496
-
497
- class Cast2022TopicsHandler(CastTopicsHandler):
498
- def __init__(self, dataset):
499
- self.dataset = dataset
500
-
501
- @cached_property
502
- def records(self):
503
- try:
504
- records = []
505
- nodes: Dict[str, ConversationTreeNode] = {}
506
-
507
- for (
508
- query
509
- ) in (
510
- self.dataset.dataset.queries_iter()
511
- ): # type: _irds.trec_cast.Cast2022Query
512
- parent = nodes[query.parent_id] if query.parent_id else None
513
-
514
- if query.participant == "User":
515
- topic = Record(
516
- IDItem(query.query_id),
517
- SimpleTextItem(query.raw_utterance),
518
- DecontextualizedDictItem(
519
- "manual",
520
- {
521
- "manual": query.manual_rewritten_utterance,
522
- },
523
- ),
524
- ConversationHistoryItem(
525
- parent.conversation(False) if parent else []
526
- ),
527
- EntryType.USER_QUERY,
528
- )
529
- node = ConversationTreeNode(topic)
530
- records.append(topic)
531
- else:
532
- node = ConversationTreeNode(
533
- Record(
534
- AnswerEntry(query.response),
535
- EntryType.SYSTEM_ANSWER,
536
- )
537
- )
538
552
 
539
- nodes[query.query_id] = node
540
- if parent:
541
- parent.add(node)
542
- except Exception:
543
- logging.exception("Error while computing topic records")
544
- raise
545
-
546
- return records
547
-
548
- Topics.HANDLERS.update(
549
- {
550
- # _irds.trec_cast.Cast2019Query: Cast2019TopicsHandler,
551
- _irds.trec_cast.Cast2020Query: Cast2020TopicsHandler,
552
- _irds.trec_cast.Cast2021Query: Cast2021TopicsHandler,
553
- _irds.trec_cast.Cast2022Query: Cast2022TopicsHandler,
554
- }
555
- )
553
+ nodes[query.query_id] = node
554
+ if parent:
555
+ parent.add(node)
556
+ except Exception:
557
+ logging.exception("Error while computing topic records")
558
+ raise
556
559
 
557
- class CastDocHandler:
558
- def check(self, cls):
559
- assert issubclass(cls, _irds.trec_cast.CastDoc)
560
+ return records
560
561
 
561
- @cached_property
562
- def target_cls(self):
563
- return formats.TitleUrlDocument
564
562
 
565
- def __call__(self, _, doc: _irds.trec_cast.CastDoc):
566
- return Record(
567
- IDItem(doc.doc_id), formats.SimpleTextItem(" ".join(doc.passages))
568
- )
563
+ Topics.HANDLERS.update(
564
+ {
565
+ # _irds.trec_cast.Cast2019Query: Cast2019TopicsHandler,
566
+ _irds.trec_cast.Cast2020Query: Cast2020TopicsHandler,
567
+ _irds.trec_cast.Cast2021Query: Cast2021TopicsHandler,
568
+ _irds.trec_cast.Cast2022Query: Cast2022TopicsHandler,
569
+ }
570
+ )
569
571
 
570
- class CastPassageDocHandler:
571
- def check(self, cls):
572
- assert issubclass(cls, _irds.trec_cast.CastPassageDoc)
573
572
 
574
- @cached_property
575
- def target_cls(self):
576
- return formats.TitleUrlDocument
573
+ class CastDocHandler:
574
+ def check(self, cls):
575
+ assert issubclass(cls, _irds.trec_cast.CastDoc)
576
+
577
+ @cached_property
578
+ def target_cls(self):
579
+ return formats.TitleUrlDocument
580
+
581
+ def __call__(self, _, doc: _irds.trec_cast.CastDoc):
582
+ return Record(
583
+ IDItem(doc.doc_id), formats.SimpleTextItem(" ".join(doc.passages))
584
+ )
585
+
586
+
587
+ class CastPassageDocHandler:
588
+ def check(self, cls):
589
+ assert issubclass(cls, _irds.trec_cast.CastPassageDoc)
590
+
591
+ @cached_property
592
+ def target_cls(self):
593
+ return formats.TitleUrlDocument
594
+
595
+ def __call__(self, _, doc: _irds.trec_cast.CastPassageDoc):
596
+ return Record(
597
+ IDItem(doc.doc_id),
598
+ formats.TitleUrlDocument(doc.text, doc.title, doc.url),
599
+ )
577
600
 
578
- def __call__(self, _, doc: _irds.trec_cast.CastPassageDoc):
579
- return Record(
580
- IDItem(doc.doc_id),
581
- formats.TitleUrlDocument(doc.text, doc.title, doc.url),
582
- )
583
601
 
584
- Documents.CONVERTERS[_irds.trec_cast.CastDoc] = CastDocHandler()
585
- Documents.CONVERTERS[_irds.trec_cast.CastPassageDoc] = CastPassageDocHandler()
602
+ Documents.CONVERTERS[_irds.trec_cast.CastDoc] = CastDocHandler()
603
+ Documents.CONVERTERS[_irds.trec_cast.CastPassageDoc] = CastPassageDocHandler()
586
604
 
587
605
 
588
606
  class Adhoc(ir.Adhoc, IRDSId):
@@ -1,13 +1,13 @@
1
1
  import logging
2
2
  from typing import Optional, Type, Callable, Iterator
3
3
  from ir_datasets.indices import PickleLz4FullStore
4
- from datamaestro.download import Download
4
+ from datamaestro.download import Resource
5
5
  from datamaestro.utils import FileChecker
6
6
  from pathlib import Path
7
7
  import urllib3
8
8
 
9
9
 
10
- class lz4docstore_downloader(Download):
10
+ class lz4docstore_downloader(Resource):
11
11
  """Uses ir_datasets Lz4FullStore to build a document store for a stream of documents"""
12
12
 
13
13
  def __init__(
@@ -69,3 +69,59 @@ class lz4docstore_downloader(Download):
69
69
 
70
70
  # All good!
71
71
  (destination / "done").touch()
72
+
73
+
74
+ class lz4docstore_builder(Resource):
75
+ def __init__(
76
+ self,
77
+ name: str,
78
+ iter_factory: Callable[[], Iterator],
79
+ doc_cls: Type,
80
+ lookup_field: str,
81
+ *,
82
+ count_hint: Optional[int] = None,
83
+ ):
84
+ """Uses ir_datasets Lz4FullStore to build a document store for a stream of documents
85
+
86
+ :param name: The name of the variable for path construction
87
+ :param iter_factory: Iterator over documents
88
+ :param doc_cls: The class of documents (must be a dataclass because of how ir-datasets works)
89
+ :param lookup_field: Which field to use for lookup
90
+ :param count_hint: Number of documents (hint), defaults to None
91
+ """
92
+ super().__init__(name)
93
+ self.iter_factory = iter_factory
94
+ self.doc_cls = doc_cls
95
+ self.lookup_field = lookup_field
96
+ self.count_hint = count_hint
97
+
98
+ def prepare(self):
99
+ return self.definition.datapath / self.varname
100
+
101
+ def download(self, force=False):
102
+ # Creates directory if needed
103
+ destination = self.definition.datapath / self.varname
104
+ destination.mkdir(exist_ok=True)
105
+
106
+ # Early exit
107
+ if (destination / "done").is_file() and not force:
108
+ return True
109
+
110
+ # Download (cache)
111
+ logging.info("Building the document index")
112
+
113
+ # Builds the LZ4 store
114
+ store = PickleLz4FullStore(
115
+ destination,
116
+ lambda: self.iter_factory(),
117
+ self.doc_cls,
118
+ lookup_field=self.lookup_field,
119
+ index_fields=[self.lookup_field],
120
+ key_field_prefix=None,
121
+ size_hint=None,
122
+ count_hint=self.count_hint,
123
+ )
124
+ store.build()
125
+
126
+ # All good!
127
+ (destination / "done").touch()
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '2024.5.31'
16
- __version_tuple__ = version_tuple = (2024, 5, 31)
15
+ __version__ = version = '2025.1.7'
16
+ __version_tuple__ = version_tuple = (2025, 1, 7)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datamaestro-text
3
- Version: 2024.5.31
3
+ Version: 2025.1.7
4
4
  Summary: Datamaestro module for text-related datasets
5
5
  Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
6
6
  License: GPL-3
@@ -18,8 +18,8 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
18
  Requires-Python: >=3.8
19
19
  Description-Content-Type: text/markdown
20
20
  License-File: LICENSE
21
- Requires-Dist: datamaestro >=1.1.1
22
- Requires-Dist: ir-datasets
21
+ Requires-Dist: datamaestro>=1.2.1
22
+ Requires-Dist: ir_datasets>=0.5.8
23
23
  Requires-Dist: attrs
24
24
 
25
25
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) [![PyPI version](https://badge.fury.io/py/datamaestro-text.svg)](https://badge.fury.io/py/datamaestro-text)
@@ -1,12 +1,12 @@
1
1
  datamaestro_text/__init__.py,sha256=hU8jZpkXl3F74qIfqnJl7v4nJ9YxfoR7IpJpUREFNRI,248
2
- datamaestro_text/version.py,sha256=fwtF2zbaLZ1a4gnsJGlnkD1w9QKZyClNJUGeq39EhTE,419
2
+ datamaestro_text/version.py,sha256=k65KHkS4PD3NjDDCJgWcTwRBE4yVcKs32B8SUxv2DvE,417
3
3
  datamaestro_text/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  datamaestro_text/config/ai/quac.yaml,sha256=h1D7UJo2z1nZ_9MXpDHuQNJG9Ma2oayUdJV6lyumAIg,1103
5
5
  datamaestro_text/config/com/oscar-corpus.py,sha256=dDfdy_uhG6esRQQzUOOORwK8KvEq4c4hZbWt22pv-zY,733
6
6
  datamaestro_text/config/com/sentiment140.py,sha256=iRV_rSjQcr9WNjjQ4EdznaEMGLADV_rwpeRx7ycQi0k,1383
7
7
  datamaestro_text/config/com/fastml/goodbooks-10k.yaml,sha256=5ZABxUnBFs2ZnCXtBH8YoBiPb3SocRRdH1DLSfVWF-Y,1172
8
8
  datamaestro_text/config/com/github/aagohary/canard.py,sha256=tIwb_KxMUR3st7rzQUkt6rIjolTl8uKvDq6t795b1nY,1468
9
- datamaestro_text/config/com/github/apple/ml-qrecc.py,sha256=prljhI25mZn4NqUwu5sfntvvzLI1-Twpe_tJYjUoWDo,1444
9
+ datamaestro_text/config/com/github/apple/ml-qrecc.py,sha256=_lufJlg-4zaQyjr5Ae-X-9hXzpl2JFjfxh_RhnBbva4,3068
10
10
  datamaestro_text/config/com/github/prdwb/orconvqa.py,sha256=Tg3hbnvilxW_Lwt7fpKvGMtu-6mc9oNIHM-LX6JTR1U,3026
11
11
  datamaestro_text/config/com/github/soskek/bookcorpus.yaml,sha256=qJKs35yeEIilEMgNvU3OEqMp1TSn7mDM2T-uYyA7kTU,1607
12
12
  datamaestro_text/config/com/microsoft/wikiqa.yaml,sha256=U7rU-W2Xz1MYv2YXT4jCTj5DsHyM0VssbQPNa3EISaM,540
@@ -46,7 +46,7 @@ datamaestro_text/data/recommendation.py,sha256=MatelpJQiCMpNDuaQVBkRY809J1qiQo-8
46
46
  datamaestro_text/data/tagging.py,sha256=yWm7bNLks77cAySa1aZNCmLFxTTqhqXZ0PaoaEYU6hI,697
47
47
  datamaestro_text/data/text.py,sha256=pOI8nrEd6RoQA28DVH1JufHTunr9vG3FQzwElR8YirI,499
48
48
  datamaestro_text/data/conversation/__init__.py,sha256=esOWnSImMlQs5QtfxUt559ABLd6a5wwoNpj9XtIq71Y,159
49
- datamaestro_text/data/conversation/base.py,sha256=FWA4-5corSZUuRMzpewOBXPDG2YR60j5geZmN-SaXrg,6451
49
+ datamaestro_text/data/conversation/base.py,sha256=zXX5sPMoVtDf4WTkLx45IvZz6lgKPgcUZFu6N4lVBlc,6457
50
50
  datamaestro_text/data/conversation/canard.py,sha256=IMxu5NfytWZLZ_cyT8UFOICbDE82HLJoNBT_6j36Faw,1808
51
51
  datamaestro_text/data/conversation/orconvqa.py,sha256=VG3GV5_IPQcvla9rrQPypDNcZYmKNMgozmr2oudeLA4,3802
52
52
  datamaestro_text/data/conversation/qrecc.py,sha256=es4GmqPtE63A7O_GARe8Zy3rQvuLEhAvUA7CfN_nMeA,2562
@@ -55,15 +55,15 @@ datamaestro_text/data/ir/base.py,sha256=Cw8egjChpx4ksUwp-vTA70B2OWxROH7FIeJylsXg
55
55
  datamaestro_text/data/ir/cord19.py,sha256=7xDIzsy63WrA9lxxyNOMu5ECRymu5x23EzYG977nS6Y,1440
56
56
  datamaestro_text/data/ir/csv.py,sha256=tnxuqR_MZ3GQhuoXEMYyWLQw8PyD0gRqsnzIP5Gsziw,1212
57
57
  datamaestro_text/data/ir/data.py,sha256=ERmmOxz_9mUkIuccNbzUa5Y6gVLLVDdyc4cCxbCCUbY,20
58
- datamaestro_text/data/ir/formats.py,sha256=wgjXIkNJjqRbHEMkkXyXRRMnxnho45jfUbPsJCazkZk,2866
58
+ datamaestro_text/data/ir/formats.py,sha256=rU9uJhdFDdlnQ2qfFowK_--1WVOH1fVgfXWOPPDF_FY,3160
59
59
  datamaestro_text/data/ir/huggingface.py,sha256=G71VFDN-SllZy4LFxumEbCumEJvb5-orAbLemHjWhiA,956
60
- datamaestro_text/data/ir/stores.py,sha256=odp1XoCq-FakKICXsMBCxzJlx77j71QPKzyLnMg0xGA,733
60
+ datamaestro_text/data/ir/stores.py,sha256=rlOogoBAfsP7o01KqvHiTF1XqzK2Fp6QbRmuasabKAE,859
61
61
  datamaestro_text/data/ir/trec.py,sha256=qDOzmgUn0hMqeP9KdI190-9IKsLl6yNRqIoBz6s-cBs,1898
62
62
  datamaestro_text/data/ir/utils.py,sha256=6-GhXVtgkBZGhIs2-ODZua_3DmKjSSVydStpHDqbAwE,833
63
63
  datamaestro_text/datasets/irds/__init__.py,sha256=Tq0HN1qojnZYLBumM59BuTkz7r0gcu-5OXmDDLgPpAc,707
64
- datamaestro_text/datasets/irds/data.py,sha256=EsqaY5UNbtQGMdEqUn5tlxW-k2LiaJ0jiD_6vVtZuU8,20261
64
+ datamaestro_text/datasets/irds/data.py,sha256=BTsFZdjFJ0XdlYRh2rcshAoeknHrypf7be2BGUbWWFk,19931
65
65
  datamaestro_text/datasets/irds/datasets.py,sha256=qtN-nX2_OV9FD339aZjwVL1jFf6I4T6MbNCcuRAjmtU,5682
66
- datamaestro_text/datasets/irds/helpers.py,sha256=KC-2nQPCIl4VnbfDkAkr4iFlhkknn8zvbADlClWZvwU,2207
66
+ datamaestro_text/datasets/irds/helpers.py,sha256=fGE-fbuJbXdTzl1qo55247jzn9cvApY-d82GJBgfY1E,3982
67
67
  datamaestro_text/datasets/irds/utils.py,sha256=m30JLIrV_HgilN11TvY9dGTyumES6LLzWZDUAMT915M,1425
68
68
  datamaestro_text/download/tmdb.py,sha256=kU_Vz9jhznlyeKMHziVu58IHoWv8zPu6CZTHVNQvmu4,4009
69
69
  datamaestro_text/interfaces/plaintext.py,sha256=cWfS_xjqZxQ0EV4Ax5BEarZ4lnhQ1I7mc_vgfBgE76w,885
@@ -78,9 +78,9 @@ datamaestro_text/utils/files.py,sha256=n6ZGl5LNrZbHLcV9RFwd7cFT0vPUezit-2dsBzs1v
78
78
  datamaestro_text/utils/iter.py,sha256=2_UZ8y9Ma4k5U9ZD4w55Zfb6NGrKM1L4G40OygRm1is,2459
79
79
  datamaestro_text/utils/randomstream.py,sha256=_-boH4IIqN8qcl3IktjpNp9vmF4TWRzHUSNVwg7WAr8,973
80
80
  datamaestro_text/utils/shuffle.py,sha256=o8JTz3mr0lYWyv0zEh91jEK12ci1etMiUnzh5GkOHCM,3490
81
- datamaestro_text-2024.5.31.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
82
- datamaestro_text-2024.5.31.dist-info/METADATA,sha256=MGRuQbJdMtcfGAGdF0MqDiPcR7NABD7PhGIMVnf71aY,1604
83
- datamaestro_text-2024.5.31.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
84
- datamaestro_text-2024.5.31.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
85
- datamaestro_text-2024.5.31.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
86
- datamaestro_text-2024.5.31.dist-info/RECORD,,
81
+ datamaestro_text-2025.1.7.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
82
+ datamaestro_text-2025.1.7.dist-info/METADATA,sha256=-wsBcUcnEnOMiHb1ROIf43r55ZNNNjpIemuFXm5hHUY,1609
83
+ datamaestro_text-2025.1.7.dist-info/WHEEL,sha256=A3WOREP4zgxI0fKrHUG8DC8013e3dK3n7a6HDbcEIwE,91
84
+ datamaestro_text-2025.1.7.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
85
+ datamaestro_text-2025.1.7.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
86
+ datamaestro_text-2025.1.7.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.43.0)
2
+ Generator: setuptools (75.7.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5