datamaestro-text 2025.1.7__py3-none-any.whl → 2025.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,11 @@
1
1
  """MS MARCO (Microsoft Machine Reading Comprehension) is a large scale dataset focused on machine reading comprehension, question answering, and passage ranking. A variant of this task will be the part of TREC and AFIRM 2019. For Updates about TREC 2019 please follow This Repository Passage Reranking task Task Given a query q and a the 1000 most relevant passages P = p1, p2, p3,... p1000, as retrieved by BM25 a succeful system is expected to rerank the most relevant passage as high as possible. For this task not all 1000 relevant items have a human labeled relevant passage. Evaluation will be done using MRR.
2
2
 
3
- **Publication**:
4
- Tri Nguyen, Mir Rosenberg, Xia Song, Jianfeng Gao, Saurabh Tiwary, RanganMajumder, and Li Deng. 2016.
5
- MS MARCO: A Human Generated MAchineReading COmprehension Dataset. In CoCo@NIPS.
3
+ **Publication**:
4
+ Tri Nguyen, Mir Rosenberg, Xia Song, Jianfeng Gao, Saurabh Tiwary, RanganMajumder, and Li Deng. 2016.
5
+ MS MARCO: A Human Generated MAchineReading COmprehension Dataset. In CoCo@NIPS.
6
6
 
7
7
 
8
- See [https://github.com/microsoft/MSMARCO-Passage-Ranking](https://github.com/microsoft/MSMARCO-Passage-Ranking) for more details
8
+ See [https://github.com/microsoft/MSMARCO-Passage-Ranking](https://github.com/microsoft/MSMARCO-Passage-Ranking) for more details
9
9
  """
10
10
 
11
11
  from datamaestro.annotations.agreement import useragreement
@@ -35,6 +35,7 @@ http://www.msmarco.org/dataset.aspx""",
35
35
 
36
36
  # --- Document collection
37
37
 
38
+
38
39
  # TODO: Not ideal since it would be better to have small versions right away
39
40
  # instead of downloading again the MS Marco Collection
40
41
  @lua
@@ -43,10 +44,10 @@ http://www.msmarco.org/dataset.aspx""",
43
44
  url="https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz",
44
45
  checker=HashCheck("31644046b18952c1386cd4564ba2ae69", md5),
45
46
  )
46
- @dataset(Folder, url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
47
- def collection_etc(data):
47
+ @dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
48
+ def collection_etc(data) -> Folder:
48
49
  """Documents and some more files"""
49
- return {"path": data}
50
+ return Folder(path=data)
50
51
 
51
52
 
52
53
  @lua
@@ -4,10 +4,6 @@ from datamaestro.download.archive import zipdownloader
4
4
  from datamaestro.data.ml import Supervised
5
5
  from datamaestro.utils import HashCheck
6
6
 
7
- # name: Sentiment140
8
- # web: http://help.sentiment140.com/for-students/
9
-
10
- # description: |
11
7
 
12
8
 
13
9
  @zipdownloader(
@@ -1,10 +1,9 @@
1
1
  """The AQUAINT Corpus, Linguistic Data Consortium (LDC) catalog number LDC2002T31 and ISBN 1-58563-240-6 consists of newswire text data in English, drawn from three sources: the Xinhua News Service (People's Republic of China), the New York Times News Service, and the Associated Press Worldstream News Service. It was prepared by the LDC for the AQUAINT Project, and will be used in official benchmark evaluations conducted by National Institute of Standards and Technology (NIST)."""
2
2
 
3
- from datamaestro.definitions import DatafolderPath
4
- from datamaestro.data import Base
5
- from datamaestro_text.data.ir.trec import TipsterCollection
6
- from datamaestro.definitions import argument, datatasks, datatags, dataset
3
+ from datamaestro.context import DatafolderPath
4
+ from datamaestro.definitions import dataset
7
5
  from datamaestro.download.links import links, linkfolder
6
+ from datamaestro_text.data.ir.trec import TipsterCollection
8
7
 
9
8
 
10
9
  URL = "https://catalog.ldc.upenn.edu/LDC2002T31"
@@ -22,8 +22,8 @@ from datamaestro_text.data.ir.trec import TipsterCollection
22
22
  from datamaestro.download.links import linkfolder
23
23
  from datamaestro.definitions import (
24
24
  dataset,
25
- DatafolderPath,
26
25
  )
26
+ from datamaestro.context import DatafolderPath
27
27
 
28
28
  # Store meta-information
29
29
  TIPSTER = dataset(TipsterCollection, url="https://catalog.ldc.upenn.edu/LDC93T3A")
@@ -1,5 +1,5 @@
1
- from pathlib import Path
2
- from datamaestro.data import Base, File, argument
1
+ from experimaestro import Meta
2
+ from datamaestro.data import Base, File
3
3
  from datamaestro.definitions import datatags
4
4
  import numpy as np
5
5
  from typing import Tuple, List
@@ -18,9 +18,9 @@ class WordEmbeddings(Base):
18
18
  raise NotImplementedError()
19
19
 
20
20
 
21
- @argument("encoding", str, ignored=True, default="utf-8")
22
21
  class WordEmbeddingsText(WordEmbeddings, File):
23
22
  """Word embeddings as a text word / values"""
23
+ encoding: Meta[str] = "utf-8"
24
24
 
25
25
  def load(self):
26
26
  words = []
@@ -1,7 +1,8 @@
1
1
  from csv import DictReader
2
2
  from typing import Iterator
3
3
 
4
- from datamaestro.data import File, documentation
4
+ from experimaestro import documentation
5
+ from datamaestro.data import File
5
6
  from datamaestro.record import Record
6
7
  from datamaestro_text.data.ir import Documents, TopicRecord, Topics, IDItem
7
8
  from datamaestro_text.data.ir.formats import (
@@ -1,14 +1,13 @@
1
- from datamaestro.data import Base, File, argument
1
+ from experimaestro import Param
2
+ from datamaestro.data import Base, File
2
3
  import datamaestro.data.csv as csv
3
4
 
4
5
 
5
- @argument("ratings", type=File)
6
6
  class RatedItems(Base):
7
- pass
7
+ ratings: Param[File]
8
8
 
9
9
 
10
- @argument("links", type=csv.Generic)
11
- @argument("movies", type=csv.Generic)
12
- @argument("tags", type=csv.Generic)
13
10
  class Movielens(RatedItems):
14
- pass
11
+ links: Param[csv.Generic]
12
+ movies: Param[csv.Generic]
13
+ tags: Param[csv.Generic]
@@ -1,15 +1,15 @@
1
- from pathlib import Path
2
- from datamaestro.data import Base, Folder, File, argument
1
+ from typing import Optional
2
+ from experimaestro import Param
3
+ from datamaestro.data import Base, Folder, File
3
4
  from datamaestro.data.ml import Supervised
4
5
 
5
6
 
6
- @argument("train", type=Base)
7
- @argument("test", type=Base, required=False)
8
- @argument("validation", type=Base, required=False)
9
7
  class TrainingText(Supervised):
10
8
  """ "A dataset used for training with a train and a test"""
11
9
 
12
- pass
10
+ train: Param[Base]
11
+ test: Param[Optional[Base]] = None
12
+ validation: Param[Optional[Base]] = None
13
13
 
14
14
 
15
15
  class TextFolder(Folder):
@@ -1,8 +1,13 @@
1
- # file generated by setuptools_scm
1
+ # file generated by setuptools-scm
2
2
  # don't change, don't track in version control
3
+
4
+ __all__ = ["__version__", "__version_tuple__", "version", "version_tuple"]
5
+
3
6
  TYPE_CHECKING = False
4
7
  if TYPE_CHECKING:
5
- from typing import Tuple, Union
8
+ from typing import Tuple
9
+ from typing import Union
10
+
6
11
  VERSION_TUPLE = Tuple[Union[int, str], ...]
7
12
  else:
8
13
  VERSION_TUPLE = object
@@ -12,5 +17,5 @@ __version__: str
12
17
  __version_tuple__: VERSION_TUPLE
13
18
  version_tuple: VERSION_TUPLE
14
19
 
15
- __version__ = version = '2025.1.7'
16
- __version_tuple__ = version_tuple = (2025, 1, 7)
20
+ __version__ = version = '2025.4.3'
21
+ __version_tuple__ = version_tuple = (2025, 4, 3)
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: datamaestro-text
3
- Version: 2025.1.7
3
+ Version: 2025.4.3
4
4
  Summary: Datamaestro module for text-related datasets
5
5
  Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
6
6
  License: GPL-3
@@ -21,6 +21,7 @@ License-File: LICENSE
21
21
  Requires-Dist: datamaestro>=1.2.1
22
22
  Requires-Dist: ir_datasets>=0.5.8
23
23
  Requires-Dist: attrs
24
+ Dynamic: license-file
24
25
 
25
26
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) [![PyPI version](https://badge.fury.io/py/datamaestro-text.svg)](https://badge.fury.io/py/datamaestro-text)
26
27
 
@@ -1,16 +1,16 @@
1
1
  datamaestro_text/__init__.py,sha256=hU8jZpkXl3F74qIfqnJl7v4nJ9YxfoR7IpJpUREFNRI,248
2
- datamaestro_text/version.py,sha256=k65KHkS4PD3NjDDCJgWcTwRBE4yVcKs32B8SUxv2DvE,417
2
+ datamaestro_text/version.py,sha256=Ldmqy5wkUM54W7PO84xMGLTubji7Xl68QRaeaSSILS0,517
3
3
  datamaestro_text/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  datamaestro_text/config/ai/quac.yaml,sha256=h1D7UJo2z1nZ_9MXpDHuQNJG9Ma2oayUdJV6lyumAIg,1103
5
5
  datamaestro_text/config/com/oscar-corpus.py,sha256=dDfdy_uhG6esRQQzUOOORwK8KvEq4c4hZbWt22pv-zY,733
6
- datamaestro_text/config/com/sentiment140.py,sha256=iRV_rSjQcr9WNjjQ4EdznaEMGLADV_rwpeRx7ycQi0k,1383
6
+ datamaestro_text/config/com/sentiment140.py,sha256=bLxFY6xIOp3_9mn5H36V-jfa_vXdetRxi6sK4cghl9w,1294
7
7
  datamaestro_text/config/com/fastml/goodbooks-10k.yaml,sha256=5ZABxUnBFs2ZnCXtBH8YoBiPb3SocRRdH1DLSfVWF-Y,1172
8
8
  datamaestro_text/config/com/github/aagohary/canard.py,sha256=tIwb_KxMUR3st7rzQUkt6rIjolTl8uKvDq6t795b1nY,1468
9
9
  datamaestro_text/config/com/github/apple/ml-qrecc.py,sha256=_lufJlg-4zaQyjr5Ae-X-9hXzpl2JFjfxh_RhnBbva4,3068
10
10
  datamaestro_text/config/com/github/prdwb/orconvqa.py,sha256=Tg3hbnvilxW_Lwt7fpKvGMtu-6mc9oNIHM-LX6JTR1U,3026
11
11
  datamaestro_text/config/com/github/soskek/bookcorpus.yaml,sha256=qJKs35yeEIilEMgNvU3OEqMp1TSn7mDM2T-uYyA7kTU,1607
12
12
  datamaestro_text/config/com/microsoft/wikiqa.yaml,sha256=U7rU-W2Xz1MYv2YXT4jCTj5DsHyM0VssbQPNa3EISaM,540
13
- datamaestro_text/config/com/microsoft/msmarco/passage.py,sha256=ouNn2nivS6mUMaCyMzqxNv1YMoPrSEX-UcSZpG1v_uw,11645
13
+ datamaestro_text/config/com/microsoft/msmarco/passage.py,sha256=tVldwVh0pQpXXifqcMNEN9cLO1HXkXoEhToSazSx5RE,11643
14
14
  datamaestro_text/config/com/smashwords/bookcorpus.py,sha256=hCFjZg9t1AHET05qU31uatiAOD5EEzzWG9y_W90jcOE,861
15
15
  datamaestro_text/config/edu/cornell/nlvr.yaml,sha256=9Yk5VZMncSmrP7JNuGXqExksgX5nQ_Zfnlps8hWze3Q,921
16
16
  datamaestro_text/config/edu/stanford/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -19,7 +19,7 @@ datamaestro_text/config/edu/stanford/glove.py,sha256=ykkQ7nYWqhmgc2TeohNMliYSiX8
19
19
  datamaestro_text/config/edu/stanford/im2p.yaml,sha256=JoToNyEPpmwdyLFedCBot5ypyw7p9rzi12mGXJuZin0,2909
20
20
  datamaestro_text/config/edu/upenn/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
21
  datamaestro_text/config/edu/upenn/ldc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
- datamaestro_text/config/edu/upenn/ldc/aquaint.py,sha256=Y_biKee8LA8arsHiKOUlPBWfylDDM9k-x5UgN-uJdLE,1658
22
+ datamaestro_text/config/edu/upenn/ldc/aquaint.py,sha256=YEU3kIXzv71Vjg9bUoXyQ-vSLvC-y4LlE3mJQf38XuY,1589
23
23
  datamaestro_text/config/gov/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
24
  datamaestro_text/config/gov/nist/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
25
  datamaestro_text/config/gov/nist/ir/covid.py,sha256=wn2E7sQ8M6pAucVD4sKJYImyzKUKphyiDFJD0oYRCbg,4004
@@ -28,7 +28,7 @@ datamaestro_text/config/gov/nist/trec/adhoc.py,sha256=vcFaLlZ-chwDt013MYI8bYZ2ug
28
28
  datamaestro_text/config/gov/nist/trec/clueweb.yaml,sha256=sm4UbdtMzWoDVPsewtVDS7Vj2jBOdgp18Xqo1X4ysQc,792
29
29
  datamaestro_text/config/gov/nist/trec/deeplearning.yaml,sha256=QGM7PtXLJRttNdOPE16o7-k3e5tA9HgcaM_-qFDV_5Q,2125
30
30
  datamaestro_text/config/gov/nist/trec/index.yaml,sha256=oSUhUmtukf5oXqUbJLhae8xZx7Uac5V6uZoUB-RJ7Sw,2711
31
- datamaestro_text/config/gov/nist/trec/tipster.py,sha256=rmVFcwUPAfD529rneZUlCLBke-edYjrBIH3n02-qfvc,5371
31
+ datamaestro_text/config/gov/nist/trec/tipster.py,sha256=t0w9lOBfvwt6YCYCyDj9fo1QiBXfs0q8qzKl74f40jU,5398
32
32
  datamaestro_text/config/gov/nist/trec/web.yaml,sha256=iNIJ-PCeLwpUaJByJesyCqfFJolWCts_VETACha3hfQ,563
33
33
  datamaestro_text/config/io/github/rajpurkar/squad.yaml,sha256=JchkTruEhVd0uOTGgek2tOgBL4UT3G3KVASvZcNigLg,1143
34
34
  datamaestro_text/config/io/github/thunlp/fewrel.py,sha256=LJYexk2ssZ5SqKIKLjPdybS9ju4g9fGIFy5eybtsvYQ,1224
@@ -41,10 +41,10 @@ datamaestro_text/config/org/grouplens/movielens.py,sha256=e78JoiHuwP6gbj7Q84UwPT
41
41
  datamaestro_text/config/org/universaldependencies/french.py,sha256=W_gDtfinjlw08qohX_PWvzQlacDwRFB7PeOzO33mRVU,2208
42
42
  datamaestro_text/config/uk/ac/ucl/cs/qangaroo.yaml,sha256=IBy82CDNNLjJPNPzues1EgDXu0A5WDvUFeVNSOyrIpI,1137
43
43
  datamaestro_text/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
- datamaestro_text/data/embeddings.py,sha256=AskX7Ggvkpqhb-Je_hBTFp_vfkiWzWtJH1gFQxuUTwM,1155
45
- datamaestro_text/data/recommendation.py,sha256=MatelpJQiCMpNDuaQVBkRY809J1qiQo-8QO9XD2J_dU,303
44
+ datamaestro_text/data/embeddings.py,sha256=qdeZ4m6drFNkqW_4pKxJKdJHtOnDRs0P7QVJ7AO6xFE,1127
45
+ datamaestro_text/data/recommendation.py,sha256=wHV_9SXSclouuXaBmvwg3ncZLOFfdKRZG3IHkkPJX9Y,279
46
46
  datamaestro_text/data/tagging.py,sha256=yWm7bNLks77cAySa1aZNCmLFxTTqhqXZ0PaoaEYU6hI,697
47
- datamaestro_text/data/text.py,sha256=pOI8nrEd6RoQA28DVH1JufHTunr9vG3FQzwElR8YirI,499
47
+ datamaestro_text/data/text.py,sha256=_9J7-j06BOIZ1HsOcBsWy_asanAKkDoZIsWWMMj4tm4,496
48
48
  datamaestro_text/data/conversation/__init__.py,sha256=esOWnSImMlQs5QtfxUt559ABLd6a5wwoNpj9XtIq71Y,159
49
49
  datamaestro_text/data/conversation/base.py,sha256=zXX5sPMoVtDf4WTkLx45IvZz6lgKPgcUZFu6N4lVBlc,6457
50
50
  datamaestro_text/data/conversation/canard.py,sha256=IMxu5NfytWZLZ_cyT8UFOICbDE82HLJoNBT_6j36Faw,1808
@@ -52,7 +52,7 @@ datamaestro_text/data/conversation/orconvqa.py,sha256=VG3GV5_IPQcvla9rrQPypDNcZY
52
52
  datamaestro_text/data/conversation/qrecc.py,sha256=es4GmqPtE63A7O_GARe8Zy3rQvuLEhAvUA7CfN_nMeA,2562
53
53
  datamaestro_text/data/ir/__init__.py,sha256=EOVnRv9oKEEucMOgrFjhXxeUWEkaJkbXBndpkKhZaiY,8701
54
54
  datamaestro_text/data/ir/base.py,sha256=Cw8egjChpx4ksUwp-vTA70B2OWxROH7FIeJylsXggAk,1429
55
- datamaestro_text/data/ir/cord19.py,sha256=7xDIzsy63WrA9lxxyNOMu5ECRymu5x23EzYG977nS6Y,1440
55
+ datamaestro_text/data/ir/cord19.py,sha256=yu1Khgy6AZjH2BPQKdnxDid0vQgQ8zvb8-FQlHH-GOU,1465
56
56
  datamaestro_text/data/ir/csv.py,sha256=tnxuqR_MZ3GQhuoXEMYyWLQw8PyD0gRqsnzIP5Gsziw,1212
57
57
  datamaestro_text/data/ir/data.py,sha256=ERmmOxz_9mUkIuccNbzUa5Y6gVLLVDdyc4cCxbCCUbY,20
58
58
  datamaestro_text/data/ir/formats.py,sha256=rU9uJhdFDdlnQ2qfFowK_--1WVOH1fVgfXWOPPDF_FY,3160
@@ -78,9 +78,9 @@ datamaestro_text/utils/files.py,sha256=n6ZGl5LNrZbHLcV9RFwd7cFT0vPUezit-2dsBzs1v
78
78
  datamaestro_text/utils/iter.py,sha256=2_UZ8y9Ma4k5U9ZD4w55Zfb6NGrKM1L4G40OygRm1is,2459
79
79
  datamaestro_text/utils/randomstream.py,sha256=_-boH4IIqN8qcl3IktjpNp9vmF4TWRzHUSNVwg7WAr8,973
80
80
  datamaestro_text/utils/shuffle.py,sha256=o8JTz3mr0lYWyv0zEh91jEK12ci1etMiUnzh5GkOHCM,3490
81
- datamaestro_text-2025.1.7.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
82
- datamaestro_text-2025.1.7.dist-info/METADATA,sha256=-wsBcUcnEnOMiHb1ROIf43r55ZNNNjpIemuFXm5hHUY,1609
83
- datamaestro_text-2025.1.7.dist-info/WHEEL,sha256=A3WOREP4zgxI0fKrHUG8DC8013e3dK3n7a6HDbcEIwE,91
84
- datamaestro_text-2025.1.7.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
85
- datamaestro_text-2025.1.7.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
86
- datamaestro_text-2025.1.7.dist-info/RECORD,,
81
+ datamaestro_text-2025.4.3.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
82
+ datamaestro_text-2025.4.3.dist-info/METADATA,sha256=M1XG19GB1RLCTJ0xICe47LYDjHzLGFPUvHXg9-bmZZM,1631
83
+ datamaestro_text-2025.4.3.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
84
+ datamaestro_text-2025.4.3.dist-info/entry_points.txt,sha256=lO1P5hE183L5qEEVHlG8d_ik0HNXnX7Eo87cQLdcl-Y,111
85
+ datamaestro_text-2025.4.3.dist-info/top_level.txt,sha256=gYSeqViE8r7eCxSdqFJL74OwljOwKsGPaIhEcCXqc-o,17
86
+ datamaestro_text-2025.4.3.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.7.0)
2
+ Generator: setuptools (78.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5