tfds-nightly 4.9.8.dev202504110044__py3-none-any.whl → 4.9.8.dev202504130103__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tensorflow_datasets/datasets/lbpp/CITATIONS.bib +24 -0
- tensorflow_datasets/datasets/lbpp/README.md +6 -0
- tensorflow_datasets/datasets/lbpp/__init__.py +15 -0
- tensorflow_datasets/datasets/lbpp/lbpp_dataset_builder.py +174 -0
- {tfds_nightly-4.9.8.dev202504110044.dist-info → tfds_nightly-4.9.8.dev202504130103.dist-info}/METADATA +1 -1
- {tfds_nightly-4.9.8.dev202504110044.dist-info → tfds_nightly-4.9.8.dev202504130103.dist-info}/RECORD +11 -7
- {tfds_nightly-4.9.8.dev202504110044.dist-info → tfds_nightly-4.9.8.dev202504130103.dist-info}/WHEEL +0 -0
- {tfds_nightly-4.9.8.dev202504110044.dist-info → tfds_nightly-4.9.8.dev202504130103.dist-info}/entry_points.txt +0 -0
- {tfds_nightly-4.9.8.dev202504110044.dist-info → tfds_nightly-4.9.8.dev202504130103.dist-info}/licenses/AUTHORS +0 -0
- {tfds_nightly-4.9.8.dev202504110044.dist-info → tfds_nightly-4.9.8.dev202504130103.dist-info}/licenses/LICENSE +0 -0
- {tfds_nightly-4.9.8.dev202504110044.dist-info → tfds_nightly-4.9.8.dev202504130103.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,24 @@
|
|
1
|
+
@inproceedings{matton-etal-2024-leakage,
|
2
|
+
title = "On Leakage of Code Generation Evaluation Datasets",
|
3
|
+
author = "Matton, Alexandre and
|
4
|
+
Sherborne, Tom and
|
5
|
+
Aumiller, Dennis and
|
6
|
+
Tommasone, Elena and
|
7
|
+
Alizadeh, Milad and
|
8
|
+
He, Jingyi and
|
9
|
+
Ma, Raymond and
|
10
|
+
Voisin, Maxime and
|
11
|
+
Gilsenan-McMahon, Ellen and
|
12
|
+
Gall{\'e}, Matthias",
|
13
|
+
editor = "Al-Onaizan, Yaser and
|
14
|
+
Bansal, Mohit and
|
15
|
+
Chen, Yun-Nung",
|
16
|
+
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
|
17
|
+
month = nov,
|
18
|
+
year = "2024",
|
19
|
+
address = "Miami, Florida, USA",
|
20
|
+
publisher = "Association for Computational Linguistics",
|
21
|
+
url = "https://aclanthology.org/2024.findings-emnlp.772/",
|
22
|
+
doi = "10.18653/v1/2024.findings-emnlp.772",
|
23
|
+
pages = "13215--13223",
|
24
|
+
}
|
@@ -0,0 +1,6 @@
|
|
1
|
+
*Less Basic Python Programming* is a collection of 161 programming problems
|
2
|
+
with accompanying unit tests.
|
3
|
+
They were created with the aim of being fresh (not leaked at the time of
|
4
|
+
creation) and more difficult than similar datasets (e.g., HumanEval and MBPP).
|
5
|
+
It can serve as a drop-in replacement or enrichment of those datasets as they
|
6
|
+
are structured in an equivalent way.
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# coding=utf-8
|
2
|
+
# Copyright 2024 The TensorFlow Datasets Authors.
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
|
@@ -0,0 +1,174 @@
|
|
1
|
+
# coding=utf-8
|
2
|
+
# Copyright 2024 The TensorFlow Datasets Authors.
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
|
16
|
+
# Copyright 2024 Cohere and the current dataset script contributor.
|
17
|
+
#
|
18
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
19
|
+
# you may not use this file except in compliance with the License.
|
20
|
+
# You may obtain a copy of the License at
|
21
|
+
#
|
22
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
23
|
+
#
|
24
|
+
# Unless required by applicable law or agreed to in writing, software
|
25
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
26
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
27
|
+
# See the License for the specific language governing permissions and
|
28
|
+
# limitations under the License.
|
29
|
+
# Author Note: Data loader is heavily inspired by the builder in
|
30
|
+
# https://github.com/google-research/google-research/tree/main/lbpp_dataset
|
31
|
+
"""Cohere Less Basic Python Problems. All columns decoded."""
|
32
|
+
|
33
|
+
import base64
|
34
|
+
import json
|
35
|
+
import pickle
|
36
|
+
import zlib
|
37
|
+
|
38
|
+
from tensorflow_datasets.core.utils.lazy_imports_utils import pandas as pd
|
39
|
+
import tensorflow_datasets.public_api as tfds
|
40
|
+
|
41
|
+
|
42
|
+
_HOMEPAGE = "https://aclanthology.org/2024.findings-emnlp.772/"
|
43
|
+
|
44
|
+
_VERSION = tfds.core.Version("2.0.0")
|
45
|
+
|
46
|
+
_COLUMNS = [
|
47
|
+
"task_id",
|
48
|
+
"language",
|
49
|
+
"title",
|
50
|
+
"instruction",
|
51
|
+
"completion",
|
52
|
+
"test_file",
|
53
|
+
"test_list",
|
54
|
+
"signature",
|
55
|
+
"categories",
|
56
|
+
"test_setup",
|
57
|
+
]
|
58
|
+
|
59
|
+
_LANGUAGES = ["python", "cpp", "go", "java", "js", "rust"]
|
60
|
+
_ALL_LANGUAGE_ALIASES = ["all", "multilingual"]
|
61
|
+
_LANGUAGE_ALIAS_MAP = {
|
62
|
+
"default": "python",
|
63
|
+
"javascript": "js",
|
64
|
+
}
|
65
|
+
|
66
|
+
|
67
|
+
def decode_str(str_to_decode: str):
|
68
|
+
return json.loads(
|
69
|
+
pickle.loads(
|
70
|
+
zlib.decompress(base64.b64decode(str_to_decode.encode("utf-8")))
|
71
|
+
)
|
72
|
+
)
|
73
|
+
|
74
|
+
|
75
|
+
class LBPPConfig(tfds.core.BuilderConfig):
|
76
|
+
"""BuilderConfig."""
|
77
|
+
|
78
|
+
def __init__(self, name, description, features, **kwargs):
|
79
|
+
super(LBPPConfig, self).__init__(name=name, version=_VERSION, **kwargs)
|
80
|
+
self.name = name
|
81
|
+
self.description = description
|
82
|
+
self.features = features
|
83
|
+
|
84
|
+
|
85
|
+
class Builder(tfds.core.GeneratorBasedBuilder):
|
86
|
+
"""Builder for LBPP dataset."""
|
87
|
+
|
88
|
+
VERSION = _VERSION
|
89
|
+
LICENSE = "apache-2.0"
|
90
|
+
BUILDER_CONFIGS = [
|
91
|
+
LBPPConfig(
|
92
|
+
name="all", description="Multilingual LBPP", features=_COLUMNS
|
93
|
+
),
|
94
|
+
LBPPConfig(
|
95
|
+
name="multilingual",
|
96
|
+
description="Multilingual LBPP",
|
97
|
+
features=_COLUMNS,
|
98
|
+
),
|
99
|
+
LBPPConfig(name="default", description="Python LBPP", features=_COLUMNS),
|
100
|
+
LBPPConfig(name="python", description="Python LBPP", features=_COLUMNS),
|
101
|
+
LBPPConfig(name="cpp", description="C++ LBPP", features=_COLUMNS),
|
102
|
+
LBPPConfig(name="go", description="Go LBPP", features=_COLUMNS),
|
103
|
+
LBPPConfig(name="java", description="Java LBPP", features=_COLUMNS),
|
104
|
+
LBPPConfig(name="js", description="JavaScript LBPP", features=_COLUMNS),
|
105
|
+
LBPPConfig(
|
106
|
+
name="javascript", description="JavaScript LBPP", features=_COLUMNS
|
107
|
+
),
|
108
|
+
LBPPConfig(name="rust", description="JavaScript LBPP", features=_COLUMNS),
|
109
|
+
]
|
110
|
+
DEFAULT_CONFIG_NAME = "python"
|
111
|
+
|
112
|
+
def _info(self):
|
113
|
+
return self.dataset_info_from_configs(
|
114
|
+
features=tfds.features.FeaturesDict({
|
115
|
+
"task_id": tfds.features.Text(),
|
116
|
+
"language": tfds.features.Text(),
|
117
|
+
"title": tfds.features.Text(),
|
118
|
+
"instruction": tfds.features.Text(),
|
119
|
+
"completion": tfds.features.Text(),
|
120
|
+
"test_file": tfds.features.Text(),
|
121
|
+
"test_list": tfds.features.Sequence(tfds.features.Text()),
|
122
|
+
"signature": tfds.features.Text(),
|
123
|
+
"categories": tfds.features.Sequence(tfds.features.Text()),
|
124
|
+
"test_setup": tfds.features.Text(),
|
125
|
+
}),
|
126
|
+
homepage=_HOMEPAGE,
|
127
|
+
supervised_keys=None,
|
128
|
+
)
|
129
|
+
|
130
|
+
def _split_generators(self, dl_manager):
|
131
|
+
# Map alias to actual language
|
132
|
+
data_loading_name = _LANGUAGE_ALIAS_MAP.get(
|
133
|
+
self.builder_config.name, self.builder_config.name
|
134
|
+
)
|
135
|
+
hf_url_prefix = (
|
136
|
+
"https://huggingface.co/datasets/CohereForAI/lbpp/resolve/main/"
|
137
|
+
)
|
138
|
+
if data_loading_name in _ALL_LANGUAGE_ALIASES:
|
139
|
+
# Download all languages
|
140
|
+
download_targets = [
|
141
|
+
f"{hf_url_prefix}{lang}/test.parquet" for lang in _LANGUAGES
|
142
|
+
]
|
143
|
+
else:
|
144
|
+
download_targets = [f"{hf_url_prefix}{data_loading_name}/test.parquet"]
|
145
|
+
|
146
|
+
downloaded_files = dl_manager.download(download_targets)
|
147
|
+
|
148
|
+
return [
|
149
|
+
tfds.core.SplitGenerator(
|
150
|
+
name=tfds.Split.TEST,
|
151
|
+
gen_kwargs={
|
152
|
+
"filepaths": downloaded_files,
|
153
|
+
},
|
154
|
+
)
|
155
|
+
]
|
156
|
+
|
157
|
+
def _generate_examples(self, filepaths: list[str]):
|
158
|
+
key = 0
|
159
|
+
for filepath in filepaths:
|
160
|
+
df = pd.read_parquet(filepath)
|
161
|
+
for line in df.to_dict(orient="records"):
|
162
|
+
yield key, {
|
163
|
+
"task_id": line["task_id"],
|
164
|
+
"language": line["language"],
|
165
|
+
"title": line["title"],
|
166
|
+
"instruction": line["instruction"],
|
167
|
+
"completion": decode_str(line["completion"]),
|
168
|
+
"test_file": decode_str(line["test_file"]),
|
169
|
+
"test_list": decode_str(line["test_list"]),
|
170
|
+
"signature": line["signature"] or "",
|
171
|
+
"categories": line["categories"],
|
172
|
+
"test_setup": decode_str(line["test_setup"]),
|
173
|
+
}
|
174
|
+
key += 1
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: tfds-nightly
|
3
|
-
Version: 4.9.8.
|
3
|
+
Version: 4.9.8.dev202504130103
|
4
4
|
Summary: tensorflow/datasets is a library of datasets ready to use with TensorFlow.
|
5
5
|
Home-page: https://github.com/tensorflow/datasets
|
6
6
|
Download-URL: https://github.com/tensorflow/datasets/tags
|
{tfds_nightly-4.9.8.dev202504110044.dist-info → tfds_nightly-4.9.8.dev202504130103.dist-info}/RECORD
RENAMED
@@ -818,6 +818,10 @@ tensorflow_datasets/datasets/lambada/__init__.py,sha256=UCI6C5ryp7VdOk-mbtcPxp6J
|
|
818
818
|
tensorflow_datasets/datasets/lambada/checksums.tsv,sha256=EgETmQzHlYMzJjZ5u71Pho16UdrbagX34a5jcpgmSmU,172
|
819
819
|
tensorflow_datasets/datasets/lambada/lambada_dataset_builder.py,sha256=BNYLj9XI0pdwm1c7wO5ZqJ7BMbMdnCmUr44nX2F8nxE,2269
|
820
820
|
tensorflow_datasets/datasets/lambada/lambada_dataset_builder_test.py,sha256=gJwJQ6CJYOt6QlO5c5-VALf804ByrDMwf4dGjOTjE5Y,1070
|
821
|
+
tensorflow_datasets/datasets/lbpp/CITATIONS.bib,sha256=-GCp4MjOuqbVCM6scAoXKhNVuRblq-9W1gJQ304FvPU,835
|
822
|
+
tensorflow_datasets/datasets/lbpp/README.md,sha256=g1DwOdrH-6todsxwhdnRoJGkB2NzXw5Mq-CgJ-Lsnjo,372
|
823
|
+
tensorflow_datasets/datasets/lbpp/__init__.py,sha256=UCI6C5ryp7VdOk-mbtcPxp6JjEup1pI68YhAiayrHjc,612
|
824
|
+
tensorflow_datasets/datasets/lbpp/lbpp_dataset_builder.py,sha256=OAG0Pm-jAdb37QB4Al70uXzu7KLqq5mXtT2EirzFWAE,5835
|
821
825
|
tensorflow_datasets/datasets/lfw/CITATIONS.bib,sha256=hYwRfGh52dQKV9Z4EX0ec-esXVWaOY3d3g-ulDGYyKQ,340
|
822
826
|
tensorflow_datasets/datasets/lfw/README.md,sha256=SKbgr4y6FxNy2Abei2kk2VQ6N1a7MHKcLOhnTGhLI3I,98
|
823
827
|
tensorflow_datasets/datasets/lfw/TAGS.txt,sha256=3k_aYCPHQfbOnta3Z41F8BFfpHbzanXx2ZLL7h59TpU,46
|
@@ -2455,10 +2459,10 @@ tensorflow_datasets/vision_language/wit/wit_test.py,sha256=DJ687VN9hAp6SLXnr_P12
|
|
2455
2459
|
tensorflow_datasets/vision_language/wit_kaggle/__init__.py,sha256=38tJQ73VHz8vOJn-AyZh2we2YJucbSRIgmgcrsC6bQM,719
|
2456
2460
|
tensorflow_datasets/vision_language/wit_kaggle/wit_kaggle.py,sha256=k-7YD1SGr5bASfdR2_09rrqz-8cpWdIcBWWEXhCvzuk,16903
|
2457
2461
|
tensorflow_datasets/vision_language/wit_kaggle/wit_kaggle_test.py,sha256=vLlluBW77ASNVC4ix7t8idkSUBI6q1-B7zmRV_ICCQM,1778
|
2458
|
-
tfds_nightly-4.9.8.
|
2459
|
-
tfds_nightly-4.9.8.
|
2460
|
-
tfds_nightly-4.9.8.
|
2461
|
-
tfds_nightly-4.9.8.
|
2462
|
-
tfds_nightly-4.9.8.
|
2463
|
-
tfds_nightly-4.9.8.
|
2464
|
-
tfds_nightly-4.9.8.
|
2462
|
+
tfds_nightly-4.9.8.dev202504130103.dist-info/licenses/AUTHORS,sha256=nvBG4WwfgjuOu1oZkuQKw9kg7X6rve679ObS-YDDmXg,309
|
2463
|
+
tfds_nightly-4.9.8.dev202504130103.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
|
2464
|
+
tfds_nightly-4.9.8.dev202504130103.dist-info/METADATA,sha256=Xb2ci-GJ8aIVhADL-FVI0VIfcvUCVEUiPDRUNdZTGhI,11879
|
2465
|
+
tfds_nightly-4.9.8.dev202504130103.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
2466
|
+
tfds_nightly-4.9.8.dev202504130103.dist-info/entry_points.txt,sha256=eHEL7nF5y1uCY2FgkuYIdE062epJXlAQTSdq89px4p4,73
|
2467
|
+
tfds_nightly-4.9.8.dev202504130103.dist-info/top_level.txt,sha256=bAevmk9209s_oxVZVlN6hSDIVS423qrMQvmcWSvW4do,20
|
2468
|
+
tfds_nightly-4.9.8.dev202504130103.dist-info/RECORD,,
|
{tfds_nightly-4.9.8.dev202504110044.dist-info → tfds_nightly-4.9.8.dev202504130103.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|