tfds-nightly 4.9.8.dev202504110044__py3-none-any.whl → 4.9.8.dev202504130103__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,24 @@
1
+ @inproceedings{matton-etal-2024-leakage,
2
+ title = "On Leakage of Code Generation Evaluation Datasets",
3
+ author = "Matton, Alexandre and
4
+ Sherborne, Tom and
5
+ Aumiller, Dennis and
6
+ Tommasone, Elena and
7
+ Alizadeh, Milad and
8
+ He, Jingyi and
9
+ Ma, Raymond and
10
+ Voisin, Maxime and
11
+ Gilsenan-McMahon, Ellen and
12
+ Gall{\'e}, Matthias",
13
+ editor = "Al-Onaizan, Yaser and
14
+ Bansal, Mohit and
15
+ Chen, Yun-Nung",
16
+ booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
17
+ month = nov,
18
+ year = "2024",
19
+ address = "Miami, Florida, USA",
20
+ publisher = "Association for Computational Linguistics",
21
+ url = "https://aclanthology.org/2024.findings-emnlp.772/",
22
+ doi = "10.18653/v1/2024.findings-emnlp.772",
23
+ pages = "13215--13223",
24
+ }
@@ -0,0 +1,6 @@
1
+ *Less Basic Python Programming* is a collection of 161 programming problems
2
+ with accompanying unit tests.
3
+ They were created with the aim of being fresh (not leaked at the time of
4
+ creation) and more difficult than similar datasets (e.g., HumanEval and MBPP).
5
+ It can serve as a drop-in replacement or enrichment of those datasets as they
6
+ are structured in an equivalent way.
@@ -0,0 +1,15 @@
1
+ # coding=utf-8
2
+ # Copyright 2024 The TensorFlow Datasets Authors.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
@@ -0,0 +1,174 @@
1
+ # coding=utf-8
2
+ # Copyright 2024 The TensorFlow Datasets Authors.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ # Copyright 2024 Cohere and the current dataset script contributor.
17
+ #
18
+ # Licensed under the Apache License, Version 2.0 (the "License");
19
+ # you may not use this file except in compliance with the License.
20
+ # You may obtain a copy of the License at
21
+ #
22
+ # http://www.apache.org/licenses/LICENSE-2.0
23
+ #
24
+ # Unless required by applicable law or agreed to in writing, software
25
+ # distributed under the License is distributed on an "AS IS" BASIS,
26
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
27
+ # See the License for the specific language governing permissions and
28
+ # limitations under the License.
29
+ # Author Note: Data loader is heavily inspired by the builder in
30
+ # https://github.com/google-research/google-research/tree/main/lbpp_dataset
31
+ """Cohere Less Basic Python Problems. All columns decoded."""
32
+
33
+ import base64
34
+ import json
35
+ import pickle
36
+ import zlib
37
+
38
+ from tensorflow_datasets.core.utils.lazy_imports_utils import pandas as pd
39
+ import tensorflow_datasets.public_api as tfds
40
+
41
+
42
+ _HOMEPAGE = "https://aclanthology.org/2024.findings-emnlp.772/"
43
+
44
+ _VERSION = tfds.core.Version("2.0.0")
45
+
46
+ _COLUMNS = [
47
+ "task_id",
48
+ "language",
49
+ "title",
50
+ "instruction",
51
+ "completion",
52
+ "test_file",
53
+ "test_list",
54
+ "signature",
55
+ "categories",
56
+ "test_setup",
57
+ ]
58
+
59
+ _LANGUAGES = ["python", "cpp", "go", "java", "js", "rust"]
60
+ _ALL_LANGUAGE_ALIASES = ["all", "multilingual"]
61
+ _LANGUAGE_ALIAS_MAP = {
62
+ "default": "python",
63
+ "javascript": "js",
64
+ }
65
+
66
+
67
+ def decode_str(str_to_decode: str):
68
+ return json.loads(
69
+ pickle.loads(
70
+ zlib.decompress(base64.b64decode(str_to_decode.encode("utf-8")))
71
+ )
72
+ )
73
+
74
+
75
+ class LBPPConfig(tfds.core.BuilderConfig):
76
+ """BuilderConfig."""
77
+
78
+ def __init__(self, name, description, features, **kwargs):
79
+ super(LBPPConfig, self).__init__(name=name, version=_VERSION, **kwargs)
80
+ self.name = name
81
+ self.description = description
82
+ self.features = features
83
+
84
+
85
+ class Builder(tfds.core.GeneratorBasedBuilder):
86
+ """Builder for LBPP dataset."""
87
+
88
+ VERSION = _VERSION
89
+ LICENSE = "apache-2.0"
90
+ BUILDER_CONFIGS = [
91
+ LBPPConfig(
92
+ name="all", description="Multilingual LBPP", features=_COLUMNS
93
+ ),
94
+ LBPPConfig(
95
+ name="multilingual",
96
+ description="Multilingual LBPP",
97
+ features=_COLUMNS,
98
+ ),
99
+ LBPPConfig(name="default", description="Python LBPP", features=_COLUMNS),
100
+ LBPPConfig(name="python", description="Python LBPP", features=_COLUMNS),
101
+ LBPPConfig(name="cpp", description="C++ LBPP", features=_COLUMNS),
102
+ LBPPConfig(name="go", description="Go LBPP", features=_COLUMNS),
103
+ LBPPConfig(name="java", description="Java LBPP", features=_COLUMNS),
104
+ LBPPConfig(name="js", description="JavaScript LBPP", features=_COLUMNS),
105
+ LBPPConfig(
106
+ name="javascript", description="JavaScript LBPP", features=_COLUMNS
107
+ ),
108
+ LBPPConfig(name="rust", description="JavaScript LBPP", features=_COLUMNS),
109
+ ]
110
+ DEFAULT_CONFIG_NAME = "python"
111
+
112
+ def _info(self):
113
+ return self.dataset_info_from_configs(
114
+ features=tfds.features.FeaturesDict({
115
+ "task_id": tfds.features.Text(),
116
+ "language": tfds.features.Text(),
117
+ "title": tfds.features.Text(),
118
+ "instruction": tfds.features.Text(),
119
+ "completion": tfds.features.Text(),
120
+ "test_file": tfds.features.Text(),
121
+ "test_list": tfds.features.Sequence(tfds.features.Text()),
122
+ "signature": tfds.features.Text(),
123
+ "categories": tfds.features.Sequence(tfds.features.Text()),
124
+ "test_setup": tfds.features.Text(),
125
+ }),
126
+ homepage=_HOMEPAGE,
127
+ supervised_keys=None,
128
+ )
129
+
130
+ def _split_generators(self, dl_manager):
131
+ # Map alias to actual language
132
+ data_loading_name = _LANGUAGE_ALIAS_MAP.get(
133
+ self.builder_config.name, self.builder_config.name
134
+ )
135
+ hf_url_prefix = (
136
+ "https://huggingface.co/datasets/CohereForAI/lbpp/resolve/main/"
137
+ )
138
+ if data_loading_name in _ALL_LANGUAGE_ALIASES:
139
+ # Download all languages
140
+ download_targets = [
141
+ f"{hf_url_prefix}{lang}/test.parquet" for lang in _LANGUAGES
142
+ ]
143
+ else:
144
+ download_targets = [f"{hf_url_prefix}{data_loading_name}/test.parquet"]
145
+
146
+ downloaded_files = dl_manager.download(download_targets)
147
+
148
+ return [
149
+ tfds.core.SplitGenerator(
150
+ name=tfds.Split.TEST,
151
+ gen_kwargs={
152
+ "filepaths": downloaded_files,
153
+ },
154
+ )
155
+ ]
156
+
157
+ def _generate_examples(self, filepaths: list[str]):
158
+ key = 0
159
+ for filepath in filepaths:
160
+ df = pd.read_parquet(filepath)
161
+ for line in df.to_dict(orient="records"):
162
+ yield key, {
163
+ "task_id": line["task_id"],
164
+ "language": line["language"],
165
+ "title": line["title"],
166
+ "instruction": line["instruction"],
167
+ "completion": decode_str(line["completion"]),
168
+ "test_file": decode_str(line["test_file"]),
169
+ "test_list": decode_str(line["test_list"]),
170
+ "signature": line["signature"] or "",
171
+ "categories": line["categories"],
172
+ "test_setup": decode_str(line["test_setup"]),
173
+ }
174
+ key += 1
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tfds-nightly
3
- Version: 4.9.8.dev202504110044
3
+ Version: 4.9.8.dev202504130103
4
4
  Summary: tensorflow/datasets is a library of datasets ready to use with TensorFlow.
5
5
  Home-page: https://github.com/tensorflow/datasets
6
6
  Download-URL: https://github.com/tensorflow/datasets/tags
@@ -818,6 +818,10 @@ tensorflow_datasets/datasets/lambada/__init__.py,sha256=UCI6C5ryp7VdOk-mbtcPxp6J
818
818
  tensorflow_datasets/datasets/lambada/checksums.tsv,sha256=EgETmQzHlYMzJjZ5u71Pho16UdrbagX34a5jcpgmSmU,172
819
819
  tensorflow_datasets/datasets/lambada/lambada_dataset_builder.py,sha256=BNYLj9XI0pdwm1c7wO5ZqJ7BMbMdnCmUr44nX2F8nxE,2269
820
820
  tensorflow_datasets/datasets/lambada/lambada_dataset_builder_test.py,sha256=gJwJQ6CJYOt6QlO5c5-VALf804ByrDMwf4dGjOTjE5Y,1070
821
+ tensorflow_datasets/datasets/lbpp/CITATIONS.bib,sha256=-GCp4MjOuqbVCM6scAoXKhNVuRblq-9W1gJQ304FvPU,835
822
+ tensorflow_datasets/datasets/lbpp/README.md,sha256=g1DwOdrH-6todsxwhdnRoJGkB2NzXw5Mq-CgJ-Lsnjo,372
823
+ tensorflow_datasets/datasets/lbpp/__init__.py,sha256=UCI6C5ryp7VdOk-mbtcPxp6JjEup1pI68YhAiayrHjc,612
824
+ tensorflow_datasets/datasets/lbpp/lbpp_dataset_builder.py,sha256=OAG0Pm-jAdb37QB4Al70uXzu7KLqq5mXtT2EirzFWAE,5835
821
825
  tensorflow_datasets/datasets/lfw/CITATIONS.bib,sha256=hYwRfGh52dQKV9Z4EX0ec-esXVWaOY3d3g-ulDGYyKQ,340
822
826
  tensorflow_datasets/datasets/lfw/README.md,sha256=SKbgr4y6FxNy2Abei2kk2VQ6N1a7MHKcLOhnTGhLI3I,98
823
827
  tensorflow_datasets/datasets/lfw/TAGS.txt,sha256=3k_aYCPHQfbOnta3Z41F8BFfpHbzanXx2ZLL7h59TpU,46
@@ -2455,10 +2459,10 @@ tensorflow_datasets/vision_language/wit/wit_test.py,sha256=DJ687VN9hAp6SLXnr_P12
2455
2459
  tensorflow_datasets/vision_language/wit_kaggle/__init__.py,sha256=38tJQ73VHz8vOJn-AyZh2we2YJucbSRIgmgcrsC6bQM,719
2456
2460
  tensorflow_datasets/vision_language/wit_kaggle/wit_kaggle.py,sha256=k-7YD1SGr5bASfdR2_09rrqz-8cpWdIcBWWEXhCvzuk,16903
2457
2461
  tensorflow_datasets/vision_language/wit_kaggle/wit_kaggle_test.py,sha256=vLlluBW77ASNVC4ix7t8idkSUBI6q1-B7zmRV_ICCQM,1778
2458
- tfds_nightly-4.9.8.dev202504110044.dist-info/licenses/AUTHORS,sha256=nvBG4WwfgjuOu1oZkuQKw9kg7X6rve679ObS-YDDmXg,309
2459
- tfds_nightly-4.9.8.dev202504110044.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
2460
- tfds_nightly-4.9.8.dev202504110044.dist-info/METADATA,sha256=H17CedG4r0iROeeuCajllqeuV30nomA-6fC7rTODtjQ,11879
2461
- tfds_nightly-4.9.8.dev202504110044.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
2462
- tfds_nightly-4.9.8.dev202504110044.dist-info/entry_points.txt,sha256=eHEL7nF5y1uCY2FgkuYIdE062epJXlAQTSdq89px4p4,73
2463
- tfds_nightly-4.9.8.dev202504110044.dist-info/top_level.txt,sha256=bAevmk9209s_oxVZVlN6hSDIVS423qrMQvmcWSvW4do,20
2464
- tfds_nightly-4.9.8.dev202504110044.dist-info/RECORD,,
2462
+ tfds_nightly-4.9.8.dev202504130103.dist-info/licenses/AUTHORS,sha256=nvBG4WwfgjuOu1oZkuQKw9kg7X6rve679ObS-YDDmXg,309
2463
+ tfds_nightly-4.9.8.dev202504130103.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
2464
+ tfds_nightly-4.9.8.dev202504130103.dist-info/METADATA,sha256=Xb2ci-GJ8aIVhADL-FVI0VIfcvUCVEUiPDRUNdZTGhI,11879
2465
+ tfds_nightly-4.9.8.dev202504130103.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
2466
+ tfds_nightly-4.9.8.dev202504130103.dist-info/entry_points.txt,sha256=eHEL7nF5y1uCY2FgkuYIdE062epJXlAQTSdq89px4p4,73
2467
+ tfds_nightly-4.9.8.dev202504130103.dist-info/top_level.txt,sha256=bAevmk9209s_oxVZVlN6hSDIVS423qrMQvmcWSvW4do,20
2468
+ tfds_nightly-4.9.8.dev202504130103.dist-info/RECORD,,