data-prep-toolkit-transforms 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cc_net_prepro.py +168 -0
- code2parquet_local.py +51 -0
- code2parquet_local_python.py +60 -0
- code2parquet_s3_python.py +61 -0
- code2parquet_transform.py +222 -0
- code2parquet_transform_python.py +39 -0
- code_quality_local.py +37 -0
- code_quality_local_python.py +50 -0
- code_quality_transform.py +312 -0
- code_quality_transform_python.py +26 -0
- data_prep_toolkit_transforms-0.2.1.dist-info/METADATA +73 -0
- data_prep_toolkit_transforms-0.2.1.dist-info/RECORD +80 -0
- data_prep_toolkit_transforms-0.2.1.dist-info/WHEEL +5 -0
- data_prep_toolkit_transforms-0.2.1.dist-info/top_level.txt +76 -0
- doc_Gopher_statistics.py +158 -0
- doc_c4_statistics.py +167 -0
- doc_chunk_chunkers.py +68 -0
- doc_chunk_local.py +34 -0
- doc_chunk_local_python.py +49 -0
- doc_chunk_transform.py +227 -0
- doc_chunk_transform_python.py +43 -0
- doc_id_local.py +54 -0
- doc_id_local_python.py +52 -0
- doc_id_transform_base.py +177 -0
- doc_id_transform_python.py +120 -0
- doc_quality_local.py +43 -0
- doc_quality_local_python.py +57 -0
- doc_quality_transform.py +241 -0
- doc_quality_transform_python.py +42 -0
- doc_quality_utils.py +67 -0
- ededup_local.py +43 -0
- ededup_local_python.py +46 -0
- ededup_local_python_incremental.py +53 -0
- ededup_transform_base.py +249 -0
- ededup_transform_python.py +145 -0
- filter_local.py +58 -0
- filter_local_python.py +60 -0
- filter_test_support.py +135 -0
- filter_transform.py +192 -0
- filter_transform_python.py +31 -0
- flair_recognizer.py +149 -0
- header_cleanser_local.py +52 -0
- header_cleanser_local_python.py +53 -0
- header_cleanser_test_support.py +94 -0
- header_cleanser_transform.py +224 -0
- header_cleanser_transform_python.py +31 -0
- lang_id_local.py +49 -0
- lang_id_local_python.py +55 -0
- lang_id_transform.py +141 -0
- lang_id_transform_python.py +42 -0
- lang_models.py +52 -0
- nlp.py +46 -0
- pdf2parquet_local.py +39 -0
- pdf2parquet_local_python.py +55 -0
- pdf2parquet_transform.py +332 -0
- pdf2parquet_transform_python.py +42 -0
- pii_analyzer.py +71 -0
- pii_anonymizer.py +27 -0
- pii_redactor_local.py +37 -0
- pii_redactor_local_python.py +37 -0
- pii_redactor_transform.py +152 -0
- pii_redactor_transform_python.py +35 -0
- proglang_select_local.py +51 -0
- proglang_select_local_python.py +61 -0
- proglang_select_transform.py +167 -0
- proglang_select_transform_python.py +32 -0
- resize_local.py +36 -0
- resize_local_python.py +46 -0
- resize_transform.py +193 -0
- resize_transform_python.py +40 -0
- text_encoder_local.py +44 -0
- text_encoder_local_python.py +44 -0
- text_encoder_transform.py +127 -0
- text_encoder_transform_python.py +44 -0
- tokenization_local_long_doc_python.py +49 -0
- tokenization_local_python.py +40 -0
- tokenization_s3_long_doc_python.py +52 -0
- tokenization_transform.py +258 -0
- tokenization_transform_python.py +27 -0
- tokenization_utils.py +143 -0
cc_net_prepro.py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
"""
|
|
14
|
+
@ Create on: 2023/04/25
|
|
15
|
+
@ Description:
|
|
16
|
+
To incorporate preprocessing steps from cc_net/
|
|
17
|
+
|
|
18
|
+
@ Reference:
|
|
19
|
+
https://github.com/facebookresearch/cc_net/blob/main/cc_net/text_normalizer.py
|
|
20
|
+
https://github.com/bigscience-workshop/data_tooling/blob/master/kenlm_training/cc_net/text_normalizer.py
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
import re
|
|
24
|
+
import unicodedata
|
|
25
|
+
from typing import Dict
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
unicode_punct_dict: Dict[str, str] = {
|
|
29
|
+
",": ",",
|
|
30
|
+
"。": ".",
|
|
31
|
+
"、": ",",
|
|
32
|
+
"„": '"',
|
|
33
|
+
"”": '"',
|
|
34
|
+
"“": '"',
|
|
35
|
+
"«": '"',
|
|
36
|
+
"»": '"',
|
|
37
|
+
"1": '"',
|
|
38
|
+
"」": '"',
|
|
39
|
+
"「": '"',
|
|
40
|
+
"《": '"',
|
|
41
|
+
"》": '"',
|
|
42
|
+
"´": "'",
|
|
43
|
+
"∶": ":",
|
|
44
|
+
":": ":",
|
|
45
|
+
"?": "?",
|
|
46
|
+
"!": "!",
|
|
47
|
+
"(": "(",
|
|
48
|
+
")": ")",
|
|
49
|
+
";": ";",
|
|
50
|
+
"–": "-",
|
|
51
|
+
"—": " - ",
|
|
52
|
+
".": ". ",
|
|
53
|
+
"~": "~",
|
|
54
|
+
"’": "'",
|
|
55
|
+
"…": "...",
|
|
56
|
+
"━": "-",
|
|
57
|
+
"〈": "<",
|
|
58
|
+
"〉": ">",
|
|
59
|
+
"【": "[",
|
|
60
|
+
"】": "]",
|
|
61
|
+
"%": "%",
|
|
62
|
+
"►": "-",
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
_unicode_punct_re = re.compile(f"[{''.join(unicode_punct_dict.keys())}]")
|
|
66
|
+
|
|
67
|
+
"""
|
|
68
|
+
Generate regex pattern obj for later searching using re.search() or re.match()
|
|
69
|
+
(r'[\x00\x01...\x9e\x9f]',re.UNICODE)
|
|
70
|
+
"""
|
|
71
|
+
_non_printing_chars_re = re.compile(f"[{''.join(map(chr, list(range(0, 32)) + list(range(127, 160))))}]")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def unicode_normalization(line: str, language="en") -> str:
|
|
75
|
+
"""
|
|
76
|
+
For some languages like 'ja' or 'en', its text should be normalized with the right unicode format
|
|
77
|
+
prior to calling tokenization by sentence piece tokenizer
|
|
78
|
+
"""
|
|
79
|
+
if language == "ja":
|
|
80
|
+
line = unicodedata.normalize("NFKC", line)
|
|
81
|
+
elif language == "en":
|
|
82
|
+
line = unicodedata.normalize("NFD", line) # normalize line using Unicode Normalization Form D
|
|
83
|
+
else:
|
|
84
|
+
"""
|
|
85
|
+
TODO: add relevant unicodedata normalization here for other languages as needed
|
|
86
|
+
"""
|
|
87
|
+
return line
|
|
88
|
+
|
|
89
|
+
return line
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _strip_accents(line) -> str:
|
|
93
|
+
"""
|
|
94
|
+
This currently is applied for `en` and `ja` language to strip out accents from text.
|
|
95
|
+
The given text 'line' should be normalized with the right unicode format prior to calling this method.
|
|
96
|
+
For example:
|
|
97
|
+
line = "Café élevàtor ôperàtor naïve Noël façade don't"
|
|
98
|
+
-> "Cafe elevator operator naive Noel facade don't" if line was normalized with unicode format NFD
|
|
99
|
+
-> "Cafe elevator operator naïve Noël façade don't" if line was NOT normalized with any unicode format
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
"""Keep char whose category is NOT "Mn", i.e,
|
|
103
|
+
"Mn" is category for Mark or Non-Spacing characters like diacritic/accent or non-spacing marks
|
|
104
|
+
Example of some diacritic/non-spacing marks: ^,´,` as they don't occupy space
|
|
105
|
+
"""
|
|
106
|
+
output = [c for c in line if unicodedata.category(c) != "Mn"] # decompose line into chars and diacritical marks
|
|
107
|
+
if len(output) == line:
|
|
108
|
+
return line
|
|
109
|
+
return "".join(output)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _replace_unicode_punct(line: str) -> str:
|
|
113
|
+
"""
|
|
114
|
+
Replace unicode punctuations defined in `unicode_punct_dict'
|
|
115
|
+
"""
|
|
116
|
+
return "".join(unicode_punct_dict.get(c, c) for c in line)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _remove_unicode_punct(line: str) -> str:
|
|
120
|
+
"""
|
|
121
|
+
More aggressive _replace_unicode_punct
|
|
122
|
+
"""
|
|
123
|
+
return _unicode_punct_re.sub("", line)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _remove_non_printing_char(line: str) -> str:
|
|
127
|
+
return _non_printing_chars_re.sub("", line)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def cc_net_normalize(
|
|
131
|
+
line: str,
|
|
132
|
+
strip_accent: bool = True,
|
|
133
|
+
lower_case: bool = True,
|
|
134
|
+
digit_2_zero: bool = True,
|
|
135
|
+
punct_level: int = 1,
|
|
136
|
+
language: str = "en",
|
|
137
|
+
) -> str:
|
|
138
|
+
line = line.strip()
|
|
139
|
+
|
|
140
|
+
if not line:
|
|
141
|
+
return line
|
|
142
|
+
|
|
143
|
+
line = unicode_normalization(line=line, language=language)
|
|
144
|
+
|
|
145
|
+
if lower_case:
|
|
146
|
+
line = line.lower()
|
|
147
|
+
|
|
148
|
+
if strip_accent:
|
|
149
|
+
line = _strip_accents(line)
|
|
150
|
+
|
|
151
|
+
if digit_2_zero:
|
|
152
|
+
# eg, "int 10 float 2.01 scientific 1.2e10" -> "int 00 float 0.00 scientific 0.0e00"
|
|
153
|
+
line = re.compile(r"\d").sub("0", line)
|
|
154
|
+
|
|
155
|
+
if punct_level == 1:
|
|
156
|
+
line = _replace_unicode_punct(line)
|
|
157
|
+
|
|
158
|
+
elif punct_level == 2:
|
|
159
|
+
line = _remove_unicode_punct(line)
|
|
160
|
+
|
|
161
|
+
line = _remove_non_printing_char(line)
|
|
162
|
+
return line
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
if __name__ == "__main__":
|
|
166
|
+
line = "Int 10 float 2.01 scientific 1.2e10 Café ôperàtor"
|
|
167
|
+
new_line = cc_net_normalize(line)
|
|
168
|
+
print(f"== {line} -> {new_line}")
|
code2parquet_local.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
import ast
|
|
14
|
+
import os
|
|
15
|
+
|
|
16
|
+
from code2parquet_transform import ( # domain_key,; snapshot_key,
|
|
17
|
+
CodeToParquetTransform,
|
|
18
|
+
data_factory_key,
|
|
19
|
+
detect_programming_lang_key,
|
|
20
|
+
supported_langs_file_key,
|
|
21
|
+
)
|
|
22
|
+
from data_processing.data_access import DataAccessFactory, DataAccessLocal
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
supported_languages_file = os.path.abspath(
|
|
26
|
+
os.path.join(os.path.dirname(__file__), "../../ray/test-data/languages/lang_extensions.json")
|
|
27
|
+
)
|
|
28
|
+
input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input"))
|
|
29
|
+
|
|
30
|
+
params = {
|
|
31
|
+
supported_langs_file_key: supported_languages_file,
|
|
32
|
+
detect_programming_lang_key: True,
|
|
33
|
+
# snapshot_key: "github",
|
|
34
|
+
# domain_key: "code",
|
|
35
|
+
"data_files_to_use": ast.literal_eval("['.zip']"),
|
|
36
|
+
data_factory_key: DataAccessFactory(), # Expect to create DataAccessLocal
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
if __name__ == "__main__":
|
|
40
|
+
# Here we show how to run outside of ray
|
|
41
|
+
# Create and configure the transform.
|
|
42
|
+
# transform = CodeToParquetPythonTransform(params)
|
|
43
|
+
transform = CodeToParquetTransform(params)
|
|
44
|
+
# Use the local data access to read a parquet table.
|
|
45
|
+
data_access = DataAccessLocal()
|
|
46
|
+
file_to_process = os.path.join(input_folder, "application-java.zip")
|
|
47
|
+
byte_array, _ = data_access.get_file(file_to_process)
|
|
48
|
+
# Transform the table
|
|
49
|
+
files_list, metadata = transform.transform_binary(file_name=file_to_process, byte_array=byte_array)
|
|
50
|
+
print(f"Got {len(files_list)} output files")
|
|
51
|
+
print(f"output metadata : {metadata}")
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
import ast
|
|
14
|
+
import os
|
|
15
|
+
import sys
|
|
16
|
+
|
|
17
|
+
from code2parquet_transform import ( # domain_key,; snapshot_key,
|
|
18
|
+
detect_programming_lang_cli_key,
|
|
19
|
+
supported_langs_file_cli_key,
|
|
20
|
+
)
|
|
21
|
+
from code2parquet_transform_python import CodeToParquetPythonConfiguration
|
|
22
|
+
from data_processing.runtime.pure_python import PythonTransformLauncher
|
|
23
|
+
from data_processing.utils import ParamsUtils
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# create parameters
|
|
27
|
+
supported_languages_file = os.path.abspath(
|
|
28
|
+
os.path.join(os.path.dirname(__file__), "../test-data/languages/lang_extensions.json")
|
|
29
|
+
)
|
|
30
|
+
input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input"))
|
|
31
|
+
output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../output"))
|
|
32
|
+
local_conf = {
|
|
33
|
+
"input_folder": input_folder,
|
|
34
|
+
"output_folder": output_folder,
|
|
35
|
+
}
|
|
36
|
+
worker_options = {"num_cpus": 0.8}
|
|
37
|
+
code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
|
|
38
|
+
ingest_config = {
|
|
39
|
+
supported_langs_file_cli_key: supported_languages_file,
|
|
40
|
+
detect_programming_lang_cli_key: True,
|
|
41
|
+
# snapshot_key: "github",
|
|
42
|
+
# domain_key: "code",
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
params = {
|
|
46
|
+
# Data access. Only required parameters are specified
|
|
47
|
+
"data_local_config": ParamsUtils.convert_to_ast(local_conf),
|
|
48
|
+
"data_files_to_use": ast.literal_eval("['.zip']"),
|
|
49
|
+
# orchestrator
|
|
50
|
+
"runtime_pipeline_id": "pipeline_id",
|
|
51
|
+
"runtime_job_id": "job_id",
|
|
52
|
+
"runtime_code_location": ParamsUtils.convert_to_ast(code_location),
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
if __name__ == "__main__":
|
|
56
|
+
sys.argv = ParamsUtils.dict_to_req(d=(params | ingest_config))
|
|
57
|
+
# create launcher
|
|
58
|
+
launcher = PythonTransformLauncher(CodeToParquetPythonConfiguration())
|
|
59
|
+
# launch
|
|
60
|
+
launcher.launch()
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
import ast
|
|
14
|
+
import sys
|
|
15
|
+
|
|
16
|
+
from code2parquet_transform import ( # domain_key,; snapshot_key,
|
|
17
|
+
detect_programming_lang_cli_key,
|
|
18
|
+
supported_langs_file_cli_key,
|
|
19
|
+
)
|
|
20
|
+
from code2parquet_transform_python import CodeToParquetPythonConfiguration
|
|
21
|
+
from data_processing.runtime.pure_python import PythonTransformLauncher
|
|
22
|
+
from data_processing.utils import GB, ParamsUtils
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# create parameters
|
|
26
|
+
s3_cred = {
|
|
27
|
+
"access_key": "localminioaccesskey",
|
|
28
|
+
"secret_key": "localminiosecretkey",
|
|
29
|
+
"url": "http://localhost:9000",
|
|
30
|
+
}
|
|
31
|
+
s3_conf = {
|
|
32
|
+
"input_folder": "test/ingest_2_parquet/input",
|
|
33
|
+
"output_folder": "test/ingest_2_parquet/output",
|
|
34
|
+
}
|
|
35
|
+
worker_options = {"num_cpus": 0.8, "memory": 2 * GB}
|
|
36
|
+
code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
|
|
37
|
+
ingest_config = {
|
|
38
|
+
supported_langs_file_cli_key: "test/ingest_2_parquet/languages/lang_extensions.json",
|
|
39
|
+
detect_programming_lang_cli_key: True,
|
|
40
|
+
# snapshot_key: "github",
|
|
41
|
+
# domain_key: "code",
|
|
42
|
+
"code2parquet_s3_cred": ParamsUtils.convert_to_ast(s3_cred),
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
params = {
|
|
46
|
+
# Data access. Only required parameters are specified
|
|
47
|
+
"data_s3_cred": ParamsUtils.convert_to_ast(s3_cred),
|
|
48
|
+
"data_s3_config": ParamsUtils.convert_to_ast(s3_conf),
|
|
49
|
+
"data_files_to_use": ast.literal_eval("['.zip']"),
|
|
50
|
+
# orchestrator
|
|
51
|
+
"runtime_pipeline_id": "pipeline_id",
|
|
52
|
+
"runtime_job_id": "job_id",
|
|
53
|
+
"runtime_code_location": ParamsUtils.convert_to_ast(code_location),
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
if __name__ == "__main__":
|
|
57
|
+
sys.argv = ParamsUtils.dict_to_req(d=(params | ingest_config))
|
|
58
|
+
# create launcher
|
|
59
|
+
launcher = PythonTransformLauncher(CodeToParquetPythonConfiguration())
|
|
60
|
+
# launch
|
|
61
|
+
launcher.launch()
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
import io
|
|
14
|
+
import json
|
|
15
|
+
import logging
|
|
16
|
+
import os
|
|
17
|
+
import uuid
|
|
18
|
+
import zipfile
|
|
19
|
+
from argparse import ArgumentParser, Namespace
|
|
20
|
+
from datetime import datetime
|
|
21
|
+
from typing import Any
|
|
22
|
+
|
|
23
|
+
import pyarrow as pa
|
|
24
|
+
from data_processing.data_access import DataAccess, DataAccessFactory
|
|
25
|
+
from data_processing.transform import AbstractBinaryTransform, TransformConfiguration
|
|
26
|
+
from data_processing.utils import CLIArgumentProvider, TransformUtils, str2bool
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
shortname = "code2parquet"
|
|
30
|
+
cli_prefix = f"{shortname}_"
|
|
31
|
+
|
|
32
|
+
supported_langs_file_key = "supported_langs_file"
|
|
33
|
+
supported_langs_file_cli_key = f"{cli_prefix}{supported_langs_file_key}"
|
|
34
|
+
|
|
35
|
+
supported_languages_key = "supported_languages"
|
|
36
|
+
supported_languages_cli_key = f"{cli_prefix}{supported_languages_key}"
|
|
37
|
+
|
|
38
|
+
detect_programming_lang_key = "detect_programming_lang"
|
|
39
|
+
detect_programming_lang_cli_key = f"{cli_prefix}{detect_programming_lang_key}"
|
|
40
|
+
detect_programming_lang_default = True
|
|
41
|
+
|
|
42
|
+
data_factory_key = "data_factory"
|
|
43
|
+
|
|
44
|
+
domain_key = "domain"
|
|
45
|
+
domain_cli_key = f"{cli_prefix}{domain_key}"
|
|
46
|
+
snapshot_key = "snapshot"
|
|
47
|
+
snapshot_cli_key = f"{cli_prefix}{snapshot_key}"
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def get_supported_languages(lang_file: str, data_access: DataAccess, logger: logging.Logger) -> dict[str, str]:
|
|
51
|
+
logger.debug(f"Getting supported languages from file {lang_file}")
|
|
52
|
+
json_data, _ = data_access.get_file(lang_file)
|
|
53
|
+
lang_dict = json.loads(json_data.decode("utf-8"))
|
|
54
|
+
reversed_dict = {ext: langs for langs, exts in lang_dict.items() for ext in exts}
|
|
55
|
+
logger.debug(f"Supported languages {reversed_dict}")
|
|
56
|
+
return reversed_dict
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class CodeToParquetTransform(AbstractBinaryTransform):
|
|
60
|
+
def __init__(self, config: dict):
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
config: dictionary of configuration data
|
|
65
|
+
supported_langs - dictionary of file extenstions to language names.
|
|
66
|
+
supported_langs_file - if supported_langs, is not provided, then read a map
|
|
67
|
+
of language names keyed to a list of extensions, from this json file. The file is read using
|
|
68
|
+
the DataAccessFactory, under the code2parquet_data_factory key.
|
|
69
|
+
"""
|
|
70
|
+
from data_processing.utils import get_logger
|
|
71
|
+
|
|
72
|
+
self.logger = get_logger(__name__)
|
|
73
|
+
super().__init__(config)
|
|
74
|
+
self.languages_supported = config.get(supported_languages_key, None)
|
|
75
|
+
if self.languages_supported is None:
|
|
76
|
+
path = config.get(supported_langs_file_key, None)
|
|
77
|
+
if path is not None:
|
|
78
|
+
daf = config.get(data_factory_key, None)
|
|
79
|
+
if daf is None:
|
|
80
|
+
raise ValueError(f"Neither {supported_languages_key} nor {data_factory_key} were provided.")
|
|
81
|
+
data_access = daf.create_data_access()
|
|
82
|
+
self.languages_supported = get_supported_languages(
|
|
83
|
+
lang_file=path, data_access=data_access, logger=self.logger
|
|
84
|
+
)
|
|
85
|
+
self.detect_programming_lang = config.get(detect_programming_lang_key, detect_programming_lang_default)
|
|
86
|
+
if self.detect_programming_lang and self.languages_supported is None:
|
|
87
|
+
raise RuntimeError(
|
|
88
|
+
"Programming language detection requested without providing a mapping of extensions to languages"
|
|
89
|
+
)
|
|
90
|
+
domain = config.get(domain_key, None)
|
|
91
|
+
snapshot = config.get(domain_key, None)
|
|
92
|
+
self.shared_columns = {}
|
|
93
|
+
if domain is not None:
|
|
94
|
+
self.shared_columns["domain"] = domain
|
|
95
|
+
if snapshot is not None:
|
|
96
|
+
self.shared_columns["snapshot"] = snapshot
|
|
97
|
+
|
|
98
|
+
def _get_lang_from_ext(self, ext):
|
|
99
|
+
lang = "unknown"
|
|
100
|
+
if ext is not None:
|
|
101
|
+
lang = self.languages_supported.get(ext, lang)
|
|
102
|
+
return lang
|
|
103
|
+
|
|
104
|
+
def transform_binary(self, file_name: str, byte_array: bytes) -> tuple[list[tuple[bytes, str]], dict[str, Any]]:
|
|
105
|
+
"""
|
|
106
|
+
Converts raw data file (ZIP) to Parquet format
|
|
107
|
+
"""
|
|
108
|
+
# We currently only process .zip files
|
|
109
|
+
if TransformUtils.get_file_extension(file_name)[1] != ".zip":
|
|
110
|
+
self.logger.warning(f"Got unsupported file type {file_name}, skipping")
|
|
111
|
+
return [], {}
|
|
112
|
+
data = []
|
|
113
|
+
number_of_rows = 0
|
|
114
|
+
with zipfile.ZipFile(io.BytesIO(bytes(byte_array))) as opened_zip:
|
|
115
|
+
# Loop through each file member in the ZIP archive
|
|
116
|
+
for member in opened_zip.infolist():
|
|
117
|
+
if not member.is_dir():
|
|
118
|
+
with opened_zip.open(member) as file:
|
|
119
|
+
try:
|
|
120
|
+
# Read the content of the file
|
|
121
|
+
content_bytes = file.read()
|
|
122
|
+
# Decode the content
|
|
123
|
+
content_string = TransformUtils.decode_content(content_bytes)
|
|
124
|
+
if content_string and len(content_string) > 0:
|
|
125
|
+
ext = TransformUtils.get_file_extension(member.filename)[1]
|
|
126
|
+
row_data = {
|
|
127
|
+
"title": member.filename,
|
|
128
|
+
"document": TransformUtils.get_file_basename(file_name),
|
|
129
|
+
"contents": content_string,
|
|
130
|
+
"document_id": str(uuid.uuid4()),
|
|
131
|
+
"ext": ext,
|
|
132
|
+
"hash": TransformUtils.str_to_hash(content_string),
|
|
133
|
+
"size": len(content_string),
|
|
134
|
+
"date_acquired": datetime.now().isoformat(),
|
|
135
|
+
"repo_name": os.path.splitext(os.path.basename(file_name))[0],
|
|
136
|
+
} | self.shared_columns
|
|
137
|
+
if self.detect_programming_lang:
|
|
138
|
+
lang = self._get_lang_from_ext(ext)
|
|
139
|
+
row_data["programming_language"] = lang # TODO column name should be configurable
|
|
140
|
+
data.append(row_data)
|
|
141
|
+
number_of_rows += 1
|
|
142
|
+
else:
|
|
143
|
+
self.logger.warning(
|
|
144
|
+
f"file {member.filename} is empty. content {content_string}, skipping"
|
|
145
|
+
)
|
|
146
|
+
except Exception as e:
|
|
147
|
+
self.logger.warning(f"Exception {str(e)} processing file {member.filename}, skipping")
|
|
148
|
+
table = pa.Table.from_pylist(data)
|
|
149
|
+
return [(TransformUtils.convert_arrow_to_binary(table=table), ".parquet")], {"number of rows": number_of_rows}
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class CodeToParquetTransformConfiguration(TransformConfiguration):
|
|
153
|
+
"""
|
|
154
|
+
Provides support for configuring and using the associated Transform class include
|
|
155
|
+
configuration with CLI args and combining of metadata.
|
|
156
|
+
"""
|
|
157
|
+
|
|
158
|
+
def __init__(self, transform_class: type[AbstractBinaryTransform] = CodeToParquetTransform):
|
|
159
|
+
super().__init__(
|
|
160
|
+
name=shortname,
|
|
161
|
+
transform_class=transform_class,
|
|
162
|
+
remove_from_metadata=[data_factory_key],
|
|
163
|
+
)
|
|
164
|
+
from data_processing.utils import get_logger
|
|
165
|
+
|
|
166
|
+
self.logger = get_logger(__name__)
|
|
167
|
+
self.daf = None
|
|
168
|
+
|
|
169
|
+
def add_input_params(self, parser: ArgumentParser) -> None:
|
|
170
|
+
"""
|
|
171
|
+
Add Transform-specific arguments to the given parser.
|
|
172
|
+
This will be included in a dictionary used to initialize the ProgLangMatchTransform.
|
|
173
|
+
By convention a common prefix should be used for all mutator-specific CLI args
|
|
174
|
+
(e.g, noop_, pii_, etc.)
|
|
175
|
+
"""
|
|
176
|
+
parser.add_argument(
|
|
177
|
+
f"--{cli_prefix}{supported_langs_file_key}",
|
|
178
|
+
type=str,
|
|
179
|
+
default=None,
|
|
180
|
+
help="Path to file containing the list of supported languages",
|
|
181
|
+
)
|
|
182
|
+
parser.add_argument(
|
|
183
|
+
f"--{cli_prefix}{detect_programming_lang_key}",
|
|
184
|
+
type=lambda x: bool(str2bool(x)),
|
|
185
|
+
default=detect_programming_lang_default,
|
|
186
|
+
help="Infer the programming lang from the file extension using the file of supported languages",
|
|
187
|
+
)
|
|
188
|
+
parser.add_argument(
|
|
189
|
+
f"--{snapshot_cli_key}", type=str, help="Snapshot value assigned to all imported documents.", default=None
|
|
190
|
+
)
|
|
191
|
+
parser.add_argument(
|
|
192
|
+
f"--{domain_cli_key}",
|
|
193
|
+
type=str,
|
|
194
|
+
help="Domain value assigned to all imported documents.",
|
|
195
|
+
default=None,
|
|
196
|
+
)
|
|
197
|
+
# Create the DataAccessFactor to use CLI args
|
|
198
|
+
self.daf = DataAccessFactory(cli_prefix, False)
|
|
199
|
+
# Add the DataAccessFactory parameters to the transform's configuration parameters.
|
|
200
|
+
self.daf.add_input_params(parser)
|
|
201
|
+
|
|
202
|
+
def apply_input_params(self, args: Namespace) -> bool:
|
|
203
|
+
"""
|
|
204
|
+
Validate and apply the arguments that have been parsed
|
|
205
|
+
:param args: user defined arguments.
|
|
206
|
+
:return: True, if validate pass or False otherwise
|
|
207
|
+
"""
|
|
208
|
+
captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False)
|
|
209
|
+
if captured.get(supported_langs_file_key, None) is None:
|
|
210
|
+
self.logger.warning(f"{supported_langs_file_key} is required, but got None")
|
|
211
|
+
return False
|
|
212
|
+
self.params = captured | {
|
|
213
|
+
# detect_programming_lang_key: captured.get(detect_programming_lang_key, None),
|
|
214
|
+
# supported_langs_file_key: captured.get(supported_langs_file_key, ""),
|
|
215
|
+
# domain_key: dargs.get(domain_key, ""),
|
|
216
|
+
# snapshot_key: dargs.get(snapshot_key, ""),
|
|
217
|
+
data_factory_key: self.daf,
|
|
218
|
+
}
|
|
219
|
+
# self.logger.info(f"Transform configuration {self.params}") # Uhmm, let's NOT print out S3 keys please!
|
|
220
|
+
|
|
221
|
+
# Validate and populate the transform's DataAccessFactory
|
|
222
|
+
return self.daf.apply_input_params(args)
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
from code2parquet_transform import (
|
|
14
|
+
CodeToParquetTransform,
|
|
15
|
+
CodeToParquetTransformConfiguration,
|
|
16
|
+
data_factory_key,
|
|
17
|
+
get_supported_languages,
|
|
18
|
+
supported_langs_file_key,
|
|
19
|
+
)
|
|
20
|
+
from data_processing.runtime.pure_python import PythonTransformLauncher
|
|
21
|
+
from data_processing.runtime.pure_python.runtime_configuration import (
|
|
22
|
+
PythonTransformRuntimeConfiguration,
|
|
23
|
+
)
|
|
24
|
+
from data_processing.utils import get_logger
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
logger = get_logger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class CodeToParquetPythonConfiguration(PythonTransformRuntimeConfiguration):
|
|
31
|
+
def __init__(self):
|
|
32
|
+
super().__init__(transform_config=CodeToParquetTransformConfiguration(transform_class=CodeToParquetTransform))
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
if __name__ == "__main__":
|
|
36
|
+
# launcher = NOOPRayLauncher()
|
|
37
|
+
launcher = PythonTransformLauncher(CodeToParquetPythonConfiguration())
|
|
38
|
+
logger.info("Launching noop transform")
|
|
39
|
+
launcher.launch()
|
code_quality_local.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
import os
|
|
14
|
+
|
|
15
|
+
from code_quality_transform import CodeQualityTransform
|
|
16
|
+
from data_processing.data_access import DataAccessLocal
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input"))
|
|
20
|
+
|
|
21
|
+
if __name__ == "__main__":
|
|
22
|
+
codequality_params = {
|
|
23
|
+
"code_quality_params": {
|
|
24
|
+
"contents_column_name": "contents",
|
|
25
|
+
"language_column_name": "language",
|
|
26
|
+
"tokenizer": "codeparrot/codeparrot",
|
|
27
|
+
"hf_token": None,
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
transform = CodeQualityTransform(codequality_params)
|
|
31
|
+
|
|
32
|
+
data_access = DataAccessLocal()
|
|
33
|
+
table, _ = data_access.get_table(os.path.join(input_folder, "sample_1.parquet"))
|
|
34
|
+
print(f"input table: {table}")
|
|
35
|
+
# Transform the table
|
|
36
|
+
table_list, metadata = transform.transform(table)
|
|
37
|
+
print(f"\noutput table: {table_list}")
|