data-prep-toolkit-transforms 0.2.2__1-py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- UAST.py +270 -0
- UAST_parser.py +271 -0
- __init__.py +0 -0
- base_tokenizer.py +36 -0
- cc_net_prepro.py +168 -0
- cluster_estimator.py +59 -0
- code2parquet_local.py +51 -0
- code2parquet_local_python.py +60 -0
- code2parquet_local_ray.py +63 -0
- code2parquet_s3_python.py +61 -0
- code2parquet_s3_ray.py +64 -0
- code2parquet_transform.py +222 -0
- code2parquet_transform_python.py +39 -0
- code2parquet_transform_ray.py +123 -0
- code_profiler_local.py +66 -0
- code_profiler_local_python.py +46 -0
- code_profiler_local_ray.py +59 -0
- code_profiler_transform.py +208 -0
- code_profiler_transform_python.py +47 -0
- code_profiler_transform_ray.py +47 -0
- code_quality_local.py +37 -0
- code_quality_local_python.py +50 -0
- code_quality_local_ray.py +56 -0
- code_quality_s3_ray.py +57 -0
- code_quality_transform.py +312 -0
- code_quality_transform_python.py +27 -0
- code_quality_transform_ray.py +29 -0
- compute_shingles.py +50 -0
- data_prep_toolkit_transforms-0.2.2.dist-info/METADATA +199 -0
- data_prep_toolkit_transforms-0.2.2.dist-info/RECORD +211 -0
- data_prep_toolkit_transforms-0.2.2.dist-info/WHEEL +5 -0
- data_prep_toolkit_transforms-0.2.2.dist-info/top_level.txt +1 -0
- doc_Gopher_statistics.py +158 -0
- doc_c4_statistics.py +167 -0
- doc_chunk_chunkers.py +138 -0
- doc_chunk_local.py +34 -0
- doc_chunk_local_python.py +56 -0
- doc_chunk_local_ray.py +50 -0
- doc_chunk_s3_ray.py +57 -0
- doc_chunk_transform.py +255 -0
- doc_chunk_transform_python.py +43 -0
- doc_chunk_transform_ray.py +50 -0
- doc_id_local.py +54 -0
- doc_id_local_python.py +52 -0
- doc_id_local_ray.py +57 -0
- doc_id_s3_ray.py +60 -0
- doc_id_transform_base.py +177 -0
- doc_id_transform_python.py +120 -0
- doc_id_transform_ray.py +116 -0
- doc_quality_local.py +43 -0
- doc_quality_local_python.py +57 -0
- doc_quality_local_ray.py +59 -0
- doc_quality_s3_ray.py +70 -0
- doc_quality_transform.py +241 -0
- doc_quality_transform_python.py +42 -0
- doc_quality_transform_ray.py +43 -0
- doc_quality_utils.py +67 -0
- dpk_repo_level_order/__init__.py +2 -0
- dpk_repo_level_order/internal/check_languages.py +93 -0
- dpk_repo_level_order/internal/repo_grouper.py +137 -0
- dpk_repo_level_order/internal/repo_level_wrappers.py +227 -0
- dpk_repo_level_order/internal/sorting/semantic_ordering/__init__.py +5 -0
- dpk_repo_level_order/internal/sorting/semantic_ordering/build_dep_graph.py +460 -0
- dpk_repo_level_order/internal/sorting/semantic_ordering/sort_by_semantic_dep.py +91 -0
- dpk_repo_level_order/internal/sorting/semantic_ordering/topological_sort.py +204 -0
- dpk_repo_level_order/internal/sorting/semantic_ordering/utils.py +132 -0
- dpk_repo_level_order/internal/store/ray_store.py +152 -0
- dpk_repo_level_order/internal/store/store.py +139 -0
- dpk_repo_level_order/internal/store/store_factory.py +139 -0
- dpk_web2parquet/config.py +81 -0
- dpk_web2parquet/local.py +26 -0
- dpk_web2parquet/local_python.py +49 -0
- dpk_web2parquet/python_runtime.py +44 -0
- dpk_web2parquet/transform.py +126 -0
- dpk_web2parquet/utils.py +38 -0
- ededup_compute_execution_params.py +131 -0
- ededup_local.py +43 -0
- ededup_local_python.py +46 -0
- ededup_local_python_incremental.py +53 -0
- ededup_local_ray.py +55 -0
- ededup_local_ray_incremental.py +62 -0
- ededup_s3_ray.py +58 -0
- ededup_transform_base.py +249 -0
- ededup_transform_python.py +145 -0
- ededup_transform_ray.py +241 -0
- fdedup_compute_execution_params.py +232 -0
- fdedup_local_ray.py +71 -0
- fdedup_s3_ray.py +76 -0
- fdedup_support.py +621 -0
- fdedup_transform_ray.py +803 -0
- filter_local.py +58 -0
- filter_local_python.py +60 -0
- filter_local_ray.py +71 -0
- filter_s3_ray.py +74 -0
- filter_test_support.py +135 -0
- filter_transform.py +192 -0
- filter_transform_python.py +31 -0
- filter_transform_ray.py +32 -0
- flair_recognizer.py +160 -0
- hap_local.py +49 -0
- hap_local_python.py +53 -0
- hap_local_ray.py +60 -0
- hap_s3_ray.py +64 -0
- hap_transform.py +176 -0
- hap_transform_python.py +35 -0
- hap_transform_ray.py +39 -0
- header_cleanser_local.py +52 -0
- header_cleanser_local_python.py +53 -0
- header_cleanser_local_ray.py +64 -0
- header_cleanser_s3_ray.py +67 -0
- header_cleanser_test_support.py +94 -0
- header_cleanser_transform.py +237 -0
- header_cleanser_transform_python.py +31 -0
- header_cleanser_transform_ray.py +32 -0
- html2parquet_local.py +35 -0
- html2parquet_local_python.py +46 -0
- html2parquet_local_ray.py +55 -0
- html2parquet_s3_ray.py +57 -0
- html2parquet_transform.py +238 -0
- html2parquet_transform_python.py +42 -0
- html2parquet_transform_ray.py +70 -0
- lang_id_local.py +49 -0
- lang_id_local_python.py +55 -0
- lang_id_local_ray.py +65 -0
- lang_id_s3_ray.py +71 -0
- lang_id_transform.py +141 -0
- lang_id_transform_python.py +42 -0
- lang_id_transform_ray.py +43 -0
- lang_models.py +52 -0
- license_select_local.py +43 -0
- license_select_local_python.py +54 -0
- license_select_local_ray.py +61 -0
- license_select_s3_ray.py +56 -0
- license_select_transform.py +181 -0
- license_select_transform_python.py +27 -0
- license_select_transform_ray.py +30 -0
- malware_local.py +40 -0
- malware_local_python.py +55 -0
- malware_local_ray.py +62 -0
- malware_transform.py +194 -0
- malware_transform_python.py +31 -0
- malware_transform_ray.py +31 -0
- nlp.py +46 -0
- noop_local.py +34 -0
- noop_local_python.py +45 -0
- noop_local_python_multiprocessor.py +46 -0
- noop_local_ray.py +51 -0
- noop_s3_ray.py +57 -0
- noop_transform.py +118 -0
- noop_transform_python.py +45 -0
- noop_transform_ray.py +42 -0
- offline-customizations/config_LLM_runner_app.py +22 -0
- offline-customizations/generic_LLM_runner_app.py +583 -0
- pdf2parquet_local.py +39 -0
- pdf2parquet_local_python.py +56 -0
- pdf2parquet_local_ray.py +52 -0
- pdf2parquet_s3_ray.py +57 -0
- pdf2parquet_transform.py +494 -0
- pdf2parquet_transform_python.py +42 -0
- pdf2parquet_transform_ray.py +72 -0
- pii_analyzer.py +83 -0
- pii_anonymizer.py +38 -0
- pii_redactor_local.py +35 -0
- pii_redactor_local_python.py +37 -0
- pii_redactor_local_ray.py +54 -0
- pii_redactor_s3_ray.py +59 -0
- pii_redactor_transform.py +162 -0
- pii_redactor_transform_python.py +34 -0
- pii_redactor_transform_ray.py +49 -0
- profiler_compute_execution_params.py +122 -0
- profiler_local.py +44 -0
- profiler_local_python.py +45 -0
- profiler_local_ray.py +52 -0
- profiler_s3_ray.py +55 -0
- profiler_transform_base.py +176 -0
- profiler_transform_python.py +125 -0
- profiler_transform_ray.py +209 -0
- proglang_select_local.py +51 -0
- proglang_select_local_python.py +61 -0
- proglang_select_local_ray.py +67 -0
- proglang_select_transform.py +167 -0
- proglang_select_transform_python.py +32 -0
- proglang_select_transform_ray.py +88 -0
- repo_level_order_local_ray.py +63 -0
- repo_level_order_s3_ray.py +64 -0
- repo_level_order_transform.py +467 -0
- repo_level_order_transform_ray.py +24 -0
- resize_local.py +36 -0
- resize_local_python.py +46 -0
- resize_local_ray.py +51 -0
- resize_s3_ray.py +57 -0
- resize_transform.py +193 -0
- resize_transform_python.py +40 -0
- resize_transform_ray.py +40 -0
- text_encoder_local.py +44 -0
- text_encoder_local_python.py +44 -0
- text_encoder_local_ray.py +50 -0
- text_encoder_s3_ray.py +56 -0
- text_encoder_transform.py +127 -0
- text_encoder_transform_python.py +44 -0
- text_encoder_transform_ray.py +51 -0
- tokenization_local_long_doc_python.py +49 -0
- tokenization_local_python.py +40 -0
- tokenization_local_ray.py +49 -0
- tokenization_s3_long_doc_python.py +52 -0
- tokenization_s3_ray.py +59 -0
- tokenization_transform.py +258 -0
- tokenization_transform_python.py +27 -0
- tokenization_transform_ray.py +32 -0
- tokenization_utils.py +143 -0
- transformer.py +151 -0
UAST.py
ADDED
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
import networkx
|
|
15
|
+
import matplotlib.pyplot as plt
|
|
16
|
+
|
|
17
|
+
class UASTNode:
|
|
18
|
+
"""
|
|
19
|
+
Represents a node in the Universal Abstract Syntax Tree (UAST).
|
|
20
|
+
|
|
21
|
+
Attributes:
|
|
22
|
+
id (int): The unique identifier of the node.
|
|
23
|
+
code_snippet (str): The line(s) of code associated with the node.
|
|
24
|
+
node_type (str): The type of the node.
|
|
25
|
+
parents (list): The list of parent nodes.
|
|
26
|
+
children (list): The list of child nodes.
|
|
27
|
+
metadata (dict): The associated information/metadata of the node
|
|
28
|
+
start_point (tuple(int, int)): The start line number and byte of the line of the node.
|
|
29
|
+
end_point (tuple(int, int)): The end line number and byte of the node.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(self,
|
|
33
|
+
id: int = 0,
|
|
34
|
+
code_snippet: str = None,
|
|
35
|
+
node_type: str = None,
|
|
36
|
+
parents: list = list(),
|
|
37
|
+
children: list = list(),
|
|
38
|
+
metadata : dict = dict(),
|
|
39
|
+
start_point : tuple[int,int] = (None, None),
|
|
40
|
+
end_point : tuple[int,int] = (None, None)) -> None:
|
|
41
|
+
|
|
42
|
+
self.id = id
|
|
43
|
+
self.code_snippet = code_snippet
|
|
44
|
+
self.node_type = node_type
|
|
45
|
+
self.parents = parents
|
|
46
|
+
self.children = children
|
|
47
|
+
self.metadata = metadata
|
|
48
|
+
self.start_point = start_point
|
|
49
|
+
self.end_point = end_point
|
|
50
|
+
|
|
51
|
+
def __str__(self) -> str:
|
|
52
|
+
return f"ID: {self.id}, Type: {self.node_type}, Snippet: {repr(self.code_snippet)}, Parents: {self.parents}, Children: {self.children}, Metadata = {self.metadata}"
|
|
53
|
+
|
|
54
|
+
def __repr__(self) -> str:
|
|
55
|
+
return f"ID: {self.id}, Type: {self.node_type}, Snippet: {repr(self.code_snippet)}, Parents: {self.parents}, Children: {self.children}, Metadata = {self.metadata}"
|
|
56
|
+
|
|
57
|
+
def __eq__(self, other) -> bool:
|
|
58
|
+
return self.id == other.id and self.code_snippet == other.code_snippet and self.node_type == other.node_type and self.parents == other.parents and self.children == other.children and self.metadata == other.metadata and self.start_point == other.start_point and self.end_point == other.end_point
|
|
59
|
+
|
|
60
|
+
class UASTEdge:
|
|
61
|
+
"""
|
|
62
|
+
Represents an edge in the UAST (Universal Abstract Syntax Tree).
|
|
63
|
+
|
|
64
|
+
Attributes:
|
|
65
|
+
start_id (int): The ID of the starting node of the edge.
|
|
66
|
+
end_id (int): The ID of the ending node of the edge.
|
|
67
|
+
directed_relation (str): The directed relation between the nodes.
|
|
68
|
+
metadata (dict): The metadata information associated with the edge.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
def __init__(self,
|
|
72
|
+
start_id: int = None,
|
|
73
|
+
end_id: int = None,
|
|
74
|
+
directed_relation: str = None,
|
|
75
|
+
metadata : dict = dict()):
|
|
76
|
+
|
|
77
|
+
self.start_id = start_id
|
|
78
|
+
self.end_id = end_id
|
|
79
|
+
self.directed_relation = directed_relation
|
|
80
|
+
self.metadata = metadata
|
|
81
|
+
|
|
82
|
+
def __str__(self) -> str:
|
|
83
|
+
return f"Start: {self.start_id}, End: {self.end_id}, Relation: {self.directed_relation}, Metadata = {self.metadata}, Metadata: {self.metadata}"
|
|
84
|
+
|
|
85
|
+
def __repr__(self) -> str:
|
|
86
|
+
return f"Start: {self.start_id}, End: {self.end_id}, Relation: {self.directed_relation}, Metadata = {self.metadata}, Metadata: {self.metadata}"
|
|
87
|
+
|
|
88
|
+
def __eq__(self, other) -> bool:
|
|
89
|
+
return self.start_id == other.start_id and self.end_id == other.end_id and self.directed_relation == other.directed_relation and self.metadata == other.metadata
|
|
90
|
+
|
|
91
|
+
def __hash__(self) -> int:
|
|
92
|
+
return hash((self.start_id, self.end_id, self.directed_relation, self.metadata))
|
|
93
|
+
|
|
94
|
+
class UAST:
|
|
95
|
+
"""
|
|
96
|
+
Represents a graph of a Universal Abstract Syntax Tree (UAST).
|
|
97
|
+
|
|
98
|
+
Attributes:
|
|
99
|
+
nodes (dict[int, UASTNode]): A dictionary mapping node IDs to UASTNode objects.
|
|
100
|
+
edges (list[UASTEdge]): A list of UASTEdge objects representing the edges between nodes.
|
|
101
|
+
assigned_id (int): The ID to be assigned to the next node added to the UAST.
|
|
102
|
+
|
|
103
|
+
Methods:
|
|
104
|
+
__init__(): Initializes an empty UAST object.
|
|
105
|
+
__len__(): Returns the number of nodes in the UAST.
|
|
106
|
+
__str__(): Returns a string representation of the UAST.
|
|
107
|
+
__repr__(): Returns a string representation of the UAST.
|
|
108
|
+
__eq__(other): Checks if the UAST is equal to another UAST.
|
|
109
|
+
add_node(node): Adds a node to the UAST.
|
|
110
|
+
_create_root(): Creates a root node for the UAST.
|
|
111
|
+
create_node(node_type, code_snippet, start_point, end_point): Creates a new node and adds it to the UAST, also returns the node object.
|
|
112
|
+
add_edge(node1, node2, directed_relation, metadata): Adds an edge between two nodes in the UAST.
|
|
113
|
+
get_node(id): Retrieves a node from the UAST based on its ID.
|
|
114
|
+
get_nodes_of_type(node_type): Retrieves the ID of all nodes of the input type
|
|
115
|
+
get_children(node): Retrieves the children of a node in the UAST.
|
|
116
|
+
get_parents(node): Retrieves the parent of a node in the UAST.
|
|
117
|
+
print_graph(id): Prints the UAST starting from the specified node ID.
|
|
118
|
+
save_to_file(file_path): Saves the UAST to a file in JSON format.
|
|
119
|
+
load_from_file(file_path): Loads the UAST from a file in JSON format.
|
|
120
|
+
visualize(): Visualizes the graph using NetworkX
|
|
121
|
+
"""
|
|
122
|
+
def __init__(self):
|
|
123
|
+
self.nodes : dict[int,UASTNode] = dict()
|
|
124
|
+
self.edges : list[UASTEdge] = list()
|
|
125
|
+
self.assigned_id : int = 0
|
|
126
|
+
self.nodes_of_type : dict = dict()
|
|
127
|
+
self.root = self._create_root()
|
|
128
|
+
|
|
129
|
+
def __len__(self) -> int:
|
|
130
|
+
return len(self.nodes)
|
|
131
|
+
|
|
132
|
+
def __str__(self) -> str:
|
|
133
|
+
return f"Nodes: {self.nodes} \nEdges: {self.edges}"
|
|
134
|
+
|
|
135
|
+
def __repr__(self) -> str:
|
|
136
|
+
return f"Nodes: {self.nodes} \nEdges: {self.edges}"
|
|
137
|
+
|
|
138
|
+
def __eq__(self, other) -> bool:
|
|
139
|
+
return self.nodes == other.nodes and self.edges == other.edges
|
|
140
|
+
|
|
141
|
+
def add_node(self, node : UASTNode) -> None:
|
|
142
|
+
self.nodes[self.assigned_id] = node
|
|
143
|
+
self.assigned_id += 1
|
|
144
|
+
if node.node_type not in self.nodes_of_type :
|
|
145
|
+
self.nodes_of_type[node.node_type] = list()
|
|
146
|
+
self.nodes_of_type[node.node_type].append(node.id)
|
|
147
|
+
return
|
|
148
|
+
|
|
149
|
+
def _create_root(self) -> UASTNode:
|
|
150
|
+
return self.create_node(node_type = "uast_root", code_snippet = "root", metadata= {"info" : "links to all"}, start_point = (-1,0), end_point = (-1,3))
|
|
151
|
+
|
|
152
|
+
def create_node(self,
|
|
153
|
+
node_type : str = None,
|
|
154
|
+
code_snippet : str = None,
|
|
155
|
+
metadata : dict = dict(),
|
|
156
|
+
start_point : tuple[int,int] = (None, None),
|
|
157
|
+
end_point : tuple[int,int] = (None, None)) -> UASTNode:
|
|
158
|
+
|
|
159
|
+
node = UASTNode(id = self.assigned_id, node_type = node_type, code_snippet = code_snippet, metadata = metadata, start_point = start_point, end_point = end_point, children= list(), parents = list())
|
|
160
|
+
self.add_node(node)
|
|
161
|
+
return node
|
|
162
|
+
|
|
163
|
+
def add_edge(self, node1 : UASTNode = None, node2 : UASTNode = None, directed_relation : str = None, metadata : dict = dict())-> UASTEdge:
|
|
164
|
+
edge = UASTEdge(start_id = node1.id, end_id = node2.id, directed_relation = directed_relation, metadata = metadata)
|
|
165
|
+
node2.parents.append(node1.id)
|
|
166
|
+
node1.children.append(node2.id)
|
|
167
|
+
self.edges.append(edge)
|
|
168
|
+
return edge
|
|
169
|
+
|
|
170
|
+
def get_node(self, id : int) -> UASTNode:
|
|
171
|
+
return self.nodes[id]
|
|
172
|
+
|
|
173
|
+
def get_nodes_of_type(self, node_type : str) -> list[int]:
|
|
174
|
+
return self.nodes_of_type[node_type]
|
|
175
|
+
|
|
176
|
+
def get_children(self, node : UASTNode) -> list[int]:
|
|
177
|
+
return node.children
|
|
178
|
+
|
|
179
|
+
def get_parents(self, node : UASTNode) -> int:
|
|
180
|
+
return node.parents
|
|
181
|
+
|
|
182
|
+
def print_graph(self, id):
|
|
183
|
+
if id not in self.nodes:
|
|
184
|
+
return
|
|
185
|
+
visited = set()
|
|
186
|
+
|
|
187
|
+
def dfs(id, visited):
|
|
188
|
+
visited.add(id)
|
|
189
|
+
print(self.nodes[id])
|
|
190
|
+
for child in self.nodes[id].children:
|
|
191
|
+
if child not in visited:
|
|
192
|
+
dfs(child, visited)
|
|
193
|
+
|
|
194
|
+
dfs(id, visited)
|
|
195
|
+
del visited
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def save_to_file(self, file_path):
|
|
199
|
+
# convert children list to list for serialization
|
|
200
|
+
copy_nodes = self.nodes.copy()
|
|
201
|
+
for k, v in self.nodes.items():
|
|
202
|
+
v.children = list(v.children)
|
|
203
|
+
v.parents = list(v.parents)
|
|
204
|
+
copy_nodes[k] = v
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
data = {
|
|
208
|
+
"nodes": {str(k): v.__dict__ for k, v in self.nodes.items()},
|
|
209
|
+
"edges": [edge.__dict__ for edge in self.edges]
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
with open(file_path, 'w') as f:
|
|
213
|
+
json.dump(data, f, indent= 4)
|
|
214
|
+
|
|
215
|
+
return
|
|
216
|
+
|
|
217
|
+
def get_json(self):
|
|
218
|
+
|
|
219
|
+
copy_nodes = self.nodes.copy()
|
|
220
|
+
for k, v in self.nodes.items():
|
|
221
|
+
v.children = list(v.children)
|
|
222
|
+
v.parents = list(v.parents)
|
|
223
|
+
copy_nodes[k] = v
|
|
224
|
+
|
|
225
|
+
data = {
|
|
226
|
+
"nodes": {str(k): v.__dict__ for k, v in self.nodes.items()},
|
|
227
|
+
"edges": [edge.__dict__ for edge in self.edges]
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
return data
|
|
231
|
+
|
|
232
|
+
def load_from_json_string(self, obj: str):
|
|
233
|
+
data = json.loads(obj)
|
|
234
|
+
self.nodes = {int(k): UASTNode(**v) for k, v in data["nodes"].items()}
|
|
235
|
+
self.edges = [UASTEdge(**edge) for edge in data["edges"]]
|
|
236
|
+
self.assigned_id = max(self.nodes.keys()) + 1
|
|
237
|
+
for node in self.nodes.values():
|
|
238
|
+
node.start_point = tuple(node.start_point)
|
|
239
|
+
node.end_point = tuple(node.end_point)
|
|
240
|
+
return
|
|
241
|
+
|
|
242
|
+
def load_from_file(self, file_path):
|
|
243
|
+
with open(file_path, 'r') as f:
|
|
244
|
+
data = json.load(f)
|
|
245
|
+
self.nodes = {int(k): UASTNode(**v) for k, v in data["nodes"].items()}
|
|
246
|
+
self.edges = [UASTEdge(**edge) for edge in data["edges"]]
|
|
247
|
+
self.assigned_id = max(self.nodes.keys()) + 1
|
|
248
|
+
for node in self.nodes.values():
|
|
249
|
+
node.start_point = tuple(node.start_point)
|
|
250
|
+
node.end_point = tuple(node.end_point)
|
|
251
|
+
return
|
|
252
|
+
|
|
253
|
+
def visualize(self):
|
|
254
|
+
edges_viz = []
|
|
255
|
+
labeldict = {}
|
|
256
|
+
for edge in self.edges:
|
|
257
|
+
edges_viz.append([edge.start_id, edge.end_id])
|
|
258
|
+
labeldict[edge.start_id] = self.nodes[edge.start_id].node_type
|
|
259
|
+
labeldict[edge.end_id] = self.nodes[edge.end_id].node_type
|
|
260
|
+
print(labeldict)
|
|
261
|
+
plt.figure(figsize=(10,10))
|
|
262
|
+
plt.rcParams["font.size"] = 20
|
|
263
|
+
G = networkx.Graph()
|
|
264
|
+
G.add_edges_from(edges_viz)
|
|
265
|
+
pos = networkx.spring_layout(G)
|
|
266
|
+
networkx.draw_networkx_labels(G, pos, labels= labeldict, font_size= 12, )
|
|
267
|
+
networkx.draw_networkx_nodes(G, pos, nodelist= self.nodes.keys(), node_size= 300)
|
|
268
|
+
networkx.draw_networkx_edges(G, pos, edgelist= edges_viz)
|
|
269
|
+
plt.show()
|
|
270
|
+
return
|
UAST_parser.py
ADDED
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
from UAST import UAST
|
|
14
|
+
import json
|
|
15
|
+
from tree_sitter import Tree
|
|
16
|
+
import os
|
|
17
|
+
import sys
|
|
18
|
+
sys.setrecursionlimit(10000)
|
|
19
|
+
|
|
20
|
+
"""
|
|
21
|
+
Initialize the parser with a path for rules and grammar.
|
|
22
|
+
"""
|
|
23
|
+
class UASTParser():
|
|
24
|
+
def __init__(self):
|
|
25
|
+
self.language : str = None
|
|
26
|
+
self.uast : UAST = None
|
|
27
|
+
self.rules : dict = None
|
|
28
|
+
self.cached_rules = dict()
|
|
29
|
+
|
|
30
|
+
# Compute the absolute path to the tree-sitter-bindings directory
|
|
31
|
+
grammar_dir = os.path.dirname(os.path.abspath(__file__))
|
|
32
|
+
self.grammar_path = os.path.join(grammar_dir, '..', '..', 'python', 'src', 'grammar', 'UAST_Grammar.json')
|
|
33
|
+
|
|
34
|
+
if not os.path.exists(self.grammar_path):
|
|
35
|
+
print("Current working directory:", os.getcwd())
|
|
36
|
+
raise FileNotFoundError(f"UAST Grammar file not found at {self.grammar_path}. Please ensure it exists.")
|
|
37
|
+
|
|
38
|
+
with open(self.grammar_path, "r") as grammar_file:
|
|
39
|
+
self.grammar = json.load(grammar_file)
|
|
40
|
+
|
|
41
|
+
# Compute the absolute path to the ruleset directory based on the script's location
|
|
42
|
+
script_dir = os.path.dirname(os.path.abspath(__file__))
|
|
43
|
+
self.rule_directory = os.path.join(script_dir, 'ruleset/')
|
|
44
|
+
|
|
45
|
+
if not os.path.isdir(self.rule_directory):
|
|
46
|
+
print("Script directory:", script_dir)
|
|
47
|
+
raise FileNotFoundError(f"Ruleset directory not found at {self.rule_directory}. Please ensure it exists.")
|
|
48
|
+
|
|
49
|
+
'''
|
|
50
|
+
# Rule directory and file
|
|
51
|
+
self.rule_directory = "../../python/src/ruleset/"
|
|
52
|
+
if not os.path.isdir(self.rule_directory):
|
|
53
|
+
print("Current working directory:", os.getcwd())
|
|
54
|
+
raise FileNotFoundError(f"Ruleset directory not found at {self.rule_directory}. Please ensure it exists.")
|
|
55
|
+
'''
|
|
56
|
+
self.rule_file_name: str = "UAST_rules_"
|
|
57
|
+
|
|
58
|
+
self.AST : Tree = None
|
|
59
|
+
# self.offset : int = None
|
|
60
|
+
# self.prev_line : int = -1
|
|
61
|
+
self.extracted : str = None
|
|
62
|
+
self.function_info = dict()
|
|
63
|
+
self.class_info = dict()
|
|
64
|
+
self.user_defined_entity = {"uast_function": "self.function_info[snippet] = id",
|
|
65
|
+
"uast_class": "self.class_info[snippet] = id"}
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def set_rule_dir_path(self, path: str):
|
|
69
|
+
self.rule_directory = path
|
|
70
|
+
|
|
71
|
+
def set_grammar_path(self, path : str):
|
|
72
|
+
self.grammar_path = path
|
|
73
|
+
self.grammar = json.load(open(self.grammar_path, "r"))
|
|
74
|
+
|
|
75
|
+
# set language for the parser
|
|
76
|
+
def set_language(self, language : str):
|
|
77
|
+
self.language = language
|
|
78
|
+
|
|
79
|
+
if (language not in self.cached_rules):
|
|
80
|
+
rules_cache = json.load(open(self.rule_directory + self.rule_file_name + self.language + '.json', "r"))
|
|
81
|
+
self.cached_rules[language] = rules_cache
|
|
82
|
+
|
|
83
|
+
self.rules = self.cached_rules[language]
|
|
84
|
+
|
|
85
|
+
# initialise a DFS traversal on the AST and an empty UAST.
|
|
86
|
+
def parse(self, AST, code_snippet) :
|
|
87
|
+
if(self.language == None) :
|
|
88
|
+
print("Language not loaded")
|
|
89
|
+
return
|
|
90
|
+
self.AST = AST
|
|
91
|
+
self.uast = UAST()
|
|
92
|
+
self.uast.root.metadata["language"] = self.language
|
|
93
|
+
self.uast.root.metadata["loc_snippet"] = self.count_loc(code_snippet, self.language)
|
|
94
|
+
self._dfs(AST_node = self.AST.root_node, parent = self.uast.root)
|
|
95
|
+
'''
|
|
96
|
+
# commenting this block temporarily
|
|
97
|
+
# Call the new modularized function to calculate the code-to-comment ratio
|
|
98
|
+
code_to_comment_ratio = self.calculate_code_to_comment_ratio(self.uast.root)
|
|
99
|
+
# Add the code_to_comment_ratio to the root node's metadata
|
|
100
|
+
self.uast.root.metadata["code_to_comment_ratio"] = code_to_comment_ratio
|
|
101
|
+
'''
|
|
102
|
+
return self.uast
|
|
103
|
+
|
|
104
|
+
def calculate_code_to_comment_ratio(self, root_node):
|
|
105
|
+
# Get the loc_snippet from the root node's metadata
|
|
106
|
+
loc_snippet = root_node.metadata.get("loc_snippet", 0)
|
|
107
|
+
|
|
108
|
+
# Sum all loc_original_code for uast_comment nodes
|
|
109
|
+
total_comment_loc = 0
|
|
110
|
+
|
|
111
|
+
# Recursive function to sum comment LOC
|
|
112
|
+
def sum_comment_loc(node):
|
|
113
|
+
nonlocal total_comment_loc
|
|
114
|
+
|
|
115
|
+
# Check if the node is a comment node
|
|
116
|
+
if node.node_type == "uast_comment":
|
|
117
|
+
total_comment_loc += node.metadata.get("loc_original_code", 0)
|
|
118
|
+
|
|
119
|
+
# Traverse the children, ensuring we get the actual node objects
|
|
120
|
+
for child_id in node.children:
|
|
121
|
+
child_node = self.uast.get_node(child_id) # Fetch the actual child node using self.uast
|
|
122
|
+
sum_comment_loc(child_node) # Recursively sum for the child node
|
|
123
|
+
|
|
124
|
+
# Start summing loc_original_code from the root node
|
|
125
|
+
sum_comment_loc(root_node)
|
|
126
|
+
|
|
127
|
+
# Calculate the code-to-comment ratio (handling division by zero)
|
|
128
|
+
if total_comment_loc > 0:
|
|
129
|
+
return loc_snippet / total_comment_loc
|
|
130
|
+
else:
|
|
131
|
+
return None # Handle no comments
|
|
132
|
+
|
|
133
|
+
def count_lo_comments(self, code_snippet):
|
|
134
|
+
lines = code_snippet.split('\n')
|
|
135
|
+
loc_count = 0
|
|
136
|
+
for line in lines:
|
|
137
|
+
stripped_line = line.strip()
|
|
138
|
+
# Count all lines except blank ones
|
|
139
|
+
if stripped_line:
|
|
140
|
+
loc_count += 1
|
|
141
|
+
return loc_count
|
|
142
|
+
|
|
143
|
+
def count_loc(self, code_snippet, language):
|
|
144
|
+
# Define the comment markers for each language
|
|
145
|
+
language_comment_markers = {
|
|
146
|
+
"c": ('//', '/*', '*/'),
|
|
147
|
+
"java": ('//', '/*', '*/'),
|
|
148
|
+
"C#": ('//', '/*', '*/'),
|
|
149
|
+
"c_sharp": ('//', '/*', '*/'),
|
|
150
|
+
"cpp": ('//', '/*', '*/'),
|
|
151
|
+
"objc": ('//', '/*', '*/'),
|
|
152
|
+
"rust": ('//', '/*', '*/'),
|
|
153
|
+
"go": ('//', '/*', '*/'),
|
|
154
|
+
"kotlin": ('//', '/*', '*/'),
|
|
155
|
+
"VHDL": ('--', None, None),
|
|
156
|
+
"py": ('#', '"""', '"""'),
|
|
157
|
+
"js": ('//', '/*', '*/'),
|
|
158
|
+
"dart": ('//', '/*', '*/'),
|
|
159
|
+
"QML": ('//', None, None),
|
|
160
|
+
"typescript": ('//', '/*', '*/'),
|
|
161
|
+
"perl": ('#', None, None),
|
|
162
|
+
"haskell": ('--', '{-', '-}'),
|
|
163
|
+
"elm": ('--', '{-', '-}'),
|
|
164
|
+
"agda": ('--', '{-', '-}'),
|
|
165
|
+
"d": ('//', '/*', '*/'),
|
|
166
|
+
"nim": ('#', '##', None),
|
|
167
|
+
"ocaml": ('(*', '(*', '*)'),
|
|
168
|
+
"scala": ('//', '/*', '*/')
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
single_line_comment, multi_line_comment_start, multi_line_comment_end = language_comment_markers.get(language, (None, None, None))
|
|
172
|
+
|
|
173
|
+
if not single_line_comment:
|
|
174
|
+
raise ValueError(f"Unsupported language: {language}")
|
|
175
|
+
|
|
176
|
+
lines = code_snippet.split('\n')
|
|
177
|
+
loc_count = 0
|
|
178
|
+
inside_multiline_comment = False
|
|
179
|
+
|
|
180
|
+
for line in lines:
|
|
181
|
+
stripped_line = line.strip()
|
|
182
|
+
|
|
183
|
+
# Skip empty lines
|
|
184
|
+
if not stripped_line:
|
|
185
|
+
continue
|
|
186
|
+
|
|
187
|
+
# Handle multi-line comments
|
|
188
|
+
if multi_line_comment_start and multi_line_comment_end:
|
|
189
|
+
if inside_multiline_comment:
|
|
190
|
+
# Check if the line contains the end of a multi-line comment
|
|
191
|
+
if multi_line_comment_end in stripped_line:
|
|
192
|
+
inside_multiline_comment = False
|
|
193
|
+
continue
|
|
194
|
+
elif multi_line_comment_start in stripped_line:
|
|
195
|
+
# If the line starts a multi-line comment
|
|
196
|
+
inside_multiline_comment = True
|
|
197
|
+
continue
|
|
198
|
+
|
|
199
|
+
# Skip single-line comments
|
|
200
|
+
if stripped_line.startswith(single_line_comment):
|
|
201
|
+
continue
|
|
202
|
+
|
|
203
|
+
# If the line is neither a comment nor blank, count it as LOC
|
|
204
|
+
loc_count += 1
|
|
205
|
+
|
|
206
|
+
return loc_count
|
|
207
|
+
|
|
208
|
+
def _add_user_defined(self, node):
|
|
209
|
+
id = node.id
|
|
210
|
+
type = node.node_type
|
|
211
|
+
|
|
212
|
+
if node.code_snippet is not None:
|
|
213
|
+
snippet = node.code_snippet.replace(type, '').strip()
|
|
214
|
+
# Add further processing with the snippet
|
|
215
|
+
else:
|
|
216
|
+
# Handle the case where code_snippet is None
|
|
217
|
+
snippet = ""
|
|
218
|
+
# You can log a warning or take other appropriate action
|
|
219
|
+
print(f"Warning: node.code_snippet is None for node type: {type}")
|
|
220
|
+
|
|
221
|
+
if (type in self.user_defined_entity):
|
|
222
|
+
exec(self.user_defined_entity[type])
|
|
223
|
+
node.metadata["user_defined"] = True
|
|
224
|
+
|
|
225
|
+
del id
|
|
226
|
+
del type
|
|
227
|
+
del snippet
|
|
228
|
+
return
|
|
229
|
+
|
|
230
|
+
# Traversing through the AST to create nodes recursively.
|
|
231
|
+
def _dfs(self, AST_node, parent) :
|
|
232
|
+
if (AST_node.type in self.rules) :
|
|
233
|
+
ast_snippet = AST_node.text.decode("utf8")
|
|
234
|
+
node_type = self.rules[AST_node.type]["uast_node_type"]
|
|
235
|
+
exec_string = self.rules[AST_node.type]["extractor"]
|
|
236
|
+
uast_snippet = self._extract(ast_snippet = ast_snippet, node_type = node_type, exec_string = exec_string)
|
|
237
|
+
|
|
238
|
+
if node_type == "uast_comment":
|
|
239
|
+
loc_original_code = self.count_lo_comments(ast_snippet)
|
|
240
|
+
else:
|
|
241
|
+
loc_original_code = self.count_loc(ast_snippet, self.language)
|
|
242
|
+
|
|
243
|
+
node = self.uast.create_node(
|
|
244
|
+
node_type = node_type,
|
|
245
|
+
code_snippet = uast_snippet,
|
|
246
|
+
# choose to enable or disbale the storage of original code by removing the following line.
|
|
247
|
+
metadata = {
|
|
248
|
+
"original_code" : ast_snippet,
|
|
249
|
+
"loc_original_code": loc_original_code
|
|
250
|
+
},
|
|
251
|
+
)
|
|
252
|
+
self._add_user_defined(node)
|
|
253
|
+
self.uast.add_edge(node1 = parent, node2 = node, directed_relation = "parent_node")
|
|
254
|
+
parent = node
|
|
255
|
+
|
|
256
|
+
for child in AST_node.children:
|
|
257
|
+
try:
|
|
258
|
+
self._dfs(AST_node= child, parent = parent)
|
|
259
|
+
except RecursionError as e:
|
|
260
|
+
print(f"RecursionError caught: {str(e)}")
|
|
261
|
+
|
|
262
|
+
def _extract(self, ast_snippet, node_type, exec_string):
|
|
263
|
+
code_snippet = ast_snippet
|
|
264
|
+
try:
|
|
265
|
+
exec(exec_string)
|
|
266
|
+
except Exception as e:
|
|
267
|
+
print(e)
|
|
268
|
+
try:
|
|
269
|
+
return self.grammar[node_type]["keyword"] + " " + self.extracted
|
|
270
|
+
except Exception as e:
|
|
271
|
+
print(e)
|
__init__.py
ADDED
|
File without changes
|
base_tokenizer.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
import string
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
"""
|
|
17
|
+
This implements the most simplistic tokenizer based on the white spaces
|
|
18
|
+
that can be overwritten by a different a different one. This method is
|
|
19
|
+
build in the library and can be overwritten using approach described at
|
|
20
|
+
https://stackoverflow.com/questions/37553545/how-do-i-override-a-function-of-a-python-library
|
|
21
|
+
|
|
22
|
+
import base_tokenizer
|
|
23
|
+
base_tokenizer.tokenize = my_local_tokenize
|
|
24
|
+
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def tokenize(text: str) -> list[str]:
|
|
29
|
+
"""
|
|
30
|
+
Tokenize string
|
|
31
|
+
:param text: source text
|
|
32
|
+
:return: list of tokens (words)
|
|
33
|
+
"""
|
|
34
|
+
# start from normalizing string
|
|
35
|
+
normal = text.strip().lower().translate(str.maketrans("", "", string.punctuation))
|
|
36
|
+
return normal.split()
|