data-prep-toolkit-transforms 0.2.2__1-py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (211) hide show
  1. UAST.py +270 -0
  2. UAST_parser.py +271 -0
  3. __init__.py +0 -0
  4. base_tokenizer.py +36 -0
  5. cc_net_prepro.py +168 -0
  6. cluster_estimator.py +59 -0
  7. code2parquet_local.py +51 -0
  8. code2parquet_local_python.py +60 -0
  9. code2parquet_local_ray.py +63 -0
  10. code2parquet_s3_python.py +61 -0
  11. code2parquet_s3_ray.py +64 -0
  12. code2parquet_transform.py +222 -0
  13. code2parquet_transform_python.py +39 -0
  14. code2parquet_transform_ray.py +123 -0
  15. code_profiler_local.py +66 -0
  16. code_profiler_local_python.py +46 -0
  17. code_profiler_local_ray.py +59 -0
  18. code_profiler_transform.py +208 -0
  19. code_profiler_transform_python.py +47 -0
  20. code_profiler_transform_ray.py +47 -0
  21. code_quality_local.py +37 -0
  22. code_quality_local_python.py +50 -0
  23. code_quality_local_ray.py +56 -0
  24. code_quality_s3_ray.py +57 -0
  25. code_quality_transform.py +312 -0
  26. code_quality_transform_python.py +27 -0
  27. code_quality_transform_ray.py +29 -0
  28. compute_shingles.py +50 -0
  29. data_prep_toolkit_transforms-0.2.2.dist-info/METADATA +199 -0
  30. data_prep_toolkit_transforms-0.2.2.dist-info/RECORD +211 -0
  31. data_prep_toolkit_transforms-0.2.2.dist-info/WHEEL +5 -0
  32. data_prep_toolkit_transforms-0.2.2.dist-info/top_level.txt +1 -0
  33. doc_Gopher_statistics.py +158 -0
  34. doc_c4_statistics.py +167 -0
  35. doc_chunk_chunkers.py +138 -0
  36. doc_chunk_local.py +34 -0
  37. doc_chunk_local_python.py +56 -0
  38. doc_chunk_local_ray.py +50 -0
  39. doc_chunk_s3_ray.py +57 -0
  40. doc_chunk_transform.py +255 -0
  41. doc_chunk_transform_python.py +43 -0
  42. doc_chunk_transform_ray.py +50 -0
  43. doc_id_local.py +54 -0
  44. doc_id_local_python.py +52 -0
  45. doc_id_local_ray.py +57 -0
  46. doc_id_s3_ray.py +60 -0
  47. doc_id_transform_base.py +177 -0
  48. doc_id_transform_python.py +120 -0
  49. doc_id_transform_ray.py +116 -0
  50. doc_quality_local.py +43 -0
  51. doc_quality_local_python.py +57 -0
  52. doc_quality_local_ray.py +59 -0
  53. doc_quality_s3_ray.py +70 -0
  54. doc_quality_transform.py +241 -0
  55. doc_quality_transform_python.py +42 -0
  56. doc_quality_transform_ray.py +43 -0
  57. doc_quality_utils.py +67 -0
  58. dpk_repo_level_order/__init__.py +2 -0
  59. dpk_repo_level_order/internal/check_languages.py +93 -0
  60. dpk_repo_level_order/internal/repo_grouper.py +137 -0
  61. dpk_repo_level_order/internal/repo_level_wrappers.py +227 -0
  62. dpk_repo_level_order/internal/sorting/semantic_ordering/__init__.py +5 -0
  63. dpk_repo_level_order/internal/sorting/semantic_ordering/build_dep_graph.py +460 -0
  64. dpk_repo_level_order/internal/sorting/semantic_ordering/sort_by_semantic_dep.py +91 -0
  65. dpk_repo_level_order/internal/sorting/semantic_ordering/topological_sort.py +204 -0
  66. dpk_repo_level_order/internal/sorting/semantic_ordering/utils.py +132 -0
  67. dpk_repo_level_order/internal/store/ray_store.py +152 -0
  68. dpk_repo_level_order/internal/store/store.py +139 -0
  69. dpk_repo_level_order/internal/store/store_factory.py +139 -0
  70. dpk_web2parquet/config.py +81 -0
  71. dpk_web2parquet/local.py +26 -0
  72. dpk_web2parquet/local_python.py +49 -0
  73. dpk_web2parquet/python_runtime.py +44 -0
  74. dpk_web2parquet/transform.py +126 -0
  75. dpk_web2parquet/utils.py +38 -0
  76. ededup_compute_execution_params.py +131 -0
  77. ededup_local.py +43 -0
  78. ededup_local_python.py +46 -0
  79. ededup_local_python_incremental.py +53 -0
  80. ededup_local_ray.py +55 -0
  81. ededup_local_ray_incremental.py +62 -0
  82. ededup_s3_ray.py +58 -0
  83. ededup_transform_base.py +249 -0
  84. ededup_transform_python.py +145 -0
  85. ededup_transform_ray.py +241 -0
  86. fdedup_compute_execution_params.py +232 -0
  87. fdedup_local_ray.py +71 -0
  88. fdedup_s3_ray.py +76 -0
  89. fdedup_support.py +621 -0
  90. fdedup_transform_ray.py +803 -0
  91. filter_local.py +58 -0
  92. filter_local_python.py +60 -0
  93. filter_local_ray.py +71 -0
  94. filter_s3_ray.py +74 -0
  95. filter_test_support.py +135 -0
  96. filter_transform.py +192 -0
  97. filter_transform_python.py +31 -0
  98. filter_transform_ray.py +32 -0
  99. flair_recognizer.py +160 -0
  100. hap_local.py +49 -0
  101. hap_local_python.py +53 -0
  102. hap_local_ray.py +60 -0
  103. hap_s3_ray.py +64 -0
  104. hap_transform.py +176 -0
  105. hap_transform_python.py +35 -0
  106. hap_transform_ray.py +39 -0
  107. header_cleanser_local.py +52 -0
  108. header_cleanser_local_python.py +53 -0
  109. header_cleanser_local_ray.py +64 -0
  110. header_cleanser_s3_ray.py +67 -0
  111. header_cleanser_test_support.py +94 -0
  112. header_cleanser_transform.py +237 -0
  113. header_cleanser_transform_python.py +31 -0
  114. header_cleanser_transform_ray.py +32 -0
  115. html2parquet_local.py +35 -0
  116. html2parquet_local_python.py +46 -0
  117. html2parquet_local_ray.py +55 -0
  118. html2parquet_s3_ray.py +57 -0
  119. html2parquet_transform.py +238 -0
  120. html2parquet_transform_python.py +42 -0
  121. html2parquet_transform_ray.py +70 -0
  122. lang_id_local.py +49 -0
  123. lang_id_local_python.py +55 -0
  124. lang_id_local_ray.py +65 -0
  125. lang_id_s3_ray.py +71 -0
  126. lang_id_transform.py +141 -0
  127. lang_id_transform_python.py +42 -0
  128. lang_id_transform_ray.py +43 -0
  129. lang_models.py +52 -0
  130. license_select_local.py +43 -0
  131. license_select_local_python.py +54 -0
  132. license_select_local_ray.py +61 -0
  133. license_select_s3_ray.py +56 -0
  134. license_select_transform.py +181 -0
  135. license_select_transform_python.py +27 -0
  136. license_select_transform_ray.py +30 -0
  137. malware_local.py +40 -0
  138. malware_local_python.py +55 -0
  139. malware_local_ray.py +62 -0
  140. malware_transform.py +194 -0
  141. malware_transform_python.py +31 -0
  142. malware_transform_ray.py +31 -0
  143. nlp.py +46 -0
  144. noop_local.py +34 -0
  145. noop_local_python.py +45 -0
  146. noop_local_python_multiprocessor.py +46 -0
  147. noop_local_ray.py +51 -0
  148. noop_s3_ray.py +57 -0
  149. noop_transform.py +118 -0
  150. noop_transform_python.py +45 -0
  151. noop_transform_ray.py +42 -0
  152. offline-customizations/config_LLM_runner_app.py +22 -0
  153. offline-customizations/generic_LLM_runner_app.py +583 -0
  154. pdf2parquet_local.py +39 -0
  155. pdf2parquet_local_python.py +56 -0
  156. pdf2parquet_local_ray.py +52 -0
  157. pdf2parquet_s3_ray.py +57 -0
  158. pdf2parquet_transform.py +494 -0
  159. pdf2parquet_transform_python.py +42 -0
  160. pdf2parquet_transform_ray.py +72 -0
  161. pii_analyzer.py +83 -0
  162. pii_anonymizer.py +38 -0
  163. pii_redactor_local.py +35 -0
  164. pii_redactor_local_python.py +37 -0
  165. pii_redactor_local_ray.py +54 -0
  166. pii_redactor_s3_ray.py +59 -0
  167. pii_redactor_transform.py +162 -0
  168. pii_redactor_transform_python.py +34 -0
  169. pii_redactor_transform_ray.py +49 -0
  170. profiler_compute_execution_params.py +122 -0
  171. profiler_local.py +44 -0
  172. profiler_local_python.py +45 -0
  173. profiler_local_ray.py +52 -0
  174. profiler_s3_ray.py +55 -0
  175. profiler_transform_base.py +176 -0
  176. profiler_transform_python.py +125 -0
  177. profiler_transform_ray.py +209 -0
  178. proglang_select_local.py +51 -0
  179. proglang_select_local_python.py +61 -0
  180. proglang_select_local_ray.py +67 -0
  181. proglang_select_transform.py +167 -0
  182. proglang_select_transform_python.py +32 -0
  183. proglang_select_transform_ray.py +88 -0
  184. repo_level_order_local_ray.py +63 -0
  185. repo_level_order_s3_ray.py +64 -0
  186. repo_level_order_transform.py +467 -0
  187. repo_level_order_transform_ray.py +24 -0
  188. resize_local.py +36 -0
  189. resize_local_python.py +46 -0
  190. resize_local_ray.py +51 -0
  191. resize_s3_ray.py +57 -0
  192. resize_transform.py +193 -0
  193. resize_transform_python.py +40 -0
  194. resize_transform_ray.py +40 -0
  195. text_encoder_local.py +44 -0
  196. text_encoder_local_python.py +44 -0
  197. text_encoder_local_ray.py +50 -0
  198. text_encoder_s3_ray.py +56 -0
  199. text_encoder_transform.py +127 -0
  200. text_encoder_transform_python.py +44 -0
  201. text_encoder_transform_ray.py +51 -0
  202. tokenization_local_long_doc_python.py +49 -0
  203. tokenization_local_python.py +40 -0
  204. tokenization_local_ray.py +49 -0
  205. tokenization_s3_long_doc_python.py +52 -0
  206. tokenization_s3_ray.py +59 -0
  207. tokenization_transform.py +258 -0
  208. tokenization_transform_python.py +27 -0
  209. tokenization_transform_ray.py +32 -0
  210. tokenization_utils.py +143 -0
  211. transformer.py +151 -0
UAST.py ADDED
@@ -0,0 +1,270 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ import json
14
+ import networkx
15
+ import matplotlib.pyplot as plt
16
+
17
+ class UASTNode:
18
+ """
19
+ Represents a node in the Universal Abstract Syntax Tree (UAST).
20
+
21
+ Attributes:
22
+ id (int): The unique identifier of the node.
23
+ code_snippet (str): The line(s) of code associated with the node.
24
+ node_type (str): The type of the node.
25
+ parents (list): The list of parent nodes.
26
+ children (list): The list of child nodes.
27
+ metadata (dict): The associated information/metadata of the node
28
+ start_point (tuple(int, int)): The start line number and byte of the line of the node.
29
+ end_point (tuple(int, int)): The end line number and byte of the node.
30
+ """
31
+
32
+ def __init__(self,
33
+ id: int = 0,
34
+ code_snippet: str = None,
35
+ node_type: str = None,
36
+ parents: list = list(),
37
+ children: list = list(),
38
+ metadata : dict = dict(),
39
+ start_point : tuple[int,int] = (None, None),
40
+ end_point : tuple[int,int] = (None, None)) -> None:
41
+
42
+ self.id = id
43
+ self.code_snippet = code_snippet
44
+ self.node_type = node_type
45
+ self.parents = parents
46
+ self.children = children
47
+ self.metadata = metadata
48
+ self.start_point = start_point
49
+ self.end_point = end_point
50
+
51
+ def __str__(self) -> str:
52
+ return f"ID: {self.id}, Type: {self.node_type}, Snippet: {repr(self.code_snippet)}, Parents: {self.parents}, Children: {self.children}, Metadata = {self.metadata}"
53
+
54
+ def __repr__(self) -> str:
55
+ return f"ID: {self.id}, Type: {self.node_type}, Snippet: {repr(self.code_snippet)}, Parents: {self.parents}, Children: {self.children}, Metadata = {self.metadata}"
56
+
57
+ def __eq__(self, other) -> bool:
58
+ return self.id == other.id and self.code_snippet == other.code_snippet and self.node_type == other.node_type and self.parents == other.parents and self.children == other.children and self.metadata == other.metadata and self.start_point == other.start_point and self.end_point == other.end_point
59
+
60
+ class UASTEdge:
61
+ """
62
+ Represents an edge in the UAST (Universal Abstract Syntax Tree).
63
+
64
+ Attributes:
65
+ start_id (int): The ID of the starting node of the edge.
66
+ end_id (int): The ID of the ending node of the edge.
67
+ directed_relation (str): The directed relation between the nodes.
68
+ metadata (dict): The metadata information associated with the edge.
69
+ """
70
+
71
+ def __init__(self,
72
+ start_id: int = None,
73
+ end_id: int = None,
74
+ directed_relation: str = None,
75
+ metadata : dict = dict()):
76
+
77
+ self.start_id = start_id
78
+ self.end_id = end_id
79
+ self.directed_relation = directed_relation
80
+ self.metadata = metadata
81
+
82
+ def __str__(self) -> str:
83
+ return f"Start: {self.start_id}, End: {self.end_id}, Relation: {self.directed_relation}, Metadata = {self.metadata}, Metadata: {self.metadata}"
84
+
85
+ def __repr__(self) -> str:
86
+ return f"Start: {self.start_id}, End: {self.end_id}, Relation: {self.directed_relation}, Metadata = {self.metadata}, Metadata: {self.metadata}"
87
+
88
+ def __eq__(self, other) -> bool:
89
+ return self.start_id == other.start_id and self.end_id == other.end_id and self.directed_relation == other.directed_relation and self.metadata == other.metadata
90
+
91
+ def __hash__(self) -> int:
92
+ return hash((self.start_id, self.end_id, self.directed_relation, self.metadata))
93
+
94
+ class UAST:
95
+ """
96
+ Represents a graph of a Universal Abstract Syntax Tree (UAST).
97
+
98
+ Attributes:
99
+ nodes (dict[int, UASTNode]): A dictionary mapping node IDs to UASTNode objects.
100
+ edges (list[UASTEdge]): A list of UASTEdge objects representing the edges between nodes.
101
+ assigned_id (int): The ID to be assigned to the next node added to the UAST.
102
+
103
+ Methods:
104
+ __init__(): Initializes an empty UAST object.
105
+ __len__(): Returns the number of nodes in the UAST.
106
+ __str__(): Returns a string representation of the UAST.
107
+ __repr__(): Returns a string representation of the UAST.
108
+ __eq__(other): Checks if the UAST is equal to another UAST.
109
+ add_node(node): Adds a node to the UAST.
110
+ _create_root(): Creates a root node for the UAST.
111
+ create_node(node_type, code_snippet, start_point, end_point): Creates a new node and adds it to the UAST, also returns the node object.
112
+ add_edge(node1, node2, directed_relation, metadata): Adds an edge between two nodes in the UAST.
113
+ get_node(id): Retrieves a node from the UAST based on its ID.
114
+ get_nodes_of_type(node_type): Retrieves the ID of all nodes of the input type
115
+ get_children(node): Retrieves the children of a node in the UAST.
116
+ get_parents(node): Retrieves the parent of a node in the UAST.
117
+ print_graph(id): Prints the UAST starting from the specified node ID.
118
+ save_to_file(file_path): Saves the UAST to a file in JSON format.
119
+ load_from_file(file_path): Loads the UAST from a file in JSON format.
120
+ visualize(): Visualizes the graph using NetworkX
121
+ """
122
+ def __init__(self):
123
+ self.nodes : dict[int,UASTNode] = dict()
124
+ self.edges : list[UASTEdge] = list()
125
+ self.assigned_id : int = 0
126
+ self.nodes_of_type : dict = dict()
127
+ self.root = self._create_root()
128
+
129
+ def __len__(self) -> int:
130
+ return len(self.nodes)
131
+
132
+ def __str__(self) -> str:
133
+ return f"Nodes: {self.nodes} \nEdges: {self.edges}"
134
+
135
+ def __repr__(self) -> str:
136
+ return f"Nodes: {self.nodes} \nEdges: {self.edges}"
137
+
138
+ def __eq__(self, other) -> bool:
139
+ return self.nodes == other.nodes and self.edges == other.edges
140
+
141
+ def add_node(self, node : UASTNode) -> None:
142
+ self.nodes[self.assigned_id] = node
143
+ self.assigned_id += 1
144
+ if node.node_type not in self.nodes_of_type :
145
+ self.nodes_of_type[node.node_type] = list()
146
+ self.nodes_of_type[node.node_type].append(node.id)
147
+ return
148
+
149
+ def _create_root(self) -> UASTNode:
150
+ return self.create_node(node_type = "uast_root", code_snippet = "root", metadata= {"info" : "links to all"}, start_point = (-1,0), end_point = (-1,3))
151
+
152
+ def create_node(self,
153
+ node_type : str = None,
154
+ code_snippet : str = None,
155
+ metadata : dict = dict(),
156
+ start_point : tuple[int,int] = (None, None),
157
+ end_point : tuple[int,int] = (None, None)) -> UASTNode:
158
+
159
+ node = UASTNode(id = self.assigned_id, node_type = node_type, code_snippet = code_snippet, metadata = metadata, start_point = start_point, end_point = end_point, children= list(), parents = list())
160
+ self.add_node(node)
161
+ return node
162
+
163
+ def add_edge(self, node1 : UASTNode = None, node2 : UASTNode = None, directed_relation : str = None, metadata : dict = dict())-> UASTEdge:
164
+ edge = UASTEdge(start_id = node1.id, end_id = node2.id, directed_relation = directed_relation, metadata = metadata)
165
+ node2.parents.append(node1.id)
166
+ node1.children.append(node2.id)
167
+ self.edges.append(edge)
168
+ return edge
169
+
170
+ def get_node(self, id : int) -> UASTNode:
171
+ return self.nodes[id]
172
+
173
+ def get_nodes_of_type(self, node_type : str) -> list[int]:
174
+ return self.nodes_of_type[node_type]
175
+
176
+ def get_children(self, node : UASTNode) -> list[int]:
177
+ return node.children
178
+
179
+ def get_parents(self, node : UASTNode) -> int:
180
+ return node.parents
181
+
182
+ def print_graph(self, id):
183
+ if id not in self.nodes:
184
+ return
185
+ visited = set()
186
+
187
+ def dfs(id, visited):
188
+ visited.add(id)
189
+ print(self.nodes[id])
190
+ for child in self.nodes[id].children:
191
+ if child not in visited:
192
+ dfs(child, visited)
193
+
194
+ dfs(id, visited)
195
+ del visited
196
+
197
+
198
+ def save_to_file(self, file_path):
199
+ # convert children list to list for serialization
200
+ copy_nodes = self.nodes.copy()
201
+ for k, v in self.nodes.items():
202
+ v.children = list(v.children)
203
+ v.parents = list(v.parents)
204
+ copy_nodes[k] = v
205
+
206
+
207
+ data = {
208
+ "nodes": {str(k): v.__dict__ for k, v in self.nodes.items()},
209
+ "edges": [edge.__dict__ for edge in self.edges]
210
+ }
211
+
212
+ with open(file_path, 'w') as f:
213
+ json.dump(data, f, indent= 4)
214
+
215
+ return
216
+
217
+ def get_json(self):
218
+
219
+ copy_nodes = self.nodes.copy()
220
+ for k, v in self.nodes.items():
221
+ v.children = list(v.children)
222
+ v.parents = list(v.parents)
223
+ copy_nodes[k] = v
224
+
225
+ data = {
226
+ "nodes": {str(k): v.__dict__ for k, v in self.nodes.items()},
227
+ "edges": [edge.__dict__ for edge in self.edges]
228
+ }
229
+
230
+ return data
231
+
232
+ def load_from_json_string(self, obj: str):
233
+ data = json.loads(obj)
234
+ self.nodes = {int(k): UASTNode(**v) for k, v in data["nodes"].items()}
235
+ self.edges = [UASTEdge(**edge) for edge in data["edges"]]
236
+ self.assigned_id = max(self.nodes.keys()) + 1
237
+ for node in self.nodes.values():
238
+ node.start_point = tuple(node.start_point)
239
+ node.end_point = tuple(node.end_point)
240
+ return
241
+
242
+ def load_from_file(self, file_path):
243
+ with open(file_path, 'r') as f:
244
+ data = json.load(f)
245
+ self.nodes = {int(k): UASTNode(**v) for k, v in data["nodes"].items()}
246
+ self.edges = [UASTEdge(**edge) for edge in data["edges"]]
247
+ self.assigned_id = max(self.nodes.keys()) + 1
248
+ for node in self.nodes.values():
249
+ node.start_point = tuple(node.start_point)
250
+ node.end_point = tuple(node.end_point)
251
+ return
252
+
253
+ def visualize(self):
254
+ edges_viz = []
255
+ labeldict = {}
256
+ for edge in self.edges:
257
+ edges_viz.append([edge.start_id, edge.end_id])
258
+ labeldict[edge.start_id] = self.nodes[edge.start_id].node_type
259
+ labeldict[edge.end_id] = self.nodes[edge.end_id].node_type
260
+ print(labeldict)
261
+ plt.figure(figsize=(10,10))
262
+ plt.rcParams["font.size"] = 20
263
+ G = networkx.Graph()
264
+ G.add_edges_from(edges_viz)
265
+ pos = networkx.spring_layout(G)
266
+ networkx.draw_networkx_labels(G, pos, labels= labeldict, font_size= 12, )
267
+ networkx.draw_networkx_nodes(G, pos, nodelist= self.nodes.keys(), node_size= 300)
268
+ networkx.draw_networkx_edges(G, pos, edgelist= edges_viz)
269
+ plt.show()
270
+ return
UAST_parser.py ADDED
@@ -0,0 +1,271 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ from UAST import UAST
14
+ import json
15
+ from tree_sitter import Tree
16
+ import os
17
+ import sys
18
+ sys.setrecursionlimit(10000)
19
+
20
+ """
21
+ Initialize the parser with a path for rules and grammar.
22
+ """
23
+ class UASTParser():
24
+ def __init__(self):
25
+ self.language : str = None
26
+ self.uast : UAST = None
27
+ self.rules : dict = None
28
+ self.cached_rules = dict()
29
+
30
+ # Compute the absolute path to the tree-sitter-bindings directory
31
+ grammar_dir = os.path.dirname(os.path.abspath(__file__))
32
+ self.grammar_path = os.path.join(grammar_dir, '..', '..', 'python', 'src', 'grammar', 'UAST_Grammar.json')
33
+
34
+ if not os.path.exists(self.grammar_path):
35
+ print("Current working directory:", os.getcwd())
36
+ raise FileNotFoundError(f"UAST Grammar file not found at {self.grammar_path}. Please ensure it exists.")
37
+
38
+ with open(self.grammar_path, "r") as grammar_file:
39
+ self.grammar = json.load(grammar_file)
40
+
41
+ # Compute the absolute path to the ruleset directory based on the script's location
42
+ script_dir = os.path.dirname(os.path.abspath(__file__))
43
+ self.rule_directory = os.path.join(script_dir, 'ruleset/')
44
+
45
+ if not os.path.isdir(self.rule_directory):
46
+ print("Script directory:", script_dir)
47
+ raise FileNotFoundError(f"Ruleset directory not found at {self.rule_directory}. Please ensure it exists.")
48
+
49
+ '''
50
+ # Rule directory and file
51
+ self.rule_directory = "../../python/src/ruleset/"
52
+ if not os.path.isdir(self.rule_directory):
53
+ print("Current working directory:", os.getcwd())
54
+ raise FileNotFoundError(f"Ruleset directory not found at {self.rule_directory}. Please ensure it exists.")
55
+ '''
56
+ self.rule_file_name: str = "UAST_rules_"
57
+
58
+ self.AST : Tree = None
59
+ # self.offset : int = None
60
+ # self.prev_line : int = -1
61
+ self.extracted : str = None
62
+ self.function_info = dict()
63
+ self.class_info = dict()
64
+ self.user_defined_entity = {"uast_function": "self.function_info[snippet] = id",
65
+ "uast_class": "self.class_info[snippet] = id"}
66
+
67
+
68
+ def set_rule_dir_path(self, path: str):
69
+ self.rule_directory = path
70
+
71
+ def set_grammar_path(self, path : str):
72
+ self.grammar_path = path
73
+ self.grammar = json.load(open(self.grammar_path, "r"))
74
+
75
+ # set language for the parser
76
+ def set_language(self, language : str):
77
+ self.language = language
78
+
79
+ if (language not in self.cached_rules):
80
+ rules_cache = json.load(open(self.rule_directory + self.rule_file_name + self.language + '.json', "r"))
81
+ self.cached_rules[language] = rules_cache
82
+
83
+ self.rules = self.cached_rules[language]
84
+
85
+ # initialise a DFS traversal on the AST and an empty UAST.
86
+ def parse(self, AST, code_snippet) :
87
+ if(self.language == None) :
88
+ print("Language not loaded")
89
+ return
90
+ self.AST = AST
91
+ self.uast = UAST()
92
+ self.uast.root.metadata["language"] = self.language
93
+ self.uast.root.metadata["loc_snippet"] = self.count_loc(code_snippet, self.language)
94
+ self._dfs(AST_node = self.AST.root_node, parent = self.uast.root)
95
+ '''
96
+ # commenting this block temporarily
97
+ # Call the new modularized function to calculate the code-to-comment ratio
98
+ code_to_comment_ratio = self.calculate_code_to_comment_ratio(self.uast.root)
99
+ # Add the code_to_comment_ratio to the root node's metadata
100
+ self.uast.root.metadata["code_to_comment_ratio"] = code_to_comment_ratio
101
+ '''
102
+ return self.uast
103
+
104
+ def calculate_code_to_comment_ratio(self, root_node):
105
+ # Get the loc_snippet from the root node's metadata
106
+ loc_snippet = root_node.metadata.get("loc_snippet", 0)
107
+
108
+ # Sum all loc_original_code for uast_comment nodes
109
+ total_comment_loc = 0
110
+
111
+ # Recursive function to sum comment LOC
112
+ def sum_comment_loc(node):
113
+ nonlocal total_comment_loc
114
+
115
+ # Check if the node is a comment node
116
+ if node.node_type == "uast_comment":
117
+ total_comment_loc += node.metadata.get("loc_original_code", 0)
118
+
119
+ # Traverse the children, ensuring we get the actual node objects
120
+ for child_id in node.children:
121
+ child_node = self.uast.get_node(child_id) # Fetch the actual child node using self.uast
122
+ sum_comment_loc(child_node) # Recursively sum for the child node
123
+
124
+ # Start summing loc_original_code from the root node
125
+ sum_comment_loc(root_node)
126
+
127
+ # Calculate the code-to-comment ratio (handling division by zero)
128
+ if total_comment_loc > 0:
129
+ return loc_snippet / total_comment_loc
130
+ else:
131
+ return None # Handle no comments
132
+
133
+ def count_lo_comments(self, code_snippet):
134
+ lines = code_snippet.split('\n')
135
+ loc_count = 0
136
+ for line in lines:
137
+ stripped_line = line.strip()
138
+ # Count all lines except blank ones
139
+ if stripped_line:
140
+ loc_count += 1
141
+ return loc_count
142
+
143
+ def count_loc(self, code_snippet, language):
144
+ # Define the comment markers for each language
145
+ language_comment_markers = {
146
+ "c": ('//', '/*', '*/'),
147
+ "java": ('//', '/*', '*/'),
148
+ "C#": ('//', '/*', '*/'),
149
+ "c_sharp": ('//', '/*', '*/'),
150
+ "cpp": ('//', '/*', '*/'),
151
+ "objc": ('//', '/*', '*/'),
152
+ "rust": ('//', '/*', '*/'),
153
+ "go": ('//', '/*', '*/'),
154
+ "kotlin": ('//', '/*', '*/'),
155
+ "VHDL": ('--', None, None),
156
+ "py": ('#', '"""', '"""'),
157
+ "js": ('//', '/*', '*/'),
158
+ "dart": ('//', '/*', '*/'),
159
+ "QML": ('//', None, None),
160
+ "typescript": ('//', '/*', '*/'),
161
+ "perl": ('#', None, None),
162
+ "haskell": ('--', '{-', '-}'),
163
+ "elm": ('--', '{-', '-}'),
164
+ "agda": ('--', '{-', '-}'),
165
+ "d": ('//', '/*', '*/'),
166
+ "nim": ('#', '##', None),
167
+ "ocaml": ('(*', '(*', '*)'),
168
+ "scala": ('//', '/*', '*/')
169
+ }
170
+
171
+ single_line_comment, multi_line_comment_start, multi_line_comment_end = language_comment_markers.get(language, (None, None, None))
172
+
173
+ if not single_line_comment:
174
+ raise ValueError(f"Unsupported language: {language}")
175
+
176
+ lines = code_snippet.split('\n')
177
+ loc_count = 0
178
+ inside_multiline_comment = False
179
+
180
+ for line in lines:
181
+ stripped_line = line.strip()
182
+
183
+ # Skip empty lines
184
+ if not stripped_line:
185
+ continue
186
+
187
+ # Handle multi-line comments
188
+ if multi_line_comment_start and multi_line_comment_end:
189
+ if inside_multiline_comment:
190
+ # Check if the line contains the end of a multi-line comment
191
+ if multi_line_comment_end in stripped_line:
192
+ inside_multiline_comment = False
193
+ continue
194
+ elif multi_line_comment_start in stripped_line:
195
+ # If the line starts a multi-line comment
196
+ inside_multiline_comment = True
197
+ continue
198
+
199
+ # Skip single-line comments
200
+ if stripped_line.startswith(single_line_comment):
201
+ continue
202
+
203
+ # If the line is neither a comment nor blank, count it as LOC
204
+ loc_count += 1
205
+
206
+ return loc_count
207
+
208
+ def _add_user_defined(self, node):
209
+ id = node.id
210
+ type = node.node_type
211
+
212
+ if node.code_snippet is not None:
213
+ snippet = node.code_snippet.replace(type, '').strip()
214
+ # Add further processing with the snippet
215
+ else:
216
+ # Handle the case where code_snippet is None
217
+ snippet = ""
218
+ # You can log a warning or take other appropriate action
219
+ print(f"Warning: node.code_snippet is None for node type: {type}")
220
+
221
+ if (type in self.user_defined_entity):
222
+ exec(self.user_defined_entity[type])
223
+ node.metadata["user_defined"] = True
224
+
225
+ del id
226
+ del type
227
+ del snippet
228
+ return
229
+
230
+ # Traversing through the AST to create nodes recursively.
231
+ def _dfs(self, AST_node, parent) :
232
+ if (AST_node.type in self.rules) :
233
+ ast_snippet = AST_node.text.decode("utf8")
234
+ node_type = self.rules[AST_node.type]["uast_node_type"]
235
+ exec_string = self.rules[AST_node.type]["extractor"]
236
+ uast_snippet = self._extract(ast_snippet = ast_snippet, node_type = node_type, exec_string = exec_string)
237
+
238
+ if node_type == "uast_comment":
239
+ loc_original_code = self.count_lo_comments(ast_snippet)
240
+ else:
241
+ loc_original_code = self.count_loc(ast_snippet, self.language)
242
+
243
+ node = self.uast.create_node(
244
+ node_type = node_type,
245
+ code_snippet = uast_snippet,
246
+ # choose to enable or disbale the storage of original code by removing the following line.
247
+ metadata = {
248
+ "original_code" : ast_snippet,
249
+ "loc_original_code": loc_original_code
250
+ },
251
+ )
252
+ self._add_user_defined(node)
253
+ self.uast.add_edge(node1 = parent, node2 = node, directed_relation = "parent_node")
254
+ parent = node
255
+
256
+ for child in AST_node.children:
257
+ try:
258
+ self._dfs(AST_node= child, parent = parent)
259
+ except RecursionError as e:
260
+ print(f"RecursionError caught: {str(e)}")
261
+
262
+ def _extract(self, ast_snippet, node_type, exec_string):
263
+ code_snippet = ast_snippet
264
+ try:
265
+ exec(exec_string)
266
+ except Exception as e:
267
+ print(e)
268
+ try:
269
+ return self.grammar[node_type]["keyword"] + " " + self.extracted
270
+ except Exception as e:
271
+ print(e)
__init__.py ADDED
File without changes
base_tokenizer.py ADDED
@@ -0,0 +1,36 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ import string
14
+
15
+
16
+ """
17
+ This implements the most simplistic tokenizer based on the white spaces
18
+ that can be overwritten by a different a different one. This method is
19
+ build in the library and can be overwritten using approach described at
20
+ https://stackoverflow.com/questions/37553545/how-do-i-override-a-function-of-a-python-library
21
+
22
+ import base_tokenizer
23
+ base_tokenizer.tokenize = my_local_tokenize
24
+
25
+ """
26
+
27
+
28
+ def tokenize(text: str) -> list[str]:
29
+ """
30
+ Tokenize string
31
+ :param text: source text
32
+ :return: list of tokens (words)
33
+ """
34
+ # start from normalizing string
35
+ normal = text.strip().lower().translate(str.maketrans("", "", string.punctuation))
36
+ return normal.split()