janus-llm 1.0.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- janus/__init__.py +9 -1
- janus/__main__.py +4 -0
- janus/_tests/test_cli.py +128 -0
- janus/_tests/test_translate.py +49 -7
- janus/cli.py +530 -46
- janus/converter.py +50 -19
- janus/embedding/_tests/test_collections.py +2 -8
- janus/embedding/_tests/test_database.py +32 -0
- janus/embedding/_tests/test_vectorize.py +9 -4
- janus/embedding/collections.py +49 -6
- janus/embedding/embedding_models_info.py +120 -0
- janus/embedding/vectorize.py +53 -62
- janus/language/_tests/__init__.py +0 -0
- janus/language/_tests/test_combine.py +62 -0
- janus/language/_tests/test_splitter.py +16 -0
- janus/language/binary/_tests/test_binary.py +16 -1
- janus/language/binary/binary.py +10 -3
- janus/language/block.py +31 -30
- janus/language/combine.py +26 -34
- janus/language/mumps/_tests/test_mumps.py +2 -2
- janus/language/mumps/mumps.py +93 -9
- janus/language/naive/__init__.py +4 -0
- janus/language/naive/basic_splitter.py +14 -0
- janus/language/naive/chunk_splitter.py +26 -0
- janus/language/naive/registry.py +13 -0
- janus/language/naive/simple_ast.py +18 -0
- janus/language/naive/tag_splitter.py +61 -0
- janus/language/splitter.py +168 -74
- janus/language/treesitter/_tests/test_treesitter.py +9 -6
- janus/language/treesitter/treesitter.py +37 -13
- janus/llm/model_callbacks.py +177 -0
- janus/llm/models_info.py +134 -70
- janus/metrics/__init__.py +8 -0
- janus/metrics/_tests/__init__.py +0 -0
- janus/metrics/_tests/reference.py +2 -0
- janus/metrics/_tests/target.py +2 -0
- janus/metrics/_tests/test_bleu.py +56 -0
- janus/metrics/_tests/test_chrf.py +67 -0
- janus/metrics/_tests/test_file_pairing.py +59 -0
- janus/metrics/_tests/test_llm.py +91 -0
- janus/metrics/_tests/test_reading.py +28 -0
- janus/metrics/_tests/test_rouge_score.py +65 -0
- janus/metrics/_tests/test_similarity_score.py +23 -0
- janus/metrics/_tests/test_treesitter_metrics.py +110 -0
- janus/metrics/bleu.py +66 -0
- janus/metrics/chrf.py +55 -0
- janus/metrics/cli.py +7 -0
- janus/metrics/complexity_metrics.py +208 -0
- janus/metrics/file_pairing.py +113 -0
- janus/metrics/llm_metrics.py +202 -0
- janus/metrics/metric.py +466 -0
- janus/metrics/reading.py +70 -0
- janus/metrics/rouge_score.py +96 -0
- janus/metrics/similarity.py +53 -0
- janus/metrics/splitting.py +38 -0
- janus/parsers/_tests/__init__.py +0 -0
- janus/parsers/_tests/test_code_parser.py +32 -0
- janus/parsers/code_parser.py +24 -253
- janus/parsers/doc_parser.py +169 -0
- janus/parsers/eval_parser.py +80 -0
- janus/parsers/reqs_parser.py +72 -0
- janus/prompts/prompt.py +103 -30
- janus/translate.py +636 -111
- janus/utils/_tests/__init__.py +0 -0
- janus/utils/_tests/test_logger.py +67 -0
- janus/utils/_tests/test_progress.py +20 -0
- janus/utils/enums.py +56 -3
- janus/utils/progress.py +56 -0
- {janus_llm-1.0.0.dist-info → janus_llm-2.0.0.dist-info}/METADATA +23 -10
- janus_llm-2.0.0.dist-info/RECORD +94 -0
- {janus_llm-1.0.0.dist-info → janus_llm-2.0.0.dist-info}/WHEEL +1 -1
- janus_llm-1.0.0.dist-info/RECORD +0 -48
- {janus_llm-1.0.0.dist-info → janus_llm-2.0.0.dist-info}/LICENSE +0 -0
- {janus_llm-1.0.0.dist-info → janus_llm-2.0.0.dist-info}/entry_points.txt +0 -0
janus/language/splitter.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
import re
|
2
|
+
from itertools import compress
|
2
3
|
from pathlib import Path
|
3
4
|
from typing import List
|
4
5
|
|
@@ -21,6 +22,20 @@ class TokenLimitError(Exception):
|
|
21
22
|
pass
|
22
23
|
|
23
24
|
|
25
|
+
class EmptyTreeError(Exception):
|
26
|
+
"""An exception raised when the tree is empty or does not exist (can happen
|
27
|
+
when there are no nodes of interest in the tree)
|
28
|
+
"""
|
29
|
+
|
30
|
+
pass
|
31
|
+
|
32
|
+
|
33
|
+
class FileSizeError(Exception):
|
34
|
+
"""An exception raised when the file size is too large for the splitter"""
|
35
|
+
|
36
|
+
pass
|
37
|
+
|
38
|
+
|
24
39
|
class Splitter(FileManager):
|
25
40
|
"""A class for splitting code into functional blocks to prompt with for
|
26
41
|
transcoding.
|
@@ -31,7 +46,10 @@ class Splitter(FileManager):
|
|
31
46
|
language: str,
|
32
47
|
model: None | BaseLanguageModel = None,
|
33
48
|
max_tokens: int = 4096,
|
34
|
-
|
49
|
+
skip_merge: bool = False,
|
50
|
+
protected_node_types: tuple[str] = (),
|
51
|
+
prune_node_types: tuple[str] = (),
|
52
|
+
prune_unprotected: bool = False,
|
35
53
|
):
|
36
54
|
"""
|
37
55
|
Arguments:
|
@@ -39,14 +57,22 @@ class Splitter(FileManager):
|
|
39
57
|
model: The name of the model to use for counting tokens. If the model is None,
|
40
58
|
will use tiktoken's default tokenizer to count tokens.
|
41
59
|
max_tokens: The maximum number of tokens to use for each functional block.
|
42
|
-
|
60
|
+
skip_merge: Whether to merge child nodes up to the max_token length.
|
61
|
+
May be used for situations like documentation where function-level
|
62
|
+
documentation is preferred.
|
63
|
+
TODO: Maybe instead support something like a list of node types that
|
64
|
+
shouldnt be merged (e.g. functions, classes)?
|
65
|
+
prune_unprotected: Whether to prune unprotected nodes from the tree.
|
43
66
|
"""
|
44
67
|
super().__init__(language=language)
|
45
68
|
self.model = model
|
46
69
|
if self.model is None:
|
47
70
|
self._encoding = tiktoken.get_encoding("cl100k_base")
|
48
|
-
self.
|
71
|
+
self.skip_merge = skip_merge
|
49
72
|
self.max_tokens: int = max_tokens
|
73
|
+
self._protected_node_types = set(protected_node_types)
|
74
|
+
self._prune_node_types = set(prune_node_types)
|
75
|
+
self.prune_unprotected = prune_unprotected
|
50
76
|
|
51
77
|
def split(self, file: Path | str) -> CodeBlock:
|
52
78
|
"""Split the given file into functional code blocks.
|
@@ -59,11 +85,27 @@ class Splitter(FileManager):
|
|
59
85
|
"""
|
60
86
|
path = Path(file)
|
61
87
|
code = path.read_text()
|
88
|
+
return self.split_string(code, path.name)
|
89
|
+
|
90
|
+
def split_string(self, code: str, name: str) -> CodeBlock:
|
91
|
+
"""Split the given code into functional code blocks.
|
92
|
+
|
93
|
+
Arguments:
|
94
|
+
code: The code as a string to split into functional blocks.
|
95
|
+
name: The filename of the code block.
|
96
|
+
|
97
|
+
Returns:
|
98
|
+
A `CodeBlock` made up of nested `CodeBlock`s.
|
99
|
+
"""
|
62
100
|
|
63
101
|
root = self._get_ast(code)
|
64
|
-
self.
|
102
|
+
self._prune(root)
|
103
|
+
if self.prune_unprotected:
|
104
|
+
self._prune_unprotected(root)
|
105
|
+
self._set_identifiers(root, name)
|
65
106
|
self._segment_leaves(root)
|
66
|
-
self.
|
107
|
+
if not self.skip_merge:
|
108
|
+
self._merge_tree(root)
|
67
109
|
|
68
110
|
return root
|
69
111
|
|
@@ -79,12 +121,25 @@ class Splitter(FileManager):
|
|
79
121
|
"""
|
80
122
|
raise NotImplementedError()
|
81
123
|
|
82
|
-
def
|
124
|
+
def _all_node_types(self, root: CodeBlock) -> set[NodeType]:
|
125
|
+
types = set()
|
126
|
+
stack = [root]
|
127
|
+
while stack:
|
128
|
+
node = stack.pop()
|
129
|
+
types.add(node.node_type)
|
130
|
+
stack.extend(node.children)
|
131
|
+
return types
|
132
|
+
|
133
|
+
def _set_identifiers(self, root: CodeBlock, name: str):
|
83
134
|
"""Set the IDs and names of each node in the given tree. By default,
|
84
135
|
node IDs take the form `child_<i>`, where <i> is an integer counter which
|
85
136
|
increments in breadth-first order, and node names take the form
|
86
|
-
`<
|
137
|
+
`<name>:<ID>`. Child classes should override this function to use
|
87
138
|
more informative names based on the particular programming language.
|
139
|
+
|
140
|
+
Arguments:
|
141
|
+
root: The root of the tree to set identifiers for.
|
142
|
+
name: The name of the file being split.
|
88
143
|
"""
|
89
144
|
seen_ids = 0
|
90
145
|
queue = [root]
|
@@ -92,7 +147,7 @@ class Splitter(FileManager):
|
|
92
147
|
node = queue.pop(0) # BFS order to keep lower IDs toward the root
|
93
148
|
node.id = f"child_{seen_ids}"
|
94
149
|
seen_ids += 1
|
95
|
-
node.name = f"{
|
150
|
+
node.name = f"{name}:{node.id}"
|
96
151
|
queue.extend(node.children)
|
97
152
|
|
98
153
|
def _merge_tree(self, root: CodeBlock):
|
@@ -102,76 +157,90 @@ class Splitter(FileManager):
|
|
102
157
|
the represented code is present in the text of exactly one node in the
|
103
158
|
tree.
|
104
159
|
"""
|
160
|
+
# Simulate recursion with a stack
|
105
161
|
stack = [root]
|
106
162
|
while stack:
|
107
163
|
node = stack.pop()
|
108
|
-
self._merge_children(node)
|
109
|
-
stack.extend(node.children)
|
110
164
|
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
"""
|
119
|
-
# If the text at the function input is less than the max tokens, then
|
120
|
-
# we can just return it as a CodeBlock with no children.
|
121
|
-
if node.tokens <= self.max_tokens:
|
122
|
-
node.children = []
|
123
|
-
return
|
124
|
-
|
125
|
-
node.complete = False
|
165
|
+
# If the text of this node can fit in context, then we can just
|
166
|
+
# prune its children, making it a leaf node.
|
167
|
+
if node.tokens <= self.max_tokens and not self._has_protected_descendents(
|
168
|
+
node
|
169
|
+
):
|
170
|
+
node.children = []
|
171
|
+
continue
|
126
172
|
|
127
|
-
|
128
|
-
|
129
|
-
|
173
|
+
# Otherwise, this is an internal node. Mark it as incomplete, and
|
174
|
+
# drop its text (which will be represented in its children)
|
175
|
+
node.complete = False
|
176
|
+
node.text = None
|
177
|
+
node.tokens = 0
|
130
178
|
|
131
|
-
|
132
|
-
|
133
|
-
|
179
|
+
# If this node has no children but cannot fit into context, then we
|
180
|
+
# have a problem. Oversized nodes have already been segmented into
|
181
|
+
# lines, so this node contains a single line too long to send to
|
182
|
+
# the LLM. If this happens, the source code is probably malformed.
|
183
|
+
# We have no choice but to log an error and simply ignore the node.
|
134
184
|
if not node.children:
|
135
185
|
log.error(f"[{node.name}] Childless node too long for context!")
|
136
|
-
|
137
|
-
node.tokens = 0
|
138
|
-
return
|
186
|
+
continue
|
139
187
|
|
140
|
-
|
141
|
-
|
142
|
-
|
188
|
+
# Consolidate nodes into groups, and then merge each group into a new node
|
189
|
+
node_groups = self._group_nodes(node.children)
|
190
|
+
node.children = list(map(self.merge_nodes, node_groups))
|
143
191
|
|
144
|
-
|
145
|
-
|
146
|
-
if node.tokens > self.max_tokens:
|
147
|
-
node.text = None
|
148
|
-
node.tokens = 0
|
149
|
-
return
|
192
|
+
# "Recurse" by pushing the children onto the stack
|
193
|
+
stack.extend(node.children)
|
150
194
|
|
151
|
-
|
152
|
-
|
153
|
-
key=lambda idx: node.children[idx].tokens,
|
154
|
-
)
|
195
|
+
def _should_prune(self, node: CodeBlock) -> bool:
|
196
|
+
return node.node_type in self._prune_node_types
|
155
197
|
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
if
|
163
|
-
|
198
|
+
def _prune(self, root: CodeBlock) -> None:
|
199
|
+
stack = [root]
|
200
|
+
traversal = []
|
201
|
+
while stack:
|
202
|
+
node = stack.pop()
|
203
|
+
traversal.append(node)
|
204
|
+
node.children = [c for c in node.children if not self._should_prune(c)]
|
205
|
+
stack.extend(node.children)
|
164
206
|
|
165
|
-
|
166
|
-
node.
|
167
|
-
|
207
|
+
for node in traversal[::-1]:
|
208
|
+
node.rebuild_text_from_children()
|
209
|
+
node.tokens = self._count_tokens(node.text)
|
168
210
|
|
169
|
-
|
170
|
-
node.
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
211
|
+
def _is_protected(self, node: CodeBlock) -> bool:
|
212
|
+
return node.node_type in self._protected_node_types
|
213
|
+
|
214
|
+
def _has_protected_descendents(self, node: CodeBlock) -> bool:
|
215
|
+
if not self._protected_node_types:
|
216
|
+
return False
|
217
|
+
|
218
|
+
queue = [*node.children]
|
219
|
+
while queue:
|
220
|
+
node = queue.pop(0)
|
221
|
+
if self._is_protected(node):
|
222
|
+
return True
|
223
|
+
queue.extend(node.children)
|
224
|
+
return False
|
225
|
+
|
226
|
+
def _prune_unprotected(self, root: CodeBlock) -> None:
|
227
|
+
if not self._has_protected_descendents(root):
|
228
|
+
if not self._is_protected(root):
|
229
|
+
raise EmptyTreeError("No protected nodes in tree!")
|
230
|
+
root.children = []
|
231
|
+
return
|
232
|
+
|
233
|
+
stack = [root]
|
234
|
+
while stack:
|
235
|
+
node = stack.pop()
|
236
|
+
if self._is_protected(node):
|
237
|
+
node.children = []
|
238
|
+
node.children = [
|
239
|
+
c
|
240
|
+
for c in node.children
|
241
|
+
if self._is_protected(c) or self._has_protected_descendents(c)
|
242
|
+
]
|
243
|
+
stack.extend(node.children)
|
175
244
|
|
176
245
|
def _group_nodes(self, nodes: List[CodeBlock]) -> List[List[CodeBlock]]:
|
177
246
|
"""Consolidate a list of tree_sitter nodes into groups. Each group should fit
|
@@ -197,11 +266,19 @@ class Splitter(FileManager):
|
|
197
266
|
# Estimate the length of each adjacent pair were they merged
|
198
267
|
adj_sums = [lengths[i] + lengths[i + 1] for i in range(len(lengths) - 1)]
|
199
268
|
|
269
|
+
# Create list of booleans parallel with adj_sums indicating whether that
|
270
|
+
# merge is allowed (according to the protected node types list)
|
271
|
+
protected = list(map(self._is_protected, nodes))
|
272
|
+
merge_allowed = [
|
273
|
+
not (protected[i] or protected[i + 1]) for i in range(len(protected) - 1)
|
274
|
+
]
|
275
|
+
|
200
276
|
groups = [[n] for n in nodes]
|
201
|
-
while len(groups) > 1 and min(adj_sums) <= self.max_tokens:
|
277
|
+
while len(groups) > 1 and min(adj_sums) <= self.max_tokens and any(merge_allowed):
|
202
278
|
# Get the indices of the adjacent nodes that would result in the
|
203
|
-
# smallest possible merged snippet
|
204
|
-
|
279
|
+
# smallest possible merged snippet. Ignore protected nodes.
|
280
|
+
mergeable_indices = compress(range(len(adj_sums)), merge_allowed)
|
281
|
+
i0 = int(min(mergeable_indices, key=adj_sums.__getitem__))
|
205
282
|
i1 = i0 + 1
|
206
283
|
|
207
284
|
# Recalculate the length. We can't simply use the adj_sum, because
|
@@ -222,8 +299,13 @@ class Splitter(FileManager):
|
|
222
299
|
if i1 < len(adj_sums) - 1:
|
223
300
|
adj_sums[i1 + 1] += merged_text_length
|
224
301
|
|
302
|
+
if i0 > 0 and i1 < len(merge_allowed) - 1:
|
303
|
+
if not (merge_allowed[i0 - 1] and merge_allowed[i1 + 1]):
|
304
|
+
merge_allowed[i0 - 1] = merge_allowed[i1 + 1] = False
|
305
|
+
|
225
306
|
# The potential merge length for this pair is removed
|
226
307
|
adj_sums.pop(i0)
|
308
|
+
merge_allowed.pop(i0)
|
227
309
|
|
228
310
|
# Merge the pair of node groups
|
229
311
|
groups[i0 : i1 + 1] = [groups[i0] + groups[i1]]
|
@@ -269,7 +351,7 @@ class Splitter(FileManager):
|
|
269
351
|
start_byte=nodes[0].start_byte,
|
270
352
|
end_byte=nodes[-1].end_byte,
|
271
353
|
affixes=(prefix, suffix),
|
272
|
-
|
354
|
+
node_type=NodeType("merge"),
|
273
355
|
children=sorted(sum([node.children for node in nodes], [])),
|
274
356
|
language=language,
|
275
357
|
tokens=tokens,
|
@@ -301,6 +383,9 @@ class Splitter(FileManager):
|
|
301
383
|
if node.tokens <= self.max_tokens:
|
302
384
|
return
|
303
385
|
|
386
|
+
if self._is_protected(node):
|
387
|
+
raise TokenLimitError(r"Irreducible node too large for context!")
|
388
|
+
|
304
389
|
if node.children:
|
305
390
|
for child in node.children:
|
306
391
|
self._segment_leaves(child)
|
@@ -309,9 +394,19 @@ class Splitter(FileManager):
|
|
309
394
|
if node.start_point is None or node.end_point is None:
|
310
395
|
raise ValueError("Node has no start or end point")
|
311
396
|
|
397
|
+
self._split_into_lines(node)
|
398
|
+
|
399
|
+
def _split_into_lines(self, node: CodeBlock):
|
312
400
|
split_text = re.split(r"(\n+)", node.text)
|
313
|
-
|
314
|
-
|
401
|
+
|
402
|
+
# If the string didn't start/end with newlines, make sure to include
|
403
|
+
# empty strings for the prefix/suffixes
|
404
|
+
if split_text[0].strip("\n"):
|
405
|
+
split_text = [""] + split_text
|
406
|
+
if split_text[-1].strip("\n"):
|
407
|
+
split_text.append("")
|
408
|
+
betweens = split_text[::2]
|
409
|
+
lines = split_text[1::2]
|
315
410
|
|
316
411
|
start_byte = node.start_byte
|
317
412
|
node_line = 0
|
@@ -322,7 +417,7 @@ class Splitter(FileManager):
|
|
322
417
|
end_byte = start_byte + len(bytes(line, "utf-8"))
|
323
418
|
end_char = len(line)
|
324
419
|
|
325
|
-
name = f"{node.name}L#{node_line}"
|
420
|
+
name = f"{node.name}-L#{node_line}"
|
326
421
|
tokens = self._count_tokens(line)
|
327
422
|
if tokens > self.max_tokens:
|
328
423
|
raise TokenLimitError(r"Irreducible node too large for context!")
|
@@ -337,14 +432,13 @@ class Splitter(FileManager):
|
|
337
432
|
start_byte=start_byte,
|
338
433
|
end_byte=end_byte,
|
339
434
|
affixes=(prefix, suffix),
|
340
|
-
|
435
|
+
node_type=NodeType(f"{node.node_type}__segment"),
|
341
436
|
children=[],
|
342
437
|
language=self.language,
|
343
438
|
tokens=tokens,
|
344
439
|
)
|
345
440
|
)
|
346
|
-
start_byte = end_byte
|
347
|
-
node_line += len(suffix)
|
441
|
+
start_byte = end_byte
|
348
442
|
|
349
443
|
# Keep the first child's prefix
|
350
444
|
node.children[0].omit_prefix = False
|
@@ -11,7 +11,7 @@ class TestTreeSitterSplitter(unittest.TestCase):
|
|
11
11
|
|
12
12
|
def setUp(self):
|
13
13
|
"""Set up the tests."""
|
14
|
-
model_name = "gpt-3.5-turbo"
|
14
|
+
model_name = "gpt-3.5-turbo-0125"
|
15
15
|
self.maxDiff = None
|
16
16
|
self.llm, _, _ = load_model(model_name)
|
17
17
|
|
@@ -31,7 +31,13 @@ class TestTreeSitterSplitter(unittest.TestCase):
|
|
31
31
|
self.test_file = Path("janus/language/treesitter/_tests/languages/fortran.f90")
|
32
32
|
self._split()
|
33
33
|
|
34
|
-
|
34
|
+
def test_split_ibmhlasm(self):
|
35
|
+
"""Test the split method."""
|
36
|
+
self.splitter = TreeSitterSplitter(
|
37
|
+
language="ibmhlasm", model=self.llm, max_tokens=100
|
38
|
+
)
|
39
|
+
self.combiner = Combiner(language="ibmhlasm")
|
40
|
+
self.test_file = Path("janus/language/treesitter/_tests/languages/ibmhlasm.asm")
|
35
41
|
self._split()
|
36
42
|
|
37
43
|
def test_split_matlab(self):
|
@@ -39,13 +45,10 @@ class TestTreeSitterSplitter(unittest.TestCase):
|
|
39
45
|
self.splitter = TreeSitterSplitter(
|
40
46
|
language="matlab",
|
41
47
|
model=self.llm,
|
42
|
-
max_tokens=(4096 // 3)
|
48
|
+
max_tokens=(4096 // 3),
|
43
49
|
# max_tokens used to be / 3 always in TreeSitterSplitter to leave just as
|
44
50
|
# much space for the prompt as for the translated code.
|
45
51
|
)
|
46
52
|
self.combiner = Combiner(language="matlab")
|
47
53
|
self.test_file = Path("janus/language/treesitter/_tests/languages/matlab.m")
|
48
54
|
self._split()
|
49
|
-
|
50
|
-
self.splitter.use_placeholders = False
|
51
|
-
self._split()
|
@@ -2,6 +2,7 @@ import os
|
|
2
2
|
import platform
|
3
3
|
from collections import defaultdict
|
4
4
|
from pathlib import Path
|
5
|
+
from typing import Optional
|
5
6
|
|
6
7
|
import tree_sitter
|
7
8
|
from git import Repo
|
@@ -9,7 +10,7 @@ from langchain.schema.language_model import BaseLanguageModel
|
|
9
10
|
|
10
11
|
from ...utils.enums import LANGUAGES
|
11
12
|
from ...utils.logger import create_logger
|
12
|
-
from ..block import CodeBlock
|
13
|
+
from ..block import CodeBlock, NodeType
|
13
14
|
from ..splitter import Splitter
|
14
15
|
|
15
16
|
log = create_logger(__name__)
|
@@ -25,7 +26,9 @@ class TreeSitterSplitter(Splitter):
|
|
25
26
|
language: str,
|
26
27
|
model: None | BaseLanguageModel = None,
|
27
28
|
max_tokens: int = 4096,
|
28
|
-
|
29
|
+
protected_node_types: tuple[str] = (),
|
30
|
+
prune_node_types: tuple[str] = (),
|
31
|
+
prune_unprotected: bool = False,
|
29
32
|
) -> None:
|
30
33
|
"""Initialize a TreeSitterSplitter instance.
|
31
34
|
|
@@ -38,26 +41,39 @@ class TreeSitterSplitter(Splitter):
|
|
38
41
|
language=language,
|
39
42
|
model=model,
|
40
43
|
max_tokens=max_tokens,
|
41
|
-
|
44
|
+
protected_node_types=protected_node_types,
|
45
|
+
prune_node_types=prune_node_types,
|
46
|
+
prune_unprotected=prune_unprotected,
|
42
47
|
)
|
43
48
|
self._load_parser()
|
44
49
|
|
45
50
|
def _get_ast(self, code: str) -> CodeBlock:
|
46
51
|
code = bytes(code, "utf-8")
|
47
|
-
|
48
52
|
tree = self.parser.parse(code)
|
49
53
|
root = tree.walk().node
|
50
54
|
root = self._node_to_block(root, code)
|
51
55
|
return root
|
52
56
|
|
53
|
-
|
57
|
+
# Recursively print tree to view parsed output (dev helper function)
|
58
|
+
# Example call: self._print_tree(tree.walk(), "")
|
59
|
+
def _print_tree(self, cursor: tree_sitter.TreeCursor, indent: str) -> None:
|
60
|
+
node = cursor.node
|
61
|
+
print(f"{indent}{node.type} {node.start_point}-{node.end_point}")
|
62
|
+
if cursor.goto_first_child():
|
63
|
+
while True:
|
64
|
+
self._print_tree(cursor, indent + " ")
|
65
|
+
if not cursor.goto_next_sibling():
|
66
|
+
break
|
67
|
+
cursor.goto_parent()
|
68
|
+
|
69
|
+
def _set_identifiers(self, root: CodeBlock, name: str):
|
54
70
|
seen_types = defaultdict(int)
|
55
71
|
queue = [root]
|
56
72
|
while queue:
|
57
73
|
node = queue.pop(0) # BFS order to keep lower IDs toward the root
|
58
|
-
node.id = f"{node.
|
59
|
-
seen_types[node.
|
60
|
-
node.name = f"{
|
74
|
+
node.id = f"{node.node_type}[{seen_types[node.node_type]}]"
|
75
|
+
seen_types[node.node_type] += 1
|
76
|
+
node.name = f"{name}:{node.id}"
|
61
77
|
queue.extend(node.children)
|
62
78
|
|
63
79
|
def _node_to_block(self, node: tree_sitter.Node, original_text: bytes) -> CodeBlock:
|
@@ -84,14 +100,14 @@ class TreeSitterSplitter(Splitter):
|
|
84
100
|
children = [self._node_to_block(child, original_text) for child in node.children]
|
85
101
|
node = CodeBlock(
|
86
102
|
id=node.id,
|
87
|
-
name=node.id,
|
103
|
+
name=str(node.id),
|
88
104
|
text=text,
|
89
105
|
affixes=(prefix, suffix),
|
90
106
|
start_point=node.start_point,
|
91
107
|
end_point=node.end_point,
|
92
108
|
start_byte=node.start_byte,
|
93
109
|
end_byte=node.end_byte,
|
94
|
-
|
110
|
+
node_type=NodeType(node.type),
|
95
111
|
children=children,
|
96
112
|
language=self.language,
|
97
113
|
tokens=self._count_tokens(text),
|
@@ -142,14 +158,22 @@ class TreeSitterSplitter(Splitter):
|
|
142
158
|
message = f"Tree-sitter does not support {self.language} yet."
|
143
159
|
log.error(message)
|
144
160
|
raise ValueError(message)
|
145
|
-
self.
|
161
|
+
if LANGUAGES[self.language].get("branch"):
|
162
|
+
self._git_clone(github_url, lang_dir, LANGUAGES[self.language]["branch"])
|
163
|
+
else:
|
164
|
+
self._git_clone(github_url, lang_dir)
|
146
165
|
|
147
166
|
tree_sitter.Language.build_library(str(so_file), [str(lang_dir)])
|
148
167
|
|
149
168
|
@staticmethod
|
150
|
-
def _git_clone(
|
169
|
+
def _git_clone(
|
170
|
+
repository_url: str, destination_folder: Path | str, branch: Optional[str] = None
|
171
|
+
) -> None:
|
151
172
|
try:
|
152
|
-
|
173
|
+
if branch:
|
174
|
+
Repo.clone_from(repository_url, destination_folder, branch=branch)
|
175
|
+
else:
|
176
|
+
Repo.clone_from(repository_url, destination_folder)
|
153
177
|
log.debug(f"{repository_url} cloned to {destination_folder}")
|
154
178
|
except Exception as e:
|
155
179
|
log.error(f"Error: {e}")
|