janus-llm 4.3.1__py3-none-any.whl → 4.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- janus/__init__.py +1 -1
- janus/__main__.py +1 -1
- janus/_tests/evaluator_tests/EvalReadMe.md +85 -0
- janus/_tests/evaluator_tests/incose_tests/incose_large_test.json +39 -0
- janus/_tests/evaluator_tests/incose_tests/incose_small_test.json +17 -0
- janus/_tests/evaluator_tests/inline_comment_tests/mumps_inline_comment_test.m +71 -0
- janus/_tests/test_cli.py +3 -2
- janus/cli/aggregate.py +135 -0
- janus/cli/cli.py +117 -0
- janus/cli/constants.py +49 -0
- janus/cli/database.py +289 -0
- janus/cli/diagram.py +207 -0
- janus/cli/document.py +183 -0
- janus/cli/embedding.py +122 -0
- janus/cli/llm.py +191 -0
- janus/cli/partition.py +134 -0
- janus/cli/pipeline.py +123 -0
- janus/cli/self_eval.py +147 -0
- janus/cli/translate.py +192 -0
- janus/converter/__init__.py +1 -1
- janus/converter/_tests/test_translate.py +7 -5
- janus/converter/chain.py +180 -0
- janus/converter/converter.py +444 -153
- janus/converter/diagram.py +8 -6
- janus/converter/document.py +27 -16
- janus/converter/evaluate.py +143 -144
- janus/converter/partition.py +2 -10
- janus/converter/requirements.py +4 -40
- janus/converter/translate.py +3 -59
- janus/embedding/collections.py +1 -1
- janus/language/alc/_tests/alc.asm +3779 -0
- janus/language/binary/_tests/hello.bin +0 -0
- janus/language/block.py +78 -14
- janus/language/file.py +1 -1
- janus/language/mumps/_tests/mumps.m +235 -0
- janus/language/treesitter/_tests/languages/fortran.f90 +416 -0
- janus/language/treesitter/_tests/languages/ibmhlasm.asm +16 -0
- janus/language/treesitter/_tests/languages/matlab.m +225 -0
- janus/llm/models_info.py +9 -1
- janus/metrics/_tests/asm_test_file.asm +10 -0
- janus/metrics/_tests/mumps_test_file.m +6 -0
- janus/metrics/_tests/test_treesitter_metrics.py +1 -1
- janus/metrics/metric.py +47 -124
- janus/metrics/prompts/clarity.txt +8 -0
- janus/metrics/prompts/completeness.txt +16 -0
- janus/metrics/prompts/faithfulness.txt +10 -0
- janus/metrics/prompts/hallucination.txt +16 -0
- janus/metrics/prompts/quality.txt +8 -0
- janus/metrics/prompts/readability.txt +16 -0
- janus/metrics/prompts/usefulness.txt +16 -0
- janus/parsers/code_parser.py +4 -4
- janus/parsers/doc_parser.py +12 -9
- janus/parsers/parser.py +7 -0
- janus/parsers/partition_parser.py +6 -4
- janus/parsers/reqs_parser.py +11 -8
- janus/parsers/uml.py +5 -4
- janus/prompts/prompt.py +2 -2
- janus/prompts/templates/README.md +30 -0
- janus/prompts/templates/basic_aggregation/human.txt +6 -0
- janus/prompts/templates/basic_aggregation/system.txt +1 -0
- janus/prompts/templates/basic_refinement/human.txt +14 -0
- janus/prompts/templates/basic_refinement/system.txt +1 -0
- janus/prompts/templates/diagram/human.txt +9 -0
- janus/prompts/templates/diagram/system.txt +1 -0
- janus/prompts/templates/diagram_with_documentation/human.txt +15 -0
- janus/prompts/templates/diagram_with_documentation/system.txt +1 -0
- janus/prompts/templates/document/human.txt +10 -0
- janus/prompts/templates/document/system.txt +1 -0
- janus/prompts/templates/document_cloze/human.txt +11 -0
- janus/prompts/templates/document_cloze/system.txt +1 -0
- janus/prompts/templates/document_cloze/variables.json +4 -0
- janus/prompts/templates/document_cloze/variables_asm.json +4 -0
- janus/prompts/templates/document_inline/human.txt +13 -0
- janus/prompts/templates/eval_prompts/incose/human.txt +32 -0
- janus/prompts/templates/eval_prompts/incose/system.txt +1 -0
- janus/prompts/templates/eval_prompts/incose/variables.json +3 -0
- janus/prompts/templates/eval_prompts/inline_comments/human.txt +49 -0
- janus/prompts/templates/eval_prompts/inline_comments/system.txt +1 -0
- janus/prompts/templates/eval_prompts/inline_comments/variables.json +3 -0
- janus/prompts/templates/micromanaged_mumps_v1.0/human.txt +23 -0
- janus/prompts/templates/micromanaged_mumps_v1.0/system.txt +3 -0
- janus/prompts/templates/micromanaged_mumps_v2.0/human.txt +28 -0
- janus/prompts/templates/micromanaged_mumps_v2.0/system.txt +3 -0
- janus/prompts/templates/micromanaged_mumps_v2.1/human.txt +29 -0
- janus/prompts/templates/micromanaged_mumps_v2.1/system.txt +3 -0
- janus/prompts/templates/multidocument/human.txt +15 -0
- janus/prompts/templates/multidocument/system.txt +1 -0
- janus/prompts/templates/partition/human.txt +22 -0
- janus/prompts/templates/partition/system.txt +1 -0
- janus/prompts/templates/partition/variables.json +4 -0
- janus/prompts/templates/pseudocode/human.txt +7 -0
- janus/prompts/templates/pseudocode/system.txt +7 -0
- janus/prompts/templates/refinement/fix_exceptions/human.txt +19 -0
- janus/prompts/templates/refinement/fix_exceptions/system.txt +1 -0
- janus/prompts/templates/refinement/format/code_format/human.txt +12 -0
- janus/prompts/templates/refinement/format/code_format/system.txt +1 -0
- janus/prompts/templates/refinement/format/requirements_format/human.txt +14 -0
- janus/prompts/templates/refinement/format/requirements_format/system.txt +1 -0
- janus/prompts/templates/refinement/hallucination/human.txt +13 -0
- janus/prompts/templates/refinement/hallucination/system.txt +1 -0
- janus/prompts/templates/refinement/reflection/human.txt +15 -0
- janus/prompts/templates/refinement/reflection/incose/human.txt +26 -0
- janus/prompts/templates/refinement/reflection/incose/system.txt +1 -0
- janus/prompts/templates/refinement/reflection/incose_deduplicate/human.txt +16 -0
- janus/prompts/templates/refinement/reflection/incose_deduplicate/system.txt +1 -0
- janus/prompts/templates/refinement/reflection/system.txt +1 -0
- janus/prompts/templates/refinement/revision/human.txt +16 -0
- janus/prompts/templates/refinement/revision/incose/human.txt +16 -0
- janus/prompts/templates/refinement/revision/incose/system.txt +1 -0
- janus/prompts/templates/refinement/revision/incose_deduplicate/human.txt +17 -0
- janus/prompts/templates/refinement/revision/incose_deduplicate/system.txt +1 -0
- janus/prompts/templates/refinement/revision/system.txt +1 -0
- janus/prompts/templates/refinement/uml/alc_fix_variables/human.txt +15 -0
- janus/prompts/templates/refinement/uml/alc_fix_variables/system.txt +2 -0
- janus/prompts/templates/refinement/uml/fix_connections/human.txt +15 -0
- janus/prompts/templates/refinement/uml/fix_connections/system.txt +2 -0
- janus/prompts/templates/requirements/human.txt +13 -0
- janus/prompts/templates/requirements/system.txt +2 -0
- janus/prompts/templates/retrieval/language_docs/human.txt +10 -0
- janus/prompts/templates/retrieval/language_docs/system.txt +1 -0
- janus/prompts/templates/simple/human.txt +16 -0
- janus/prompts/templates/simple/system.txt +3 -0
- janus/refiners/format.py +49 -0
- janus/refiners/refiner.py +113 -4
- janus/utils/enums.py +127 -112
- janus/utils/logger.py +2 -0
- {janus_llm-4.3.1.dist-info → janus_llm-4.4.5.dist-info}/METADATA +18 -18
- janus_llm-4.4.5.dist-info/RECORD +210 -0
- {janus_llm-4.3.1.dist-info → janus_llm-4.4.5.dist-info}/WHEEL +1 -1
- janus_llm-4.4.5.dist-info/entry_points.txt +3 -0
- janus/cli.py +0 -1488
- janus/metrics/_tests/test_llm.py +0 -90
- janus/metrics/llm_metrics.py +0 -202
- janus_llm-4.3.1.dist-info/RECORD +0 -115
- janus_llm-4.3.1.dist-info/entry_points.txt +0 -3
- {janus_llm-4.3.1.dist-info → janus_llm-4.4.5.dist-info}/LICENSE +0 -0
janus/converter/diagram.py
CHANGED
@@ -14,6 +14,7 @@ class DiagramGenerator(Documenter):
|
|
14
14
|
self,
|
15
15
|
diagram_type="Activity",
|
16
16
|
add_documentation=False,
|
17
|
+
extract_variables=False,
|
17
18
|
**kwargs,
|
18
19
|
) -> None:
|
19
20
|
"""Initialize the DiagramGenerator class
|
@@ -28,24 +29,25 @@ class DiagramGenerator(Documenter):
|
|
28
29
|
self._documenter = Documenter(**kwargs)
|
29
30
|
|
30
31
|
super().__init__(**kwargs)
|
31
|
-
|
32
|
-
|
32
|
+
prompts = []
|
33
|
+
if extract_variables:
|
34
|
+
prompts.append("extract_variables")
|
35
|
+
prompts += ["diagram_with_documentation" if add_documentation else "diagram"]
|
36
|
+
self.set_prompts(prompts)
|
33
37
|
self._parser = UMLSyntaxParser(language="plantuml")
|
34
38
|
|
35
39
|
self._load_parameters()
|
36
40
|
|
37
|
-
def _load_prompt(self):
|
38
|
-
super()._load_prompt()
|
39
|
-
self._prompt = self._prompt.partial(DIAGRAM_TYPE=self._diagram_type)
|
40
|
-
|
41
41
|
def _input_runnable(self) -> Runnable:
|
42
42
|
if self._add_documentation:
|
43
43
|
return RunnableParallel(
|
44
44
|
SOURCE_CODE=self._parser.parse_input,
|
45
45
|
DOCUMENTATION=self._documenter.chain,
|
46
46
|
context=self._retriever,
|
47
|
+
DIAGRAM_TYPE=lambda x: self._diagram_type,
|
47
48
|
)
|
48
49
|
return RunnableParallel(
|
49
50
|
SOURCE_CODE=self._parser.parse_input,
|
50
51
|
context=self._retriever,
|
52
|
+
DIAGRAM_TYPE=lambda x: self._diagram_type,
|
51
53
|
)
|
janus/converter/document.py
CHANGED
@@ -5,10 +5,8 @@ from copy import deepcopy
|
|
5
5
|
from janus.converter.converter import Converter
|
6
6
|
from janus.language.block import TranslatedCodeBlock
|
7
7
|
from janus.language.combine import JsonCombiner
|
8
|
-
from janus.parsers.doc_parser import
|
9
|
-
|
10
|
-
MultiDocumentationParser,
|
11
|
-
)
|
8
|
+
from janus.parsers.doc_parser import ClozeDocumentationParser, MultiDocumentationParser
|
9
|
+
from janus.parsers.parser import JanusParserException
|
12
10
|
from janus.utils.enums import LANGUAGES
|
13
11
|
from janus.utils.logger import create_logger
|
14
12
|
|
@@ -21,7 +19,7 @@ class Documenter(Converter):
|
|
21
19
|
):
|
22
20
|
kwargs.update(source_language=source_language)
|
23
21
|
super().__init__(**kwargs)
|
24
|
-
self.
|
22
|
+
self.set_prompts("document")
|
25
23
|
|
26
24
|
if drop_comments:
|
27
25
|
comment_node_type = LANGUAGES[source_language].get(
|
@@ -35,12 +33,14 @@ class Documenter(Converter):
|
|
35
33
|
class MultiDocumenter(Documenter):
|
36
34
|
def __init__(self, **kwargs):
|
37
35
|
super().__init__(**kwargs)
|
38
|
-
self.
|
36
|
+
self.set_prompts("multidocument")
|
39
37
|
self._combiner = JsonCombiner()
|
40
38
|
self._parser = MultiDocumentationParser()
|
41
39
|
|
40
|
+
self._load_parameters()
|
41
|
+
|
42
42
|
|
43
|
-
class
|
43
|
+
class ClozeDocumenter(Documenter):
|
44
44
|
def __init__(
|
45
45
|
self,
|
46
46
|
comments_per_request: int | None = None,
|
@@ -48,12 +48,14 @@ class MadLibsDocumenter(Documenter):
|
|
48
48
|
) -> None:
|
49
49
|
kwargs.update(drop_comments=False)
|
50
50
|
super().__init__(**kwargs)
|
51
|
-
self.
|
51
|
+
self.set_prompts("document_cloze")
|
52
52
|
self._combiner = JsonCombiner()
|
53
|
-
self._parser =
|
53
|
+
self._parser = ClozeDocumentationParser()
|
54
54
|
|
55
55
|
self.comments_per_request = comments_per_request
|
56
56
|
|
57
|
+
self._load_parameters()
|
58
|
+
|
57
59
|
def _add_translation(self, block: TranslatedCodeBlock):
|
58
60
|
if block.translated:
|
59
61
|
return
|
@@ -92,7 +94,6 @@ class MadLibsDocumenter(Documenter):
|
|
92
94
|
|
93
95
|
block.processing_time = 0
|
94
96
|
block.cost = 0
|
95
|
-
block.retries = 0
|
96
97
|
obj = {}
|
97
98
|
for i in range(0, len(comments), self.comments_per_request):
|
98
99
|
# Split the text into the section containing comments of interest,
|
@@ -114,16 +115,26 @@ class MadLibsDocumenter(Documenter):
|
|
114
115
|
working_block = TranslatedCodeBlock(working_copy, self._target_language)
|
115
116
|
|
116
117
|
# Run the LLM on the working text
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
118
|
+
try:
|
119
|
+
super()._add_translation(working_block)
|
120
|
+
except JanusParserException as e:
|
121
|
+
block.text += "\n===============\n" + working_block.text
|
122
|
+
block.tokens = self._llm.get_num_tokens(block.text)
|
123
|
+
raise e
|
124
|
+
finally:
|
125
|
+
# Update metadata to include for all runs
|
126
|
+
block.num_requests += working_block.num_requests
|
127
|
+
block.cost += working_block.cost
|
128
|
+
block.processing_time += working_block.processing_time
|
129
|
+
block.request_input_tokens += working_block.request_input_tokens
|
130
|
+
block.request_output_tokens += working_block.request_output_tokens
|
123
131
|
|
124
132
|
# Update the output text to merge this section's output in
|
125
133
|
out_text = self._parser.parse(working_block.text)
|
126
134
|
obj.update(json.loads(out_text))
|
135
|
+
# Set intermediate text, will be overwritten if file
|
136
|
+
# successfully completes
|
137
|
+
block.text = json.dumps(obj)
|
127
138
|
|
128
139
|
self._parser.parse_input(block.original)
|
129
140
|
block.text = self._parser.parse(json.dumps(obj))
|
janus/converter/evaluate.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
import json
|
2
2
|
import re
|
3
3
|
from copy import deepcopy
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import Any
|
4
6
|
|
5
7
|
from langchain_core.runnables import Runnable, RunnableLambda, RunnableParallel
|
6
8
|
|
7
9
|
from janus.converter.converter import Converter
|
8
|
-
from janus.language.block import TranslatedCodeBlock
|
9
10
|
from janus.language.combine import JsonCombiner
|
10
11
|
from janus.parsers.eval_parsers.incose_parser import IncoseParser
|
11
12
|
from janus.parsers.eval_parsers.inline_comment_parser import InlineCommentParser
|
@@ -30,11 +31,11 @@ class Evaluator(Converter):
|
|
30
31
|
|
31
32
|
Arguments:
|
32
33
|
model: The LLM to use for translation. If an OpenAI model, the
|
33
|
-
`OPENAI_API_KEY` environment variable must be set
|
34
|
-
`OPENAI_ORG_ID` environment variable should be set if needed.
|
34
|
+
`OPENAI_API_KEY` environment variable must be set.
|
35
35
|
model_arguments: Additional arguments to pass to the LLM constructor.
|
36
36
|
max_prompts: The maximum number of prompts to try before giving up.
|
37
37
|
"""
|
38
|
+
kwargs.update(use_janus_inputs=True)
|
38
39
|
super().__init__(**kwargs)
|
39
40
|
self._combiner = JsonCombiner()
|
40
41
|
self._load_parameters()
|
@@ -55,15 +56,14 @@ class RequirementEvaluator(Evaluator):
|
|
55
56
|
|
56
57
|
Arguments:
|
57
58
|
model: The LLM to use for translation. If an OpenAI model, the
|
58
|
-
`OPENAI_API_KEY` environment variable must be set
|
59
|
-
`OPENAI_ORG_ID` environment variable should be set if needed.
|
59
|
+
`OPENAI_API_KEY` environment variable must be set.
|
60
60
|
model_arguments: Additional arguments to pass to the LLM constructor.
|
61
61
|
max_prompts: The maximum number of prompts to try before giving up.
|
62
62
|
"""
|
63
63
|
super().__init__(**kwargs)
|
64
64
|
self.eval_items_per_request = eval_items_per_request
|
65
65
|
self._parser = IncoseParser()
|
66
|
-
self.
|
66
|
+
self.set_prompts("eval_prompts/incose")
|
67
67
|
|
68
68
|
def _input_runnable(self) -> Runnable:
|
69
69
|
def _get_code(json_text: str) -> str:
|
@@ -78,67 +78,55 @@ class RequirementEvaluator(Evaluator):
|
|
78
78
|
context=self._retriever,
|
79
79
|
)
|
80
80
|
|
81
|
-
def
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
block.processing_time += working_block.processing_time
|
131
|
-
|
132
|
-
# Update the output text to merge this section's output in
|
133
|
-
obj.update(json.loads(working_block.text))
|
134
|
-
|
135
|
-
block.text = json.dumps(obj)
|
136
|
-
block.tokens = self._llm.get_num_tokens(block.text)
|
137
|
-
block.translated = True
|
138
|
-
|
139
|
-
log.debug(
|
140
|
-
f"[{block.name}] Output code:\n{json.dumps(json.loads(block.text), indent=2)}"
|
141
|
-
)
|
81
|
+
def translate_janus_obj(self, obj: Any, name: str, failure_path: Path | None = None):
|
82
|
+
results = []
|
83
|
+
for o in obj["outputs"]:
|
84
|
+
if isinstance(o, dict):
|
85
|
+
results += self.translate_janus_obj(o, name, failure_path)
|
86
|
+
elif isinstance(o, str):
|
87
|
+
temp_obj = deepcopy(obj)
|
88
|
+
requirements = json.loads(o)
|
89
|
+
if not requirements:
|
90
|
+
log.debug(f"[{name}] Skipping empty output")
|
91
|
+
continue
|
92
|
+
if (
|
93
|
+
not self.eval_items_per_request
|
94
|
+
or len(requirements) < self.eval_items_per_request
|
95
|
+
):
|
96
|
+
obj_str = json.dumps(
|
97
|
+
dict(
|
98
|
+
requirements=requirements,
|
99
|
+
code=obj["input"],
|
100
|
+
)
|
101
|
+
)
|
102
|
+
temp_obj["outputs"] = [obj_str]
|
103
|
+
temp_block = self._janus_object_to_codeblock(temp_obj, name)
|
104
|
+
translated_block = self.translate_block(temp_block, failure_path)
|
105
|
+
translated_block.previous_generations[-1] = obj
|
106
|
+
translated_block.original = self._janus_object_to_codeblock(obj, name)
|
107
|
+
results.append(translated_block)
|
108
|
+
else:
|
109
|
+
for i in range(0, len(requirements), self.eval_items_per_request):
|
110
|
+
working_requirements = requirements[
|
111
|
+
i : i + self.eval_items_per_request
|
112
|
+
]
|
113
|
+
obj_str = json.dumps(
|
114
|
+
dict(
|
115
|
+
requirements=working_requirements,
|
116
|
+
code=obj["input"],
|
117
|
+
)
|
118
|
+
)
|
119
|
+
temp_obj["outputs"] = [obj_str]
|
120
|
+
temp_block = self._janus_object_to_codeblock(temp_obj, name)
|
121
|
+
translated_block = self.translate_block(temp_block, failure_path)
|
122
|
+
translated_block.previous_generations[-1] = obj
|
123
|
+
translated_block.original = self._janus_object_to_codeblock(
|
124
|
+
obj, name
|
125
|
+
)
|
126
|
+
results.append(translated_block)
|
127
|
+
else:
|
128
|
+
raise ValueError(f"Error: unable to find janus object: {type(o)}")
|
129
|
+
return results
|
142
130
|
|
143
131
|
|
144
132
|
class InlineCommentEvaluator(Evaluator):
|
@@ -153,89 +141,100 @@ class InlineCommentEvaluator(Evaluator):
|
|
153
141
|
|
154
142
|
Arguments:
|
155
143
|
model: The LLM to use for translation. If an OpenAI model, the
|
156
|
-
`OPENAI_API_KEY` environment variable must be set
|
157
|
-
`OPENAI_ORG_ID` environment variable should be set if needed.
|
144
|
+
`OPENAI_API_KEY` environment variable must be set.
|
158
145
|
model_arguments: Additional arguments to pass to the LLM constructor.
|
159
146
|
max_prompts: The maximum number of prompts to try before giving up.
|
160
147
|
"""
|
161
148
|
super().__init__(**kwargs)
|
162
149
|
self._combiner = JsonCombiner()
|
163
|
-
self._load_parameters()
|
164
150
|
self._parser = InlineCommentParser()
|
165
|
-
self.
|
151
|
+
self.set_prompts("eval_prompts/inline_comments")
|
166
152
|
self.eval_items_per_request = eval_items_per_request
|
153
|
+
self._load_parameters()
|
167
154
|
|
168
|
-
def
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
155
|
+
def _process_comments(self, input_str: str, generated_comments: dict[str, str]):
|
156
|
+
comment_patterns = [
|
157
|
+
(r"<BLOCK_COMMENT (\w{8})>", "<BLOCK_COMMENT {}>", "<BLOCK_COMMENT {}>"),
|
158
|
+
(r"<INLINE_COMMENT (\w{8})>", "<INLINE_COMMENT {}>", "<INLINE_COMMENT {}>"),
|
159
|
+
(r"<MODULE (\w{8})>", "<MODULE {}>", "<BLOCK_COMMENT {}>"),
|
160
|
+
]
|
161
|
+
missing_comments = 0
|
162
|
+
for pattern, find_template, repl_template in comment_patterns:
|
163
|
+
matches = re.findall(pattern, input_str)
|
164
|
+
|
165
|
+
for comment_id in matches:
|
166
|
+
find_tag = find_template.format(comment_id)
|
167
|
+
repl_tag = repl_template.format(comment_id)
|
168
|
+
|
169
|
+
if comment_id not in generated_comments:
|
170
|
+
missing_comments += 1
|
171
|
+
comment = generated_comments.get(comment_id, "[comment missing]")
|
172
|
+
comment = comment.replace("\n", "\\n")
|
173
|
+
|
174
|
+
# Replace the tag in the code with the comment appended.
|
175
|
+
input_str = input_str.replace(find_tag, f"{repl_tag} {comment}")
|
176
|
+
processed_str = re.sub(r"\s*<JANUS_PARTITION>\s*\n", "\n", input_str)
|
177
|
+
return processed_str.strip("\n"), missing_comments
|
178
|
+
|
179
|
+
def translate_janus_obj(self, obj: Any, name: str, failure_path: Path | None = None):
|
179
180
|
comment_pattern = r"<(?:INLINE|BLOCK)_COMMENT \w{8}>.*$"
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
f"[{block.name}] Output code:\n{json.dumps(json.loads(block.text), indent=2)}"
|
241
|
-
)
|
181
|
+
results = []
|
182
|
+
input_str = obj["input"]
|
183
|
+
for o in obj["outputs"]:
|
184
|
+
if isinstance(o, dict):
|
185
|
+
results += self.translate_janus_obj(o, name, failure_path)
|
186
|
+
elif isinstance(o, str):
|
187
|
+
temp_obj = deepcopy(obj)
|
188
|
+
generated_comments = json.loads(o)
|
189
|
+
processed_input, missing_comments = self._process_comments(
|
190
|
+
input_str, generated_comments
|
191
|
+
)
|
192
|
+
if missing_comments:
|
193
|
+
log.info(f"[{name}] Warning: missing {missing_comments} comments")
|
194
|
+
comments = list(
|
195
|
+
re.finditer(comment_pattern, processed_input, flags=re.MULTILINE)
|
196
|
+
)
|
197
|
+
if not comments:
|
198
|
+
log.info(f"[{name}] Skipping commentless block")
|
199
|
+
continue
|
200
|
+
if (
|
201
|
+
self.eval_items_per_request is None
|
202
|
+
or len(comments) < self.eval_items_per_request
|
203
|
+
):
|
204
|
+
temp_obj["outputs"] = [processed_input]
|
205
|
+
temp_block = self._janus_object_to_codeblock(temp_obj, name)
|
206
|
+
translated_block = self.translate_block(temp_block, failure_path)
|
207
|
+
translated_block.previous_generations[-1] = obj
|
208
|
+
translated_block.original = self._janus_object_to_codeblock(obj, name)
|
209
|
+
results.append(translated_block)
|
210
|
+
continue
|
211
|
+
comment_group_indices = list(
|
212
|
+
range(0, len(comments), self.eval_items_per_request)
|
213
|
+
)
|
214
|
+
log.debug(
|
215
|
+
f"[{name}] Block contains more than {self.eval_items_per_request}"
|
216
|
+
f" comments, splitting {len(comments)} comments into"
|
217
|
+
f" {len(comment_group_indices)} groups"
|
218
|
+
)
|
219
|
+
for comment_ind in comment_group_indices:
|
220
|
+
working_comments = comments[
|
221
|
+
comment_ind : comment_ind + self.eval_items_per_request
|
222
|
+
]
|
223
|
+
start_idx = working_comments[0].start()
|
224
|
+
end_idx = working_comments[-1].end()
|
225
|
+
prefix = processed_input[:start_idx]
|
226
|
+
keeper = processed_input[start_idx:end_idx]
|
227
|
+
suffix = processed_input[end_idx:]
|
228
|
+
|
229
|
+
# Strip all comment placeholders outside of the section of interest
|
230
|
+
prefix = re.sub(comment_pattern, "", prefix, flags=re.MULTILINE)
|
231
|
+
suffix = re.sub(comment_pattern, "", suffix, flags=re.MULTILINE)
|
232
|
+
temp_obj["outputs"] = [prefix + keeper + suffix]
|
233
|
+
temp_block = self._janus_object_to_codeblock(temp_obj, name)
|
234
|
+
translated_block = self.translate_block(temp_block, failure_path)
|
235
|
+
translated_block.previous_generations[-1] = obj
|
236
|
+
translated_block.original = self._janus_object_to_codeblock(obj, name)
|
237
|
+
results.append(translated_block)
|
238
|
+
else:
|
239
|
+
raise ValueError(f"Error: unrecognized janus object type: {type(o)}")
|
240
|
+
return results
|
janus/converter/partition.py
CHANGED
@@ -1,7 +1,4 @@
|
|
1
|
-
from pathlib import Path
|
2
|
-
|
3
1
|
from janus.converter.converter import Converter
|
4
|
-
from janus.language.block import TranslatedCodeBlock
|
5
2
|
from janus.parsers.partition_parser import PartitionParser
|
6
3
|
from janus.utils.logger import create_logger
|
7
4
|
|
@@ -11,17 +8,12 @@ log = create_logger(__name__)
|
|
11
8
|
class Partitioner(Converter):
|
12
9
|
def __init__(self, partition_token_limit: int, **kwargs):
|
13
10
|
super().__init__(**kwargs)
|
14
|
-
self.
|
11
|
+
self.set_prompts("partition")
|
15
12
|
self._load_model()
|
16
13
|
self._parser = PartitionParser(
|
17
14
|
token_limit=partition_token_limit,
|
18
15
|
model=self._llm,
|
19
16
|
)
|
20
17
|
self._target_language = self._source_language
|
21
|
-
self._target_suffix = self.
|
18
|
+
self._target_suffix = self._source_suffixes[0]
|
22
19
|
self._load_parameters()
|
23
|
-
|
24
|
-
def _save_to_file(self, block: TranslatedCodeBlock, out_path: Path) -> None:
|
25
|
-
output_str = self._parser.parse_combined_output(block.complete_text)
|
26
|
-
out_path.parent.mkdir(parents=True, exist_ok=True)
|
27
|
-
out_path.write_text(output_str, encoding="utf-8")
|
janus/converter/requirements.py
CHANGED
@@ -1,8 +1,4 @@
|
|
1
|
-
import json
|
2
|
-
from pathlib import Path
|
3
|
-
|
4
1
|
from janus.converter.document import Documenter
|
5
|
-
from janus.language.block import TranslatedCodeBlock
|
6
2
|
from janus.language.combine import ChunkCombiner
|
7
3
|
from janus.parsers.reqs_parser import RequirementsParser
|
8
4
|
from janus.utils.logger import create_logger
|
@@ -16,41 +12,9 @@ class RequirementsDocumenter(Documenter):
|
|
16
12
|
A class that translates code from one programming language to its requirements.
|
17
13
|
"""
|
18
14
|
|
19
|
-
def __init__(self, **kwargs):
|
20
|
-
super().__init__(**kwargs)
|
21
|
-
self.
|
15
|
+
def __init__(self, combine_output: bool = False, **kwargs):
|
16
|
+
super().__init__(combine_output=combine_output, **kwargs)
|
17
|
+
self.set_prompts("requirements")
|
22
18
|
self._combiner = ChunkCombiner()
|
23
19
|
self._parser = RequirementsParser()
|
24
|
-
|
25
|
-
@staticmethod
|
26
|
-
def get_prompt_replacements(block) -> dict[str, str]:
|
27
|
-
prompt_replacements: dict[str, str] = {"SOURCE_CODE": block.original.text}
|
28
|
-
return prompt_replacements
|
29
|
-
|
30
|
-
def _save_to_file(self, block: TranslatedCodeBlock, out_path: Path) -> None:
|
31
|
-
"""Save a file to disk.
|
32
|
-
|
33
|
-
Arguments:
|
34
|
-
block: The `CodeBlock` to save to a file.
|
35
|
-
"""
|
36
|
-
output_list = list()
|
37
|
-
# For each chunk of code, get generation metadata, the text of the code,
|
38
|
-
# and the LLM generated requirements
|
39
|
-
blocks = [block for block in block.children] if len(block.children) else [block]
|
40
|
-
for block in blocks:
|
41
|
-
code = block.original.text
|
42
|
-
requirements = self._parser.parse_combined_output(block.complete_text)
|
43
|
-
metadata = dict(
|
44
|
-
retries=block.total_retries,
|
45
|
-
cost=block.total_cost,
|
46
|
-
processing_time=block.processing_time,
|
47
|
-
)
|
48
|
-
# Put them all in a top level 'output' key
|
49
|
-
output_list.append(
|
50
|
-
dict(metadata=metadata, code=code, requirements=requirements)
|
51
|
-
)
|
52
|
-
obj = dict(
|
53
|
-
output=output_list,
|
54
|
-
)
|
55
|
-
out_path.parent.mkdir(parents=True, exist_ok=True)
|
56
|
-
out_path.write_text(json.dumps(obj, indent=2), encoding="utf-8")
|
20
|
+
self._load_parameters()
|
janus/converter/translate.py
CHANGED
@@ -1,8 +1,5 @@
|
|
1
1
|
from janus.converter.converter import Converter, run_if_changed
|
2
|
-
from janus.llm.models_info import MODEL_PROMPT_ENGINES
|
3
2
|
from janus.parsers.code_parser import CodeParser
|
4
|
-
from janus.prompts.prompt import SAME_OUTPUT
|
5
|
-
from janus.utils.enums import LANGUAGES
|
6
3
|
from janus.utils.logger import create_logger
|
7
4
|
|
8
5
|
log = create_logger(__name__)
|
@@ -21,8 +18,7 @@ class Translator(Converter):
|
|
21
18
|
|
22
19
|
Arguments:
|
23
20
|
model: The LLM to use for translation. If an OpenAI model, the
|
24
|
-
`OPENAI_API_KEY` environment variable must be set
|
25
|
-
`OPENAI_ORG_ID` environment variable should be set if needed.
|
21
|
+
`OPENAI_API_KEY` environment variable must be set.
|
26
22
|
model_arguments: Additional arguments to pass to the LLM constructor.
|
27
23
|
source_language: The source programming language.
|
28
24
|
target_language: The target programming language.
|
@@ -30,13 +26,11 @@ class Translator(Converter):
|
|
30
26
|
max_prompts: The maximum number of prompts to try before giving up.
|
31
27
|
max_tokens: The maximum number of tokens the model will take in.
|
32
28
|
If unspecificed, model's default max will be used.
|
33
|
-
|
34
|
-
(see janus/prompts/templates) or
|
29
|
+
prompt_templates: name of prompt template directories
|
30
|
+
(see janus/prompts/templates) or paths to directories.
|
35
31
|
"""
|
36
32
|
super().__init__(**kwargs)
|
37
33
|
|
38
|
-
self._target_version: str | None
|
39
|
-
|
40
34
|
self.set_target_language(
|
41
35
|
target_language=target_language,
|
42
36
|
target_version=target_version,
|
@@ -48,56 +42,6 @@ class Translator(Converter):
|
|
48
42
|
self._load_parser()
|
49
43
|
super()._load_parameters()
|
50
44
|
|
51
|
-
def set_target_language(
|
52
|
-
self, target_language: str, target_version: str | None
|
53
|
-
) -> None:
|
54
|
-
"""Validate and set the target language.
|
55
|
-
|
56
|
-
The affected objects will not be updated until translate() is called.
|
57
|
-
|
58
|
-
Arguments:
|
59
|
-
target_language: The target programming language.
|
60
|
-
target_version: The target version of the target programming language.
|
61
|
-
"""
|
62
|
-
target_language = target_language.lower()
|
63
|
-
if target_language not in LANGUAGES:
|
64
|
-
raise ValueError(
|
65
|
-
f"Invalid target language: {target_language}. "
|
66
|
-
"Valid target languages are found in `janus.utils.enums.LANGUAGES`."
|
67
|
-
)
|
68
|
-
self._target_language = target_language
|
69
|
-
self._target_version = target_version
|
70
|
-
self._target_suffix = f".{LANGUAGES[target_language]['suffix']}"
|
71
|
-
|
72
|
-
@run_if_changed(
|
73
|
-
"_prompt_template_name",
|
74
|
-
"_source_language",
|
75
|
-
"_target_language",
|
76
|
-
"_target_version",
|
77
|
-
"_model_name",
|
78
|
-
)
|
79
|
-
def _load_prompt(self) -> None:
|
80
|
-
"""Load the prompt according to this instance's attributes.
|
81
|
-
|
82
|
-
If the relevant fields have not been changed since the last time this
|
83
|
-
method was called, nothing happens.
|
84
|
-
"""
|
85
|
-
if self._prompt_template_name in SAME_OUTPUT:
|
86
|
-
if self._target_language != self._source_language:
|
87
|
-
raise ValueError(
|
88
|
-
f"Prompt template ({self._prompt_template_name}) suggests "
|
89
|
-
f"source and target languages should match, but do not "
|
90
|
-
f"({self._source_language} != {self._target_language})"
|
91
|
-
)
|
92
|
-
|
93
|
-
prompt_engine = MODEL_PROMPT_ENGINES[self._llm.short_model_id](
|
94
|
-
source_language=self._source_language,
|
95
|
-
target_language=self._target_language,
|
96
|
-
target_version=self._target_version,
|
97
|
-
prompt_template=self._prompt_template_name,
|
98
|
-
)
|
99
|
-
self._prompt = prompt_engine.prompt
|
100
|
-
|
101
45
|
@run_if_changed("_target_language")
|
102
46
|
def _load_parser(self) -> None:
|
103
47
|
"""Load the parser according to this instance's attributes.
|