janus-llm 1.0.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- janus/__init__.py +9 -1
- janus/__main__.py +4 -0
- janus/_tests/test_cli.py +128 -0
- janus/_tests/test_translate.py +49 -7
- janus/cli.py +530 -46
- janus/converter.py +50 -19
- janus/embedding/_tests/test_collections.py +2 -8
- janus/embedding/_tests/test_database.py +32 -0
- janus/embedding/_tests/test_vectorize.py +9 -4
- janus/embedding/collections.py +49 -6
- janus/embedding/embedding_models_info.py +120 -0
- janus/embedding/vectorize.py +53 -62
- janus/language/_tests/__init__.py +0 -0
- janus/language/_tests/test_combine.py +62 -0
- janus/language/_tests/test_splitter.py +16 -0
- janus/language/binary/_tests/test_binary.py +16 -1
- janus/language/binary/binary.py +10 -3
- janus/language/block.py +31 -30
- janus/language/combine.py +26 -34
- janus/language/mumps/_tests/test_mumps.py +2 -2
- janus/language/mumps/mumps.py +93 -9
- janus/language/naive/__init__.py +4 -0
- janus/language/naive/basic_splitter.py +14 -0
- janus/language/naive/chunk_splitter.py +26 -0
- janus/language/naive/registry.py +13 -0
- janus/language/naive/simple_ast.py +18 -0
- janus/language/naive/tag_splitter.py +61 -0
- janus/language/splitter.py +168 -74
- janus/language/treesitter/_tests/test_treesitter.py +9 -6
- janus/language/treesitter/treesitter.py +37 -13
- janus/llm/model_callbacks.py +177 -0
- janus/llm/models_info.py +134 -70
- janus/metrics/__init__.py +8 -0
- janus/metrics/_tests/__init__.py +0 -0
- janus/metrics/_tests/reference.py +2 -0
- janus/metrics/_tests/target.py +2 -0
- janus/metrics/_tests/test_bleu.py +56 -0
- janus/metrics/_tests/test_chrf.py +67 -0
- janus/metrics/_tests/test_file_pairing.py +59 -0
- janus/metrics/_tests/test_llm.py +91 -0
- janus/metrics/_tests/test_reading.py +28 -0
- janus/metrics/_tests/test_rouge_score.py +65 -0
- janus/metrics/_tests/test_similarity_score.py +23 -0
- janus/metrics/_tests/test_treesitter_metrics.py +110 -0
- janus/metrics/bleu.py +66 -0
- janus/metrics/chrf.py +55 -0
- janus/metrics/cli.py +7 -0
- janus/metrics/complexity_metrics.py +208 -0
- janus/metrics/file_pairing.py +113 -0
- janus/metrics/llm_metrics.py +202 -0
- janus/metrics/metric.py +466 -0
- janus/metrics/reading.py +70 -0
- janus/metrics/rouge_score.py +96 -0
- janus/metrics/similarity.py +53 -0
- janus/metrics/splitting.py +38 -0
- janus/parsers/_tests/__init__.py +0 -0
- janus/parsers/_tests/test_code_parser.py +32 -0
- janus/parsers/code_parser.py +24 -253
- janus/parsers/doc_parser.py +169 -0
- janus/parsers/eval_parser.py +80 -0
- janus/parsers/reqs_parser.py +72 -0
- janus/prompts/prompt.py +103 -30
- janus/translate.py +636 -111
- janus/utils/_tests/__init__.py +0 -0
- janus/utils/_tests/test_logger.py +67 -0
- janus/utils/_tests/test_progress.py +20 -0
- janus/utils/enums.py +56 -3
- janus/utils/progress.py +56 -0
- {janus_llm-1.0.0.dist-info → janus_llm-2.0.0.dist-info}/METADATA +23 -10
- janus_llm-2.0.0.dist-info/RECORD +94 -0
- {janus_llm-1.0.0.dist-info → janus_llm-2.0.0.dist-info}/WHEEL +1 -1
- janus_llm-1.0.0.dist-info/RECORD +0 -48
- {janus_llm-1.0.0.dist-info → janus_llm-2.0.0.dist-info}/LICENSE +0 -0
- {janus_llm-1.0.0.dist-info → janus_llm-2.0.0.dist-info}/entry_points.txt +0 -0
janus/translate.py
CHANGED
@@ -1,34 +1,62 @@
|
|
1
|
-
import
|
1
|
+
import json
|
2
|
+
import math
|
3
|
+
import re
|
4
|
+
import time
|
5
|
+
from copy import deepcopy
|
2
6
|
from pathlib import Path
|
3
|
-
from typing import Any
|
7
|
+
from typing import Any
|
4
8
|
|
5
|
-
from
|
6
|
-
from langchain.
|
9
|
+
from langchain.output_parsers import RetryWithErrorOutputParser
|
10
|
+
from langchain.output_parsers.fix import OutputFixingParser
|
11
|
+
from langchain_core.exceptions import OutputParserException
|
12
|
+
from langchain_core.language_models import BaseLanguageModel
|
13
|
+
from langchain_core.output_parsers import BaseOutputParser
|
14
|
+
from langchain_core.prompts import ChatPromptTemplate
|
15
|
+
from langchain_core.runnables import RunnableLambda, RunnableParallel
|
16
|
+
from openai import BadRequestError, RateLimitError
|
17
|
+
from text_generation.errors import ValidationError
|
18
|
+
|
19
|
+
from janus.language.naive.registry import CUSTOM_SPLITTERS
|
7
20
|
|
8
21
|
from .converter import Converter, run_if_changed
|
22
|
+
from .embedding.vectorize import ChromaDBVectorizer
|
9
23
|
from .language.block import CodeBlock, TranslatedCodeBlock
|
24
|
+
from .language.combine import ChunkCombiner, Combiner, JsonCombiner
|
25
|
+
from .language.splitter import EmptyTreeError, FileSizeError, TokenLimitError
|
10
26
|
from .llm import load_model
|
11
|
-
from .
|
12
|
-
from .
|
27
|
+
from .llm.model_callbacks import get_model_callback
|
28
|
+
from .llm.models_info import MODEL_PROMPT_ENGINES
|
29
|
+
from .parsers.code_parser import CodeParser, GenericParser
|
30
|
+
from .parsers.doc_parser import MadlibsDocumentationParser, MultiDocumentationParser
|
31
|
+
from .parsers.eval_parser import EvaluationParser
|
32
|
+
from .parsers.reqs_parser import RequirementsParser
|
33
|
+
from .prompts.prompt import SAME_OUTPUT, TEXT_OUTPUT
|
13
34
|
from .utils.enums import LANGUAGES
|
14
35
|
from .utils.logger import create_logger
|
15
36
|
|
16
37
|
log = create_logger(__name__)
|
17
38
|
|
18
39
|
|
40
|
+
PARSER_TYPES: set[str] = {"code", "text", "eval", "madlibs", "multidoc", "requirements"}
|
41
|
+
|
42
|
+
|
19
43
|
class Translator(Converter):
|
20
44
|
"""A class that translates code from one programming language to another."""
|
21
45
|
|
22
46
|
def __init__(
|
23
47
|
self,
|
24
|
-
model: str = "gpt-3.5-turbo",
|
25
|
-
model_arguments:
|
48
|
+
model: str = "gpt-3.5-turbo-0125",
|
49
|
+
model_arguments: dict[str, Any] = {},
|
26
50
|
source_language: str = "fortran",
|
27
51
|
target_language: str = "python",
|
28
|
-
target_version: str = "3.10",
|
52
|
+
target_version: str | None = "3.10",
|
29
53
|
max_prompts: int = 10,
|
54
|
+
max_tokens: int | None = None,
|
30
55
|
prompt_template: str | Path = "simple",
|
31
56
|
parser_type: str = "code",
|
57
|
+
db_path: str | None = None,
|
58
|
+
db_config: dict[str, Any] | None = None,
|
59
|
+
custom_splitter: str | None = None,
|
32
60
|
) -> None:
|
33
61
|
"""Initialize a Translator instance.
|
34
62
|
|
@@ -41,37 +69,53 @@ class Translator(Converter):
|
|
41
69
|
target_language: The target programming language.
|
42
70
|
target_version: The target version of the target programming language.
|
43
71
|
max_prompts: The maximum number of prompts to try before giving up.
|
72
|
+
max_tokens: The maximum number of tokens the model will take in.
|
73
|
+
If unspecificed, model's default max will be used.
|
44
74
|
prompt_template: name of prompt template directory
|
45
75
|
(see janus/prompts/templates) or path to a directory.
|
46
76
|
parser_type: The type of parser to use for parsing the LLM output. Valid
|
47
77
|
values are "code" (default), "text", and "eval".
|
48
78
|
"""
|
79
|
+
self._custom_splitter = custom_splitter
|
49
80
|
super().__init__(source_language=source_language)
|
50
81
|
|
51
|
-
self._parser_type:
|
52
|
-
self.
|
53
|
-
self.
|
54
|
-
self.
|
55
|
-
self.
|
56
|
-
self.
|
57
|
-
self.
|
58
|
-
self.
|
82
|
+
self._parser_type: str | None
|
83
|
+
self._model_name: str | None
|
84
|
+
self._custom_model_arguments: dict[str, Any] | None
|
85
|
+
self._target_language: str | None
|
86
|
+
self._target_version: str | None
|
87
|
+
self._target_glob: str | None
|
88
|
+
self._prompt_template_name: str | None
|
89
|
+
self._db_path: str | None
|
90
|
+
self._db_config: dict[str, Any] | None
|
91
|
+
|
92
|
+
self._llm: BaseLanguageModel | None
|
93
|
+
self._parser: BaseOutputParser | None
|
94
|
+
self._combiner: Combiner | None
|
95
|
+
self._prompt: ChatPromptTemplate | None
|
96
|
+
|
97
|
+
self.max_prompts = max_prompts
|
98
|
+
self.override_token_limit = False if max_tokens is None else True
|
99
|
+
self._max_tokens = max_tokens
|
59
100
|
|
60
101
|
self.set_model(model_name=model, **model_arguments)
|
61
102
|
self.set_parser_type(parser_type=parser_type)
|
62
103
|
self.set_prompt(prompt_template=prompt_template)
|
63
104
|
self.set_target_language(
|
64
|
-
target_language=target_language,
|
105
|
+
target_language=target_language,
|
106
|
+
target_version=target_version,
|
65
107
|
)
|
108
|
+
self.set_db_path(db_path=db_path)
|
109
|
+
self.set_db_config(db_config=db_config)
|
66
110
|
|
67
111
|
self._load_parameters()
|
68
112
|
|
69
|
-
self.max_prompts = max_prompts
|
70
|
-
|
71
113
|
def _load_parameters(self) -> None:
|
72
114
|
self._load_model()
|
73
|
-
self.
|
115
|
+
self._load_prompt()
|
74
116
|
self._load_parser()
|
117
|
+
self._load_combiner()
|
118
|
+
self._load_vectorizer()
|
75
119
|
super()._load_parameters() # will call self._changed_attrs.clear()
|
76
120
|
|
77
121
|
def translate(
|
@@ -79,7 +123,7 @@ class Translator(Converter):
|
|
79
123
|
input_directory: str | Path,
|
80
124
|
output_directory: str | Path | None = None,
|
81
125
|
overwrite: bool = False,
|
82
|
-
|
126
|
+
collection_name: str | None = None,
|
83
127
|
) -> None:
|
84
128
|
"""Translate code in the input directory from the source language to the target
|
85
129
|
language, and write the resulting files to the output directory.
|
@@ -88,6 +132,7 @@ class Translator(Converter):
|
|
88
132
|
input_directory: The directory containing the code to translate.
|
89
133
|
output_directory: The directory to write the translated code to.
|
90
134
|
overwrite: Whether to overwrite existing files (vs skip them)
|
135
|
+
collection_name: Collection to add to
|
91
136
|
"""
|
92
137
|
# Convert paths to pathlib Paths if needed
|
93
138
|
if isinstance(input_directory, str):
|
@@ -99,25 +144,77 @@ class Translator(Converter):
|
|
99
144
|
if output_directory is not None and not output_directory.exists():
|
100
145
|
output_directory.mkdir(parents=True)
|
101
146
|
|
147
|
+
source_suffix = LANGUAGES[self._source_language]["suffix"]
|
102
148
|
target_suffix = LANGUAGES[self._target_language]["suffix"]
|
103
149
|
|
104
|
-
input_paths = input_directory.rglob(self._source_glob)
|
150
|
+
input_paths = [p for p in input_directory.rglob(self._source_glob)]
|
151
|
+
|
152
|
+
log.info(f"Input directory: {input_directory.absolute()}")
|
153
|
+
log.info(
|
154
|
+
f"{self._source_language.capitalize()} '*.{source_suffix}' files: "
|
155
|
+
f"{len(input_paths)}"
|
156
|
+
)
|
157
|
+
log.info(
|
158
|
+
"Other files (skipped): "
|
159
|
+
f"{len(list(input_directory.iterdir())) - len(input_paths)}\n"
|
160
|
+
)
|
161
|
+
if output_directory is not None:
|
162
|
+
output_paths = [
|
163
|
+
output_directory
|
164
|
+
/ p.relative_to(input_directory).with_suffix(f".{target_suffix}")
|
165
|
+
for p in input_paths
|
166
|
+
]
|
167
|
+
in_out_pairs = list(zip(input_paths, output_paths))
|
168
|
+
if not overwrite:
|
169
|
+
n_files = len(in_out_pairs)
|
170
|
+
in_out_pairs = [
|
171
|
+
(inp, outp) for inp, outp in in_out_pairs if not outp.exists()
|
172
|
+
]
|
173
|
+
log.info(
|
174
|
+
f"Skipping {n_files - len(in_out_pairs)} existing "
|
175
|
+
f"'*.{source_suffix}' files"
|
176
|
+
)
|
177
|
+
else:
|
178
|
+
in_out_pairs = [(f, None) for f in input_paths]
|
179
|
+
log.info(f"Translating {len(in_out_pairs)} '*.{source_suffix}' files")
|
105
180
|
|
106
181
|
# Now, loop through every code block in every file and translate it with an LLM
|
107
182
|
total_cost = 0.0
|
108
|
-
for in_path in
|
109
|
-
|
110
|
-
|
111
|
-
if output_directory is not None:
|
112
|
-
out_path = output_directory / relative.with_suffix(f".{target_suffix}")
|
113
|
-
else:
|
114
|
-
out_path = None
|
115
|
-
# Track the cost of translating the file
|
116
|
-
# TODO: If non-OpenAI models with prices are added, this will need
|
117
|
-
# to be updated.
|
118
|
-
with get_openai_callback() as cb:
|
183
|
+
for in_path, out_path in in_out_pairs:
|
184
|
+
# Translate the file, skip it if there's a rate limit error
|
185
|
+
try:
|
119
186
|
out_block = self.translate_file(in_path)
|
120
|
-
total_cost +=
|
187
|
+
total_cost += out_block.total_cost
|
188
|
+
except RateLimitError:
|
189
|
+
continue
|
190
|
+
except OutputParserException as e:
|
191
|
+
log.error(f"Skipping {in_path.name}, failed to parse output: {e}.")
|
192
|
+
continue
|
193
|
+
except BadRequestError as e:
|
194
|
+
if str(e).startswith("Detected an error in the prompt"):
|
195
|
+
log.warning("Malformed input, skipping")
|
196
|
+
continue
|
197
|
+
raise e
|
198
|
+
except ValidationError as e:
|
199
|
+
# Only allow ValidationError to pass if token limit is manually set
|
200
|
+
if self.override_token_limit:
|
201
|
+
log.warning(
|
202
|
+
"Current file and manually set token "
|
203
|
+
"limit is too large for this model, skipping"
|
204
|
+
)
|
205
|
+
continue
|
206
|
+
raise e
|
207
|
+
except TokenLimitError:
|
208
|
+
log.warning("Ran into irreducible node too large for context, skipping")
|
209
|
+
continue
|
210
|
+
except EmptyTreeError:
|
211
|
+
log.warning(
|
212
|
+
f'Input file "{in_path.name}" has no nodes of interest, skipping'
|
213
|
+
)
|
214
|
+
continue
|
215
|
+
except FileSizeError:
|
216
|
+
log.warning("Current tile is too large for basic splitter, skipping")
|
217
|
+
continue
|
121
218
|
|
122
219
|
# Don't attempt to write files for which translation failed
|
123
220
|
if not out_block.translated:
|
@@ -139,19 +236,36 @@ class Translator(Converter):
|
|
139
236
|
#
|
140
237
|
# self._embed_nodes_recursively(out_block, embedding_type, filename)
|
141
238
|
|
239
|
+
if collection_name is not None:
|
240
|
+
self._vectorizer.add_nodes_recursively(
|
241
|
+
out_block,
|
242
|
+
collection_name,
|
243
|
+
in_path.name,
|
244
|
+
)
|
245
|
+
# out_text = self.parser.parse_combined_output(out_block.complete_text)
|
246
|
+
# # Using same id naming convention from vectorize.py
|
247
|
+
# ids = [str(uuid.uuid3(uuid.NAMESPACE_DNS, out_text))]
|
248
|
+
# output_collection.upsert(ids=ids, documents=[out_text])
|
249
|
+
|
142
250
|
# Make sure the tree's code has been consolidated at the top level
|
143
251
|
# before writing to file
|
144
252
|
self._combiner.combine(out_block)
|
145
253
|
if out_path is not None and (overwrite or not out_path.exists()):
|
146
254
|
self._save_to_file(out_block, out_path)
|
147
|
-
if output_collection is not None:
|
148
|
-
out_text = self.parser.parse_combined_output(out_block.complete_text)
|
149
|
-
# Using same id naming convention from vectorize.py
|
150
|
-
ids = [str(uuid.uuid3(uuid.NAMESPACE_DNS, out_text))]
|
151
|
-
output_collection.upsert(ids=ids, documents=[out_text])
|
152
255
|
|
153
256
|
log.info(f"Total cost: ${total_cost:,.2f}")
|
154
257
|
|
258
|
+
def _split_file(self, file: Path) -> CodeBlock:
|
259
|
+
filename = file.name
|
260
|
+
log.info(f"[{filename}] Splitting file")
|
261
|
+
root = self._splitter.split(file)
|
262
|
+
log.info(
|
263
|
+
f"[{filename}] File split into {root.n_descendents:,} blocks, "
|
264
|
+
f"tree of height {root.height}"
|
265
|
+
)
|
266
|
+
log.info(f"[{filename}] Input CodeBlock Structure:\n{root.tree_str()}")
|
267
|
+
return root
|
268
|
+
|
155
269
|
def translate_file(self, file: Path) -> TranslatedCodeBlock:
|
156
270
|
"""Translate a single file.
|
157
271
|
|
@@ -164,18 +278,12 @@ class Translator(Converter):
|
|
164
278
|
`Combiner.combine_children` on the block.
|
165
279
|
"""
|
166
280
|
self._load_parameters()
|
167
|
-
|
168
281
|
filename = file.name
|
169
|
-
|
170
|
-
input_block = self.
|
171
|
-
|
172
|
-
f"[{filename}] File split into {input_block.n_descendents:,} blocks, "
|
173
|
-
f"tree of height {input_block.height}"
|
174
|
-
)
|
175
|
-
log.info(f"[{filename}] Input CodeBlock Structure:\n{input_block.tree_str()}")
|
176
|
-
# (temporarily?) comment-out adding embeddings; will be moved
|
177
|
-
# self._embed_nodes_recursively(input_block, EmbeddingType.SOURCE, filename)
|
282
|
+
|
283
|
+
input_block = self._split_file(file)
|
284
|
+
t0 = time.time()
|
178
285
|
output_block = self._iterative_translate(input_block)
|
286
|
+
output_block.processing_time = time.time() - t0
|
179
287
|
if output_block.translated:
|
180
288
|
completeness = output_block.translation_completeness
|
181
289
|
log.info(
|
@@ -183,7 +291,9 @@ class Translator(Converter):
|
|
183
291
|
f" {completeness:.2%} of input successfully translated\n"
|
184
292
|
f" Total cost: ${output_block.total_cost:,.2f}\n"
|
185
293
|
f" Total retries: {output_block.total_retries:,d}\n"
|
294
|
+
f" Output CodeBlock Structure:\n{input_block.tree_str()}\n"
|
186
295
|
)
|
296
|
+
|
187
297
|
else:
|
188
298
|
log.error(
|
189
299
|
f"[{filename}] Translation failed\n"
|
@@ -223,25 +333,14 @@ class Translator(Converter):
|
|
223
333
|
while stack:
|
224
334
|
translated_block = stack.pop()
|
225
335
|
|
226
|
-
|
227
|
-
# TODO: If non-OpenAI models with prices are added, this will need
|
228
|
-
# to be updated.
|
229
|
-
with get_openai_callback() as cb:
|
230
|
-
self._add_translation(translated_block)
|
231
|
-
translated_block.cost = cb.total_cost
|
232
|
-
translated_block.retries = max(0, cb.successful_requests - 1)
|
336
|
+
self._add_translation(translated_block)
|
233
337
|
|
234
338
|
# If translating this block was unsuccessful, don't bother with its
|
235
339
|
# children (they wouldn't show up in the final text anyway)
|
236
340
|
if not translated_block.translated:
|
237
341
|
continue
|
238
342
|
|
239
|
-
|
240
|
-
# Don't bother translating children if they aren't used
|
241
|
-
if self._combiner.contains_child(translated_block.text, child):
|
242
|
-
stack.append(child)
|
243
|
-
else:
|
244
|
-
log.warning(f"Skipping {child.id} (not referenced in parent code)")
|
343
|
+
stack.extend(translated_block.children)
|
245
344
|
|
246
345
|
progress = translated_root.translation_completeness
|
247
346
|
if progress - last_prog > prog_delta:
|
@@ -266,11 +365,6 @@ class Translator(Converter):
|
|
266
365
|
block.translated = True
|
267
366
|
return
|
268
367
|
|
269
|
-
log.debug(f"[{block.name}] Translating...")
|
270
|
-
log.debug(f"[{block.name}] Input text:\n{block.original.text}")
|
271
|
-
prompt = self._prompt_engine.create(block.original)
|
272
|
-
top_score = -1.0
|
273
|
-
|
274
368
|
if self._llm is None:
|
275
369
|
message = (
|
276
370
|
"Model not configured correctly, cannot translate. Try setting "
|
@@ -279,51 +373,85 @@ class Translator(Converter):
|
|
279
373
|
log.error(message)
|
280
374
|
raise ValueError(message)
|
281
375
|
|
282
|
-
|
283
|
-
|
284
|
-
output = self._llm.predict_messages(prompt)
|
285
|
-
try:
|
286
|
-
parsed_output = self.parser.parse(output.content)
|
287
|
-
except ValueError as e:
|
288
|
-
log.warning(f"[{block.name}] Failed to parse output: {e}")
|
289
|
-
log.debug(f"[{block.name}] Failed output:\n{output.content}")
|
290
|
-
continue
|
291
|
-
|
292
|
-
score = self.parser.score(block.original, parsed_output)
|
293
|
-
if score > top_score:
|
294
|
-
block.text = parsed_output
|
295
|
-
top_score = score
|
296
|
-
|
297
|
-
if score >= 1.0:
|
298
|
-
break
|
299
|
-
|
300
|
-
else:
|
301
|
-
if block.text is None:
|
302
|
-
error_msg = (
|
303
|
-
f"[{block.name}] Failed to parse output after "
|
304
|
-
f"{self.max_prompts} retries. Marking as untranslated."
|
305
|
-
)
|
306
|
-
log.warning(error_msg)
|
307
|
-
return
|
376
|
+
log.debug(f"[{block.name}] Translating...")
|
377
|
+
log.debug(f"[{block.name}] Input text:\n{block.original.text}")
|
308
378
|
|
309
|
-
|
379
|
+
# Track the cost of translating this block
|
380
|
+
# TODO: If non-OpenAI models with prices are added, this will need
|
381
|
+
# to be updated.
|
382
|
+
with get_model_callback() as cb:
|
383
|
+
t0 = time.time()
|
384
|
+
block.text = self._run_chain(block)
|
385
|
+
block.processing_time = time.time() - t0
|
386
|
+
block.cost = cb.total_cost
|
387
|
+
block.retries = max(0, cb.successful_requests - 1)
|
310
388
|
|
311
389
|
block.tokens = self._llm.get_num_tokens(block.text)
|
312
390
|
block.translated = True
|
313
391
|
|
314
392
|
log.debug(f"[{block.name}] Output code:\n{block.text}")
|
315
393
|
|
394
|
+
def _run_chain(self, block: TranslatedCodeBlock) -> str:
|
395
|
+
"""Run the model with three nested error fixing schemes.
|
396
|
+
First, try to fix simple formatting errors by giving the model just
|
397
|
+
the output and the parsing error. After a number of attempts, try
|
398
|
+
giving the model the output, the parsing error, and the original
|
399
|
+
input. Again check/retry this output to solve for formatting errors.
|
400
|
+
If we still haven't succeeded after several attempts, the model may
|
401
|
+
be getting thrown off by a bad initial output; start from scratch
|
402
|
+
and try again.
|
403
|
+
|
404
|
+
The number of tries for each layer of this scheme is roughly equal
|
405
|
+
to the cube root of self.max_retries, so the total calls to the
|
406
|
+
LLM will be roughly as expected (up to sqrt(self.max_retries) over)
|
407
|
+
"""
|
408
|
+
self._parser.set_reference(block.original)
|
409
|
+
|
410
|
+
# Retries with just the output and the error
|
411
|
+
n1 = round(self.max_prompts ** (1 / 3))
|
412
|
+
|
413
|
+
# Retries with the input, output, and error
|
414
|
+
n2 = round((self.max_prompts // n1) ** (1 / 2))
|
415
|
+
|
416
|
+
# Retries with just the input
|
417
|
+
n3 = math.ceil(self.max_prompts / (n1 * n2))
|
418
|
+
|
419
|
+
fix_format = OutputFixingParser.from_llm(
|
420
|
+
llm=self._llm,
|
421
|
+
parser=self._parser,
|
422
|
+
max_retries=n1,
|
423
|
+
)
|
424
|
+
retry = RetryWithErrorOutputParser.from_llm(
|
425
|
+
llm=self._llm,
|
426
|
+
parser=fix_format,
|
427
|
+
max_retries=n2,
|
428
|
+
)
|
429
|
+
|
430
|
+
completion_chain = self._prompt | self._llm
|
431
|
+
chain = RunnableParallel(
|
432
|
+
completion=completion_chain, prompt_value=self._prompt
|
433
|
+
) | RunnableLambda(lambda x: retry.parse_with_prompt(**x))
|
434
|
+
|
435
|
+
for _ in range(n3):
|
436
|
+
try:
|
437
|
+
return chain.invoke({"SOURCE_CODE": block.original.text})
|
438
|
+
except OutputParserException:
|
439
|
+
pass
|
440
|
+
|
441
|
+
raise OutputParserException(f"Failed to parse after {n1*n2*n3} retries")
|
442
|
+
|
316
443
|
def _save_to_file(self, block: CodeBlock, out_path: Path) -> None:
|
317
444
|
"""Save a file to disk.
|
318
445
|
|
319
446
|
Arguments:
|
320
447
|
block: The `CodeBlock` to save to a file.
|
321
448
|
"""
|
322
|
-
|
449
|
+
# TODO: can't use output fixer and this system for combining output
|
450
|
+
out_text = self._parser.parse_combined_output(block.complete_text)
|
323
451
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
324
452
|
out_path.write_text(out_text, encoding="utf-8")
|
325
453
|
|
326
|
-
def set_model(self, model_name: str, **custom_arguments:
|
454
|
+
def set_model(self, model_name: str, **custom_arguments: dict[str, Any]):
|
327
455
|
"""Validate and set the model name.
|
328
456
|
|
329
457
|
The affected objects will not be updated until translate() is called.
|
@@ -363,7 +491,9 @@ class Translator(Converter):
|
|
363
491
|
"""
|
364
492
|
self._prompt_template_name = prompt_template
|
365
493
|
|
366
|
-
def set_target_language(
|
494
|
+
def set_target_language(
|
495
|
+
self, target_language: str, target_version: str | None
|
496
|
+
) -> None:
|
367
497
|
"""Validate and set the target language.
|
368
498
|
|
369
499
|
The affected objects will not be updated until translate() is called.
|
@@ -382,8 +512,14 @@ class Translator(Converter):
|
|
382
512
|
self._target_language = target_language
|
383
513
|
self._target_version = target_version
|
384
514
|
|
515
|
+
def set_db_path(self, db_path: str) -> None:
|
516
|
+
self._db_path = db_path
|
517
|
+
|
518
|
+
def set_db_config(self, db_config: dict[str, Any] | None) -> None:
|
519
|
+
self._db_config = db_config
|
520
|
+
|
385
521
|
@run_if_changed("_model_name", "_custom_model_arguments")
|
386
|
-
def _load_model(self):
|
522
|
+
def _load_model(self) -> None:
|
387
523
|
"""Load the model according to this instance's attributes.
|
388
524
|
|
389
525
|
If the relevant fields have not been changed since the last time this method was
|
@@ -398,7 +534,9 @@ class Translator(Converter):
|
|
398
534
|
self._llm, token_limit, self.model_cost = load_model(self._model_name)
|
399
535
|
# Set the max_tokens to less than half the model's limit to allow for enough
|
400
536
|
# tokens at output
|
401
|
-
|
537
|
+
# Only modify max_tokens if it is not specified by user
|
538
|
+
if not self.override_token_limit:
|
539
|
+
self._max_tokens = token_limit // 2.5
|
402
540
|
|
403
541
|
@run_if_changed("_parser_type", "_target_language")
|
404
542
|
def _load_parser(self) -> None:
|
@@ -412,14 +550,26 @@ class Translator(Converter):
|
|
412
550
|
f"Target language ({self._target_language}) suggests target "
|
413
551
|
f"parser should be 'text', but is '{self._parser_type}'"
|
414
552
|
)
|
553
|
+
if (
|
554
|
+
self._parser_type in {"eval", "multidoc", "madlibs"}
|
555
|
+
and "json" != self._target_language
|
556
|
+
):
|
557
|
+
raise ValueError(
|
558
|
+
f"Parser type ({self._parser_type}) suggests target language"
|
559
|
+
f" should be 'json', but is '{self._target_language}'"
|
560
|
+
)
|
415
561
|
if "code" == self._parser_type:
|
416
|
-
self.
|
562
|
+
self._parser = CodeParser(language=self._target_language)
|
417
563
|
elif "eval" == self._parser_type:
|
418
|
-
self.
|
419
|
-
|
420
|
-
)
|
564
|
+
self._parser = EvaluationParser()
|
565
|
+
elif "multidoc" == self._parser_type:
|
566
|
+
self._parser = MultiDocumentationParser()
|
567
|
+
elif "madlibs" == self._parser_type:
|
568
|
+
self._parser = MadlibsDocumentationParser()
|
421
569
|
elif "text" == self._parser_type:
|
422
|
-
self.
|
570
|
+
self._parser = GenericParser()
|
571
|
+
elif "requirements" == self._parser_type:
|
572
|
+
self._parser = RequirementsParser()
|
423
573
|
else:
|
424
574
|
raise ValueError(
|
425
575
|
f"Unsupported parser type: {self._parser_type}. Can be: "
|
@@ -427,13 +577,17 @@ class Translator(Converter):
|
|
427
577
|
)
|
428
578
|
|
429
579
|
@run_if_changed(
|
430
|
-
"_prompt_template_name",
|
580
|
+
"_prompt_template_name",
|
581
|
+
"_source_language",
|
582
|
+
"_target_language",
|
583
|
+
"_target_version",
|
584
|
+
"_model_name",
|
431
585
|
)
|
432
|
-
def
|
433
|
-
"""Load the prompt
|
586
|
+
def _load_prompt(self) -> None:
|
587
|
+
"""Load the prompt according to this instance's attributes.
|
434
588
|
|
435
|
-
If the relevant fields have not been changed since the last time this
|
436
|
-
called, nothing happens.
|
589
|
+
If the relevant fields have not been changed since the last time this
|
590
|
+
method was called, nothing happens.
|
437
591
|
"""
|
438
592
|
if self._prompt_template_name in SAME_OUTPUT:
|
439
593
|
if self._target_language != self._source_language:
|
@@ -448,9 +602,380 @@ class Translator(Converter):
|
|
448
602
|
f"language should be 'text', but is '{self._target_language}'"
|
449
603
|
)
|
450
604
|
|
451
|
-
|
605
|
+
prompt_engine = MODEL_PROMPT_ENGINES[self._model_name](
|
452
606
|
source_language=self._source_language,
|
453
607
|
target_language=self._target_language,
|
454
608
|
target_version=self._target_version,
|
455
609
|
prompt_template=self._prompt_template_name,
|
456
610
|
)
|
611
|
+
self._prompt = prompt_engine.prompt
|
612
|
+
|
613
|
+
@run_if_changed("_db_path")
|
614
|
+
def _load_vectorizer(self) -> None:
|
615
|
+
if self._db_path is None:
|
616
|
+
self._vectorizer = None
|
617
|
+
return
|
618
|
+
vectorizer_factory = ChromaDBVectorizer()
|
619
|
+
self._vectorizer = vectorizer_factory.create_vectorizer(
|
620
|
+
self._db_path, self._db_config
|
621
|
+
)
|
622
|
+
|
623
|
+
@run_if_changed(
|
624
|
+
"_source_language",
|
625
|
+
"_max_tokens",
|
626
|
+
"_llm",
|
627
|
+
"_protected_node_types",
|
628
|
+
"_prune_node_types",
|
629
|
+
)
|
630
|
+
def _load_splitter(self) -> None:
|
631
|
+
if self._custom_splitter is None:
|
632
|
+
super()._load_splitter()
|
633
|
+
else:
|
634
|
+
kwargs = dict(
|
635
|
+
max_tokens=self._max_tokens,
|
636
|
+
model=self._llm,
|
637
|
+
protected_node_types=self._protected_node_types,
|
638
|
+
prune_node_types=self._prune_node_types,
|
639
|
+
)
|
640
|
+
# TODO: This should be configurable
|
641
|
+
if self._custom_splitter == "tag":
|
642
|
+
kwargs["tag"] = "<ITMOD_ALC_SPLIT>"
|
643
|
+
self._splitter = CUSTOM_SPLITTERS[self._custom_splitter](
|
644
|
+
language=self._source_language, **kwargs
|
645
|
+
)
|
646
|
+
|
647
|
+
@run_if_changed("_target_language", "_parser_type")
|
648
|
+
def _load_combiner(self) -> None:
|
649
|
+
if self._parser_type == "requirements":
|
650
|
+
self._combiner = ChunkCombiner()
|
651
|
+
elif self._target_language == "json":
|
652
|
+
self._combiner = JsonCombiner()
|
653
|
+
else:
|
654
|
+
self._combiner = Combiner()
|
655
|
+
|
656
|
+
|
657
|
+
class Documenter(Translator):
|
658
|
+
def __init__(
|
659
|
+
self, source_language: str = "fortran", drop_comments: bool = True, **kwargs
|
660
|
+
):
|
661
|
+
kwargs.update(
|
662
|
+
source_language=source_language,
|
663
|
+
target_language="text",
|
664
|
+
target_version=None,
|
665
|
+
prompt_template="document",
|
666
|
+
parser_type="text",
|
667
|
+
)
|
668
|
+
super().__init__(**kwargs)
|
669
|
+
|
670
|
+
if drop_comments:
|
671
|
+
comment_node_type = LANGUAGES[source_language].get(
|
672
|
+
"comment_node_type", "comment"
|
673
|
+
)
|
674
|
+
self.set_prune_node_types([comment_node_type])
|
675
|
+
|
676
|
+
|
677
|
+
class MultiDocumenter(Documenter):
|
678
|
+
def __init__(self, **kwargs):
|
679
|
+
super().__init__(**kwargs)
|
680
|
+
self.set_prompt("multidocument")
|
681
|
+
self.set_parser_type("multidoc")
|
682
|
+
self.set_target_language("json", None)
|
683
|
+
|
684
|
+
|
685
|
+
class MadLibsDocumenter(Documenter):
|
686
|
+
def __init__(
|
687
|
+
self,
|
688
|
+
comments_per_request: int | None = None,
|
689
|
+
**kwargs,
|
690
|
+
) -> None:
|
691
|
+
kwargs.update(drop_comments=False)
|
692
|
+
super().__init__(**kwargs)
|
693
|
+
self.set_prompt("document_madlibs")
|
694
|
+
self.set_parser_type("madlibs")
|
695
|
+
self.set_target_language("json", None)
|
696
|
+
self.comments_per_request = comments_per_request
|
697
|
+
|
698
|
+
def _add_translation(self, block: TranslatedCodeBlock):
|
699
|
+
if block.translated:
|
700
|
+
return
|
701
|
+
|
702
|
+
if block.original.text is None:
|
703
|
+
block.translated = True
|
704
|
+
return
|
705
|
+
|
706
|
+
if self.comments_per_request is None:
|
707
|
+
return super()._add_translation(block)
|
708
|
+
|
709
|
+
comment_pattern = r"<(?:INLINE|BLOCK)_COMMENT \w{8}>"
|
710
|
+
comments = list(
|
711
|
+
re.finditer(
|
712
|
+
comment_pattern,
|
713
|
+
block.original.text,
|
714
|
+
)
|
715
|
+
)
|
716
|
+
|
717
|
+
if not comments:
|
718
|
+
log.info(f"[{block.name}] Skipping commentless block")
|
719
|
+
block.translated = True
|
720
|
+
block.text = None
|
721
|
+
block.complete = True
|
722
|
+
return
|
723
|
+
|
724
|
+
if len(comments) <= self.comments_per_request:
|
725
|
+
return super()._add_translation(block)
|
726
|
+
|
727
|
+
comment_group_indices = list(range(0, len(comments), self.comments_per_request))
|
728
|
+
log.debug(
|
729
|
+
f"[{block.name}] Block contains more than {self.comments_per_request}"
|
730
|
+
f" comments, splitting {len(comments)} comments into"
|
731
|
+
f" {len(comment_group_indices)} groups"
|
732
|
+
)
|
733
|
+
|
734
|
+
block.processing_time = 0
|
735
|
+
block.cost = 0
|
736
|
+
block.retries = 0
|
737
|
+
obj = {}
|
738
|
+
for i in range(0, len(comments), self.comments_per_request):
|
739
|
+
# Split the text into the section containing comments of interest,
|
740
|
+
# all the text prior to those comments, and all the text after them
|
741
|
+
working_comments = comments[i : i + self.comments_per_request]
|
742
|
+
start_idx = working_comments[0].start()
|
743
|
+
end_idx = working_comments[-1].end()
|
744
|
+
prefix = block.original.text[:start_idx]
|
745
|
+
keeper = block.original.text[start_idx:end_idx]
|
746
|
+
suffix = block.original.text[end_idx:]
|
747
|
+
|
748
|
+
# Strip all comment placeholders outside of the section of interest
|
749
|
+
prefix = re.sub(comment_pattern, "", prefix)
|
750
|
+
suffix = re.sub(comment_pattern, "", suffix)
|
751
|
+
|
752
|
+
# Build a new TranslatedBlock using the new working text
|
753
|
+
working_copy = deepcopy(block.original)
|
754
|
+
working_copy.text = prefix + keeper + suffix
|
755
|
+
working_block = TranslatedCodeBlock(working_copy, self._target_language)
|
756
|
+
|
757
|
+
# Run the LLM on the working text
|
758
|
+
super()._add_translation(working_block)
|
759
|
+
|
760
|
+
# Update metadata to include for all runs
|
761
|
+
block.retries += working_block.retries
|
762
|
+
block.cost += working_block.cost
|
763
|
+
block.processing_time += working_block.processing_time
|
764
|
+
|
765
|
+
# Update the output text to merge this section's output in
|
766
|
+
out_text = self._parser.parse(working_block.text)
|
767
|
+
obj.update(json.loads(out_text))
|
768
|
+
|
769
|
+
self._parser.set_reference(block.original)
|
770
|
+
block.text = self._parser.parse(json.dumps(obj))
|
771
|
+
block.tokens = self._llm.get_num_tokens(block.text)
|
772
|
+
block.translated = True
|
773
|
+
|
774
|
+
def _get_obj(
|
775
|
+
self, block: TranslatedCodeBlock
|
776
|
+
) -> dict[str, int | float | dict[str, str]]:
|
777
|
+
out_text = self._parser.parse_combined_output(block.complete_text)
|
778
|
+
obj = dict(
|
779
|
+
retries=block.total_retries,
|
780
|
+
cost=block.total_cost,
|
781
|
+
processing_time=block.processing_time,
|
782
|
+
comments=json.loads(out_text),
|
783
|
+
)
|
784
|
+
return obj
|
785
|
+
|
786
|
+
def _save_to_file(self, block: TranslatedCodeBlock, out_path: Path) -> None:
|
787
|
+
"""Save a file to disk.
|
788
|
+
|
789
|
+
Arguments:
|
790
|
+
block: The `CodeBlock` to save to a file.
|
791
|
+
"""
|
792
|
+
obj = self._get_obj(block)
|
793
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
794
|
+
out_path.write_text(json.dumps(obj, indent=2), encoding="utf-8")
|
795
|
+
|
796
|
+
|
797
|
+
class DiagramGenerator(Documenter):
|
798
|
+
"""DiagramGenerator
|
799
|
+
|
800
|
+
A class that translates code from one programming language to a set of diagrams.
|
801
|
+
"""
|
802
|
+
|
803
|
+
def __init__(
|
804
|
+
self,
|
805
|
+
model: str = "gpt-3.5-turbo-0125",
|
806
|
+
model_arguments: dict[str, Any] = {},
|
807
|
+
source_language: str = "fortran",
|
808
|
+
max_prompts: int = 10,
|
809
|
+
db_path: str | None = None,
|
810
|
+
db_config: dict[str, Any] | None = None,
|
811
|
+
diagram_type="Activity",
|
812
|
+
add_documentation=False,
|
813
|
+
custom_splitter: str | None = None,
|
814
|
+
) -> None:
|
815
|
+
"""Initialize the DiagramGenerator class
|
816
|
+
|
817
|
+
Arguments:
|
818
|
+
model: The LLM to use for translation. If an OpenAI model, the
|
819
|
+
`OPENAI_API_KEY` environment variable must be set and the
|
820
|
+
`OPENAI_ORG_ID` environment variable should be set if needed.
|
821
|
+
model_arguments: Additional arguments to pass to the LLM constructor.
|
822
|
+
source_language: The source programming language.
|
823
|
+
max_prompts: The maximum number of prompts to try before giving up.
|
824
|
+
db_path: path to chroma database
|
825
|
+
db_config: database configuraiton
|
826
|
+
diagram_type: type of PLANTUML diagram to generate
|
827
|
+
"""
|
828
|
+
super().__init__(
|
829
|
+
model=model,
|
830
|
+
model_arguments=model_arguments,
|
831
|
+
source_language=source_language,
|
832
|
+
max_prompts=max_prompts,
|
833
|
+
db_path=db_path,
|
834
|
+
db_config=db_config,
|
835
|
+
custom_splitter=custom_splitter,
|
836
|
+
)
|
837
|
+
self._diagram_type = diagram_type
|
838
|
+
self._add_documentation = add_documentation
|
839
|
+
self._documenter = None
|
840
|
+
self._model = model
|
841
|
+
self._model_arguments = model_arguments
|
842
|
+
self._max_prompts = max_prompts
|
843
|
+
if add_documentation:
|
844
|
+
self._diagram_prompt_template_name = "diagram_with_documentation"
|
845
|
+
else:
|
846
|
+
self._diagram_prompt_template_name = "diagram"
|
847
|
+
self._load_diagram_prompt_engine()
|
848
|
+
|
849
|
+
def _add_translation(self, block: TranslatedCodeBlock) -> None:
|
850
|
+
"""Given an "empty" `TranslatedCodeBlock`, translate the code represented in
|
851
|
+
`block.original`, setting the relevant fields in the translated block. The
|
852
|
+
`TranslatedCodeBlock` is updated in-pace, nothing is returned. Note that this
|
853
|
+
translates *only* the code for this block, not its children.
|
854
|
+
|
855
|
+
Arguments:
|
856
|
+
block: An empty `TranslatedCodeBlock`
|
857
|
+
"""
|
858
|
+
if block.translated:
|
859
|
+
return
|
860
|
+
|
861
|
+
if block.original.text is None:
|
862
|
+
block.translated = True
|
863
|
+
return
|
864
|
+
|
865
|
+
if self._add_documentation:
|
866
|
+
documentation_block = deepcopy(block)
|
867
|
+
super()._add_translation(documentation_block)
|
868
|
+
if not documentation_block.translated:
|
869
|
+
message = "Error: unable to produce documentation for code block"
|
870
|
+
log.message(message)
|
871
|
+
raise ValueError(message)
|
872
|
+
documentation = json.loads(documentation_block.text)["docstring"]
|
873
|
+
|
874
|
+
if self._llm is None:
|
875
|
+
message = (
|
876
|
+
"Model not configured correctly, cannot translate. Try setting "
|
877
|
+
"the model"
|
878
|
+
)
|
879
|
+
log.error(message)
|
880
|
+
raise ValueError(message)
|
881
|
+
|
882
|
+
log.debug(f"[{block.name}] Translating...")
|
883
|
+
log.debug(f"[{block.name}] Input text:\n{block.original.text}")
|
884
|
+
|
885
|
+
self._parser.set_reference(block.original)
|
886
|
+
|
887
|
+
query_and_parse = self.diagram_prompt | self._llm | self._parser
|
888
|
+
|
889
|
+
if self._add_documentation:
|
890
|
+
block.text = query_and_parse.invoke(
|
891
|
+
{
|
892
|
+
"SOURCE_CODE": block.original.text,
|
893
|
+
"DIAGRAM_TYPE": self._diagram_type,
|
894
|
+
"DOCUMENTATION": documentation,
|
895
|
+
}
|
896
|
+
)
|
897
|
+
else:
|
898
|
+
block.text = query_and_parse.invoke(
|
899
|
+
{
|
900
|
+
"SOURCE_CODE": block.original.text,
|
901
|
+
"DIAGRAM_TYPE": self._diagram_type,
|
902
|
+
}
|
903
|
+
)
|
904
|
+
block.tokens = self._llm.get_num_tokens(block.text)
|
905
|
+
block.translated = True
|
906
|
+
|
907
|
+
log.debug(f"[{block.name}] Output code:\n{block.text}")
|
908
|
+
|
909
|
+
@run_if_changed(
|
910
|
+
"_diagram_prompt_template_name",
|
911
|
+
"_source_language",
|
912
|
+
)
|
913
|
+
def _load_diagram_prompt_engine(self) -> None:
|
914
|
+
"""Load the prompt engine according to this instance's attributes.
|
915
|
+
|
916
|
+
If the relevant fields have not been changed since the last time this method was
|
917
|
+
called, nothing happens.
|
918
|
+
"""
|
919
|
+
if self._diagram_prompt_template_name in SAME_OUTPUT:
|
920
|
+
if self._target_language != self._source_language:
|
921
|
+
raise ValueError(
|
922
|
+
f"Prompt template ({self._prompt_template_name}) suggests "
|
923
|
+
f"source and target languages should match, but do not "
|
924
|
+
f"({self._source_language} != {self._target_language})"
|
925
|
+
)
|
926
|
+
if (
|
927
|
+
self._diagram_prompt_template_name in TEXT_OUTPUT
|
928
|
+
and self._target_language != "text"
|
929
|
+
):
|
930
|
+
raise ValueError(
|
931
|
+
f"Prompt template ({self._prompt_template_name}) suggests target "
|
932
|
+
f"language should be 'text', but is '{self._target_language}'"
|
933
|
+
)
|
934
|
+
|
935
|
+
self._diagram_prompt_engine = MODEL_PROMPT_ENGINES[self._model_name](
|
936
|
+
source_language=self._source_language,
|
937
|
+
target_language=self._target_language,
|
938
|
+
target_version=self._target_version,
|
939
|
+
prompt_template=self._diagram_prompt_template_name,
|
940
|
+
)
|
941
|
+
self.diagram_prompt = self._diagram_prompt_engine.prompt
|
942
|
+
|
943
|
+
|
944
|
+
class RequirementsDocumenter(Documenter):
|
945
|
+
"""RequirementsGenerator
|
946
|
+
|
947
|
+
A class that translates code from one programming language to its requirements.
|
948
|
+
"""
|
949
|
+
|
950
|
+
def __init__(self, **kwargs):
|
951
|
+
super().__init__(**kwargs)
|
952
|
+
self.set_prompt("chunk_requirements")
|
953
|
+
self.set_target_language("json", None)
|
954
|
+
self.set_parser_type("requirements")
|
955
|
+
|
956
|
+
def _save_to_file(self, block: TranslatedCodeBlock, out_path: Path) -> None:
|
957
|
+
"""Save a file to disk.
|
958
|
+
|
959
|
+
Arguments:
|
960
|
+
block: The `CodeBlock` to save to a file.
|
961
|
+
"""
|
962
|
+
output_list = list()
|
963
|
+
# For each chunk of code, get generation metadata, the text of the code,
|
964
|
+
# and the LLM generated requirements
|
965
|
+
for child in block.children:
|
966
|
+
code = child.original.text
|
967
|
+
requirements = self._parser.parse_combined_output(child.complete_text)
|
968
|
+
metadata = dict(
|
969
|
+
retries=child.total_retries,
|
970
|
+
cost=child.total_cost,
|
971
|
+
processing_time=child.processing_time,
|
972
|
+
)
|
973
|
+
# Put them all in a top level 'output' key
|
974
|
+
output_list.append(
|
975
|
+
dict(metadata=metadata, code=code, requirements=requirements)
|
976
|
+
)
|
977
|
+
obj = dict(
|
978
|
+
output=output_list,
|
979
|
+
)
|
980
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
981
|
+
out_path.write_text(json.dumps(obj, indent=2), encoding="utf-8")
|