janus-llm 4.3.1__py3-none-any.whl → 4.3.5__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- janus/__init__.py +1 -1
- janus/__main__.py +1 -1
- janus/_tests/evaluator_tests/EvalReadMe.md +85 -0
- janus/_tests/evaluator_tests/incose_tests/incose_large_test.json +39 -0
- janus/_tests/evaluator_tests/incose_tests/incose_small_test.json +17 -0
- janus/_tests/evaluator_tests/inline_comment_tests/mumps_inline_comment_test.m +71 -0
- janus/_tests/test_cli.py +3 -2
- janus/cli/aggregate.py +135 -0
- janus/cli/cli.py +111 -0
- janus/cli/constants.py +43 -0
- janus/cli/database.py +289 -0
- janus/cli/diagram.py +178 -0
- janus/cli/document.py +174 -0
- janus/cli/embedding.py +122 -0
- janus/cli/llm.py +187 -0
- janus/cli/partition.py +125 -0
- janus/cli/self_eval.py +149 -0
- janus/cli/translate.py +183 -0
- janus/converter/__init__.py +1 -1
- janus/converter/_tests/test_translate.py +2 -0
- janus/converter/converter.py +129 -93
- janus/converter/document.py +21 -14
- janus/converter/evaluate.py +20 -13
- janus/converter/translate.py +3 -3
- janus/embedding/collections.py +1 -1
- janus/language/alc/_tests/alc.asm +3779 -0
- janus/language/binary/_tests/hello.bin +0 -0
- janus/language/block.py +47 -12
- janus/language/file.py +1 -1
- janus/language/mumps/_tests/mumps.m +235 -0
- janus/language/treesitter/_tests/languages/fortran.f90 +416 -0
- janus/language/treesitter/_tests/languages/ibmhlasm.asm +16 -0
- janus/language/treesitter/_tests/languages/matlab.m +225 -0
- janus/llm/models_info.py +9 -1
- janus/metrics/_tests/asm_test_file.asm +10 -0
- janus/metrics/_tests/mumps_test_file.m +6 -0
- janus/metrics/_tests/test_treesitter_metrics.py +1 -1
- janus/metrics/prompts/clarity.txt +8 -0
- janus/metrics/prompts/completeness.txt +16 -0
- janus/metrics/prompts/faithfulness.txt +10 -0
- janus/metrics/prompts/hallucination.txt +16 -0
- janus/metrics/prompts/quality.txt +8 -0
- janus/metrics/prompts/readability.txt +16 -0
- janus/metrics/prompts/usefulness.txt +16 -0
- janus/parsers/code_parser.py +4 -4
- janus/parsers/doc_parser.py +12 -9
- janus/parsers/parser.py +7 -0
- janus/parsers/partition_parser.py +6 -4
- janus/parsers/reqs_parser.py +8 -5
- janus/parsers/uml.py +5 -4
- janus/prompts/prompt.py +2 -2
- janus/prompts/templates/README.md +30 -0
- janus/prompts/templates/basic_aggregation/human.txt +6 -0
- janus/prompts/templates/basic_aggregation/system.txt +1 -0
- janus/prompts/templates/basic_refinement/human.txt +14 -0
- janus/prompts/templates/basic_refinement/system.txt +1 -0
- janus/prompts/templates/diagram/human.txt +9 -0
- janus/prompts/templates/diagram/system.txt +1 -0
- janus/prompts/templates/diagram_with_documentation/human.txt +15 -0
- janus/prompts/templates/diagram_with_documentation/system.txt +1 -0
- janus/prompts/templates/document/human.txt +10 -0
- janus/prompts/templates/document/system.txt +1 -0
- janus/prompts/templates/document_cloze/human.txt +11 -0
- janus/prompts/templates/document_cloze/system.txt +1 -0
- janus/prompts/templates/document_cloze/variables.json +4 -0
- janus/prompts/templates/document_cloze/variables_asm.json +4 -0
- janus/prompts/templates/document_inline/human.txt +13 -0
- janus/prompts/templates/eval_prompts/incose/human.txt +32 -0
- janus/prompts/templates/eval_prompts/incose/system.txt +1 -0
- janus/prompts/templates/eval_prompts/incose/variables.json +3 -0
- janus/prompts/templates/eval_prompts/inline_comments/human.txt +49 -0
- janus/prompts/templates/eval_prompts/inline_comments/system.txt +1 -0
- janus/prompts/templates/eval_prompts/inline_comments/variables.json +3 -0
- janus/prompts/templates/micromanaged_mumps_v1.0/human.txt +23 -0
- janus/prompts/templates/micromanaged_mumps_v1.0/system.txt +3 -0
- janus/prompts/templates/micromanaged_mumps_v2.0/human.txt +28 -0
- janus/prompts/templates/micromanaged_mumps_v2.0/system.txt +3 -0
- janus/prompts/templates/micromanaged_mumps_v2.1/human.txt +29 -0
- janus/prompts/templates/micromanaged_mumps_v2.1/system.txt +3 -0
- janus/prompts/templates/multidocument/human.txt +15 -0
- janus/prompts/templates/multidocument/system.txt +1 -0
- janus/prompts/templates/partition/human.txt +22 -0
- janus/prompts/templates/partition/system.txt +1 -0
- janus/prompts/templates/partition/variables.json +4 -0
- janus/prompts/templates/pseudocode/human.txt +7 -0
- janus/prompts/templates/pseudocode/system.txt +7 -0
- janus/prompts/templates/refinement/fix_exceptions/human.txt +19 -0
- janus/prompts/templates/refinement/fix_exceptions/system.txt +1 -0
- janus/prompts/templates/refinement/format/code_format/human.txt +12 -0
- janus/prompts/templates/refinement/format/code_format/system.txt +1 -0
- janus/prompts/templates/refinement/format/requirements_format/human.txt +14 -0
- janus/prompts/templates/refinement/format/requirements_format/system.txt +1 -0
- janus/prompts/templates/refinement/hallucination/human.txt +13 -0
- janus/prompts/templates/refinement/hallucination/system.txt +1 -0
- janus/prompts/templates/refinement/reflection/human.txt +15 -0
- janus/prompts/templates/refinement/reflection/incose/human.txt +26 -0
- janus/prompts/templates/refinement/reflection/incose/system.txt +1 -0
- janus/prompts/templates/refinement/reflection/incose_deduplicate/human.txt +16 -0
- janus/prompts/templates/refinement/reflection/incose_deduplicate/system.txt +1 -0
- janus/prompts/templates/refinement/reflection/system.txt +1 -0
- janus/prompts/templates/refinement/revision/human.txt +16 -0
- janus/prompts/templates/refinement/revision/incose/human.txt +16 -0
- janus/prompts/templates/refinement/revision/incose/system.txt +1 -0
- janus/prompts/templates/refinement/revision/incose_deduplicate/human.txt +17 -0
- janus/prompts/templates/refinement/revision/incose_deduplicate/system.txt +1 -0
- janus/prompts/templates/refinement/revision/system.txt +1 -0
- janus/prompts/templates/refinement/uml/alc_fix_variables/human.txt +15 -0
- janus/prompts/templates/refinement/uml/alc_fix_variables/system.txt +2 -0
- janus/prompts/templates/refinement/uml/fix_connections/human.txt +15 -0
- janus/prompts/templates/refinement/uml/fix_connections/system.txt +2 -0
- janus/prompts/templates/requirements/human.txt +13 -0
- janus/prompts/templates/requirements/system.txt +2 -0
- janus/prompts/templates/retrieval/language_docs/human.txt +10 -0
- janus/prompts/templates/retrieval/language_docs/system.txt +1 -0
- janus/prompts/templates/simple/human.txt +16 -0
- janus/prompts/templates/simple/system.txt +3 -0
- janus/refiners/format.py +49 -0
- janus/refiners/refiner.py +113 -4
- janus/utils/enums.py +127 -112
- janus/utils/logger.py +2 -0
- {janus_llm-4.3.1.dist-info → janus_llm-4.3.5.dist-info}/METADATA +7 -7
- janus_llm-4.3.5.dist-info/RECORD +210 -0
- {janus_llm-4.3.1.dist-info → janus_llm-4.3.5.dist-info}/WHEEL +1 -1
- janus_llm-4.3.5.dist-info/entry_points.txt +3 -0
- janus/cli.py +0 -1488
- janus_llm-4.3.1.dist-info/RECORD +0 -115
- janus_llm-4.3.1.dist-info/entry_points.txt +0 -3
- {janus_llm-4.3.1.dist-info → janus_llm-4.3.5.dist-info}/LICENSE +0 -0
janus/cli/translate.py
ADDED
@@ -0,0 +1,183 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
from typing import Optional
|
3
|
+
|
4
|
+
import click
|
5
|
+
import typer
|
6
|
+
from typing_extensions import Annotated
|
7
|
+
|
8
|
+
from janus.cli.constants import REFINERS
|
9
|
+
from janus.language.naive.registry import CUSTOM_SPLITTERS
|
10
|
+
from janus.utils.enums import LANGUAGES
|
11
|
+
from janus.utils.logger import create_logger
|
12
|
+
|
13
|
+
log = create_logger(__name__)
|
14
|
+
|
15
|
+
|
16
|
+
def translate(
|
17
|
+
input_dir: Annotated[
|
18
|
+
Path,
|
19
|
+
typer.Option(
|
20
|
+
"--input",
|
21
|
+
"-i",
|
22
|
+
help="The directory containing the source code to be translated. "
|
23
|
+
"The files should all be in one flat directory.",
|
24
|
+
),
|
25
|
+
],
|
26
|
+
source_lang: Annotated[
|
27
|
+
str,
|
28
|
+
typer.Option(
|
29
|
+
"--source-language",
|
30
|
+
"-s",
|
31
|
+
help="The language of the source code.",
|
32
|
+
click_type=click.Choice(sorted(LANGUAGES)),
|
33
|
+
),
|
34
|
+
],
|
35
|
+
output_dir: Annotated[
|
36
|
+
Path,
|
37
|
+
typer.Option(
|
38
|
+
"--output", "-o", help="The directory to store the translated code in."
|
39
|
+
),
|
40
|
+
],
|
41
|
+
target_lang: Annotated[
|
42
|
+
str,
|
43
|
+
typer.Option(
|
44
|
+
"--target-language",
|
45
|
+
"-t",
|
46
|
+
help="The desired output language to translate the source code to. The "
|
47
|
+
"format can follow a 'language-version' syntax. Use 'text' to get plaintext"
|
48
|
+
"results as returned by the LLM. Examples: `python-3.10`, `mumps`, `java-10`,"
|
49
|
+
"text.",
|
50
|
+
),
|
51
|
+
],
|
52
|
+
llm_name: Annotated[
|
53
|
+
str,
|
54
|
+
typer.Option(
|
55
|
+
"--llm",
|
56
|
+
"-L",
|
57
|
+
help="The custom name of the model set with 'janus llm add'.",
|
58
|
+
),
|
59
|
+
],
|
60
|
+
failure_dir: Annotated[
|
61
|
+
Optional[Path],
|
62
|
+
typer.Option(
|
63
|
+
"--failure-directory",
|
64
|
+
"-f",
|
65
|
+
help="The directory to store failure files during translation",
|
66
|
+
),
|
67
|
+
] = None,
|
68
|
+
max_prompts: Annotated[
|
69
|
+
int,
|
70
|
+
typer.Option(
|
71
|
+
"--max-prompts",
|
72
|
+
"-m",
|
73
|
+
help="The maximum number of times to prompt a model on one functional block "
|
74
|
+
"before exiting the application. This is to prevent wasting too much money.",
|
75
|
+
),
|
76
|
+
] = 10,
|
77
|
+
overwrite: Annotated[
|
78
|
+
bool,
|
79
|
+
typer.Option(
|
80
|
+
"--overwrite/--preserve",
|
81
|
+
help="Whether to overwrite existing files in the output directory",
|
82
|
+
),
|
83
|
+
] = False,
|
84
|
+
skip_context: Annotated[
|
85
|
+
bool,
|
86
|
+
typer.Option(
|
87
|
+
"--skip-context",
|
88
|
+
help="Prompts will include any context information associated with source"
|
89
|
+
" code blocks, unless this option is specified",
|
90
|
+
),
|
91
|
+
] = False,
|
92
|
+
temp: Annotated[
|
93
|
+
float,
|
94
|
+
typer.Option("--temperature", "-T", help="Sampling temperature.", min=0, max=2),
|
95
|
+
] = 0.7,
|
96
|
+
prompt_template: Annotated[
|
97
|
+
str,
|
98
|
+
typer.Option(
|
99
|
+
"--prompt-template",
|
100
|
+
"-p",
|
101
|
+
help="Name of the Janus prompt template directory or "
|
102
|
+
"path to a directory containing those template files.",
|
103
|
+
),
|
104
|
+
] = "simple",
|
105
|
+
collection: Annotated[
|
106
|
+
str,
|
107
|
+
typer.Option(
|
108
|
+
"--collection",
|
109
|
+
"-c",
|
110
|
+
help="If set, will put the translated result into a Chroma DB "
|
111
|
+
"collection with the name provided.",
|
112
|
+
),
|
113
|
+
] = None,
|
114
|
+
splitter_type: Annotated[
|
115
|
+
str,
|
116
|
+
typer.Option(
|
117
|
+
"-S",
|
118
|
+
"--splitter",
|
119
|
+
help="Name of custom splitter to use",
|
120
|
+
click_type=click.Choice(list(CUSTOM_SPLITTERS.keys())),
|
121
|
+
),
|
122
|
+
] = "file",
|
123
|
+
refiner_types: Annotated[
|
124
|
+
list[str],
|
125
|
+
typer.Option(
|
126
|
+
"-r",
|
127
|
+
"--refiner",
|
128
|
+
help="List of refiner types to use. Add -r for each refiner to use in\
|
129
|
+
refinement chain",
|
130
|
+
click_type=click.Choice(list(REFINERS.keys())),
|
131
|
+
),
|
132
|
+
] = ["JanusRefiner"],
|
133
|
+
retriever_type: Annotated[
|
134
|
+
str,
|
135
|
+
typer.Option(
|
136
|
+
"-R",
|
137
|
+
"--retriever",
|
138
|
+
help="Name of custom retriever to use",
|
139
|
+
click_type=click.Choice(["active_usings", "language_docs"]),
|
140
|
+
),
|
141
|
+
] = None,
|
142
|
+
max_tokens: Annotated[
|
143
|
+
int,
|
144
|
+
typer.Option(
|
145
|
+
"--max-tokens",
|
146
|
+
"-M",
|
147
|
+
help="The maximum number of tokens the model will take in. "
|
148
|
+
"If unspecificed, model's default max will be used.",
|
149
|
+
),
|
150
|
+
] = None,
|
151
|
+
):
|
152
|
+
from janus.cli.constants import db_loc, get_collections_config
|
153
|
+
from janus.converter.translate import Translator
|
154
|
+
|
155
|
+
refiner_types = [REFINERS[r] for r in refiner_types]
|
156
|
+
try:
|
157
|
+
target_language, target_version = target_lang.split("-")
|
158
|
+
except ValueError:
|
159
|
+
target_language = target_lang
|
160
|
+
target_version = None
|
161
|
+
# make sure not overwriting input
|
162
|
+
if source_lang.lower() == target_language.lower() and input_dir == output_dir:
|
163
|
+
log.error("Output files would overwrite input! Aborting...")
|
164
|
+
raise ValueError
|
165
|
+
|
166
|
+
model_arguments = dict(temperature=temp)
|
167
|
+
collections_config = get_collections_config()
|
168
|
+
translator = Translator(
|
169
|
+
model=llm_name,
|
170
|
+
model_arguments=model_arguments,
|
171
|
+
source_language=source_lang,
|
172
|
+
target_language=target_language,
|
173
|
+
target_version=target_version,
|
174
|
+
max_prompts=max_prompts,
|
175
|
+
max_tokens=max_tokens,
|
176
|
+
prompt_template=prompt_template,
|
177
|
+
db_path=db_loc,
|
178
|
+
db_config=collections_config,
|
179
|
+
splitter_type=splitter_type,
|
180
|
+
refiner_types=refiner_types,
|
181
|
+
retriever_type=retriever_type,
|
182
|
+
)
|
183
|
+
translator.translate(input_dir, output_dir, failure_dir, overwrite, collection)
|
janus/converter/__init__.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
from janus.converter.converter import Converter
|
2
2
|
from janus.converter.diagram import DiagramGenerator
|
3
|
-
from janus.converter.document import
|
3
|
+
from janus.converter.document import ClozeDocumenter, Documenter, MultiDocumenter
|
4
4
|
from janus.converter.evaluate import Evaluator
|
5
5
|
from janus.converter.partition import Partitioner
|
6
6
|
from janus.converter.requirements import RequirementsDocumenter
|
@@ -11,6 +11,7 @@ from janus.converter.diagram import DiagramGenerator
|
|
11
11
|
from janus.converter.requirements import RequirementsDocumenter
|
12
12
|
from janus.converter.translate import Translator
|
13
13
|
from janus.language.block import CodeBlock, TranslatedCodeBlock
|
14
|
+
from janus.refiners.format import CodeFormatRefiner
|
14
15
|
|
15
16
|
|
16
17
|
class MockCollection(VectorStore):
|
@@ -50,6 +51,7 @@ class TestTranslator(unittest.TestCase):
|
|
50
51
|
target_language="python",
|
51
52
|
target_version="3.10",
|
52
53
|
splitter_type="ast-flex",
|
54
|
+
refiner_types=[CodeFormatRefiner],
|
53
55
|
)
|
54
56
|
self.test_file = Path("janus/language/treesitter/_tests/languages/fortran.f90")
|
55
57
|
self.TEST_FILE_EMBEDDING_COUNT = 14
|
janus/converter/converter.py
CHANGED
@@ -27,7 +27,7 @@ from janus.language.splitter import (
|
|
27
27
|
)
|
28
28
|
from janus.llm.model_callbacks import get_model_callback
|
29
29
|
from janus.llm.models_info import MODEL_PROMPT_ENGINES, JanusModel, load_model
|
30
|
-
from janus.parsers.parser import GenericParser, JanusParser
|
30
|
+
from janus.parsers.parser import GenericParser, JanusParser, JanusParserException
|
31
31
|
from janus.refiners.refiner import JanusRefiner
|
32
32
|
|
33
33
|
# from janus.refiners.refiner import BasicRefiner, Refiner
|
@@ -122,7 +122,7 @@ class Converter:
|
|
122
122
|
self._custom_model_arguments: dict[str, Any]
|
123
123
|
|
124
124
|
self._source_language: str
|
125
|
-
self.
|
125
|
+
self._source_suffixes: list[str]
|
126
126
|
|
127
127
|
self._target_language = "json"
|
128
128
|
self._target_suffix = ".json"
|
@@ -245,8 +245,10 @@ class Converter:
|
|
245
245
|
"Valid source languages are found in `janus.utils.enums.LANGUAGES`."
|
246
246
|
)
|
247
247
|
|
248
|
-
|
249
|
-
|
248
|
+
self._source_suffixes = [
|
249
|
+
f".{ext}" for ext in LANGUAGES[source_language]["suffixes"]
|
250
|
+
]
|
251
|
+
|
250
252
|
self._source_language = source_language
|
251
253
|
|
252
254
|
def set_protected_node_types(self, protected_node_types: tuple[str, ...]) -> None:
|
@@ -324,7 +326,7 @@ class Converter:
|
|
324
326
|
# tokens at output
|
325
327
|
# Only modify max_tokens if it is not specified by user
|
326
328
|
if not self.override_token_limit:
|
327
|
-
self._max_tokens = int(token_limit
|
329
|
+
self._max_tokens = int(token_limit * self._llm.input_token_proportion)
|
328
330
|
|
329
331
|
@run_if_changed(
|
330
332
|
"_prompt_template_name",
|
@@ -406,6 +408,7 @@ class Converter:
|
|
406
408
|
self,
|
407
409
|
input_directory: str | Path,
|
408
410
|
output_directory: str | Path | None = None,
|
411
|
+
failure_directory: str | Path | None = None,
|
409
412
|
overwrite: bool = False,
|
410
413
|
collection_name: str | None = None,
|
411
414
|
) -> None:
|
@@ -423,16 +426,22 @@ class Converter:
|
|
423
426
|
input_directory = Path(input_directory)
|
424
427
|
if isinstance(output_directory, str):
|
425
428
|
output_directory = Path(output_directory)
|
429
|
+
if isinstance(failure_directory, str):
|
430
|
+
failure_directory = Path(failure_directory)
|
426
431
|
|
427
432
|
# Make sure the output directory exists
|
428
433
|
if output_directory is not None and not output_directory.exists():
|
429
434
|
output_directory.mkdir(parents=True)
|
435
|
+
if failure_directory is not None and not failure_directory.exists():
|
436
|
+
failure_directory.mkdir(parents=True)
|
430
437
|
|
431
|
-
input_paths = [
|
438
|
+
input_paths = []
|
439
|
+
for ext in self._source_suffixes:
|
440
|
+
input_paths.extend(input_directory.rglob(f"**/*{ext}"))
|
432
441
|
|
433
442
|
log.info(f"Input directory: {input_directory.absolute()}")
|
434
443
|
log.info(
|
435
|
-
f"{self._source_language}
|
444
|
+
f"{self._source_language} {self._source_suffixes} files: "
|
436
445
|
f"{len(input_paths)}"
|
437
446
|
)
|
438
447
|
log.info(
|
@@ -445,67 +454,39 @@ class Converter:
|
|
445
454
|
/ p.relative_to(input_directory).with_suffix(self._target_suffix)
|
446
455
|
for p in input_paths
|
447
456
|
]
|
448
|
-
in_out_pairs = list(zip(input_paths, output_paths))
|
449
|
-
if not overwrite:
|
450
|
-
n_files = len(in_out_pairs)
|
451
|
-
in_out_pairs = [
|
452
|
-
(inp, outp) for inp, outp in in_out_pairs if not outp.exists()
|
453
|
-
]
|
454
|
-
log.info(
|
455
|
-
f"Skipping {n_files - len(in_out_pairs)} existing "
|
456
|
-
f"'*{self._source_suffix}' files"
|
457
|
-
)
|
458
457
|
else:
|
459
|
-
|
460
|
-
|
458
|
+
output_paths = [None for _ in input_paths]
|
459
|
+
|
460
|
+
if failure_directory is not None:
|
461
|
+
failure_paths = [
|
462
|
+
failure_directory
|
463
|
+
/ p.relative_to(input_directory).with_suffix(self._target_suffix)
|
464
|
+
for p in input_paths
|
465
|
+
]
|
466
|
+
else:
|
467
|
+
failure_paths = [None for _ in input_paths]
|
468
|
+
in_out_pairs = list(zip(input_paths, output_paths, failure_paths))
|
469
|
+
if not overwrite:
|
470
|
+
n_files = len(in_out_pairs)
|
471
|
+
in_out_pairs = [
|
472
|
+
(inp, outp, failp)
|
473
|
+
for inp, outp, failp in in_out_pairs
|
474
|
+
if outp is None or not outp.exists()
|
475
|
+
]
|
476
|
+
log.info(
|
477
|
+
f"Skipping {n_files - len(in_out_pairs)} existing "
|
478
|
+
f"{self._source_suffixes} files"
|
479
|
+
)
|
480
|
+
log.info(f"Translating {len(in_out_pairs)} {self._source_suffixes} files")
|
461
481
|
|
462
482
|
# Loop through each input file, convert and save it
|
463
483
|
total_cost = 0.0
|
464
|
-
for in_path, out_path in in_out_pairs:
|
484
|
+
for in_path, out_path, fail_path in in_out_pairs:
|
465
485
|
# Translate the file, skip it if there's a rate limit error
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
except RateLimitError:
|
471
|
-
continue
|
472
|
-
except OutputParserException as e:
|
473
|
-
log.error(f"Skipping {in_path.name}, failed to parse output: {e}.")
|
474
|
-
continue
|
475
|
-
except BadRequestError as e:
|
476
|
-
if str(e).startswith("Detected an error in the prompt"):
|
477
|
-
log.warning("Malformed input, skipping")
|
478
|
-
continue
|
479
|
-
raise e
|
480
|
-
except ValidationError as e:
|
481
|
-
# Only allow ValidationError to pass if token limit is manually set
|
482
|
-
if self.override_token_limit:
|
483
|
-
log.warning(
|
484
|
-
"Current file and manually set token "
|
485
|
-
"limit is too large for this model, skipping"
|
486
|
-
)
|
487
|
-
continue
|
488
|
-
raise e
|
489
|
-
except TokenLimitError:
|
490
|
-
log.warning("Ran into irreducible node too large for context, skipping")
|
491
|
-
continue
|
492
|
-
except EmptyTreeError:
|
493
|
-
log.warning(
|
494
|
-
f'Input file "{in_path.name}" has no nodes of interest, skipping'
|
495
|
-
)
|
496
|
-
continue
|
497
|
-
except FileSizeError:
|
498
|
-
log.warning("Current tile is too large for basic splitter, skipping")
|
499
|
-
continue
|
500
|
-
except ValueError as e:
|
501
|
-
if str(e).startswith(
|
502
|
-
"Error raised by bedrock service"
|
503
|
-
) and "maximum context length" in str(e):
|
504
|
-
log.warning(
|
505
|
-
"Input is too large for this model's context length, skipping"
|
506
|
-
)
|
507
|
-
continue
|
508
|
-
raise e
|
486
|
+
log.info(f"Processing {in_path.relative_to(input_directory)}")
|
487
|
+
out_block = self.translate_file(in_path, fail_path)
|
488
|
+
total_cost += out_block.total_cost
|
489
|
+
log.info(f"Current Running Cost: {total_cost}")
|
509
490
|
|
510
491
|
# Don't attempt to write files for which translation failed
|
511
492
|
if not out_block.translated:
|
@@ -526,11 +507,14 @@ class Converter:
|
|
526
507
|
|
527
508
|
log.info(f"Total cost: ${total_cost:,.2f}")
|
528
509
|
|
529
|
-
def translate_file(
|
510
|
+
def translate_file(
|
511
|
+
self, file: Path, failure_path: Path | None = None
|
512
|
+
) -> TranslatedCodeBlock:
|
530
513
|
"""Translate a single file.
|
531
514
|
|
532
515
|
Arguments:
|
533
516
|
file: Input path to file
|
517
|
+
failure_path: path to directory to store failure summaries`
|
534
518
|
|
535
519
|
Returns:
|
536
520
|
A `TranslatedCodeBlock` object. This block does not have a path set, and its
|
@@ -542,7 +526,7 @@ class Converter:
|
|
542
526
|
|
543
527
|
input_block = self._split_file(file)
|
544
528
|
t0 = time.time()
|
545
|
-
output_block = self._iterative_translate(input_block)
|
529
|
+
output_block = self._iterative_translate(input_block, failure_path)
|
546
530
|
output_block.processing_time = time.time() - t0
|
547
531
|
if output_block.translated:
|
548
532
|
completeness = output_block.translation_completeness
|
@@ -550,7 +534,6 @@ class Converter:
|
|
550
534
|
f"[{filename}] Translation complete\n"
|
551
535
|
f" {completeness:.2%} of input successfully translated\n"
|
552
536
|
f" Total cost: ${output_block.total_cost:,.2f}\n"
|
553
|
-
f" Total retries: {output_block.total_retries:,d}\n"
|
554
537
|
f" Output CodeBlock Structure:\n{input_block.tree_str()}\n"
|
555
538
|
)
|
556
539
|
|
@@ -558,15 +541,17 @@ class Converter:
|
|
558
541
|
log.error(
|
559
542
|
f"[{filename}] Translation failed\n"
|
560
543
|
f" Total cost: ${output_block.total_cost:,.2f}\n"
|
561
|
-
f" Total retries: {output_block.total_retries:,d}\n"
|
562
544
|
)
|
563
545
|
return output_block
|
564
546
|
|
565
|
-
def _iterative_translate(
|
547
|
+
def _iterative_translate(
|
548
|
+
self, root: CodeBlock, failure_path: Path | None = None
|
549
|
+
) -> TranslatedCodeBlock:
|
566
550
|
"""Translate the passed CodeBlock representing a full file.
|
567
551
|
|
568
552
|
Arguments:
|
569
553
|
root: A root block representing the top-level block of a file
|
554
|
+
failure_path: path to store data files for failed translations
|
570
555
|
|
571
556
|
Returns:
|
572
557
|
A `TranslatedCodeBlock`
|
@@ -574,22 +559,60 @@ class Converter:
|
|
574
559
|
translated_root = TranslatedCodeBlock(root, self._target_language)
|
575
560
|
last_prog, prog_delta = 0, 0.1
|
576
561
|
stack = [translated_root]
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
self._add_translation(translated_block)
|
562
|
+
try:
|
563
|
+
while stack:
|
564
|
+
translated_block = stack.pop()
|
581
565
|
|
582
|
-
|
583
|
-
# children (they wouldn't show up in the final text anyway)
|
584
|
-
if not translated_block.translated:
|
585
|
-
continue
|
566
|
+
self._add_translation(translated_block)
|
586
567
|
|
587
|
-
|
568
|
+
# If translating this block was unsuccessful, don't bother with its
|
569
|
+
# children (they wouldn't show up in the final text anyway)
|
570
|
+
if not translated_block.translated:
|
571
|
+
continue
|
588
572
|
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
573
|
+
stack.extend(translated_block.children)
|
574
|
+
|
575
|
+
progress = translated_root.translation_completeness
|
576
|
+
if progress - last_prog > prog_delta:
|
577
|
+
last_prog = int(progress / prog_delta) * prog_delta
|
578
|
+
log.info(f"[{root.name}] progress: {progress:.2%}")
|
579
|
+
except RateLimitError:
|
580
|
+
pass
|
581
|
+
except OutputParserException as e:
|
582
|
+
log.error(f"Skipping file, failed to parse output: {e}.")
|
583
|
+
except BadRequestError as e:
|
584
|
+
if str(e).startswith("Detected an error in the prompt"):
|
585
|
+
log.warning("Malformed input, skipping")
|
586
|
+
raise e
|
587
|
+
except ValidationError as e:
|
588
|
+
# Only allow ValidationError to pass if token limit is manually set
|
589
|
+
if self.override_token_limit:
|
590
|
+
log.warning(
|
591
|
+
"Current file and manually set token "
|
592
|
+
"limit is too large for this model, skipping"
|
593
|
+
)
|
594
|
+
raise e
|
595
|
+
except TokenLimitError:
|
596
|
+
log.warning("Ran into irreducible node too large for context, skipping")
|
597
|
+
except EmptyTreeError:
|
598
|
+
log.warning("Input file has no nodes of interest, skipping")
|
599
|
+
except FileSizeError:
|
600
|
+
log.warning("Current tile is too large for basic splitter, skipping")
|
601
|
+
except ValueError as e:
|
602
|
+
if str(e).startswith(
|
603
|
+
"Error raised by bedrock service"
|
604
|
+
) and "maximum context length" in str(e):
|
605
|
+
log.warning(
|
606
|
+
"Input is too large for this model's context length, skipping"
|
607
|
+
)
|
608
|
+
raise e
|
609
|
+
finally:
|
610
|
+
log.debug(
|
611
|
+
f"Resulting Block: {json.dumps(self._get_output_obj(translated_root))}"
|
612
|
+
)
|
613
|
+
if not translated_root.translated:
|
614
|
+
if failure_path is not None:
|
615
|
+
self._save_to_file(translated_root, failure_path)
|
593
616
|
|
594
617
|
return translated_root
|
595
618
|
|
@@ -624,11 +647,19 @@ class Converter:
|
|
624
647
|
# TODO: If non-OpenAI models with prices are added, this will need
|
625
648
|
# to be updated.
|
626
649
|
with get_model_callback() as cb:
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
650
|
+
try:
|
651
|
+
t0 = time.time()
|
652
|
+
block.text = self._run_chain(block)
|
653
|
+
except JanusParserException as e:
|
654
|
+
block.text = e.unparsed_output
|
655
|
+
block.tokens = self._llm.get_num_tokens(block.text)
|
656
|
+
raise e
|
657
|
+
finally:
|
658
|
+
block.processing_time = time.time() - t0
|
659
|
+
block.cost = cb.total_cost
|
660
|
+
block.request_input_tokens = cb.prompt_tokens
|
661
|
+
block.request_output_tokens = cb.completion_tokens
|
662
|
+
block.num_requests = cb.successful_requests
|
632
663
|
|
633
664
|
block.tokens = self._llm.get_num_tokens(block.text)
|
634
665
|
block.translated = True
|
@@ -652,20 +683,25 @@ class Converter:
|
|
652
683
|
def _get_output_obj(
|
653
684
|
self, block: TranslatedCodeBlock
|
654
685
|
) -> dict[str, int | float | str | dict[str, str] | dict[str, float]]:
|
655
|
-
output_str = self._parser.parse_combined_output(block.complete_text)
|
656
|
-
|
657
686
|
output_obj: str | dict[str, str]
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
687
|
+
if not block.translation_completed:
|
688
|
+
# translation wasn't completed, so combined parsing will likely fail
|
689
|
+
output_obj = block.complete_text
|
690
|
+
else:
|
691
|
+
output_str = self._parser.parse_combined_output(block.complete_text)
|
692
|
+
try:
|
693
|
+
output_obj = json.loads(output_str)
|
694
|
+
except json.JSONDecodeError:
|
695
|
+
output_obj = output_str
|
662
696
|
|
663
697
|
return dict(
|
664
698
|
input=block.original.text or "",
|
665
699
|
metadata=dict(
|
666
|
-
retries=block.total_retries,
|
667
700
|
cost=block.total_cost,
|
668
701
|
processing_time=block.processing_time,
|
702
|
+
num_requests=block.total_num_requests,
|
703
|
+
input_tokens=block.total_request_input_tokens,
|
704
|
+
output_tokens=block.total_request_output_tokens,
|
669
705
|
),
|
670
706
|
output=output_obj,
|
671
707
|
)
|
janus/converter/document.py
CHANGED
@@ -5,10 +5,8 @@ from copy import deepcopy
|
|
5
5
|
from janus.converter.converter import Converter
|
6
6
|
from janus.language.block import TranslatedCodeBlock
|
7
7
|
from janus.language.combine import JsonCombiner
|
8
|
-
from janus.parsers.doc_parser import
|
9
|
-
|
10
|
-
MultiDocumentationParser,
|
11
|
-
)
|
8
|
+
from janus.parsers.doc_parser import ClozeDocumentationParser, MultiDocumentationParser
|
9
|
+
from janus.parsers.parser import JanusParserException
|
12
10
|
from janus.utils.enums import LANGUAGES
|
13
11
|
from janus.utils.logger import create_logger
|
14
12
|
|
@@ -40,7 +38,7 @@ class MultiDocumenter(Documenter):
|
|
40
38
|
self._parser = MultiDocumentationParser()
|
41
39
|
|
42
40
|
|
43
|
-
class
|
41
|
+
class ClozeDocumenter(Documenter):
|
44
42
|
def __init__(
|
45
43
|
self,
|
46
44
|
comments_per_request: int | None = None,
|
@@ -48,9 +46,9 @@ class MadLibsDocumenter(Documenter):
|
|
48
46
|
) -> None:
|
49
47
|
kwargs.update(drop_comments=False)
|
50
48
|
super().__init__(**kwargs)
|
51
|
-
self.set_prompt("
|
49
|
+
self.set_prompt("document_cloze")
|
52
50
|
self._combiner = JsonCombiner()
|
53
|
-
self._parser =
|
51
|
+
self._parser = ClozeDocumentationParser()
|
54
52
|
|
55
53
|
self.comments_per_request = comments_per_request
|
56
54
|
|
@@ -92,7 +90,6 @@ class MadLibsDocumenter(Documenter):
|
|
92
90
|
|
93
91
|
block.processing_time = 0
|
94
92
|
block.cost = 0
|
95
|
-
block.retries = 0
|
96
93
|
obj = {}
|
97
94
|
for i in range(0, len(comments), self.comments_per_request):
|
98
95
|
# Split the text into the section containing comments of interest,
|
@@ -114,16 +111,26 @@ class MadLibsDocumenter(Documenter):
|
|
114
111
|
working_block = TranslatedCodeBlock(working_copy, self._target_language)
|
115
112
|
|
116
113
|
# Run the LLM on the working text
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
114
|
+
try:
|
115
|
+
super()._add_translation(working_block)
|
116
|
+
except JanusParserException as e:
|
117
|
+
block.text += "\n===============\n" + working_block.text
|
118
|
+
block.tokens = self._llm.get_num_tokens(block.text)
|
119
|
+
raise e
|
120
|
+
finally:
|
121
|
+
# Update metadata to include for all runs
|
122
|
+
block.num_requests += working_block.num_requests
|
123
|
+
block.cost += working_block.cost
|
124
|
+
block.processing_time += working_block.processing_time
|
125
|
+
block.request_input_tokens += working_block.request_input_tokens
|
126
|
+
block.request_output_tokens += working_block.request_output_tokens
|
123
127
|
|
124
128
|
# Update the output text to merge this section's output in
|
125
129
|
out_text = self._parser.parse(working_block.text)
|
126
130
|
obj.update(json.loads(out_text))
|
131
|
+
# Set intermediate text, will be overwritten if file
|
132
|
+
# successfully completes
|
133
|
+
block.text = json.dumps(obj)
|
127
134
|
|
128
135
|
self._parser.parse_input(block.original)
|
129
136
|
block.text = self._parser.parse(json.dumps(obj))
|