janus-llm 4.3.1__py3-none-any.whl → 4.3.5__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (128) hide show
  1. janus/__init__.py +1 -1
  2. janus/__main__.py +1 -1
  3. janus/_tests/evaluator_tests/EvalReadMe.md +85 -0
  4. janus/_tests/evaluator_tests/incose_tests/incose_large_test.json +39 -0
  5. janus/_tests/evaluator_tests/incose_tests/incose_small_test.json +17 -0
  6. janus/_tests/evaluator_tests/inline_comment_tests/mumps_inline_comment_test.m +71 -0
  7. janus/_tests/test_cli.py +3 -2
  8. janus/cli/aggregate.py +135 -0
  9. janus/cli/cli.py +111 -0
  10. janus/cli/constants.py +43 -0
  11. janus/cli/database.py +289 -0
  12. janus/cli/diagram.py +178 -0
  13. janus/cli/document.py +174 -0
  14. janus/cli/embedding.py +122 -0
  15. janus/cli/llm.py +187 -0
  16. janus/cli/partition.py +125 -0
  17. janus/cli/self_eval.py +149 -0
  18. janus/cli/translate.py +183 -0
  19. janus/converter/__init__.py +1 -1
  20. janus/converter/_tests/test_translate.py +2 -0
  21. janus/converter/converter.py +129 -93
  22. janus/converter/document.py +21 -14
  23. janus/converter/evaluate.py +20 -13
  24. janus/converter/translate.py +3 -3
  25. janus/embedding/collections.py +1 -1
  26. janus/language/alc/_tests/alc.asm +3779 -0
  27. janus/language/binary/_tests/hello.bin +0 -0
  28. janus/language/block.py +47 -12
  29. janus/language/file.py +1 -1
  30. janus/language/mumps/_tests/mumps.m +235 -0
  31. janus/language/treesitter/_tests/languages/fortran.f90 +416 -0
  32. janus/language/treesitter/_tests/languages/ibmhlasm.asm +16 -0
  33. janus/language/treesitter/_tests/languages/matlab.m +225 -0
  34. janus/llm/models_info.py +9 -1
  35. janus/metrics/_tests/asm_test_file.asm +10 -0
  36. janus/metrics/_tests/mumps_test_file.m +6 -0
  37. janus/metrics/_tests/test_treesitter_metrics.py +1 -1
  38. janus/metrics/prompts/clarity.txt +8 -0
  39. janus/metrics/prompts/completeness.txt +16 -0
  40. janus/metrics/prompts/faithfulness.txt +10 -0
  41. janus/metrics/prompts/hallucination.txt +16 -0
  42. janus/metrics/prompts/quality.txt +8 -0
  43. janus/metrics/prompts/readability.txt +16 -0
  44. janus/metrics/prompts/usefulness.txt +16 -0
  45. janus/parsers/code_parser.py +4 -4
  46. janus/parsers/doc_parser.py +12 -9
  47. janus/parsers/parser.py +7 -0
  48. janus/parsers/partition_parser.py +6 -4
  49. janus/parsers/reqs_parser.py +8 -5
  50. janus/parsers/uml.py +5 -4
  51. janus/prompts/prompt.py +2 -2
  52. janus/prompts/templates/README.md +30 -0
  53. janus/prompts/templates/basic_aggregation/human.txt +6 -0
  54. janus/prompts/templates/basic_aggregation/system.txt +1 -0
  55. janus/prompts/templates/basic_refinement/human.txt +14 -0
  56. janus/prompts/templates/basic_refinement/system.txt +1 -0
  57. janus/prompts/templates/diagram/human.txt +9 -0
  58. janus/prompts/templates/diagram/system.txt +1 -0
  59. janus/prompts/templates/diagram_with_documentation/human.txt +15 -0
  60. janus/prompts/templates/diagram_with_documentation/system.txt +1 -0
  61. janus/prompts/templates/document/human.txt +10 -0
  62. janus/prompts/templates/document/system.txt +1 -0
  63. janus/prompts/templates/document_cloze/human.txt +11 -0
  64. janus/prompts/templates/document_cloze/system.txt +1 -0
  65. janus/prompts/templates/document_cloze/variables.json +4 -0
  66. janus/prompts/templates/document_cloze/variables_asm.json +4 -0
  67. janus/prompts/templates/document_inline/human.txt +13 -0
  68. janus/prompts/templates/eval_prompts/incose/human.txt +32 -0
  69. janus/prompts/templates/eval_prompts/incose/system.txt +1 -0
  70. janus/prompts/templates/eval_prompts/incose/variables.json +3 -0
  71. janus/prompts/templates/eval_prompts/inline_comments/human.txt +49 -0
  72. janus/prompts/templates/eval_prompts/inline_comments/system.txt +1 -0
  73. janus/prompts/templates/eval_prompts/inline_comments/variables.json +3 -0
  74. janus/prompts/templates/micromanaged_mumps_v1.0/human.txt +23 -0
  75. janus/prompts/templates/micromanaged_mumps_v1.0/system.txt +3 -0
  76. janus/prompts/templates/micromanaged_mumps_v2.0/human.txt +28 -0
  77. janus/prompts/templates/micromanaged_mumps_v2.0/system.txt +3 -0
  78. janus/prompts/templates/micromanaged_mumps_v2.1/human.txt +29 -0
  79. janus/prompts/templates/micromanaged_mumps_v2.1/system.txt +3 -0
  80. janus/prompts/templates/multidocument/human.txt +15 -0
  81. janus/prompts/templates/multidocument/system.txt +1 -0
  82. janus/prompts/templates/partition/human.txt +22 -0
  83. janus/prompts/templates/partition/system.txt +1 -0
  84. janus/prompts/templates/partition/variables.json +4 -0
  85. janus/prompts/templates/pseudocode/human.txt +7 -0
  86. janus/prompts/templates/pseudocode/system.txt +7 -0
  87. janus/prompts/templates/refinement/fix_exceptions/human.txt +19 -0
  88. janus/prompts/templates/refinement/fix_exceptions/system.txt +1 -0
  89. janus/prompts/templates/refinement/format/code_format/human.txt +12 -0
  90. janus/prompts/templates/refinement/format/code_format/system.txt +1 -0
  91. janus/prompts/templates/refinement/format/requirements_format/human.txt +14 -0
  92. janus/prompts/templates/refinement/format/requirements_format/system.txt +1 -0
  93. janus/prompts/templates/refinement/hallucination/human.txt +13 -0
  94. janus/prompts/templates/refinement/hallucination/system.txt +1 -0
  95. janus/prompts/templates/refinement/reflection/human.txt +15 -0
  96. janus/prompts/templates/refinement/reflection/incose/human.txt +26 -0
  97. janus/prompts/templates/refinement/reflection/incose/system.txt +1 -0
  98. janus/prompts/templates/refinement/reflection/incose_deduplicate/human.txt +16 -0
  99. janus/prompts/templates/refinement/reflection/incose_deduplicate/system.txt +1 -0
  100. janus/prompts/templates/refinement/reflection/system.txt +1 -0
  101. janus/prompts/templates/refinement/revision/human.txt +16 -0
  102. janus/prompts/templates/refinement/revision/incose/human.txt +16 -0
  103. janus/prompts/templates/refinement/revision/incose/system.txt +1 -0
  104. janus/prompts/templates/refinement/revision/incose_deduplicate/human.txt +17 -0
  105. janus/prompts/templates/refinement/revision/incose_deduplicate/system.txt +1 -0
  106. janus/prompts/templates/refinement/revision/system.txt +1 -0
  107. janus/prompts/templates/refinement/uml/alc_fix_variables/human.txt +15 -0
  108. janus/prompts/templates/refinement/uml/alc_fix_variables/system.txt +2 -0
  109. janus/prompts/templates/refinement/uml/fix_connections/human.txt +15 -0
  110. janus/prompts/templates/refinement/uml/fix_connections/system.txt +2 -0
  111. janus/prompts/templates/requirements/human.txt +13 -0
  112. janus/prompts/templates/requirements/system.txt +2 -0
  113. janus/prompts/templates/retrieval/language_docs/human.txt +10 -0
  114. janus/prompts/templates/retrieval/language_docs/system.txt +1 -0
  115. janus/prompts/templates/simple/human.txt +16 -0
  116. janus/prompts/templates/simple/system.txt +3 -0
  117. janus/refiners/format.py +49 -0
  118. janus/refiners/refiner.py +113 -4
  119. janus/utils/enums.py +127 -112
  120. janus/utils/logger.py +2 -0
  121. {janus_llm-4.3.1.dist-info → janus_llm-4.3.5.dist-info}/METADATA +7 -7
  122. janus_llm-4.3.5.dist-info/RECORD +210 -0
  123. {janus_llm-4.3.1.dist-info → janus_llm-4.3.5.dist-info}/WHEEL +1 -1
  124. janus_llm-4.3.5.dist-info/entry_points.txt +3 -0
  125. janus/cli.py +0 -1488
  126. janus_llm-4.3.1.dist-info/RECORD +0 -115
  127. janus_llm-4.3.1.dist-info/entry_points.txt +0 -3
  128. {janus_llm-4.3.1.dist-info → janus_llm-4.3.5.dist-info}/LICENSE +0 -0
janus/cli/translate.py ADDED
@@ -0,0 +1,183 @@
1
+ from pathlib import Path
2
+ from typing import Optional
3
+
4
+ import click
5
+ import typer
6
+ from typing_extensions import Annotated
7
+
8
+ from janus.cli.constants import REFINERS
9
+ from janus.language.naive.registry import CUSTOM_SPLITTERS
10
+ from janus.utils.enums import LANGUAGES
11
+ from janus.utils.logger import create_logger
12
+
13
+ log = create_logger(__name__)
14
+
15
+
16
+ def translate(
17
+ input_dir: Annotated[
18
+ Path,
19
+ typer.Option(
20
+ "--input",
21
+ "-i",
22
+ help="The directory containing the source code to be translated. "
23
+ "The files should all be in one flat directory.",
24
+ ),
25
+ ],
26
+ source_lang: Annotated[
27
+ str,
28
+ typer.Option(
29
+ "--source-language",
30
+ "-s",
31
+ help="The language of the source code.",
32
+ click_type=click.Choice(sorted(LANGUAGES)),
33
+ ),
34
+ ],
35
+ output_dir: Annotated[
36
+ Path,
37
+ typer.Option(
38
+ "--output", "-o", help="The directory to store the translated code in."
39
+ ),
40
+ ],
41
+ target_lang: Annotated[
42
+ str,
43
+ typer.Option(
44
+ "--target-language",
45
+ "-t",
46
+ help="The desired output language to translate the source code to. The "
47
+ "format can follow a 'language-version' syntax. Use 'text' to get plaintext"
48
+ "results as returned by the LLM. Examples: `python-3.10`, `mumps`, `java-10`,"
49
+ "text.",
50
+ ),
51
+ ],
52
+ llm_name: Annotated[
53
+ str,
54
+ typer.Option(
55
+ "--llm",
56
+ "-L",
57
+ help="The custom name of the model set with 'janus llm add'.",
58
+ ),
59
+ ],
60
+ failure_dir: Annotated[
61
+ Optional[Path],
62
+ typer.Option(
63
+ "--failure-directory",
64
+ "-f",
65
+ help="The directory to store failure files during translation",
66
+ ),
67
+ ] = None,
68
+ max_prompts: Annotated[
69
+ int,
70
+ typer.Option(
71
+ "--max-prompts",
72
+ "-m",
73
+ help="The maximum number of times to prompt a model on one functional block "
74
+ "before exiting the application. This is to prevent wasting too much money.",
75
+ ),
76
+ ] = 10,
77
+ overwrite: Annotated[
78
+ bool,
79
+ typer.Option(
80
+ "--overwrite/--preserve",
81
+ help="Whether to overwrite existing files in the output directory",
82
+ ),
83
+ ] = False,
84
+ skip_context: Annotated[
85
+ bool,
86
+ typer.Option(
87
+ "--skip-context",
88
+ help="Prompts will include any context information associated with source"
89
+ " code blocks, unless this option is specified",
90
+ ),
91
+ ] = False,
92
+ temp: Annotated[
93
+ float,
94
+ typer.Option("--temperature", "-T", help="Sampling temperature.", min=0, max=2),
95
+ ] = 0.7,
96
+ prompt_template: Annotated[
97
+ str,
98
+ typer.Option(
99
+ "--prompt-template",
100
+ "-p",
101
+ help="Name of the Janus prompt template directory or "
102
+ "path to a directory containing those template files.",
103
+ ),
104
+ ] = "simple",
105
+ collection: Annotated[
106
+ str,
107
+ typer.Option(
108
+ "--collection",
109
+ "-c",
110
+ help="If set, will put the translated result into a Chroma DB "
111
+ "collection with the name provided.",
112
+ ),
113
+ ] = None,
114
+ splitter_type: Annotated[
115
+ str,
116
+ typer.Option(
117
+ "-S",
118
+ "--splitter",
119
+ help="Name of custom splitter to use",
120
+ click_type=click.Choice(list(CUSTOM_SPLITTERS.keys())),
121
+ ),
122
+ ] = "file",
123
+ refiner_types: Annotated[
124
+ list[str],
125
+ typer.Option(
126
+ "-r",
127
+ "--refiner",
128
+ help="List of refiner types to use. Add -r for each refiner to use in\
129
+ refinement chain",
130
+ click_type=click.Choice(list(REFINERS.keys())),
131
+ ),
132
+ ] = ["JanusRefiner"],
133
+ retriever_type: Annotated[
134
+ str,
135
+ typer.Option(
136
+ "-R",
137
+ "--retriever",
138
+ help="Name of custom retriever to use",
139
+ click_type=click.Choice(["active_usings", "language_docs"]),
140
+ ),
141
+ ] = None,
142
+ max_tokens: Annotated[
143
+ int,
144
+ typer.Option(
145
+ "--max-tokens",
146
+ "-M",
147
+ help="The maximum number of tokens the model will take in. "
148
+ "If unspecificed, model's default max will be used.",
149
+ ),
150
+ ] = None,
151
+ ):
152
+ from janus.cli.constants import db_loc, get_collections_config
153
+ from janus.converter.translate import Translator
154
+
155
+ refiner_types = [REFINERS[r] for r in refiner_types]
156
+ try:
157
+ target_language, target_version = target_lang.split("-")
158
+ except ValueError:
159
+ target_language = target_lang
160
+ target_version = None
161
+ # make sure not overwriting input
162
+ if source_lang.lower() == target_language.lower() and input_dir == output_dir:
163
+ log.error("Output files would overwrite input! Aborting...")
164
+ raise ValueError
165
+
166
+ model_arguments = dict(temperature=temp)
167
+ collections_config = get_collections_config()
168
+ translator = Translator(
169
+ model=llm_name,
170
+ model_arguments=model_arguments,
171
+ source_language=source_lang,
172
+ target_language=target_language,
173
+ target_version=target_version,
174
+ max_prompts=max_prompts,
175
+ max_tokens=max_tokens,
176
+ prompt_template=prompt_template,
177
+ db_path=db_loc,
178
+ db_config=collections_config,
179
+ splitter_type=splitter_type,
180
+ refiner_types=refiner_types,
181
+ retriever_type=retriever_type,
182
+ )
183
+ translator.translate(input_dir, output_dir, failure_dir, overwrite, collection)
@@ -1,6 +1,6 @@
1
1
  from janus.converter.converter import Converter
2
2
  from janus.converter.diagram import DiagramGenerator
3
- from janus.converter.document import Documenter, MadLibsDocumenter, MultiDocumenter
3
+ from janus.converter.document import ClozeDocumenter, Documenter, MultiDocumenter
4
4
  from janus.converter.evaluate import Evaluator
5
5
  from janus.converter.partition import Partitioner
6
6
  from janus.converter.requirements import RequirementsDocumenter
@@ -11,6 +11,7 @@ from janus.converter.diagram import DiagramGenerator
11
11
  from janus.converter.requirements import RequirementsDocumenter
12
12
  from janus.converter.translate import Translator
13
13
  from janus.language.block import CodeBlock, TranslatedCodeBlock
14
+ from janus.refiners.format import CodeFormatRefiner
14
15
 
15
16
 
16
17
  class MockCollection(VectorStore):
@@ -50,6 +51,7 @@ class TestTranslator(unittest.TestCase):
50
51
  target_language="python",
51
52
  target_version="3.10",
52
53
  splitter_type="ast-flex",
54
+ refiner_types=[CodeFormatRefiner],
53
55
  )
54
56
  self.test_file = Path("janus/language/treesitter/_tests/languages/fortran.f90")
55
57
  self.TEST_FILE_EMBEDDING_COUNT = 14
@@ -27,7 +27,7 @@ from janus.language.splitter import (
27
27
  )
28
28
  from janus.llm.model_callbacks import get_model_callback
29
29
  from janus.llm.models_info import MODEL_PROMPT_ENGINES, JanusModel, load_model
30
- from janus.parsers.parser import GenericParser, JanusParser
30
+ from janus.parsers.parser import GenericParser, JanusParser, JanusParserException
31
31
  from janus.refiners.refiner import JanusRefiner
32
32
 
33
33
  # from janus.refiners.refiner import BasicRefiner, Refiner
@@ -122,7 +122,7 @@ class Converter:
122
122
  self._custom_model_arguments: dict[str, Any]
123
123
 
124
124
  self._source_language: str
125
- self._source_suffix: str
125
+ self._source_suffixes: list[str]
126
126
 
127
127
  self._target_language = "json"
128
128
  self._target_suffix = ".json"
@@ -245,8 +245,10 @@ class Converter:
245
245
  "Valid source languages are found in `janus.utils.enums.LANGUAGES`."
246
246
  )
247
247
 
248
- ext = LANGUAGES[source_language]["suffix"]
249
- self._source_suffix = f".{ext}"
248
+ self._source_suffixes = [
249
+ f".{ext}" for ext in LANGUAGES[source_language]["suffixes"]
250
+ ]
251
+
250
252
  self._source_language = source_language
251
253
 
252
254
  def set_protected_node_types(self, protected_node_types: tuple[str, ...]) -> None:
@@ -324,7 +326,7 @@ class Converter:
324
326
  # tokens at output
325
327
  # Only modify max_tokens if it is not specified by user
326
328
  if not self.override_token_limit:
327
- self._max_tokens = int(token_limit // 2.5)
329
+ self._max_tokens = int(token_limit * self._llm.input_token_proportion)
328
330
 
329
331
  @run_if_changed(
330
332
  "_prompt_template_name",
@@ -406,6 +408,7 @@ class Converter:
406
408
  self,
407
409
  input_directory: str | Path,
408
410
  output_directory: str | Path | None = None,
411
+ failure_directory: str | Path | None = None,
409
412
  overwrite: bool = False,
410
413
  collection_name: str | None = None,
411
414
  ) -> None:
@@ -423,16 +426,22 @@ class Converter:
423
426
  input_directory = Path(input_directory)
424
427
  if isinstance(output_directory, str):
425
428
  output_directory = Path(output_directory)
429
+ if isinstance(failure_directory, str):
430
+ failure_directory = Path(failure_directory)
426
431
 
427
432
  # Make sure the output directory exists
428
433
  if output_directory is not None and not output_directory.exists():
429
434
  output_directory.mkdir(parents=True)
435
+ if failure_directory is not None and not failure_directory.exists():
436
+ failure_directory.mkdir(parents=True)
430
437
 
431
- input_paths = [p for p in input_directory.rglob(f"**/*{self._source_suffix}")]
438
+ input_paths = []
439
+ for ext in self._source_suffixes:
440
+ input_paths.extend(input_directory.rglob(f"**/*{ext}"))
432
441
 
433
442
  log.info(f"Input directory: {input_directory.absolute()}")
434
443
  log.info(
435
- f"{self._source_language} '*{self._source_suffix}' files: "
444
+ f"{self._source_language} {self._source_suffixes} files: "
436
445
  f"{len(input_paths)}"
437
446
  )
438
447
  log.info(
@@ -445,67 +454,39 @@ class Converter:
445
454
  / p.relative_to(input_directory).with_suffix(self._target_suffix)
446
455
  for p in input_paths
447
456
  ]
448
- in_out_pairs = list(zip(input_paths, output_paths))
449
- if not overwrite:
450
- n_files = len(in_out_pairs)
451
- in_out_pairs = [
452
- (inp, outp) for inp, outp in in_out_pairs if not outp.exists()
453
- ]
454
- log.info(
455
- f"Skipping {n_files - len(in_out_pairs)} existing "
456
- f"'*{self._source_suffix}' files"
457
- )
458
457
  else:
459
- in_out_pairs = [(f, None) for f in input_paths]
460
- log.info(f"Translating {len(in_out_pairs)} '*{self._source_suffix}' files")
458
+ output_paths = [None for _ in input_paths]
459
+
460
+ if failure_directory is not None:
461
+ failure_paths = [
462
+ failure_directory
463
+ / p.relative_to(input_directory).with_suffix(self._target_suffix)
464
+ for p in input_paths
465
+ ]
466
+ else:
467
+ failure_paths = [None for _ in input_paths]
468
+ in_out_pairs = list(zip(input_paths, output_paths, failure_paths))
469
+ if not overwrite:
470
+ n_files = len(in_out_pairs)
471
+ in_out_pairs = [
472
+ (inp, outp, failp)
473
+ for inp, outp, failp in in_out_pairs
474
+ if outp is None or not outp.exists()
475
+ ]
476
+ log.info(
477
+ f"Skipping {n_files - len(in_out_pairs)} existing "
478
+ f"{self._source_suffixes} files"
479
+ )
480
+ log.info(f"Translating {len(in_out_pairs)} {self._source_suffixes} files")
461
481
 
462
482
  # Loop through each input file, convert and save it
463
483
  total_cost = 0.0
464
- for in_path, out_path in in_out_pairs:
484
+ for in_path, out_path, fail_path in in_out_pairs:
465
485
  # Translate the file, skip it if there's a rate limit error
466
- try:
467
- log.info(f"Processing {in_path.relative_to(input_directory)}")
468
- out_block = self.translate_file(in_path)
469
- total_cost += out_block.total_cost
470
- except RateLimitError:
471
- continue
472
- except OutputParserException as e:
473
- log.error(f"Skipping {in_path.name}, failed to parse output: {e}.")
474
- continue
475
- except BadRequestError as e:
476
- if str(e).startswith("Detected an error in the prompt"):
477
- log.warning("Malformed input, skipping")
478
- continue
479
- raise e
480
- except ValidationError as e:
481
- # Only allow ValidationError to pass if token limit is manually set
482
- if self.override_token_limit:
483
- log.warning(
484
- "Current file and manually set token "
485
- "limit is too large for this model, skipping"
486
- )
487
- continue
488
- raise e
489
- except TokenLimitError:
490
- log.warning("Ran into irreducible node too large for context, skipping")
491
- continue
492
- except EmptyTreeError:
493
- log.warning(
494
- f'Input file "{in_path.name}" has no nodes of interest, skipping'
495
- )
496
- continue
497
- except FileSizeError:
498
- log.warning("Current tile is too large for basic splitter, skipping")
499
- continue
500
- except ValueError as e:
501
- if str(e).startswith(
502
- "Error raised by bedrock service"
503
- ) and "maximum context length" in str(e):
504
- log.warning(
505
- "Input is too large for this model's context length, skipping"
506
- )
507
- continue
508
- raise e
486
+ log.info(f"Processing {in_path.relative_to(input_directory)}")
487
+ out_block = self.translate_file(in_path, fail_path)
488
+ total_cost += out_block.total_cost
489
+ log.info(f"Current Running Cost: {total_cost}")
509
490
 
510
491
  # Don't attempt to write files for which translation failed
511
492
  if not out_block.translated:
@@ -526,11 +507,14 @@ class Converter:
526
507
 
527
508
  log.info(f"Total cost: ${total_cost:,.2f}")
528
509
 
529
- def translate_file(self, file: Path) -> TranslatedCodeBlock:
510
+ def translate_file(
511
+ self, file: Path, failure_path: Path | None = None
512
+ ) -> TranslatedCodeBlock:
530
513
  """Translate a single file.
531
514
 
532
515
  Arguments:
533
516
  file: Input path to file
517
+ failure_path: path to directory to store failure summaries`
534
518
 
535
519
  Returns:
536
520
  A `TranslatedCodeBlock` object. This block does not have a path set, and its
@@ -542,7 +526,7 @@ class Converter:
542
526
 
543
527
  input_block = self._split_file(file)
544
528
  t0 = time.time()
545
- output_block = self._iterative_translate(input_block)
529
+ output_block = self._iterative_translate(input_block, failure_path)
546
530
  output_block.processing_time = time.time() - t0
547
531
  if output_block.translated:
548
532
  completeness = output_block.translation_completeness
@@ -550,7 +534,6 @@ class Converter:
550
534
  f"[{filename}] Translation complete\n"
551
535
  f" {completeness:.2%} of input successfully translated\n"
552
536
  f" Total cost: ${output_block.total_cost:,.2f}\n"
553
- f" Total retries: {output_block.total_retries:,d}\n"
554
537
  f" Output CodeBlock Structure:\n{input_block.tree_str()}\n"
555
538
  )
556
539
 
@@ -558,15 +541,17 @@ class Converter:
558
541
  log.error(
559
542
  f"[{filename}] Translation failed\n"
560
543
  f" Total cost: ${output_block.total_cost:,.2f}\n"
561
- f" Total retries: {output_block.total_retries:,d}\n"
562
544
  )
563
545
  return output_block
564
546
 
565
- def _iterative_translate(self, root: CodeBlock) -> TranslatedCodeBlock:
547
+ def _iterative_translate(
548
+ self, root: CodeBlock, failure_path: Path | None = None
549
+ ) -> TranslatedCodeBlock:
566
550
  """Translate the passed CodeBlock representing a full file.
567
551
 
568
552
  Arguments:
569
553
  root: A root block representing the top-level block of a file
554
+ failure_path: path to store data files for failed translations
570
555
 
571
556
  Returns:
572
557
  A `TranslatedCodeBlock`
@@ -574,22 +559,60 @@ class Converter:
574
559
  translated_root = TranslatedCodeBlock(root, self._target_language)
575
560
  last_prog, prog_delta = 0, 0.1
576
561
  stack = [translated_root]
577
- while stack:
578
- translated_block = stack.pop()
579
-
580
- self._add_translation(translated_block)
562
+ try:
563
+ while stack:
564
+ translated_block = stack.pop()
581
565
 
582
- # If translating this block was unsuccessful, don't bother with its
583
- # children (they wouldn't show up in the final text anyway)
584
- if not translated_block.translated:
585
- continue
566
+ self._add_translation(translated_block)
586
567
 
587
- stack.extend(translated_block.children)
568
+ # If translating this block was unsuccessful, don't bother with its
569
+ # children (they wouldn't show up in the final text anyway)
570
+ if not translated_block.translated:
571
+ continue
588
572
 
589
- progress = translated_root.translation_completeness
590
- if progress - last_prog > prog_delta:
591
- last_prog = int(progress / prog_delta) * prog_delta
592
- log.info(f"[{root.name}] progress: {progress:.2%}")
573
+ stack.extend(translated_block.children)
574
+
575
+ progress = translated_root.translation_completeness
576
+ if progress - last_prog > prog_delta:
577
+ last_prog = int(progress / prog_delta) * prog_delta
578
+ log.info(f"[{root.name}] progress: {progress:.2%}")
579
+ except RateLimitError:
580
+ pass
581
+ except OutputParserException as e:
582
+ log.error(f"Skipping file, failed to parse output: {e}.")
583
+ except BadRequestError as e:
584
+ if str(e).startswith("Detected an error in the prompt"):
585
+ log.warning("Malformed input, skipping")
586
+ raise e
587
+ except ValidationError as e:
588
+ # Only allow ValidationError to pass if token limit is manually set
589
+ if self.override_token_limit:
590
+ log.warning(
591
+ "Current file and manually set token "
592
+ "limit is too large for this model, skipping"
593
+ )
594
+ raise e
595
+ except TokenLimitError:
596
+ log.warning("Ran into irreducible node too large for context, skipping")
597
+ except EmptyTreeError:
598
+ log.warning("Input file has no nodes of interest, skipping")
599
+ except FileSizeError:
600
+ log.warning("Current tile is too large for basic splitter, skipping")
601
+ except ValueError as e:
602
+ if str(e).startswith(
603
+ "Error raised by bedrock service"
604
+ ) and "maximum context length" in str(e):
605
+ log.warning(
606
+ "Input is too large for this model's context length, skipping"
607
+ )
608
+ raise e
609
+ finally:
610
+ log.debug(
611
+ f"Resulting Block: {json.dumps(self._get_output_obj(translated_root))}"
612
+ )
613
+ if not translated_root.translated:
614
+ if failure_path is not None:
615
+ self._save_to_file(translated_root, failure_path)
593
616
 
594
617
  return translated_root
595
618
 
@@ -624,11 +647,19 @@ class Converter:
624
647
  # TODO: If non-OpenAI models with prices are added, this will need
625
648
  # to be updated.
626
649
  with get_model_callback() as cb:
627
- t0 = time.time()
628
- block.text = self._run_chain(block)
629
- block.processing_time = time.time() - t0
630
- block.cost = cb.total_cost
631
- block.retries = max(0, cb.successful_requests - 1)
650
+ try:
651
+ t0 = time.time()
652
+ block.text = self._run_chain(block)
653
+ except JanusParserException as e:
654
+ block.text = e.unparsed_output
655
+ block.tokens = self._llm.get_num_tokens(block.text)
656
+ raise e
657
+ finally:
658
+ block.processing_time = time.time() - t0
659
+ block.cost = cb.total_cost
660
+ block.request_input_tokens = cb.prompt_tokens
661
+ block.request_output_tokens = cb.completion_tokens
662
+ block.num_requests = cb.successful_requests
632
663
 
633
664
  block.tokens = self._llm.get_num_tokens(block.text)
634
665
  block.translated = True
@@ -652,20 +683,25 @@ class Converter:
652
683
  def _get_output_obj(
653
684
  self, block: TranslatedCodeBlock
654
685
  ) -> dict[str, int | float | str | dict[str, str] | dict[str, float]]:
655
- output_str = self._parser.parse_combined_output(block.complete_text)
656
-
657
686
  output_obj: str | dict[str, str]
658
- try:
659
- output_obj = json.loads(output_str)
660
- except json.JSONDecodeError:
661
- output_obj = output_str
687
+ if not block.translation_completed:
688
+ # translation wasn't completed, so combined parsing will likely fail
689
+ output_obj = block.complete_text
690
+ else:
691
+ output_str = self._parser.parse_combined_output(block.complete_text)
692
+ try:
693
+ output_obj = json.loads(output_str)
694
+ except json.JSONDecodeError:
695
+ output_obj = output_str
662
696
 
663
697
  return dict(
664
698
  input=block.original.text or "",
665
699
  metadata=dict(
666
- retries=block.total_retries,
667
700
  cost=block.total_cost,
668
701
  processing_time=block.processing_time,
702
+ num_requests=block.total_num_requests,
703
+ input_tokens=block.total_request_input_tokens,
704
+ output_tokens=block.total_request_output_tokens,
669
705
  ),
670
706
  output=output_obj,
671
707
  )
@@ -5,10 +5,8 @@ from copy import deepcopy
5
5
  from janus.converter.converter import Converter
6
6
  from janus.language.block import TranslatedCodeBlock
7
7
  from janus.language.combine import JsonCombiner
8
- from janus.parsers.doc_parser import (
9
- MadlibsDocumentationParser,
10
- MultiDocumentationParser,
11
- )
8
+ from janus.parsers.doc_parser import ClozeDocumentationParser, MultiDocumentationParser
9
+ from janus.parsers.parser import JanusParserException
12
10
  from janus.utils.enums import LANGUAGES
13
11
  from janus.utils.logger import create_logger
14
12
 
@@ -40,7 +38,7 @@ class MultiDocumenter(Documenter):
40
38
  self._parser = MultiDocumentationParser()
41
39
 
42
40
 
43
- class MadLibsDocumenter(Documenter):
41
+ class ClozeDocumenter(Documenter):
44
42
  def __init__(
45
43
  self,
46
44
  comments_per_request: int | None = None,
@@ -48,9 +46,9 @@ class MadLibsDocumenter(Documenter):
48
46
  ) -> None:
49
47
  kwargs.update(drop_comments=False)
50
48
  super().__init__(**kwargs)
51
- self.set_prompt("document_madlibs")
49
+ self.set_prompt("document_cloze")
52
50
  self._combiner = JsonCombiner()
53
- self._parser = MadlibsDocumentationParser()
51
+ self._parser = ClozeDocumentationParser()
54
52
 
55
53
  self.comments_per_request = comments_per_request
56
54
 
@@ -92,7 +90,6 @@ class MadLibsDocumenter(Documenter):
92
90
 
93
91
  block.processing_time = 0
94
92
  block.cost = 0
95
- block.retries = 0
96
93
  obj = {}
97
94
  for i in range(0, len(comments), self.comments_per_request):
98
95
  # Split the text into the section containing comments of interest,
@@ -114,16 +111,26 @@ class MadLibsDocumenter(Documenter):
114
111
  working_block = TranslatedCodeBlock(working_copy, self._target_language)
115
112
 
116
113
  # Run the LLM on the working text
117
- super()._add_translation(working_block)
118
-
119
- # Update metadata to include for all runs
120
- block.retries += working_block.retries
121
- block.cost += working_block.cost
122
- block.processing_time += working_block.processing_time
114
+ try:
115
+ super()._add_translation(working_block)
116
+ except JanusParserException as e:
117
+ block.text += "\n===============\n" + working_block.text
118
+ block.tokens = self._llm.get_num_tokens(block.text)
119
+ raise e
120
+ finally:
121
+ # Update metadata to include for all runs
122
+ block.num_requests += working_block.num_requests
123
+ block.cost += working_block.cost
124
+ block.processing_time += working_block.processing_time
125
+ block.request_input_tokens += working_block.request_input_tokens
126
+ block.request_output_tokens += working_block.request_output_tokens
123
127
 
124
128
  # Update the output text to merge this section's output in
125
129
  out_text = self._parser.parse(working_block.text)
126
130
  obj.update(json.loads(out_text))
131
+ # Set intermediate text, will be overwritten if file
132
+ # successfully completes
133
+ block.text = json.dumps(obj)
127
134
 
128
135
  self._parser.parse_input(block.original)
129
136
  block.text = self._parser.parse(json.dumps(obj))