PyPI - janus-llm - Versions diffs - 4.3.1__py3-none-any.whl → 4.3.5__py3-none-any.whl - Mend

janus-llm 4.3.1py3-none-any.whl → 4.3.5py3-none-any.whl

Files changed (128) hide show

janus/__init__.py +1 -1
janus/__main__.py +1 -1
janus/_tests/evaluator_tests/EvalReadMe.md +85 -0
janus/_tests/evaluator_tests/incose_tests/incose_large_test.json +39 -0
janus/_tests/evaluator_tests/incose_tests/incose_small_test.json +17 -0
janus/_tests/evaluator_tests/inline_comment_tests/mumps_inline_comment_test.m +71 -0
janus/_tests/test_cli.py +3 -2
janus/cli/aggregate.py +135 -0
janus/cli/cli.py +111 -0
janus/cli/constants.py +43 -0
janus/cli/database.py +289 -0
janus/cli/diagram.py +178 -0
janus/cli/document.py +174 -0
janus/cli/embedding.py +122 -0
janus/cli/llm.py +187 -0
janus/cli/partition.py +125 -0
janus/cli/self_eval.py +149 -0
janus/cli/translate.py +183 -0
janus/converter/__init__.py +1 -1
janus/converter/_tests/test_translate.py +2 -0
janus/converter/converter.py +129 -93
janus/converter/document.py +21 -14
janus/converter/evaluate.py +20 -13
janus/converter/translate.py +3 -3
janus/embedding/collections.py +1 -1
janus/language/alc/_tests/alc.asm +3779 -0
janus/language/binary/_tests/hello.bin +0 -0
janus/language/block.py +47 -12
janus/language/file.py +1 -1
janus/language/mumps/_tests/mumps.m +235 -0
janus/language/treesitter/_tests/languages/fortran.f90 +416 -0
janus/language/treesitter/_tests/languages/ibmhlasm.asm +16 -0
janus/language/treesitter/_tests/languages/matlab.m +225 -0
janus/llm/models_info.py +9 -1
janus/metrics/_tests/asm_test_file.asm +10 -0
janus/metrics/_tests/mumps_test_file.m +6 -0
janus/metrics/_tests/test_treesitter_metrics.py +1 -1
janus/metrics/prompts/clarity.txt +8 -0
janus/metrics/prompts/completeness.txt +16 -0
janus/metrics/prompts/faithfulness.txt +10 -0
janus/metrics/prompts/hallucination.txt +16 -0
janus/metrics/prompts/quality.txt +8 -0
janus/metrics/prompts/readability.txt +16 -0
janus/metrics/prompts/usefulness.txt +16 -0
janus/parsers/code_parser.py +4 -4
janus/parsers/doc_parser.py +12 -9
janus/parsers/parser.py +7 -0
janus/parsers/partition_parser.py +6 -4
janus/parsers/reqs_parser.py +8 -5
janus/parsers/uml.py +5 -4
janus/prompts/prompt.py +2 -2
janus/prompts/templates/README.md +30 -0
janus/prompts/templates/basic_aggregation/human.txt +6 -0
janus/prompts/templates/basic_aggregation/system.txt +1 -0
janus/prompts/templates/basic_refinement/human.txt +14 -0
janus/prompts/templates/basic_refinement/system.txt +1 -0
janus/prompts/templates/diagram/human.txt +9 -0
janus/prompts/templates/diagram/system.txt +1 -0
janus/prompts/templates/diagram_with_documentation/human.txt +15 -0
janus/prompts/templates/diagram_with_documentation/system.txt +1 -0
janus/prompts/templates/document/human.txt +10 -0
janus/prompts/templates/document/system.txt +1 -0
janus/prompts/templates/document_cloze/human.txt +11 -0
janus/prompts/templates/document_cloze/system.txt +1 -0
janus/prompts/templates/document_cloze/variables.json +4 -0
janus/prompts/templates/document_cloze/variables_asm.json +4 -0
janus/prompts/templates/document_inline/human.txt +13 -0
janus/prompts/templates/eval_prompts/incose/human.txt +32 -0
janus/prompts/templates/eval_prompts/incose/system.txt +1 -0
janus/prompts/templates/eval_prompts/incose/variables.json +3 -0
janus/prompts/templates/eval_prompts/inline_comments/human.txt +49 -0
janus/prompts/templates/eval_prompts/inline_comments/system.txt +1 -0
janus/prompts/templates/eval_prompts/inline_comments/variables.json +3 -0
janus/prompts/templates/micromanaged_mumps_v1.0/human.txt +23 -0
janus/prompts/templates/micromanaged_mumps_v1.0/system.txt +3 -0
janus/prompts/templates/micromanaged_mumps_v2.0/human.txt +28 -0
janus/prompts/templates/micromanaged_mumps_v2.0/system.txt +3 -0
janus/prompts/templates/micromanaged_mumps_v2.1/human.txt +29 -0
janus/prompts/templates/micromanaged_mumps_v2.1/system.txt +3 -0
janus/prompts/templates/multidocument/human.txt +15 -0
janus/prompts/templates/multidocument/system.txt +1 -0
janus/prompts/templates/partition/human.txt +22 -0
janus/prompts/templates/partition/system.txt +1 -0
janus/prompts/templates/partition/variables.json +4 -0
janus/prompts/templates/pseudocode/human.txt +7 -0
janus/prompts/templates/pseudocode/system.txt +7 -0
janus/prompts/templates/refinement/fix_exceptions/human.txt +19 -0
janus/prompts/templates/refinement/fix_exceptions/system.txt +1 -0
janus/prompts/templates/refinement/format/code_format/human.txt +12 -0
janus/prompts/templates/refinement/format/code_format/system.txt +1 -0
janus/prompts/templates/refinement/format/requirements_format/human.txt +14 -0
janus/prompts/templates/refinement/format/requirements_format/system.txt +1 -0
janus/prompts/templates/refinement/hallucination/human.txt +13 -0
janus/prompts/templates/refinement/hallucination/system.txt +1 -0
janus/prompts/templates/refinement/reflection/human.txt +15 -0
janus/prompts/templates/refinement/reflection/incose/human.txt +26 -0
janus/prompts/templates/refinement/reflection/incose/system.txt +1 -0
janus/prompts/templates/refinement/reflection/incose_deduplicate/human.txt +16 -0
janus/prompts/templates/refinement/reflection/incose_deduplicate/system.txt +1 -0
janus/prompts/templates/refinement/reflection/system.txt +1 -0
janus/prompts/templates/refinement/revision/human.txt +16 -0
janus/prompts/templates/refinement/revision/incose/human.txt +16 -0
janus/prompts/templates/refinement/revision/incose/system.txt +1 -0
janus/prompts/templates/refinement/revision/incose_deduplicate/human.txt +17 -0
janus/prompts/templates/refinement/revision/incose_deduplicate/system.txt +1 -0
janus/prompts/templates/refinement/revision/system.txt +1 -0
janus/prompts/templates/refinement/uml/alc_fix_variables/human.txt +15 -0
janus/prompts/templates/refinement/uml/alc_fix_variables/system.txt +2 -0
janus/prompts/templates/refinement/uml/fix_connections/human.txt +15 -0
janus/prompts/templates/refinement/uml/fix_connections/system.txt +2 -0
janus/prompts/templates/requirements/human.txt +13 -0
janus/prompts/templates/requirements/system.txt +2 -0
janus/prompts/templates/retrieval/language_docs/human.txt +10 -0
janus/prompts/templates/retrieval/language_docs/system.txt +1 -0
janus/prompts/templates/simple/human.txt +16 -0
janus/prompts/templates/simple/system.txt +3 -0
janus/refiners/format.py +49 -0
janus/refiners/refiner.py +113 -4
janus/utils/enums.py +127 -112
janus/utils/logger.py +2 -0
{janus_llm-4.3.1.dist-info → janus_llm-4.3.5.dist-info}/METADATA +7 -7
janus_llm-4.3.5.dist-info/RECORD +210 -0
{janus_llm-4.3.1.dist-info → janus_llm-4.3.5.dist-info}/WHEEL +1 -1
janus_llm-4.3.5.dist-info/entry_points.txt +3 -0
janus/cli.py +0 -1488
janus_llm-4.3.1.dist-info/RECORD +0 -115
janus_llm-4.3.1.dist-info/entry_points.txt +0 -3
{janus_llm-4.3.1.dist-info → janus_llm-4.3.5.dist-info}/LICENSE +0 -0

janus/cli/translate.py ADDED Viewed

@@ -0,0 +1,183 @@
+from pathlib import Path
+from typing import Optional
+import click
+import typer
+from typing_extensions import Annotated
+from janus.cli.constants import REFINERS
+from janus.language.naive.registry import CUSTOM_SPLITTERS
+from janus.utils.enums import LANGUAGES
+from janus.utils.logger import create_logger
+log = create_logger(__name__)
+def translate(
+    input_dir: Annotated[
+        Path,
+        typer.Option(
+            "--input",
+            "-i",
+            help="The directory containing the source code to be translated. "
+            "The files should all be in one flat directory.",
+        ),
+    ],
+    source_lang: Annotated[
+        str,
+        typer.Option(
+            "--source-language",
+            "-s",
+            help="The language of the source code.",
+            click_type=click.Choice(sorted(LANGUAGES)),
+        ),
+    ],
+    output_dir: Annotated[
+        Path,
+        typer.Option(
+            "--output", "-o", help="The directory to store the translated code in."
+        ),
+    ],
+    target_lang: Annotated[
+        str,
+        typer.Option(
+            "--target-language",
+            "-t",
+            help="The desired output language to translate the source code to. The "
+            "format can follow a 'language-version' syntax.  Use 'text' to get plaintext"
+            "results as returned by the LLM. Examples: `python-3.10`, `mumps`, `java-10`,"
+            "text.",
+        ),
+    ],
+    llm_name: Annotated[
+        str,
+        typer.Option(
+            "--llm",
+            "-L",
+            help="The custom name of the model set with 'janus llm add'.",
+        ),
+    ],
+    failure_dir: Annotated[
+        Optional[Path],
+        typer.Option(
+            "--failure-directory",
+            "-f",
+            help="The directory to store failure files during translation",
+        ),
+    ] = None,
+    max_prompts: Annotated[
+        int,
+        typer.Option(
+            "--max-prompts",
+            "-m",
+            help="The maximum number of times to prompt a model on one functional block "
+            "before exiting the application. This is to prevent wasting too much money.",
+        ),
+    ] = 10,
+    overwrite: Annotated[
+        bool,
+        typer.Option(
+            "--overwrite/--preserve",
+            help="Whether to overwrite existing files in the output directory",
+        ),
+    ] = False,
+    skip_context: Annotated[
+        bool,
+        typer.Option(
+            "--skip-context",
+            help="Prompts will include any context information associated with source"
+            " code blocks, unless this option is specified",
+        ),
+    ] = False,
+    temp: Annotated[
+        float,
+        typer.Option("--temperature", "-T", help="Sampling temperature.", min=0, max=2),
+    ] = 0.7,
+    prompt_template: Annotated[
+        str,
+        typer.Option(
+            "--prompt-template",
+            "-p",
+            help="Name of the Janus prompt template directory or "
+            "path to a directory containing those template files.",
+        ),
+    ] = "simple",
+    collection: Annotated[
+        str,
+        typer.Option(
+            "--collection",
+            "-c",
+            help="If set, will put the translated result into a Chroma DB "
+            "collection with the name provided.",
+        ),
+    ] = None,
+    splitter_type: Annotated[
+        str,
+        typer.Option(
+            "-S",
+            "--splitter",
+            help="Name of custom splitter to use",
+            click_type=click.Choice(list(CUSTOM_SPLITTERS.keys())),
+        ),
+    ] = "file",
+    refiner_types: Annotated[
+        list[str],
+        typer.Option(
+            "-r",
+            "--refiner",
+            help="List of refiner types to use. Add -r for each refiner to use in\
+                refinement chain",
+            click_type=click.Choice(list(REFINERS.keys())),
+        ),
+    ] = ["JanusRefiner"],
+    retriever_type: Annotated[
+        str,
+        typer.Option(
+            "-R",
+            "--retriever",
+            help="Name of custom retriever to use",
+            click_type=click.Choice(["active_usings", "language_docs"]),
+        ),
+    ] = None,
+    max_tokens: Annotated[
+        int,
+        typer.Option(
+            "--max-tokens",
+            "-M",
+            help="The maximum number of tokens the model will take in. "
+            "If unspecificed, model's default max will be used.",
+        ),
+    ] = None,
+):
+    from janus.cli.constants import db_loc, get_collections_config
+    from janus.converter.translate import Translator
+    refiner_types = [REFINERS[r] for r in refiner_types]
+    try:
+        target_language, target_version = target_lang.split("-")
+    except ValueError:
+        target_language = target_lang
+        target_version = None
+    # make sure not overwriting input
+    if source_lang.lower() == target_language.lower() and input_dir == output_dir:
+        log.error("Output files would overwrite input! Aborting...")
+        raise ValueError
+    model_arguments = dict(temperature=temp)
+    collections_config = get_collections_config()
+    translator = Translator(
+        model=llm_name,
+        model_arguments=model_arguments,
+        source_language=source_lang,
+        target_language=target_language,
+        target_version=target_version,
+        max_prompts=max_prompts,
+        max_tokens=max_tokens,
+        prompt_template=prompt_template,
+        db_path=db_loc,
+        db_config=collections_config,
+        splitter_type=splitter_type,
+        refiner_types=refiner_types,
+        retriever_type=retriever_type,
+    )
+    translator.translate(input_dir, output_dir, failure_dir, overwrite, collection)

janus/converter/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from janus.converter.converter import Converter
 from janus.converter.diagram import DiagramGenerator
-from janus.converter.document import Documenter, MadLibsDocumenter, MultiDocumenter
+from janus.converter.document import ClozeDocumenter, Documenter, MultiDocumenter
 from janus.converter.evaluate import Evaluator
 from janus.converter.partition import Partitioner
 from janus.converter.requirements import RequirementsDocumenter

janus/converter/_tests/test_translate.py CHANGED Viewed

@@ -11,6 +11,7 @@ from janus.converter.diagram import DiagramGenerator
 from janus.converter.requirements import RequirementsDocumenter
 from janus.converter.translate import Translator
 from janus.language.block import CodeBlock, TranslatedCodeBlock
+from janus.refiners.format import CodeFormatRefiner
 class MockCollection(VectorStore):
@@ -50,6 +51,7 @@ class TestTranslator(unittest.TestCase):
             target_language="python",
             target_version="3.10",
             splitter_type="ast-flex",
+            refiner_types=[CodeFormatRefiner],
         )
         self.test_file = Path("janus/language/treesitter/_tests/languages/fortran.f90")
         self.TEST_FILE_EMBEDDING_COUNT = 14

janus/converter/converter.py CHANGED Viewed

@@ -27,7 +27,7 @@ from janus.language.splitter import (
 )
 from janus.llm.model_callbacks import get_model_callback
 from janus.llm.models_info import MODEL_PROMPT_ENGINES, JanusModel, load_model
-from janus.parsers.parser import GenericParser, JanusParser
+from janus.parsers.parser import GenericParser, JanusParser, JanusParserException
 from janus.refiners.refiner import JanusRefiner
 # from janus.refiners.refiner import BasicRefiner, Refiner
@@ -122,7 +122,7 @@ class Converter:
         self._custom_model_arguments: dict[str, Any]
         self._source_language: str
-        self._source_suffix: str
+        self._source_suffixes: list[str]
         self._target_language = "json"
         self._target_suffix = ".json"
@@ -245,8 +245,10 @@ class Converter:
                 "Valid source languages are found in `janus.utils.enums.LANGUAGES`."
             )
-        ext = LANGUAGES[source_language]["suffix"]
-        self._source_suffix = f".{ext}"
+        self._source_suffixes = [
+            f".{ext}" for ext in LANGUAGES[source_language]["suffixes"]
+        ]
         self._source_language = source_language
     def set_protected_node_types(self, protected_node_types: tuple[str, ...]) -> None:
@@ -324,7 +326,7 @@ class Converter:
         # tokens at output
         # Only modify max_tokens if it is not specified by user
         if not self.override_token_limit:
-            self._max_tokens = int(token_limit // 2.5)
+            self._max_tokens = int(token_limit * self._llm.input_token_proportion)
     @run_if_changed(
         "_prompt_template_name",
@@ -406,6 +408,7 @@ class Converter:
         self,
         input_directory: str | Path,
         output_directory: str | Path | None = None,
+        failure_directory: str | Path | None = None,
         overwrite: bool = False,
         collection_name: str | None = None,
     ) -> None:
@@ -423,16 +426,22 @@ class Converter:
             input_directory = Path(input_directory)
         if isinstance(output_directory, str):
             output_directory = Path(output_directory)
+        if isinstance(failure_directory, str):
+            failure_directory = Path(failure_directory)
         # Make sure the output directory exists
         if output_directory is not None and not output_directory.exists():
             output_directory.mkdir(parents=True)
+        if failure_directory is not None and not failure_directory.exists():
+            failure_directory.mkdir(parents=True)
-        input_paths = [p for p in input_directory.rglob(f"**/*{self._source_suffix}")]
+        input_paths = []
+        for ext in self._source_suffixes:
+            input_paths.extend(input_directory.rglob(f"**/*{ext}"))
         log.info(f"Input directory: {input_directory.absolute()}")
         log.info(
-            f"{self._source_language} '*{self._source_suffix}' files: "
+            f"{self._source_language} {self._source_suffixes} files: "
             f"{len(input_paths)}"
         )
         log.info(
@@ -445,67 +454,39 @@ class Converter:
                 / p.relative_to(input_directory).with_suffix(self._target_suffix)
                 for p in input_paths
             ]
-            in_out_pairs = list(zip(input_paths, output_paths))
-            if not overwrite:
-                n_files = len(in_out_pairs)
-                in_out_pairs = [
-                    (inp, outp) for inp, outp in in_out_pairs if not outp.exists()
-                ]
-                log.info(
-                    f"Skipping {n_files - len(in_out_pairs)} existing "
-                    f"'*{self._source_suffix}' files"
-                )
         else:
-            in_out_pairs = [(f, None) for f in input_paths]
-        log.info(f"Translating {len(in_out_pairs)} '*{self._source_suffix}' files")
+            output_paths = [None for _ in input_paths]
+        if failure_directory is not None:
+            failure_paths = [
+                failure_directory
+                / p.relative_to(input_directory).with_suffix(self._target_suffix)
+                for p in input_paths
+            ]
+        else:
+            failure_paths = [None for _ in input_paths]
+        in_out_pairs = list(zip(input_paths, output_paths, failure_paths))
+        if not overwrite:
+            n_files = len(in_out_pairs)
+            in_out_pairs = [
+                (inp, outp, failp)
+                for inp, outp, failp in in_out_pairs
+                if outp is None or not outp.exists()
+            ]
+            log.info(
+                f"Skipping {n_files - len(in_out_pairs)} existing "
+                f"{self._source_suffixes} files"
+            )
+        log.info(f"Translating {len(in_out_pairs)} {self._source_suffixes} files")
         # Loop through each input file, convert and save it
         total_cost = 0.0
-        for in_path, out_path in in_out_pairs:
+        for in_path, out_path, fail_path in in_out_pairs:
             # Translate the file, skip it if there's a rate limit error
-            try:
-                log.info(f"Processing {in_path.relative_to(input_directory)}")
-                out_block = self.translate_file(in_path)
-                total_cost += out_block.total_cost
-            except RateLimitError:
-                continue
-            except OutputParserException as e:
-                log.error(f"Skipping {in_path.name}, failed to parse output: {e}.")
-                continue
-            except BadRequestError as e:
-                if str(e).startswith("Detected an error in the prompt"):
-                    log.warning("Malformed input, skipping")
-                    continue
-                raise e
-            except ValidationError as e:
-                # Only allow ValidationError to pass if token limit is manually set
-                if self.override_token_limit:
-                    log.warning(
-                        "Current file and manually set token "
-                        "limit is too large for this model, skipping"
-                    )
-                    continue
-                raise e
-            except TokenLimitError:
-                log.warning("Ran into irreducible node too large for context, skipping")
-                continue
-            except EmptyTreeError:
-                log.warning(
-                    f'Input file "{in_path.name}" has no nodes of interest, skipping'
-                )
-                continue
-            except FileSizeError:
-                log.warning("Current tile is too large for basic splitter, skipping")
-                continue
-            except ValueError as e:
-                if str(e).startswith(
-                    "Error raised by bedrock service"
-                ) and "maximum context length" in str(e):
-                    log.warning(
-                        "Input is too large for this model's context length, skipping"
-                    )
-                    continue
-                raise e
+            log.info(f"Processing {in_path.relative_to(input_directory)}")
+            out_block = self.translate_file(in_path, fail_path)
+            total_cost += out_block.total_cost
+            log.info(f"Current Running Cost: {total_cost}")
             # Don't attempt to write files for which translation failed
             if not out_block.translated:
@@ -526,11 +507,14 @@ class Converter:
         log.info(f"Total cost: ${total_cost:,.2f}")
-    def translate_file(self, file: Path) -> TranslatedCodeBlock:
+    def translate_file(
+        self, file: Path, failure_path: Path | None = None
+    ) -> TranslatedCodeBlock:
         """Translate a single file.
         Arguments:
             file: Input path to file
+            failure_path: path to directory to store failure summaries`
         Returns:
             A `TranslatedCodeBlock` object. This block does not have a path set, and its
@@ -542,7 +526,7 @@ class Converter:
         input_block = self._split_file(file)
         t0 = time.time()
-        output_block = self._iterative_translate(input_block)
+        output_block = self._iterative_translate(input_block, failure_path)
         output_block.processing_time = time.time() - t0
         if output_block.translated:
             completeness = output_block.translation_completeness
@@ -550,7 +534,6 @@ class Converter:
                 f"[{filename}] Translation complete\n"
                 f"  {completeness:.2%} of input successfully translated\n"
                 f"  Total cost: ${output_block.total_cost:,.2f}\n"
-                f"  Total retries: {output_block.total_retries:,d}\n"
                 f"  Output CodeBlock Structure:\n{input_block.tree_str()}\n"
             )
@@ -558,15 +541,17 @@ class Converter:
             log.error(
                 f"[{filename}] Translation failed\n"
                 f"  Total cost: ${output_block.total_cost:,.2f}\n"
-                f"  Total retries: {output_block.total_retries:,d}\n"
             )
         return output_block
-    def _iterative_translate(self, root: CodeBlock) -> TranslatedCodeBlock:
+    def _iterative_translate(
+        self, root: CodeBlock, failure_path: Path | None = None
+    ) -> TranslatedCodeBlock:
         """Translate the passed CodeBlock representing a full file.
         Arguments:
             root: A root block representing the top-level block of a file
+            failure_path: path to store data files for failed translations
         Returns:
             A `TranslatedCodeBlock`
@@ -574,22 +559,60 @@ class Converter:
         translated_root = TranslatedCodeBlock(root, self._target_language)
         last_prog, prog_delta = 0, 0.1
         stack = [translated_root]
-        while stack:
-            translated_block = stack.pop()
-            self._add_translation(translated_block)
+        try:
+            while stack:
+                translated_block = stack.pop()
-            # If translating this block was unsuccessful, don't bother with its
-            #  children (they wouldn't show up in the final text anyway)
-            if not translated_block.translated:
-                continue
+                self._add_translation(translated_block)
-            stack.extend(translated_block.children)
+                # If translating this block was unsuccessful, don't bother with its
+                #  children (they wouldn't show up in the final text anyway)
+                if not translated_block.translated:
+                    continue
-            progress = translated_root.translation_completeness
-            if progress - last_prog > prog_delta:
-                last_prog = int(progress / prog_delta) * prog_delta
-                log.info(f"[{root.name}] progress: {progress:.2%}")
+                stack.extend(translated_block.children)
+                progress = translated_root.translation_completeness
+                if progress - last_prog > prog_delta:
+                    last_prog = int(progress / prog_delta) * prog_delta
+                    log.info(f"[{root.name}] progress: {progress:.2%}")
+        except RateLimitError:
+            pass
+        except OutputParserException as e:
+            log.error(f"Skipping file, failed to parse output: {e}.")
+        except BadRequestError as e:
+            if str(e).startswith("Detected an error in the prompt"):
+                log.warning("Malformed input, skipping")
+            raise e
+        except ValidationError as e:
+            # Only allow ValidationError to pass if token limit is manually set
+            if self.override_token_limit:
+                log.warning(
+                    "Current file and manually set token "
+                    "limit is too large for this model, skipping"
+                )
+            raise e
+        except TokenLimitError:
+            log.warning("Ran into irreducible node too large for context, skipping")
+        except EmptyTreeError:
+            log.warning("Input file has no nodes of interest, skipping")
+        except FileSizeError:
+            log.warning("Current tile is too large for basic splitter, skipping")
+        except ValueError as e:
+            if str(e).startswith(
+                "Error raised by bedrock service"
+            ) and "maximum context length" in str(e):
+                log.warning(
+                    "Input is too large for this model's context length, skipping"
+                )
+            raise e
+        finally:
+            log.debug(
+                f"Resulting Block: {json.dumps(self._get_output_obj(translated_root))}"
+            )
+            if not translated_root.translated:
+                if failure_path is not None:
+                    self._save_to_file(translated_root, failure_path)
         return translated_root
@@ -624,11 +647,19 @@ class Converter:
         #  TODO: If non-OpenAI models with prices are added, this will need
         #   to be updated.
         with get_model_callback() as cb:
-            t0 = time.time()
-            block.text = self._run_chain(block)
-            block.processing_time = time.time() - t0
-            block.cost = cb.total_cost
-            block.retries = max(0, cb.successful_requests - 1)
+            try:
+                t0 = time.time()
+                block.text = self._run_chain(block)
+            except JanusParserException as e:
+                block.text = e.unparsed_output
+                block.tokens = self._llm.get_num_tokens(block.text)
+                raise e
+            finally:
+                block.processing_time = time.time() - t0
+                block.cost = cb.total_cost
+                block.request_input_tokens = cb.prompt_tokens
+                block.request_output_tokens = cb.completion_tokens
+                block.num_requests = cb.successful_requests
         block.tokens = self._llm.get_num_tokens(block.text)
         block.translated = True
@@ -652,20 +683,25 @@ class Converter:
     def _get_output_obj(
         self, block: TranslatedCodeBlock
     ) -> dict[str, int | float | str | dict[str, str] | dict[str, float]]:
-        output_str = self._parser.parse_combined_output(block.complete_text)
         output_obj: str | dict[str, str]
-        try:
-            output_obj = json.loads(output_str)
-        except json.JSONDecodeError:
-            output_obj = output_str
+        if not block.translation_completed:
+            # translation wasn't completed, so combined parsing will likely fail
+            output_obj = block.complete_text
+        else:
+            output_str = self._parser.parse_combined_output(block.complete_text)
+            try:
+                output_obj = json.loads(output_str)
+            except json.JSONDecodeError:
+                output_obj = output_str
         return dict(
             input=block.original.text or "",
             metadata=dict(
-                retries=block.total_retries,
                 cost=block.total_cost,
                 processing_time=block.processing_time,
+                num_requests=block.total_num_requests,
+                input_tokens=block.total_request_input_tokens,
+                output_tokens=block.total_request_output_tokens,
             ),
             output=output_obj,
         )

janus/converter/document.py CHANGED Viewed

@@ -5,10 +5,8 @@ from copy import deepcopy
 from janus.converter.converter import Converter
 from janus.language.block import TranslatedCodeBlock
 from janus.language.combine import JsonCombiner
-from janus.parsers.doc_parser import (
-    MadlibsDocumentationParser,
-    MultiDocumentationParser,
-)
+from janus.parsers.doc_parser import ClozeDocumentationParser, MultiDocumentationParser
+from janus.parsers.parser import JanusParserException
 from janus.utils.enums import LANGUAGES
 from janus.utils.logger import create_logger
@@ -40,7 +38,7 @@ class MultiDocumenter(Documenter):
         self._parser = MultiDocumentationParser()
-class MadLibsDocumenter(Documenter):
+class ClozeDocumenter(Documenter):
     def __init__(
         self,
         comments_per_request: int | None = None,
@@ -48,9 +46,9 @@ class MadLibsDocumenter(Documenter):
     ) -> None:
         kwargs.update(drop_comments=False)
         super().__init__(**kwargs)
-        self.set_prompt("document_madlibs")
+        self.set_prompt("document_cloze")
         self._combiner = JsonCombiner()
-        self._parser = MadlibsDocumentationParser()
+        self._parser = ClozeDocumentationParser()
         self.comments_per_request = comments_per_request
@@ -92,7 +90,6 @@ class MadLibsDocumenter(Documenter):
         block.processing_time = 0
         block.cost = 0
-        block.retries = 0
         obj = {}
         for i in range(0, len(comments), self.comments_per_request):
             # Split the text into the section containing comments of interest,
@@ -114,16 +111,26 @@ class MadLibsDocumenter(Documenter):
             working_block = TranslatedCodeBlock(working_copy, self._target_language)
             # Run the LLM on the working text
-            super()._add_translation(working_block)
-            # Update metadata to include for all runs
-            block.retries += working_block.retries
-            block.cost += working_block.cost
-            block.processing_time += working_block.processing_time
+            try:
+                super()._add_translation(working_block)
+            except JanusParserException as e:
+                block.text += "\n===============\n" + working_block.text
+                block.tokens = self._llm.get_num_tokens(block.text)
+                raise e
+            finally:
+                # Update metadata to include for all runs
+                block.num_requests += working_block.num_requests
+                block.cost += working_block.cost
+                block.processing_time += working_block.processing_time
+                block.request_input_tokens += working_block.request_input_tokens
+                block.request_output_tokens += working_block.request_output_tokens
             # Update the output text to merge this section's output in
             out_text = self._parser.parse(working_block.text)
             obj.update(json.loads(out_text))
+            # Set intermediate text, will be overwritten if file
+            # successfully completes
+            block.text = json.dumps(obj)
         self._parser.parse_input(block.original)
         block.text = self._parser.parse(json.dumps(obj))

janus-llm 4.3.1__py3-none-any.whl → 4.3.5__py3-none-any.whl

janus-llm 4.3.1py3-none-any.whl → 4.3.5py3-none-any.whl