PyPI - janus-llm - Versions diffs - 3.5.3__tar.gz → 4.1.0__tar.gz - Mend

janus-llm 3.5.3tar.gz → 4.1.0tar.gz

Files changed (112) hide show

{janus_llm-3.5.3 → janus_llm-4.1.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: janus-llm
-Version: 3.5.3
+Version: 4.1.0
 Summary: A transcoding library using LLMs.
 Home-page: https://github.com/janus-llm/janus-llm
 License: Apache 2.0

{janus_llm-3.5.3 → janus_llm-4.1.0}/janus/__init__.py RENAMED Viewed

@@ -5,7 +5,7 @@ from langchain_core._api.deprecation import LangChainDeprecationWarning
 from janus.converter.translate import Translator
 from janus.metrics import *  # noqa: F403
-__version__ = "3.5.3"
+__version__ = "4.1.0"
 # Ignoring a deprecation warning from langchain_core that I can't seem to hunt down
 warnings.filterwarnings("ignore", category=LangChainDeprecationWarning)

{janus_llm-3.5.3 → janus_llm-4.1.0}/janus/cli.py RENAMED Viewed

@@ -39,10 +39,12 @@ from janus.llm.models_info import (
     MODEL_TYPE_CONSTRUCTORS,
     MODEL_TYPES,
     TOKEN_LIMITS,
+    azure_models,
     bedrock_models,
     openai_models,
 )
 from janus.metrics.cli import evaluate
+from janus.refiners.refiner import REFINERS
 from janus.utils.enums import LANGUAGES
 from janus.utils.logger import create_logger
@@ -242,6 +244,24 @@ def translate(
             click_type=click.Choice(list(CUSTOM_SPLITTERS.keys())),
         ),
     ] = "file",
+    refiner_type: Annotated[
+        str,
+        typer.Option(
+            "-r",
+            "--refiner",
+            help="Name of custom refiner to use",
+            click_type=click.Choice(list(REFINERS.keys())),
+        ),
+    ] = "none",
+    retriever_type: Annotated[
+        str,
+        typer.Option(
+            "-R",
+            "--retriever",
+            help="Name of custom retriever to use",
+            click_type=click.Choice(["active_usings"]),
+        ),
+    ] = None,
     max_tokens: Annotated[
         int,
         typer.Option(
@@ -251,13 +271,6 @@ def translate(
             "If unspecificed, model's default max will be used.",
         ),
     ] = None,
-    skip_refiner: Annotated[
-        bool,
-        typer.Option(
-            "--skip-refiner",
-            help="Whether to skip the refiner for generating output",
-        ),
-    ] = True,
 ):
     try:
         target_language, target_version = target_lang.split("-")
@@ -283,8 +296,8 @@ def translate(
         db_path=db_loc,
         db_config=collections_config,
         splitter_type=splitter_type,
-        skip_context=skip_context,
-        skip_refiner=skip_refiner,
+        refiner_type=refiner_type,
+        retriever_type=retriever_type,
     )
     translator.translate(input_dir, output_dir, overwrite, collection)
@@ -342,14 +355,6 @@ def document(
             help="Whether to overwrite existing files in the output directory",
         ),
     ] = False,
-    skip_context: Annotated[
-        bool,
-        typer.Option(
-            "--skip-context",
-            help="Prompts will include any context information associated with source"
-            " code blocks, unless this option is specified",
-        ),
-    ] = False,
     doc_mode: Annotated[
         str,
         typer.Option(
@@ -397,6 +402,24 @@ def document(
             click_type=click.Choice(list(CUSTOM_SPLITTERS.keys())),
         ),
     ] = "file",
+    refiner_type: Annotated[
+        str,
+        typer.Option(
+            "-r",
+            "--refiner",
+            help="Name of custom refiner to use",
+            click_type=click.Choice(list(REFINERS.keys())),
+        ),
+    ] = "none",
+    retriever_type: Annotated[
+        str,
+        typer.Option(
+            "-R",
+            "--retriever",
+            help="Name of custom retriever to use",
+            click_type=click.Choice(["active_usings"]),
+        ),
+    ] = None,
     max_tokens: Annotated[
         int,
         typer.Option(
@@ -406,13 +429,6 @@ def document(
             "If unspecificed, model's default max will be used.",
         ),
     ] = None,
-    skip_refiner: Annotated[
-        bool,
-        typer.Option(
-            "--skip-refiner",
-            help="Whether to skip the refiner for generating output",
-        ),
-    ] = True,
 ):
     model_arguments = dict(temperature=temperature)
     collections_config = get_collections_config()
@@ -425,8 +441,8 @@ def document(
         db_path=db_loc,
         db_config=collections_config,
         splitter_type=splitter_type,
-        skip_refiner=skip_refiner,
-        skip_context=skip_context,
+        refiner_type=refiner_type,
+        retriever_type=retriever_type,
     )
     if doc_mode == "madlibs":
         documenter = MadLibsDocumenter(
@@ -615,14 +631,6 @@ def diagram(
             help="Whether to overwrite existing files in the output directory",
         ),
     ] = False,
-    skip_context: Annotated[
-        bool,
-        typer.Option(
-            "--skip-context",
-            help="Prompts will include any context information associated with source"
-            " code blocks, unless this option is specified",
-        ),
-    ] = False,
     temperature: Annotated[
         float,
         typer.Option("--temperature", "-t", help="Sampling temperature.", min=0, max=2),
@@ -659,13 +667,24 @@ def diagram(
             click_type=click.Choice(list(CUSTOM_SPLITTERS.keys())),
         ),
     ] = "file",
-    skip_refiner: Annotated[
-        bool,
+    refiner_type: Annotated[
+        str,
         typer.Option(
-            "--skip-refiner",
-            help="Whether to skip the refiner for generating output",
+            "-r",
+            "--refiner",
+            help="Name of custom refiner to use",
+            click_type=click.Choice(list(REFINERS.keys())),
         ),
-    ] = True,
+    ] = "none",
+    retriever_type: Annotated[
+        str,
+        typer.Option(
+            "-R",
+            "--retriever",
+            help="Name of custom retriever to use",
+            click_type=click.Choice(["active_usings"]),
+        ),
+    ] = None,
 ):
     model_arguments = dict(temperature=temperature)
     collections_config = get_collections_config()
@@ -676,11 +695,11 @@ def diagram(
         max_prompts=max_prompts,
         db_path=db_loc,
         db_config=collections_config,
+        splitter_type=splitter_type,
+        refiner_type=refiner_type,
+        retriever_type=retriever_type,
         diagram_type=diagram_type,
         add_documentation=add_documentation,
-        splitter_type=splitter_type,
-        skip_refiner=skip_refiner,
-        skip_context=skip_context,
     )
     diagram_generator.translate(input_dir, output_dir, overwrite, collection)
@@ -934,7 +953,7 @@ def llm_add(
             help="The type of the model",
             click_type=click.Choice(sorted(list(MODEL_TYPE_CONSTRUCTORS.keys()))),
         ),
-    ] = "OpenAI",
+    ] = "Azure",
 ):
     if not MODEL_CONFIG_DIR.exists():
         MODEL_CONFIG_DIR.mkdir(parents=True)
@@ -978,6 +997,7 @@ def llm_add(
             "model_cost": {"input": in_cost, "output": out_cost},
         }
     elif model_type == "OpenAI":
+        print("DEPRECATED: Use 'Azure' instead. CTRL+C to exit.")
         model_id = typer.prompt(
             "Enter the model ID (list model IDs with `janus llm ls -a`)",
             default="gpt-4o",
@@ -999,6 +1019,28 @@ def llm_add(
             "token_limit": max_tokens,
             "model_cost": model_cost,
         }
+    elif model_type == "Azure":
+        model_id = typer.prompt(
+            "Enter the model ID (list model IDs with `janus llm ls -a`)",
+            default="gpt-4o",
+            type=click.Choice(azure_models),
+            show_choices=False,
+        )
+        params = dict(
+            # Azure uses the "azure_deployment" key for what we're calling "long_model_id"
+            azure_deployment=MODEL_ID_TO_LONG_ID[model_id],
+            temperature=0.7,
+            n=1,
+        )
+        max_tokens = TOKEN_LIMITS[MODEL_ID_TO_LONG_ID[model_id]]
+        model_cost = COST_PER_1K_TOKENS[MODEL_ID_TO_LONG_ID[model_id]]
+        cfg = {
+            "model_type": model_type,
+            "model_id": model_id,
+            "model_args": params,
+            "token_limit": max_tokens,
+            "model_cost": model_cost,
+        }
     elif model_type == "BedrockChat" or model_type == "Bedrock":
         model_id = typer.prompt(
             "Enter the model ID (list model IDs with `janus llm ls -a`)",
@@ -1173,13 +1215,14 @@ def render(
     for input_file in input_dir.rglob("*.json"):
         with open(input_file, "r") as f:
             data = json.load(f)
-        input_tail = input_file.relative_to(input_dir)
-        output_file = output_dir / input_tail
-        output_file = output_file.with_suffix(".txt")
+        output_file = output_dir / input_file.relative_to(input_dir).with_suffix(".txt")
         if not output_file.parent.exists():
             output_file.parent.mkdir()
-        with open(output_file, "w") as f:
-            f.write(data["output"])
+        text = data["output"].replace("\\n", "\n").strip()
+        output_file.write_text(text)
         jar_path = homedir / ".janus/lib/plantuml.jar"
         subprocess.run(["java", "-jar", jar_path, output_file])  # nosec
         output_file.unlink()

{janus_llm-3.5.3 → janus_llm-4.1.0}/janus/converter/_tests/test_translate.py RENAMED Viewed

@@ -90,14 +90,14 @@ class TestDiagramGenerator(unittest.TestCase):
     def setUp(self):
         """Set up the tests."""
         self.diagram_generator = DiagramGenerator(
-            model="gpt-4o",
+            model="gpt-4o-mini",
             source_language="fortran",
             diagram_type="Activity",
         )
     def test_init(self):
         """Test __init__ method."""
-        self.assertEqual(self.diagram_generator._model_name, "gpt-4o")
+        self.assertEqual(self.diagram_generator._model_name, "gpt-4o-mini")
         self.assertEqual(self.diagram_generator._source_language, "fortran")
         self.assertEqual(self.diagram_generator._diagram_type, "Activity")

{janus_llm-3.5.3 → janus_llm-4.1.0}/janus/converter/converter.py RENAMED Viewed

@@ -2,13 +2,11 @@ import functools
 import json
 import time
 from pathlib import Path
-from typing import Any, List, Optional, Tuple
+from typing import Any
-from langchain.output_parsers import RetryWithErrorOutputParser
 from langchain_core.exceptions import OutputParserException
-from langchain_core.language_models import BaseLanguageModel
-from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate
-from langchain_core.runnables import RunnableLambda, RunnableParallel
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.runnables import Runnable, RunnableParallel, RunnablePassthrough
 from openai import BadRequestError, RateLimitError
 from pydantic import ValidationError
@@ -22,12 +20,18 @@ from janus.language.splitter import (
     Splitter,
     TokenLimitError,
 )
-from janus.llm import load_model
 from janus.llm.model_callbacks import get_model_callback
-from janus.llm.models_info import MODEL_PROMPT_ENGINES
+from janus.llm.models_info import MODEL_PROMPT_ENGINES, JanusModel, load_model
 from janus.parsers.parser import GenericParser, JanusParser
-from janus.parsers.refiner_parser import RefinerParser
-from janus.refiners.refiner import BasicRefiner, Refiner
+from janus.refiners.refiner import (
+    FixParserExceptions,
+    HallucinationRefiner,
+    JanusRefiner,
+    ReflectionRefiner,
+)
+# from janus.refiners.refiner import BasicRefiner, Refiner
+from janus.retrievers.retriever import ActiveUsingsRetriever, JanusRetriever
 from janus.utils.enums import LANGUAGES
 from janus.utils.logger import create_logger
@@ -74,9 +78,8 @@ class Converter:
         protected_node_types: tuple[str, ...] = (),
         prune_node_types: tuple[str, ...] = (),
         splitter_type: str = "file",
-        refiner_type: str = "basic",
-        skip_refiner: bool = True,
-        skip_context: bool = False,
+        refiner_type: str | None = None,
+        retriever_type: str | None = None,
     ) -> None:
         """Initialize a Converter instance.
@@ -96,9 +99,13 @@ class Converter:
             prune_node_types: A set of node types which should be pruned.
             splitter_type: The type of splitter to use. Valid values are `"file"`,
                 `"tag"`, `"chunk"`, `"ast-strict"`, and `"ast-flex"`.
-            refiner_type: The type of refiner to use. Valid values are `"basic"`.
-            skip_refiner: Whether to skip the refiner.
-            skip_context: Whether to skip adding context to the prompt.
+            refiner_type: The type of refiner to use. Valid values:
+                - "parser"
+                - "reflection"
+                - None
+            retriever_type: The type of retriever to use. Valid values:
+                - "active_usings"
+                - None
         """
         self._changed_attrs: set = set()
@@ -107,7 +114,6 @@ class Converter:
         self.override_token_limit: bool = max_tokens is not None
         self._model_name: str
-        self._model_id: str
         self._custom_model_arguments: dict[str, Any]
         self._source_language: str
@@ -120,24 +126,26 @@ class Converter:
         self._prune_node_types: tuple[str, ...] = ()
         self._max_tokens: int | None = max_tokens
         self._prompt_template_name: str
-        self._splitter_type: str
         self._db_path: str | None
         self._db_config: dict[str, Any] | None
-        self._splitter: Splitter
-        self._llm: BaseLanguageModel
+        self._llm: JanusModel
         self._prompt: ChatPromptTemplate
         self._parser: JanusParser = GenericParser()
         self._combiner: Combiner = Combiner()
-        self._refiner_type: str
-        self._refiner: Refiner
+        self._splitter_type: str
+        self._refiner_type: str | None
+        self._retriever_type: str | None
-        self.skip_refiner = skip_refiner
+        self._splitter: Splitter
+        self._refiner: JanusRefiner
+        self._retriever: JanusRetriever
         self.set_splitter(splitter_type=splitter_type)
         self.set_refiner(refiner_type=refiner_type)
+        self.set_retriever(retriever_type=retriever_type)
         self.set_model(model_name=model, **model_arguments)
         self.set_prompt(prompt_template=prompt_template)
         self.set_source_language(source_language)
@@ -146,8 +154,6 @@ class Converter:
         self.set_db_path(db_path=db_path)
         self.set_db_config(db_config=db_config)
-        self.skip_context = skip_context
         # Child class must call this. Should we enforce somehow?
         # self._load_parameters()
@@ -163,9 +169,11 @@ class Converter:
     def _load_parameters(self) -> None:
         self._load_model()
         self._load_prompt()
+        self._load_retriever()
+        self._load_refiner()
         self._load_splitter()
         self._load_vectorizer()
-        self._load_refiner()
+        self._load_chain()
         self._changed_attrs.clear()
     def set_model(self, model_name: str, **custom_arguments: dict[str, Any]):
@@ -184,8 +192,6 @@ class Converter:
     def set_prompt(self, prompt_template: str) -> None:
         """Validate and set the prompt template name.
-        The affected objects will not be updated until translate() is called.
         Arguments:
             prompt_template: name of prompt template directory
                 (see janus/prompts/templates) or path to a directory.
@@ -195,29 +201,34 @@ class Converter:
     def set_splitter(self, splitter_type: str) -> None:
         """Validate and set the prompt template name.
-        The affected objects will not be updated until translate() is called.
         Arguments:
             prompt_template: name of prompt template directory
                 (see janus/prompts/templates) or path to a directory.
         """
-        self._splitter_type = splitter_type
+        if splitter_type not in CUSTOM_SPLITTERS:
+            raise ValueError(f'Splitter type "{splitter_type}" does not exist.')
-    def set_refiner(self, refiner_type: str) -> None:
-        """Validate and set the refiner name
+        self._splitter_type = splitter_type
-        The affected objects will not be updated until translate is called
+    def set_refiner(self, refiner_type: str | None) -> None:
+        """Validate and set the refiner type
         Arguments:
-            refiner_type: the name of the refiner to use
+            refiner_type: the type of refiner to use
         """
         self._refiner_type = refiner_type
+    def set_retriever(self, retriever_type: str | None) -> None:
+        """Validate and set the retriever type
+        Arguments:
+            retriever_type: the type of retriever to use
+        """
+        self._retriever_type = retriever_type
     def set_source_language(self, source_language: str) -> None:
         """Validate and set the source language.
-        The affected objects will not be updated until _load_parameters() is called.
         Arguments:
             source_language: The source programming language.
         """
@@ -287,20 +298,6 @@ class Converter:
         self._splitter = CUSTOM_SPLITTERS[self._splitter_type](**kwargs)
-    @run_if_changed("_refiner_type", "_model_name")
-    def _load_refiner(self) -> None:
-        """Load the refiner according to this instance's attributes.
-        If the relevant fields have not been changed since the last time this method was
-        called, nothing happens.
-        """
-        if self._refiner_type == "basic":
-            self._refiner = BasicRefiner(
-                "basic_refinement", self._model_id, self._source_language
-            )
-        else:
-            raise ValueError(f"Error: unknown refiner type {self._refiner_type}")
     @run_if_changed("_model_name", "_custom_model_arguments")
     def _load_model(self) -> None:
         """Load the model according to this instance's attributes.
@@ -314,9 +311,9 @@ class Converter:
         # model_arguments.update(self._custom_model_arguments)
         # Load the model
-        self._llm, self._model_id, token_limit, self.model_cost = load_model(
-            self._model_name
-        )
+        self._llm = load_model(self._model_name)
+        token_limit = self._llm.token_limit
         # Set the max_tokens to less than half the model's limit to allow for enough
         # tokens at output
         # Only modify max_tokens if it is not specified by user
@@ -335,7 +332,7 @@ class Converter:
         If the relevant fields have not been changed since the last time this
         method was called, nothing happens.
         """
-        prompt_engine = MODEL_PROMPT_ENGINES[self._model_id](
+        prompt_engine = MODEL_PROMPT_ENGINES[self._llm.short_model_id](
             source_language=self._source_language,
             prompt_template=self._prompt_template_name,
         )
@@ -354,6 +351,59 @@ class Converter:
             self._db_path, self._db_config
         )
+    @run_if_changed("_retriever_type")
+    def _load_retriever(self):
+        if self._retriever_type == "active_usings":
+            self._retriever = ActiveUsingsRetriever()
+        else:
+            self._retriever = JanusRetriever()
+    @run_if_changed("_refiner_type", "_model_name", "max_prompts", "_parser", "_llm")
+    def _load_refiner(self) -> None:
+        """Load the refiner according to this instance's attributes.
+        If the relevant fields have not been changed since the last time this method was
+        called, nothing happens.
+        """
+        if self._refiner_type == "parser":
+            self._refiner = FixParserExceptions(
+                llm=self._llm,
+                parser=self._parser,
+                max_retries=self.max_prompts,
+            )
+        elif self._refiner_type == "reflection":
+            self._refiner = ReflectionRefiner(
+                llm=self._llm,
+                parser=self._parser,
+                max_retries=self.max_prompts,
+            )
+        elif self._refiner_type == "hallucination":
+            self._refiner = HallucinationRefiner(
+                llm=self._llm,
+                parser=self._parser,
+                max_retries=self.max_prompts,
+            )
+        else:
+            self._refiner = JanusRefiner(parser=self._parser)
+    @run_if_changed("_parser", "_retriever", "_prompt", "_llm", "_refiner")
+    def _load_chain(self):
+        self.chain = (
+            self._input_runnable()
+            | self._prompt
+            | RunnableParallel(
+                completion=self._llm,
+                prompt_value=RunnablePassthrough(),
+            )
+            | self._refiner.parse_runnable
+        )
+    def _input_runnable(self) -> Runnable:
+        return RunnableParallel(
+            SOURCE_CODE=self._parser.parse_input,
+            context=self._retriever,
+        )
     def translate(
         self,
         input_directory: str | Path,
@@ -598,110 +648,29 @@ class Converter:
         return root
     def _run_chain(self, block: TranslatedCodeBlock) -> str:
-        """Run the model with three nested error fixing schemes.
-        First, try to fix simple formatting errors by giving the model just
-        the output and the parsing error. After a number of attempts, try
-        giving the model the output, the parsing error, and the original
-        input. Again check/retry this output to solve for formatting errors.
-        If we still haven't succeeded after several attempts, the model may
-        be getting thrown off by a bad initial output; start from scratch
-        and try again.
-        The number of tries for each layer of this scheme is roughly equal
-        to the cube root of self.max_retries, so the total calls to the
-        LLM will be roughly as expected (up to sqrt(self.max_retries) over)
-        """
-        input = self._parser.parse_input(block.original)
-        # Retries with just the output and the error
-        n1 = round(self.max_prompts ** (1 / 2))
-        # Retries with the input, output, and error
-        n2 = round(self.max_prompts // n1)
-        if not self.skip_context:
-            self._make_prompt_additions(block)
-        if not self.skip_refiner:  # Make replacements in the prompt
-            refine_output = RefinerParser(
-                parser=self._parser,
-                initial_prompt=self._prompt.format(**{"SOURCE_CODE": input}),
-                refiner=self._refiner,
-                max_retries=n1,
-                llm=self._llm,
-            )
-        else:
-            refine_output = RetryWithErrorOutputParser.from_llm(
-                llm=self._llm,
-                parser=self._parser,
-                max_retries=n1,
-            )
-        completion_chain = self._prompt | self._llm
-        chain = RunnableParallel(
-            completion=completion_chain, prompt_value=self._prompt
-        ) | RunnableLambda(lambda x: refine_output.parse_with_prompt(**x))
-        for _ in range(n2):
-            try:
-                return chain.invoke({"SOURCE_CODE": input})
-            except OutputParserException:
-                pass
-        raise OutputParserException(f"Failed to parse after {n1*n2} retries")
+        return self.chain.invoke(block.original)
     def _get_output_obj(
         self, block: TranslatedCodeBlock
-    ) -> dict[str, int | float | str | dict[str, str]]:
+    ) -> dict[str, int | float | str | dict[str, str] | dict[str, float]]:
         output_str = self._parser.parse_combined_output(block.complete_text)
-        output: str | dict[str, str]
+        output_obj: str | dict[str, str]
         try:
-            output = json.loads(output_str)
+            output_obj = json.loads(output_str)
         except json.JSONDecodeError:
-            output = output_str
+            output_obj = output_str
         return dict(
-            input=block.original.text,
+            input=block.original.text or "",
             metadata=dict(
                 retries=block.total_retries,
                 cost=block.total_cost,
                 processing_time=block.processing_time,
             ),
-            output=output,
-        )
-    @staticmethod
-    def _get_prompt_additions(block) -> Optional[List[Tuple[str, str]]]:
-        """Get a list of strings to append to the prompt.
-        Arguments:
-            block: The `TranslatedCodeBlock` to save to a file.
-        """
-        return [(key, item) for key, item in block.context_tags.items()]
-    def _make_prompt_additions(self, block: CodeBlock):
-        # Prepare the additional context to prepend
-        additional_context = "".join(
-            [
-                f"{context_tag}: {context}\n"
-                for context_tag, context in self._get_prompt_additions(block)
-            ]
+            output=output_obj,
         )
-        if not hasattr(self._prompt, "messages"):
-            log.debug("Skipping additions to prompt, no messages found on prompt object!")
-            return
-        # Iterate through existing messages to find and update the system message
-        for i, message in enumerate(self._prompt.messages):
-            if isinstance(message, SystemMessagePromptTemplate):
-                # Prepend the additional context to the system message
-                updated_system_message = SystemMessagePromptTemplate.from_template(
-                    additional_context + message.prompt.template
-                )
-                # Directly modify the message in the list
-                self._prompt.messages[i] = updated_system_message
-                break  # Assuming there's only one system message to update
     def _save_to_file(self, block: TranslatedCodeBlock, out_path: Path) -> None:
         """Save a file to disk.

janus-llm 3.5.3__tar.gz → 4.1.0__tar.gz

janus-llm 3.5.3tar.gz → 4.1.0tar.gz