PyPI - janus-llm - Versions diffs - 3.5.2__py3-none-any.whl → 4.0.0__py3-none-any.whl - Mend

janus-llm 3.5.2py3-none-any.whl → 4.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

janus/__init__.py +1 -1
janus/cli.py +90 -42
janus/converter/converter.py +111 -142
janus/converter/diagram.py +21 -109
janus/converter/translate.py +1 -1
janus/language/alc/_tests/test_alc.py +1 -1
janus/language/alc/alc.py +16 -11
janus/language/binary/_tests/test_binary.py +1 -1
janus/language/binary/binary.py +2 -2
janus/language/mumps/_tests/test_mumps.py +1 -1
janus/language/mumps/mumps.py +2 -3
janus/language/naive/simple_ast.py +3 -2
janus/language/splitter.py +7 -4
janus/language/treesitter/_tests/test_treesitter.py +1 -1
janus/language/treesitter/treesitter.py +2 -2
janus/llm/model_callbacks.py +13 -0
janus/llm/models_info.py +118 -71
janus/metrics/metric.py +15 -14
janus/parsers/uml.py +60 -23
janus/refiners/refiner.py +106 -64
janus/retrievers/retriever.py +42 -0
{janus_llm-3.5.2.dist-info → janus_llm-4.0.0.dist-info}/METADATA +1 -1
{janus_llm-3.5.2.dist-info → janus_llm-4.0.0.dist-info}/RECORD +26 -26
janus/parsers/refiner_parser.py +0 -46
{janus_llm-3.5.2.dist-info → janus_llm-4.0.0.dist-info}/LICENSE +0 -0
{janus_llm-3.5.2.dist-info → janus_llm-4.0.0.dist-info}/WHEEL +0 -0
{janus_llm-3.5.2.dist-info → janus_llm-4.0.0.dist-info}/entry_points.txt +0 -0

janus/__init__.py CHANGED Viewed

@@ -5,7 +5,7 @@ from langchain_core._api.deprecation import LangChainDeprecationWarning
 from janus.converter.translate import Translator
 from janus.metrics import *  # noqa: F403
-__version__ = "3.5.2"
+__version__ = "4.0.0"
 # Ignoring a deprecation warning from langchain_core that I can't seem to hunt down
 warnings.filterwarnings("ignore", category=LangChainDeprecationWarning)

janus/cli.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import json
 import logging
 import os
+import subprocess  # nosec
 from pathlib import Path
 from typing import List, Optional
@@ -42,6 +43,7 @@ from janus.llm.models_info import (
     openai_models,
 )
 from janus.metrics.cli import evaluate
+from janus.refiners.refiner import REFINERS
 from janus.utils.enums import LANGUAGES
 from janus.utils.logger import create_logger
@@ -241,6 +243,24 @@ def translate(
             click_type=click.Choice(list(CUSTOM_SPLITTERS.keys())),
         ),
     ] = "file",
+    refiner_type: Annotated[
+        str,
+        typer.Option(
+            "-r",
+            "--refiner",
+            help="Name of custom refiner to use",
+            click_type=click.Choice(list(REFINERS.keys())),
+        ),
+    ] = "none",
+    retriever_type: Annotated[
+        str,
+        typer.Option(
+            "-R",
+            "--retriever",
+            help="Name of custom retriever to use",
+            click_type=click.Choice(["active_usings"]),
+        ),
+    ] = None,
     max_tokens: Annotated[
         int,
         typer.Option(
@@ -250,13 +270,6 @@ def translate(
             "If unspecificed, model's default max will be used.",
         ),
     ] = None,
-    skip_refiner: Annotated[
-        bool,
-        typer.Option(
-            "--skip-refiner",
-            help="Whether to skip the refiner for generating output",
-        ),
-    ] = True,
 ):
     try:
         target_language, target_version = target_lang.split("-")
@@ -282,8 +295,8 @@ def translate(
         db_path=db_loc,
         db_config=collections_config,
         splitter_type=splitter_type,
-        skip_context=skip_context,
-        skip_refiner=skip_refiner,
+        refiner_type=refiner_type,
+        retriever_type=retriever_type,
     )
     translator.translate(input_dir, output_dir, overwrite, collection)
@@ -341,14 +354,6 @@ def document(
             help="Whether to overwrite existing files in the output directory",
         ),
     ] = False,
-    skip_context: Annotated[
-        bool,
-        typer.Option(
-            "--skip-context",
-            help="Prompts will include any context information associated with source"
-            " code blocks, unless this option is specified",
-        ),
-    ] = False,
     doc_mode: Annotated[
         str,
         typer.Option(
@@ -396,6 +401,24 @@ def document(
             click_type=click.Choice(list(CUSTOM_SPLITTERS.keys())),
         ),
     ] = "file",
+    refiner_type: Annotated[
+        str,
+        typer.Option(
+            "-r",
+            "--refiner",
+            help="Name of custom refiner to use",
+            click_type=click.Choice(list(REFINERS.keys())),
+        ),
+    ] = "none",
+    retriever_type: Annotated[
+        str,
+        typer.Option(
+            "-R",
+            "--retriever",
+            help="Name of custom retriever to use",
+            click_type=click.Choice(["active_usings"]),
+        ),
+    ] = None,
     max_tokens: Annotated[
         int,
         typer.Option(
@@ -405,13 +428,6 @@ def document(
             "If unspecificed, model's default max will be used.",
         ),
     ] = None,
-    skip_refiner: Annotated[
-        bool,
-        typer.Option(
-            "--skip-refiner",
-            help="Whether to skip the refiner for generating output",
-        ),
-    ] = True,
 ):
     model_arguments = dict(temperature=temperature)
     collections_config = get_collections_config()
@@ -424,8 +440,8 @@ def document(
         db_path=db_loc,
         db_config=collections_config,
         splitter_type=splitter_type,
-        skip_refiner=skip_refiner,
-        skip_context=skip_context,
+        refiner_type=refiner_type,
+        retriever_type=retriever_type,
     )
     if doc_mode == "madlibs":
         documenter = MadLibsDocumenter(
@@ -614,14 +630,6 @@ def diagram(
             help="Whether to overwrite existing files in the output directory",
         ),
     ] = False,
-    skip_context: Annotated[
-        bool,
-        typer.Option(
-            "--skip-context",
-            help="Prompts will include any context information associated with source"
-            " code blocks, unless this option is specified",
-        ),
-    ] = False,
     temperature: Annotated[
         float,
         typer.Option("--temperature", "-t", help="Sampling temperature.", min=0, max=2),
@@ -658,13 +666,24 @@ def diagram(
             click_type=click.Choice(list(CUSTOM_SPLITTERS.keys())),
         ),
     ] = "file",
-    skip_refiner: Annotated[
-        bool,
+    refiner_type: Annotated[
+        str,
         typer.Option(
-            "--skip-refiner",
-            help="Whether to skip the refiner for generating output",
+            "-r",
+            "--refiner",
+            help="Name of custom refiner to use",
+            click_type=click.Choice(list(REFINERS.keys())),
         ),
-    ] = True,
+    ] = "none",
+    retriever_type: Annotated[
+        str,
+        typer.Option(
+            "-R",
+            "--retriever",
+            help="Name of custom retriever to use",
+            click_type=click.Choice(["active_usings"]),
+        ),
+    ] = None,
 ):
     model_arguments = dict(temperature=temperature)
     collections_config = get_collections_config()
@@ -675,11 +694,11 @@ def diagram(
         max_prompts=max_prompts,
         db_path=db_loc,
         db_config=collections_config,
+        splitter_type=splitter_type,
+        refiner_type=refiner_type,
+        retriever_type=retriever_type,
         diagram_type=diagram_type,
         add_documentation=add_documentation,
-        splitter_type=splitter_type,
-        skip_refiner=skip_refiner,
-        skip_context=skip_context,
     )
     diagram_generator.translate(input_dir, output_dir, overwrite, collection)
@@ -1156,5 +1175,34 @@ app.add_typer(evaluate, name="evaluate")
 app.add_typer(embedding, name="embedding")
+@app.command()
+def render(
+    input_dir: Annotated[
+        str,
+        typer.Option(
+            "--input",
+            "-i",
+        ),
+    ],
+    output_dir: Annotated[str, typer.Option("--output", "-o")],
+):
+    input_dir = Path(input_dir)
+    output_dir = Path(output_dir)
+    for input_file in input_dir.rglob("*.json"):
+        with open(input_file, "r") as f:
+            data = json.load(f)
+        output_file = output_dir / input_file.relative_to(input_dir).with_suffix(".txt")
+        if not output_file.parent.exists():
+            output_file.parent.mkdir()
+        text = data["output"].replace("\\n", "\n").strip()
+        output_file.write_text(text)
+        jar_path = homedir / ".janus/lib/plantuml.jar"
+        subprocess.run(["java", "-jar", jar_path, output_file])  # nosec
+        output_file.unlink()
 if __name__ == "__main__":
     app()

janus/converter/converter.py CHANGED Viewed

@@ -2,13 +2,11 @@ import functools
 import json
 import time
 from pathlib import Path
-from typing import Any, List, Optional, Tuple
+from typing import Any
-from langchain.output_parsers import RetryWithErrorOutputParser
 from langchain_core.exceptions import OutputParserException
-from langchain_core.language_models import BaseLanguageModel
-from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate
-from langchain_core.runnables import RunnableLambda, RunnableParallel
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.runnables import Runnable, RunnableParallel, RunnablePassthrough
 from openai import BadRequestError, RateLimitError
 from pydantic import ValidationError
@@ -22,12 +20,18 @@ from janus.language.splitter import (
     Splitter,
     TokenLimitError,
 )
-from janus.llm import load_model
 from janus.llm.model_callbacks import get_model_callback
-from janus.llm.models_info import MODEL_PROMPT_ENGINES
+from janus.llm.models_info import MODEL_PROMPT_ENGINES, JanusModel, load_model
 from janus.parsers.parser import GenericParser, JanusParser
-from janus.parsers.refiner_parser import RefinerParser
-from janus.refiners.refiner import BasicRefiner, Refiner
+from janus.refiners.refiner import (
+    FixParserExceptions,
+    HallucinationRefiner,
+    JanusRefiner,
+    ReflectionRefiner,
+)
+# from janus.refiners.refiner import BasicRefiner, Refiner
+from janus.retrievers.retriever import ActiveUsingsRetriever, JanusRetriever
 from janus.utils.enums import LANGUAGES
 from janus.utils.logger import create_logger
@@ -74,9 +78,8 @@ class Converter:
         protected_node_types: tuple[str, ...] = (),
         prune_node_types: tuple[str, ...] = (),
         splitter_type: str = "file",
-        refiner_type: str = "basic",
-        skip_refiner: bool = True,
-        skip_context: bool = False,
+        refiner_type: str | None = None,
+        retriever_type: str | None = None,
     ) -> None:
         """Initialize a Converter instance.
@@ -96,9 +99,13 @@ class Converter:
             prune_node_types: A set of node types which should be pruned.
             splitter_type: The type of splitter to use. Valid values are `"file"`,
                 `"tag"`, `"chunk"`, `"ast-strict"`, and `"ast-flex"`.
-            refiner_type: The type of refiner to use. Valid values are `"basic"`.
-            skip_refiner: Whether to skip the refiner.
-            skip_context: Whether to skip adding context to the prompt.
+            refiner_type: The type of refiner to use. Valid values:
+                - "parser"
+                - "reflection"
+                - None
+            retriever_type: The type of retriever to use. Valid values:
+                - "active_usings"
+                - None
         """
         self._changed_attrs: set = set()
@@ -107,7 +114,6 @@ class Converter:
         self.override_token_limit: bool = max_tokens is not None
         self._model_name: str
-        self._model_id: str
         self._custom_model_arguments: dict[str, Any]
         self._source_language: str
@@ -120,24 +126,26 @@ class Converter:
         self._prune_node_types: tuple[str, ...] = ()
         self._max_tokens: int | None = max_tokens
         self._prompt_template_name: str
-        self._splitter_type: str
         self._db_path: str | None
         self._db_config: dict[str, Any] | None
-        self._splitter: Splitter
-        self._llm: BaseLanguageModel
+        self._llm: JanusModel
         self._prompt: ChatPromptTemplate
         self._parser: JanusParser = GenericParser()
         self._combiner: Combiner = Combiner()
-        self._refiner_type: str
-        self._refiner: Refiner
+        self._splitter_type: str
+        self._refiner_type: str | None
+        self._retriever_type: str | None
-        self.skip_refiner = skip_refiner
+        self._splitter: Splitter
+        self._refiner: JanusRefiner
+        self._retriever: JanusRetriever
         self.set_splitter(splitter_type=splitter_type)
         self.set_refiner(refiner_type=refiner_type)
+        self.set_retriever(retriever_type=retriever_type)
         self.set_model(model_name=model, **model_arguments)
         self.set_prompt(prompt_template=prompt_template)
         self.set_source_language(source_language)
@@ -146,8 +154,6 @@ class Converter:
         self.set_db_path(db_path=db_path)
         self.set_db_config(db_config=db_config)
-        self.skip_context = skip_context
         # Child class must call this. Should we enforce somehow?
         # self._load_parameters()
@@ -163,9 +169,11 @@ class Converter:
     def _load_parameters(self) -> None:
         self._load_model()
         self._load_prompt()
+        self._load_retriever()
+        self._load_refiner()
         self._load_splitter()
         self._load_vectorizer()
-        self._load_refiner()
+        self._load_chain()
         self._changed_attrs.clear()
     def set_model(self, model_name: str, **custom_arguments: dict[str, Any]):
@@ -184,8 +192,6 @@ class Converter:
     def set_prompt(self, prompt_template: str) -> None:
         """Validate and set the prompt template name.
-        The affected objects will not be updated until translate() is called.
         Arguments:
             prompt_template: name of prompt template directory
                 (see janus/prompts/templates) or path to a directory.
@@ -195,29 +201,34 @@ class Converter:
     def set_splitter(self, splitter_type: str) -> None:
         """Validate and set the prompt template name.
-        The affected objects will not be updated until translate() is called.
         Arguments:
             prompt_template: name of prompt template directory
                 (see janus/prompts/templates) or path to a directory.
         """
-        self._splitter_type = splitter_type
+        if splitter_type not in CUSTOM_SPLITTERS:
+            raise ValueError(f'Splitter type "{splitter_type}" does not exist.')
-    def set_refiner(self, refiner_type: str) -> None:
-        """Validate and set the refiner name
+        self._splitter_type = splitter_type
-        The affected objects will not be updated until translate is called
+    def set_refiner(self, refiner_type: str | None) -> None:
+        """Validate and set the refiner type
         Arguments:
-            refiner_type: the name of the refiner to use
+            refiner_type: the type of refiner to use
         """
         self._refiner_type = refiner_type
+    def set_retriever(self, retriever_type: str | None) -> None:
+        """Validate and set the retriever type
+        Arguments:
+            retriever_type: the type of retriever to use
+        """
+        self._retriever_type = retriever_type
     def set_source_language(self, source_language: str) -> None:
         """Validate and set the source language.
-        The affected objects will not be updated until _load_parameters() is called.
         Arguments:
             source_language: The source programming language.
         """
@@ -287,20 +298,6 @@ class Converter:
         self._splitter = CUSTOM_SPLITTERS[self._splitter_type](**kwargs)
-    @run_if_changed("_refiner_type", "_model_name")
-    def _load_refiner(self) -> None:
-        """Load the refiner according to this instance's attributes.
-        If the relevant fields have not been changed since the last time this method was
-        called, nothing happens.
-        """
-        if self._refiner_type == "basic":
-            self._refiner = BasicRefiner(
-                "basic_refinement", self._model_id, self._source_language
-            )
-        else:
-            raise ValueError(f"Error: unknown refiner type {self._refiner_type}")
     @run_if_changed("_model_name", "_custom_model_arguments")
     def _load_model(self) -> None:
         """Load the model according to this instance's attributes.
@@ -314,9 +311,9 @@ class Converter:
         # model_arguments.update(self._custom_model_arguments)
         # Load the model
-        self._llm, self._model_id, token_limit, self.model_cost = load_model(
-            self._model_name
-        )
+        self._llm = load_model(self._model_name)
+        token_limit = self._llm.token_limit
         # Set the max_tokens to less than half the model's limit to allow for enough
         # tokens at output
         # Only modify max_tokens if it is not specified by user
@@ -335,7 +332,7 @@ class Converter:
         If the relevant fields have not been changed since the last time this
         method was called, nothing happens.
         """
-        prompt_engine = MODEL_PROMPT_ENGINES[self._model_id](
+        prompt_engine = MODEL_PROMPT_ENGINES[self._llm.short_model_id](
             source_language=self._source_language,
             prompt_template=self._prompt_template_name,
         )
@@ -354,6 +351,59 @@ class Converter:
             self._db_path, self._db_config
         )
+    @run_if_changed("_retriever_type")
+    def _load_retriever(self):
+        if self._retriever_type == "active_usings":
+            self._retriever = ActiveUsingsRetriever()
+        else:
+            self._retriever = JanusRetriever()
+    @run_if_changed("_refiner_type", "_model_name", "max_prompts", "_parser", "_llm")
+    def _load_refiner(self) -> None:
+        """Load the refiner according to this instance's attributes.
+        If the relevant fields have not been changed since the last time this method was
+        called, nothing happens.
+        """
+        if self._refiner_type == "parser":
+            self._refiner = FixParserExceptions(
+                llm=self._llm,
+                parser=self._parser,
+                max_retries=self.max_prompts,
+            )
+        elif self._refiner_type == "reflection":
+            self._refiner = ReflectionRefiner(
+                llm=self._llm,
+                parser=self._parser,
+                max_retries=self.max_prompts,
+            )
+        elif self._refiner_type == "hallucination":
+            self._refiner = HallucinationRefiner(
+                llm=self._llm,
+                parser=self._parser,
+                max_retries=self.max_prompts,
+            )
+        else:
+            self._refiner = JanusRefiner(parser=self._parser)
+    @run_if_changed("_parser", "_retriever", "_prompt", "_llm", "_refiner")
+    def _load_chain(self):
+        self.chain = (
+            self._input_runnable()
+            | self._prompt
+            | RunnableParallel(
+                completion=self._llm,
+                prompt_value=RunnablePassthrough(),
+            )
+            | self._refiner.parse_runnable
+        )
+    def _input_runnable(self) -> Runnable:
+        return RunnableParallel(
+            SOURCE_CODE=self._parser.parse_input,
+            context=self._retriever,
+        )
     def translate(
         self,
         input_directory: str | Path,
@@ -598,110 +648,29 @@ class Converter:
         return root
     def _run_chain(self, block: TranslatedCodeBlock) -> str:
-        """Run the model with three nested error fixing schemes.
-        First, try to fix simple formatting errors by giving the model just
-        the output and the parsing error. After a number of attempts, try
-        giving the model the output, the parsing error, and the original
-        input. Again check/retry this output to solve for formatting errors.
-        If we still haven't succeeded after several attempts, the model may
-        be getting thrown off by a bad initial output; start from scratch
-        and try again.
-        The number of tries for each layer of this scheme is roughly equal
-        to the cube root of self.max_retries, so the total calls to the
-        LLM will be roughly as expected (up to sqrt(self.max_retries) over)
-        """
-        input = self._parser.parse_input(block.original)
-        # Retries with just the output and the error
-        n1 = round(self.max_prompts ** (1 / 2))
-        # Retries with the input, output, and error
-        n2 = round(self.max_prompts // n1)
-        if not self.skip_context:
-            self._make_prompt_additions(block)
-        if not self.skip_refiner:  # Make replacements in the prompt
-            refine_output = RefinerParser(
-                parser=self._parser,
-                initial_prompt=self._prompt.format(**{"SOURCE_CODE": input}),
-                refiner=self._refiner,
-                max_retries=n1,
-                llm=self._llm,
-            )
-        else:
-            refine_output = RetryWithErrorOutputParser.from_llm(
-                llm=self._llm,
-                parser=self._parser,
-                max_retries=n1,
-            )
-        completion_chain = self._prompt | self._llm
-        chain = RunnableParallel(
-            completion=completion_chain, prompt_value=self._prompt
-        ) | RunnableLambda(lambda x: refine_output.parse_with_prompt(**x))
-        for _ in range(n2):
-            try:
-                return chain.invoke({"SOURCE_CODE": input})
-            except OutputParserException:
-                pass
-        raise OutputParserException(f"Failed to parse after {n1*n2} retries")
+        return self.chain.invoke(block.original)
     def _get_output_obj(
         self, block: TranslatedCodeBlock
-    ) -> dict[str, int | float | str | dict[str, str]]:
+    ) -> dict[str, int | float | str | dict[str, str] | dict[str, float]]:
         output_str = self._parser.parse_combined_output(block.complete_text)
-        output: str | dict[str, str]
+        output_obj: str | dict[str, str]
         try:
-            output = json.loads(output_str)
+            output_obj = json.loads(output_str)
         except json.JSONDecodeError:
-            output = output_str
+            output_obj = output_str
         return dict(
-            input=block.original.text,
+            input=block.original.text or "",
             metadata=dict(
                 retries=block.total_retries,
                 cost=block.total_cost,
                 processing_time=block.processing_time,
             ),
-            output=output,
-        )
-    @staticmethod
-    def _get_prompt_additions(block) -> Optional[List[Tuple[str, str]]]:
-        """Get a list of strings to append to the prompt.
-        Arguments:
-            block: The `TranslatedCodeBlock` to save to a file.
-        """
-        return [(key, item) for key, item in block.context_tags.items()]
-    def _make_prompt_additions(self, block: CodeBlock):
-        # Prepare the additional context to prepend
-        additional_context = "".join(
-            [
-                f"{context_tag}: {context}\n"
-                for context_tag, context in self._get_prompt_additions(block)
-            ]
+            output=output_obj,
         )
-        if not hasattr(self._prompt, "messages"):
-            log.debug("Skipping additions to prompt, no messages found on prompt object!")
-            return
-        # Iterate through existing messages to find and update the system message
-        for i, message in enumerate(self._prompt.messages):
-            if isinstance(message, SystemMessagePromptTemplate):
-                # Prepend the additional context to the system message
-                updated_system_message = SystemMessagePromptTemplate.from_template(
-                    additional_context + message.prompt.template
-                )
-                # Directly modify the message in the list
-                self._prompt.messages[i] = updated_system_message
-                break  # Assuming there's only one system message to update
     def _save_to_file(self, block: TranslatedCodeBlock, out_path: Path) -> None:
         """Save a file to disk.

janus-llm 3.5.2__py3-none-any.whl → 4.0.0__py3-none-any.whl

janus-llm 3.5.2py3-none-any.whl → 4.0.0py3-none-any.whl