PyPI - janus-llm - Versions diffs - 3.3.2__tar.gz → 3.4.0__tar.gz - Mend

janus-llm 3.3.2tar.gz → 3.4.0tar.gz

Files changed (108) hide show

{janus_llm-3.3.2 → janus_llm-3.4.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: janus-llm
-Version: 3.3.2
+Version: 3.4.0
 Summary: A transcoding library using LLMs.
 Home-page: https://github.com/janus-llm/janus-llm
 License: Apache 2.0

{janus_llm-3.3.2 → janus_llm-3.4.0}/janus/__init__.py RENAMED Viewed

@@ -5,7 +5,7 @@ from langchain_core._api.deprecation import LangChainDeprecationWarning
 from janus.converter.translate import Translator
 from janus.metrics import *  # noqa: F403
-__version__ = "3.3.2"
+__version__ = "3.4.0"
 # Ignoring a deprecation warning from langchain_core that I can't seem to hunt down
 warnings.filterwarnings("ignore", category=LangChainDeprecationWarning)

{janus_llm-3.3.2 → janus_llm-3.4.0}/janus/cli.py RENAMED Viewed

@@ -200,6 +200,14 @@ def translate(
             help="Whether to overwrite existing files in the output directory",
         ),
     ] = False,
+    skip_context: Annotated[
+        bool,
+        typer.Option(
+            "--skip-context",
+            help="Prompts will include any context information associated with source"
+            " code blocks, unless this option is specified",
+        ),
+    ] = False,
     temp: Annotated[
         float,
         typer.Option("--temperature", "-T", help="Sampling temperature.", min=0, max=2),
@@ -265,6 +273,7 @@ def translate(
         db_path=db_loc,
         db_config=collections_config,
         splitter_type=splitter_type,
+        skip_context=skip_context,
     )
     translator.translate(input_dir, output_dir, overwrite, collection)
@@ -322,6 +331,14 @@ def document(
             help="Whether to overwrite existing files in the output directory",
         ),
     ] = False,
+    skip_context: Annotated[
+        bool,
+        typer.Option(
+            "--skip-context",
+            help="Prompts will include any context information associated with source"
+            " code blocks, unless this option is specified",
+        ),
+    ] = False,
     doc_mode: Annotated[
         str,
         typer.Option(
@@ -390,6 +407,7 @@ def document(
         db_path=db_loc,
         db_config=collections_config,
         splitter_type=splitter_type,
+        skip_context=skip_context,
     )
     if doc_mode == "madlibs":
         documenter = MadLibsDocumenter(
@@ -458,6 +476,14 @@ def diagram(
             help="Whether to overwrite existing files in the output directory",
         ),
     ] = False,
+    skip_context: Annotated[
+        bool,
+        typer.Option(
+            "--skip-context",
+            help="Prompts will include any context information associated with source"
+            " code blocks, unless this option is specified",
+        ),
+    ] = False,
     temperature: Annotated[
         float,
         typer.Option("--temperature", "-t", help="Sampling temperature.", min=0, max=2),
@@ -507,6 +533,7 @@ def diagram(
         diagram_type=diagram_type,
         add_documentation=add_documentation,
         splitter_type=splitter_type,
+        skip_context=skip_context,
     )
     diagram_generator.translate(input_dir, output_dir, overwrite, collection)

{janus_llm-3.3.2 → janus_llm-3.4.0}/janus/converter/converter.py RENAMED Viewed

@@ -3,13 +3,13 @@ import json
 import math
 import time
 from pathlib import Path
-from typing import Any
+from typing import Any, List, Optional, Tuple
 from langchain.output_parsers import RetryWithErrorOutputParser
 from langchain_core.exceptions import OutputParserException
 from langchain_core.language_models import BaseLanguageModel
 from langchain_core.output_parsers import BaseOutputParser
-from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate
 from langchain_core.runnables import RunnableLambda, RunnableParallel
 from openai import BadRequestError, RateLimitError
 from pydantic import ValidationError
@@ -77,6 +77,7 @@ class Converter:
         prune_node_types: tuple[str, ...] = (),
         splitter_type: str = "file",
         refiner_type: str = "basic",
+        skip_context: bool = False,
     ) -> None:
         """Initialize a Converter instance.
@@ -142,6 +143,8 @@ class Converter:
         self.set_db_path(db_path=db_path)
         self.set_db_config(db_config=db_config)
+        self.skip_context = skip_context
         # Child class must call this. Should we enforce somehow?
         # self._load_parameters()
@@ -602,6 +605,9 @@ class Converter:
         # Retries with just the input
         n3 = math.ceil(self.max_prompts / (n1 * n2))
+        # Make replacements in the prompt
+        if not self.skip_context:
+            self._make_prompt_additions(block)
         refine_output = RefinerParser(
             parser=self._parser,
@@ -648,6 +654,35 @@ class Converter:
             output=output,
         )
+    @staticmethod
+    def _get_prompt_additions(block) -> Optional[List[Tuple[str, str]]]:
+        """Get a list of strings to append to the prompt.
+        Arguments:
+            block: The `TranslatedCodeBlock` to save to a file.
+        """
+        return [(key, item) for key, item in block.context_tags.items()]
+    def _make_prompt_additions(self, block: CodeBlock):
+        # Prepare the additional context to prepend
+        additional_context = "".join(
+            [
+                f"{context_tag}: {context}\n"
+                for context_tag, context in self._get_prompt_additions(block)
+            ]
+        )
+        # Iterate through existing messages to find and update the system message
+        for i, message in enumerate(self._prompt.messages):
+            if isinstance(message, SystemMessagePromptTemplate):
+                # Prepend the additional context to the system message
+                updated_system_message = SystemMessagePromptTemplate.from_template(
+                    additional_context + message.prompt.template
+                )
+                # Directly modify the message in the list
+                self._prompt.messages[i] = updated_system_message
+                break  # Assuming there's only one system message to update
     def _save_to_file(self, block: TranslatedCodeBlock, out_path: Path) -> None:
         """Save a file to disk.

{janus_llm-3.3.2 → janus_llm-3.4.0}/janus/converter/requirements.py RENAMED Viewed

@@ -22,6 +22,11 @@ class RequirementsDocumenter(Documenter):
         self._combiner = ChunkCombiner()
         self._parser = RequirementsParser()
+    @staticmethod
+    def get_prompt_replacements(block) -> dict[str, str]:
+        prompt_replacements: dict[str, str] = {"SOURCE_CODE": block.original.text}
+        return prompt_replacements
     def _save_to_file(self, block: TranslatedCodeBlock, out_path: Path) -> None:
         """Save a file to disk.

janus_llm-3.4.0/janus/language/alc/alc.py ADDED Viewed

@@ -0,0 +1,185 @@
+import re
+from typing import Optional
+from langchain.schema.language_model import BaseLanguageModel
+from janus.language.block import CodeBlock
+from janus.language.combine import Combiner
+from janus.language.node import NodeType
+from janus.language.treesitter import TreeSitterSplitter
+from janus.utils.logger import create_logger
+log = create_logger(__name__)
+class AlcCombiner(Combiner):
+    """A class that combines code blocks into ALC files."""
+    def __init__(self) -> None:
+        """Initialize a AlcCombiner instance."""
+        super().__init__("ibmhlasm")
+class AlcSplitter(TreeSitterSplitter):
+    """A class for splitting ALC code into functional blocks to prompt
+    with for transcoding.
+    """
+    def __init__(
+        self,
+        model: None | BaseLanguageModel = None,
+        max_tokens: int = 4096,
+        protected_node_types: tuple[str, ...] = (),
+        prune_node_types: tuple[str, ...] = (),
+        prune_unprotected: bool = False,
+    ):
+        """Initialize a AlcSplitter instance.
+        Arguments:
+            max_tokens: The maximum number of tokens supported by the model
+        """
+        super().__init__(
+            language="ibmhlasm",
+            model=model,
+            max_tokens=max_tokens,
+            protected_node_types=protected_node_types,
+            prune_node_types=prune_node_types,
+            prune_unprotected=prune_unprotected,
+        )
+    def _get_ast(self, code: str) -> CodeBlock:
+        root = super()._get_ast(code)
+        # Current treesitter implementation does not nest csects and dsects
+        # The loop below nests nodes following csect/dsect instructions into
+        #  the children of that instruction
+        sect_types = {"csect_instruction", "dsect_instruction"}
+        queue: list[CodeBlock] = [root]
+        while queue:
+            block = queue.pop(0)
+            # Search this children for csects and dsects. Create a list of groups
+            #  where each group is a csect or dsect, starting with the csect/dsect
+            #  instruction and containing all the subsequent nodes up until the
+            #  next csect or dsect instruction
+            sects: list[list[CodeBlock]] = [[]]
+            for c in block.children:
+                if c.node_type == "csect_instruction":
+                    c.context_tags["alc_section"] = "CSECT"
+                    sects.append([c])
+                elif c.node_type == "dsect_instruction":
+                    c.context_tags["alc_section"] = "DSECT"
+                    sects.append([c])
+                else:
+                    sects[-1].append(c)
+            sects = [s for s in sects if s]
+            # Restructure the tree, making the head of each group the parent
+            #  of all the remaining nodes in that group
+            if len(sects) > 1:
+                block.children = []
+                for sect in sects:
+                    if sect[0].node_type in sect_types:
+                        sect_node = self.merge_nodes(sect)
+                        sect_node.children = sect
+                        sect_node.node_type = NodeType(str(sect[0].node_type)[:5])
+                        block.children.append(sect_node)
+                    else:
+                        block.children.extend(sect)
+            # Push the children onto the queue
+            queue.extend(block.children)
+        return root
+class AlcListingSplitter(AlcSplitter):
+    """A class for splitting ALC listing code into functional blocks to
+    prompt with for transcoding.
+    """
+    def __init__(
+        self,
+        model: None | BaseLanguageModel = None,
+        max_tokens: int = 4096,
+        protected_node_types: tuple[str, ...] = (),
+        prune_node_types: tuple[str, ...] = (),
+        prune_unprotected: bool = False,
+    ):
+        """Initialize a AlcSplitter instance.
+        Arguments:
+            max_tokens: The maximum number of tokens supported by the model
+        """
+        # The string to mark the end of the listing header
+        self.header_indicator_str: str = (
+            "Loc  Object Code    Addr1 Addr2  Stmt   Source Statement"
+        )
+        # How many characters to trim from the right side to remove the address column
+        self.address_column_chars: int = 10
+        # The string to mark the end of the left margin
+        self.left_margin_indicator_str: str = "Stmt"
+        super().__init__(
+            model=model,
+            max_tokens=max_tokens,
+            protected_node_types=protected_node_types,
+            prune_node_types=prune_node_types,
+            prune_unprotected=prune_unprotected,
+        )
+    def _get_ast(self, code: str) -> CodeBlock:
+        active_usings = self.get_active_usings(code)
+        code = self.preproccess_assembly(code)
+        ast: CodeBlock = super()._get_ast(code)
+        ast.context_tags["active_usings"] = active_usings
+        return ast
+    def preproccess_assembly(self, code: str) -> str:
+        """Remove non-essential lines from an assembly snippet"""
+        lines = code.splitlines()
+        lines = self.strip_header_and_left(lines)
+        lines = self.strip_addresses(lines)
+        return "".join(str(line) for line in lines)
+    def get_active_usings(self, code: str) -> Optional[str]:
+        """Look for 'active usings' in the ALC listing header"""
+        lines = code.splitlines()
+        for line in lines:
+            if "Active Usings:" in line:
+                return line.split("Active Usings:")[1]
+        return None
+    def strip_header_and_left(
+        self,
+        lines: list[str],
+    ) -> list[str]:
+        """Remove the header and the left panel from the assembly sample"""
+        esd_regex = re.compile(f".*{self.header_indicator_str}.*")
+        header_end_index: int = [
+            i for i, item in enumerate(lines) if re.search(esd_regex, item)
+        ][0]
+        left_content_end_column = lines[header_end_index].find(
+            self.left_margin_indicator_str
+        )
+        hori_output_lines = lines[(header_end_index + 1) :]
+        left_output_lines = [
+            line[left_content_end_column + 5 :] for line in hori_output_lines
+        ]
+        return left_output_lines
+    def strip_addresses(self, lines: list[str]) -> list[str]:
+        """Strip the addresses which run down the right side of the assembly snippet"""
+        stripped_lines = [line[: -self.address_column_chars] for line in lines]
+        return stripped_lines
+    def strip_footer(self, lines: list[str]):
+        """Strip the footer from the assembly snippet"""
+        return NotImplementedError

{janus_llm-3.3.2 → janus_llm-3.4.0}/janus/language/block.py RENAMED Viewed

@@ -45,6 +45,7 @@ class CodeBlock:
         children: list[ForwardRef("CodeBlock")],
         embedding_id: Optional[str] = None,
         affixes: Tuple[str, str] = ("", ""),
+        context_tags: dict[str, str] = {},
     ) -> None:
         self.id: Hashable = id
         self.name: Optional[str] = name
@@ -59,6 +60,7 @@ class CodeBlock:
         self.children: list[ForwardRef("CodeBlock")] = sorted(children)
         self.embedding_id: Optional[str] = embedding_id
         self.affixes: Tuple[str, str] = affixes
+        self.context_tags: dict[str, str] = context_tags
         self.complete = True
         self.omit_prefix = True

janus_llm-3.4.0/janus/language/naive/simple_ast.py ADDED Viewed

@@ -0,0 +1,93 @@
+from janus.language.alc.alc import AlcListingSplitter, AlcSplitter
+from janus.language.mumps.mumps import MumpsSplitter
+from janus.language.naive.registry import register_splitter
+from janus.language.splitter import Splitter
+from janus.language.treesitter import TreeSitterSplitter
+from janus.utils.enums import LANGUAGES
+from janus.utils.logger import create_logger
+log = create_logger(__name__)
+@register_splitter("ast-flex")
+def get_flexible_ast(language: str, **kwargs) -> Splitter:
+    """Get a flexible AST splitter for the given language.
+    Arguments:
+        language: The language to get the splitter for.
+    Returns:
+        A flexible AST splitter for the given language.
+    """
+    if language == "ibmhlasm":
+        return AlcSplitter(**kwargs)
+    elif language == "mumps":
+        return MumpsSplitter(**kwargs)
+    else:
+        return TreeSitterSplitter(language=language, **kwargs)
+@register_splitter("ast-strict")
+def get_strict_ast(language: str, **kwargs) -> Splitter:
+    """Get a strict AST splitter for the given language.
+    The strict splitter will only return nodes that are of a functional type.
+    Arguments:
+        language: The language to get the splitter for.
+    Returns:
+        A strict AST splitter for the given language.
+    """
+    kwargs.update(
+        protected_node_types=LANGUAGES[language]["functional_node_types"],
+        prune_unprotected=True,
+    )
+    if language == "ibmhlasm":
+        return AlcSplitter(**kwargs)
+    elif language == "mumps":
+        return MumpsSplitter(**kwargs)
+    else:
+        return TreeSitterSplitter(language=language, **kwargs)
+@register_splitter("ast-strict-listing")
+def get_strict_listing_ast(language: str, **kwargs) -> Splitter:
+    """Get a strict AST splitter for the given language. This splitter is intended for
+    use with IBM HLASM.
+    The strict splitter will only return nodes that are of a functional type.
+    Arguments:
+        language: The language to get the splitter for.
+    Returns:
+        A strict AST splitter for the given language.
+    """
+    kwargs.update(
+        protected_node_types=LANGUAGES[language]["functional_node_types"],
+        prune_unprotected=True,
+    )
+    if language == "ibmhlasm":
+        return AlcListingSplitter(**kwargs)
+    else:
+        log.warning("Listing splitter is only intended for use with IBMHLASM!")
+        return TreeSitterSplitter(language=language, **kwargs)
+@register_splitter("ast-flex-listing")
+def get_flexible_listing_ast(language: str, **kwargs) -> Splitter:
+    """Get a flexible AST splitter for the given language. This splitter is intended for
+    use with IBM HLASM.
+    Arguments:
+        language: The language to get the splitter for.
+    Returns:
+        A flexible AST splitter for the given language.
+    """
+    if language == "ibmhlasm":
+        return AlcListingSplitter(**kwargs)
+    else:
+        log.warning("Listing splitter is only intended for use with IBMHLASM!")
+        return TreeSitterSplitter(language=language, **kwargs)

{janus_llm-3.3.2 → janus_llm-3.4.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "janus-llm"
-version = "3.3.2"
+version = "3.4.0"
 description = "A transcoding library using LLMs."
 authors = ["Michael Doyle <mdoyle@mitre.org>", "Chris Glasz <cglasz@mitre.org>",
            "Chris Tohline <ctohline@mitre.org>", "William Macke <wmacke@mitre.org>",

janus_llm-3.3.2/janus/language/alc/alc.py DELETED Viewed

@@ -1,87 +0,0 @@
-from langchain.schema.language_model import BaseLanguageModel
-from janus.language.block import CodeBlock
-from janus.language.combine import Combiner
-from janus.language.node import NodeType
-from janus.language.treesitter import TreeSitterSplitter
-from janus.utils.logger import create_logger
-log = create_logger(__name__)
-class AlcCombiner(Combiner):
-    """A class that combines code blocks into ALC files."""
-    def __init__(self) -> None:
-        """Initialize a AlcCombiner instance."""
-        super().__init__("ibmhlasm")
-class AlcSplitter(TreeSitterSplitter):
-    """A class for splitting ALC code into functional blocks to prompt
-    with for transcoding.
-    """
-    def __init__(
-        self,
-        model: None | BaseLanguageModel = None,
-        max_tokens: int = 4096,
-        protected_node_types: tuple[str, ...] = (),
-        prune_node_types: tuple[str, ...] = (),
-        prune_unprotected: bool = False,
-    ):
-        """Initialize a AlcSplitter instance.
-        Arguments:
-            max_tokens: The maximum number of tokens supported by the model
-        """
-        super().__init__(
-            language="ibmhlasm",
-            model=model,
-            max_tokens=max_tokens,
-            protected_node_types=protected_node_types,
-            prune_node_types=prune_node_types,
-            prune_unprotected=prune_unprotected,
-        )
-    def _get_ast(self, code: str) -> CodeBlock:
-        root = super()._get_ast(code)
-        # Current treesitter implementation does not nest csects and dsects
-        # The loop below nests nodes following csect/dsect instructions into
-        #  the children of that instruction
-        sect_types = {"csect_instruction", "dsect_instruction"}
-        queue: list[CodeBlock] = [root]
-        while queue:
-            block = queue.pop(0)
-            # Search this children for csects and dsects. Create a list of groups
-            #  where each group is a csect or dsect, starting with the csect/dsect
-            #  instruction and containing all the subsequent nodes up until the
-            #  next csect or dsect instruction
-            sects: list[list[CodeBlock]] = [[]]
-            for c in block.children:
-                if c.node_type in sect_types:
-                    sects.append([c])
-                else:
-                    sects[-1].append(c)
-            sects = [s for s in sects if s]
-            # Restructure the tree, making the head of each group the parent
-            #  of all the remaining nodes in that group
-            if len(sects) > 1:
-                block.children = []
-                for sect in sects:
-                    if sect[0].node_type in sect_types:
-                        sect_node = self.merge_nodes(sect)
-                        sect_node.children = sect
-                        sect_node.node_type = NodeType(str(sect[0].node_type)[:5])
-                        block.children.append(sect_node)
-                    else:
-                        block.children.extend(sect)
-            # Push the children onto the queue
-            queue.extend(block.children)
-        return root

janus_llm-3.3.2/janus/language/naive/simple_ast.py DELETED Viewed

@@ -1,29 +0,0 @@
-from janus.language.alc.alc import AlcSplitter
-from janus.language.mumps.mumps import MumpsSplitter
-from janus.language.naive.registry import register_splitter
-from janus.language.treesitter import TreeSitterSplitter
-from janus.utils.enums import LANGUAGES
-@register_splitter("ast-flex")
-def get_flexible_ast(language: str, **kwargs):
-    if language == "ibmhlasm":
-        return AlcSplitter(**kwargs)
-    elif language == "mumps":
-        return MumpsSplitter(**kwargs)
-    else:
-        return TreeSitterSplitter(language=language, **kwargs)
-@register_splitter("ast-strict")
-def get_strict_ast(language: str, **kwargs):
-    kwargs.update(
-        protected_node_types=LANGUAGES[language]["functional_node_types"],
-        prune_unprotected=True,
-    )
-    if language == "ibmhlasm":
-        return AlcSplitter(**kwargs)
-    elif language == "mumps":
-        return MumpsSplitter(**kwargs)
-    else:
-        return TreeSitterSplitter(language=language, **kwargs)