PyPI - janus-llm - Versions diffs - 3.3.2__tar.gz → 3.4.1__tar.gz - Mend

janus-llm 3.3.2tar.gz → 3.4.1tar.gz

Files changed (108) hide show

{janus_llm-3.3.2 → janus_llm-3.4.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: janus-llm
-Version: 3.3.2
+Version: 3.4.1
 Summary: A transcoding library using LLMs.
 Home-page: https://github.com/janus-llm/janus-llm
 License: Apache 2.0

{janus_llm-3.3.2 → janus_llm-3.4.1}/janus/__init__.py RENAMED Viewed

@@ -5,7 +5,7 @@ from langchain_core._api.deprecation import LangChainDeprecationWarning
 from janus.converter.translate import Translator
 from janus.metrics import *  # noqa: F403
-__version__ = "3.3.2"
+__version__ = "3.4.1"
 # Ignoring a deprecation warning from langchain_core that I can't seem to hunt down
 warnings.filterwarnings("ignore", category=LangChainDeprecationWarning)

{janus_llm-3.3.2 → janus_llm-3.4.1}/janus/cli.py RENAMED Viewed

@@ -200,6 +200,14 @@ def translate(
             help="Whether to overwrite existing files in the output directory",
         ),
     ] = False,
+    skip_context: Annotated[
+        bool,
+        typer.Option(
+            "--skip-context",
+            help="Prompts will include any context information associated with source"
+            " code blocks, unless this option is specified",
+        ),
+    ] = False,
     temp: Annotated[
         float,
         typer.Option("--temperature", "-T", help="Sampling temperature.", min=0, max=2),
@@ -240,6 +248,13 @@ def translate(
             "If unspecificed, model's default max will be used.",
         ),
     ] = None,
+    skip_refiner: Annotated[
+        bool,
+        typer.Option(
+            "--skip-refiner",
+            help="Whether to skip the refiner for generating output",
+        ),
+    ] = True,
 ):
     try:
         target_language, target_version = target_lang.split("-")
@@ -265,6 +280,8 @@ def translate(
         db_path=db_loc,
         db_config=collections_config,
         splitter_type=splitter_type,
+        skip_context=skip_context,
+        skip_refiner=skip_refiner,
     )
     translator.translate(input_dir, output_dir, overwrite, collection)
@@ -322,6 +339,14 @@ def document(
             help="Whether to overwrite existing files in the output directory",
         ),
     ] = False,
+    skip_context: Annotated[
+        bool,
+        typer.Option(
+            "--skip-context",
+            help="Prompts will include any context information associated with source"
+            " code blocks, unless this option is specified",
+        ),
+    ] = False,
     doc_mode: Annotated[
         str,
         typer.Option(
@@ -378,6 +403,13 @@ def document(
             "If unspecificed, model's default max will be used.",
         ),
     ] = None,
+    skip_refiner: Annotated[
+        bool,
+        typer.Option(
+            "--skip-refiner",
+            help="Whether to skip the refiner for generating output",
+        ),
+    ] = True,
 ):
     model_arguments = dict(temperature=temperature)
     collections_config = get_collections_config()
@@ -390,6 +422,8 @@ def document(
         db_path=db_loc,
         db_config=collections_config,
         splitter_type=splitter_type,
+        skip_refiner=skip_refiner,
+        skip_context=skip_context,
     )
     if doc_mode == "madlibs":
         documenter = MadLibsDocumenter(
@@ -458,6 +492,14 @@ def diagram(
             help="Whether to overwrite existing files in the output directory",
         ),
     ] = False,
+    skip_context: Annotated[
+        bool,
+        typer.Option(
+            "--skip-context",
+            help="Prompts will include any context information associated with source"
+            " code blocks, unless this option is specified",
+        ),
+    ] = False,
     temperature: Annotated[
         float,
         typer.Option("--temperature", "-t", help="Sampling temperature.", min=0, max=2),
@@ -494,6 +536,13 @@ def diagram(
             click_type=click.Choice(list(CUSTOM_SPLITTERS.keys())),
         ),
     ] = "file",
+    skip_refiner: Annotated[
+        bool,
+        typer.Option(
+            "--skip-refiner",
+            help="Whether to skip the refiner for generating output",
+        ),
+    ] = True,
 ):
     model_arguments = dict(temperature=temperature)
     collections_config = get_collections_config()
@@ -507,6 +556,8 @@ def diagram(
         diagram_type=diagram_type,
         add_documentation=add_documentation,
         splitter_type=splitter_type,
+        skip_refiner=skip_refiner,
+        skip_context=skip_context,
     )
     diagram_generator.translate(input_dir, output_dir, overwrite, collection)

{janus_llm-3.3.2 → janus_llm-3.4.1}/janus/converter/converter.py RENAMED Viewed

@@ -1,15 +1,14 @@
 import functools
 import json
-import math
 import time
 from pathlib import Path
-from typing import Any
+from typing import Any, List, Optional, Tuple
 from langchain.output_parsers import RetryWithErrorOutputParser
 from langchain_core.exceptions import OutputParserException
 from langchain_core.language_models import BaseLanguageModel
 from langchain_core.output_parsers import BaseOutputParser
-from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate
 from langchain_core.runnables import RunnableLambda, RunnableParallel
 from openai import BadRequestError, RateLimitError
 from pydantic import ValidationError
@@ -77,6 +76,8 @@ class Converter:
         prune_node_types: tuple[str, ...] = (),
         splitter_type: str = "file",
         refiner_type: str = "basic",
+        skip_refiner: bool = True,
+        skip_context: bool = False,
     ) -> None:
         """Initialize a Converter instance.
@@ -97,6 +98,8 @@ class Converter:
             splitter_type: The type of splitter to use. Valid values are `"file"`,
                 `"tag"`, `"chunk"`, `"ast-strict"`, and `"ast-flex"`.
             refiner_type: The type of refiner to use. Valid values are `"basic"`.
+            skip_refiner: Whether to skip the refiner.
+            skip_context: Whether to skip adding context to the prompt.
         """
         self._changed_attrs: set = set()
@@ -132,6 +135,8 @@ class Converter:
         self._refiner_type: str
         self._refiner: Refiner
+        self.skip_refiner = skip_refiner
         self.set_splitter(splitter_type=splitter_type)
         self.set_refiner(refiner_type=refiner_type)
         self.set_model(model_name=model, **model_arguments)
@@ -142,6 +147,8 @@ class Converter:
         self.set_db_path(db_path=db_path)
         self.set_db_config(db_config=db_config)
+        self.skip_context = skip_context
         # Child class must call this. Should we enforce somehow?
         # self._load_parameters()
@@ -290,7 +297,7 @@ class Converter:
         """
         if self._refiner_type == "basic":
             self._refiner = BasicRefiner(
-                "basic_refinement", self._model_name, self._source_language
+                "basic_refinement", self._model_id, self._source_language
             )
         else:
             raise ValueError(f"Error: unknown refiner type {self._refiner_type}")
@@ -595,37 +602,41 @@ class Converter:
         self._parser.set_reference(block.original)
         # Retries with just the output and the error
-        n1 = round(self.max_prompts ** (1 / 3))
+        n1 = round(self.max_prompts ** (1 / 2))
         # Retries with the input, output, and error
-        n2 = round((self.max_prompts // n1) ** (1 / 2))
+        n2 = round(self.max_prompts // n1)
         # Retries with just the input
-        n3 = math.ceil(self.max_prompts / (n1 * n2))
-        refine_output = RefinerParser(
-            parser=self._parser,
-            initial_prompt=self._prompt.format(**{"SOURCE_CODE": block.original.text}),
-            refiner=self._refiner,
-            max_retries=n1,
-            llm=self._llm,
-        )
-        retry = RetryWithErrorOutputParser.from_llm(
-            llm=self._llm,
-            parser=refine_output,
-            max_retries=n2,
-        )
+        if not self.skip_context:
+            self._make_prompt_additions(block)
+        if not self.skip_refiner:  # Make replacements in the prompt
+            refine_output = RefinerParser(
+                parser=self._parser,
+                initial_prompt=self._prompt.format(
+                    **{"SOURCE_CODE": block.original.text}
+                ),
+                refiner=self._refiner,
+                max_retries=n1,
+                llm=self._llm,
+            )
+        else:
+            refine_output = RetryWithErrorOutputParser.from_llm(
+                llm=self._llm,
+                parser=self._parser,
+                max_retries=n1,
+            )
         completion_chain = self._prompt | self._llm
         chain = RunnableParallel(
             completion=completion_chain, prompt_value=self._prompt
-        ) | RunnableLambda(lambda x: retry.parse_with_prompt(**x))
-        for _ in range(n3):
+        ) | RunnableLambda(lambda x: refine_output.parse_with_prompt(**x))
+        for _ in range(n2):
             try:
                 return chain.invoke({"SOURCE_CODE": block.original.text})
             except OutputParserException:
                 pass
-        raise OutputParserException(f"Failed to parse after {n1*n2*n3} retries")
+        raise OutputParserException(f"Failed to parse after {n1*n2} retries")
     def _get_output_obj(
         self, block: TranslatedCodeBlock
@@ -648,6 +659,35 @@ class Converter:
             output=output,
         )
+    @staticmethod
+    def _get_prompt_additions(block) -> Optional[List[Tuple[str, str]]]:
+        """Get a list of strings to append to the prompt.
+        Arguments:
+            block: The `TranslatedCodeBlock` to save to a file.
+        """
+        return [(key, item) for key, item in block.context_tags.items()]
+    def _make_prompt_additions(self, block: CodeBlock):
+        # Prepare the additional context to prepend
+        additional_context = "".join(
+            [
+                f"{context_tag}: {context}\n"
+                for context_tag, context in self._get_prompt_additions(block)
+            ]
+        )
+        # Iterate through existing messages to find and update the system message
+        for i, message in enumerate(self._prompt.messages):
+            if isinstance(message, SystemMessagePromptTemplate):
+                # Prepend the additional context to the system message
+                updated_system_message = SystemMessagePromptTemplate.from_template(
+                    additional_context + message.prompt.template
+                )
+                # Directly modify the message in the list
+                self._prompt.messages[i] = updated_system_message
+                break  # Assuming there's only one system message to update
     def _save_to_file(self, block: TranslatedCodeBlock, out_path: Path) -> None:
         """Save a file to disk.

{janus_llm-3.3.2 → janus_llm-3.4.1}/janus/converter/requirements.py RENAMED Viewed

@@ -22,6 +22,11 @@ class RequirementsDocumenter(Documenter):
         self._combiner = ChunkCombiner()
         self._parser = RequirementsParser()
+    @staticmethod
+    def get_prompt_replacements(block) -> dict[str, str]:
+        prompt_replacements: dict[str, str] = {"SOURCE_CODE": block.original.text}
+        return prompt_replacements
     def _save_to_file(self, block: TranslatedCodeBlock, out_path: Path) -> None:
         """Save a file to disk.

janus_llm-3.4.1/janus/language/alc/alc.py ADDED Viewed

@@ -0,0 +1,185 @@
+import re
+from typing import Optional
+from langchain.schema.language_model import BaseLanguageModel
+from janus.language.block import CodeBlock
+from janus.language.combine import Combiner
+from janus.language.node import NodeType
+from janus.language.treesitter import TreeSitterSplitter
+from janus.utils.logger import create_logger
+log = create_logger(__name__)
+class AlcCombiner(Combiner):
+    """A class that combines code blocks into ALC files."""
+    def __init__(self) -> None:
+        """Initialize a AlcCombiner instance."""
+        super().__init__("ibmhlasm")
+class AlcSplitter(TreeSitterSplitter):
+    """A class for splitting ALC code into functional blocks to prompt
+    with for transcoding.
+    """
+    def __init__(
+        self,
+        model: None | BaseLanguageModel = None,
+        max_tokens: int = 4096,
+        protected_node_types: tuple[str, ...] = (),
+        prune_node_types: tuple[str, ...] = (),
+        prune_unprotected: bool = False,
+    ):
+        """Initialize a AlcSplitter instance.
+        Arguments:
+            max_tokens: The maximum number of tokens supported by the model
+        """
+        super().__init__(
+            language="ibmhlasm",
+            model=model,
+            max_tokens=max_tokens,
+            protected_node_types=protected_node_types,
+            prune_node_types=prune_node_types,
+            prune_unprotected=prune_unprotected,
+        )
+    def _get_ast(self, code: str) -> CodeBlock:
+        root = super()._get_ast(code)
+        # Current treesitter implementation does not nest csects and dsects
+        # The loop below nests nodes following csect/dsect instructions into
+        #  the children of that instruction
+        sect_types = {"csect_instruction", "dsect_instruction"}
+        queue: list[CodeBlock] = [root]
+        while queue:
+            block = queue.pop(0)
+            # Search this children for csects and dsects. Create a list of groups
+            #  where each group is a csect or dsect, starting with the csect/dsect
+            #  instruction and containing all the subsequent nodes up until the
+            #  next csect or dsect instruction
+            sects: list[list[CodeBlock]] = [[]]
+            for c in block.children:
+                if c.node_type == "csect_instruction":
+                    c.context_tags["alc_section"] = "CSECT"
+                    sects.append([c])
+                elif c.node_type == "dsect_instruction":
+                    c.context_tags["alc_section"] = "DSECT"
+                    sects.append([c])
+                else:
+                    sects[-1].append(c)
+            sects = [s for s in sects if s]
+            # Restructure the tree, making the head of each group the parent
+            #  of all the remaining nodes in that group
+            if len(sects) > 1:
+                block.children = []
+                for sect in sects:
+                    if sect[0].node_type in sect_types:
+                        sect_node = self.merge_nodes(sect)
+                        sect_node.children = sect
+                        sect_node.node_type = NodeType(str(sect[0].node_type)[:5])
+                        block.children.append(sect_node)
+                    else:
+                        block.children.extend(sect)
+            # Push the children onto the queue
+            queue.extend(block.children)
+        return root
+class AlcListingSplitter(AlcSplitter):
+    """A class for splitting ALC listing code into functional blocks to
+    prompt with for transcoding.
+    """
+    def __init__(
+        self,
+        model: None | BaseLanguageModel = None,
+        max_tokens: int = 4096,
+        protected_node_types: tuple[str, ...] = (),
+        prune_node_types: tuple[str, ...] = (),
+        prune_unprotected: bool = False,
+    ):
+        """Initialize a AlcSplitter instance.
+        Arguments:
+            max_tokens: The maximum number of tokens supported by the model
+        """
+        # The string to mark the end of the listing header
+        self.header_indicator_str: str = (
+            "Loc  Object Code    Addr1 Addr2  Stmt   Source Statement"
+        )
+        # How many characters to trim from the right side to remove the address column
+        self.address_column_chars: int = 10
+        # The string to mark the end of the left margin
+        self.left_margin_indicator_str: str = "Stmt"
+        super().__init__(
+            model=model,
+            max_tokens=max_tokens,
+            protected_node_types=protected_node_types,
+            prune_node_types=prune_node_types,
+            prune_unprotected=prune_unprotected,
+        )
+    def _get_ast(self, code: str) -> CodeBlock:
+        active_usings = self.get_active_usings(code)
+        code = self.preproccess_assembly(code)
+        ast: CodeBlock = super()._get_ast(code)
+        ast.context_tags["active_usings"] = active_usings
+        return ast
+    def preproccess_assembly(self, code: str) -> str:
+        """Remove non-essential lines from an assembly snippet"""
+        lines = code.splitlines()
+        lines = self.strip_header_and_left(lines)
+        lines = self.strip_addresses(lines)
+        return "".join(str(line) for line in lines)
+    def get_active_usings(self, code: str) -> Optional[str]:
+        """Look for 'active usings' in the ALC listing header"""
+        lines = code.splitlines()
+        for line in lines:
+            if "Active Usings:" in line:
+                return line.split("Active Usings:")[1]
+        return None
+    def strip_header_and_left(
+        self,
+        lines: list[str],
+    ) -> list[str]:
+        """Remove the header and the left panel from the assembly sample"""
+        esd_regex = re.compile(f".*{self.header_indicator_str}.*")
+        header_end_index: int = [
+            i for i, item in enumerate(lines) if re.search(esd_regex, item)
+        ][0]
+        left_content_end_column = lines[header_end_index].find(
+            self.left_margin_indicator_str
+        )
+        hori_output_lines = lines[(header_end_index + 1) :]
+        left_output_lines = [
+            line[left_content_end_column + 5 :] for line in hori_output_lines
+        ]
+        return left_output_lines
+    def strip_addresses(self, lines: list[str]) -> list[str]:
+        """Strip the addresses which run down the right side of the assembly snippet"""
+        stripped_lines = [line[: -self.address_column_chars] for line in lines]
+        return stripped_lines
+    def strip_footer(self, lines: list[str]):
+        """Strip the footer from the assembly snippet"""
+        return NotImplementedError

{janus_llm-3.3.2 → janus_llm-3.4.1}/janus/language/block.py RENAMED Viewed

@@ -45,6 +45,7 @@ class CodeBlock:
         children: list[ForwardRef("CodeBlock")],
         embedding_id: Optional[str] = None,
         affixes: Tuple[str, str] = ("", ""),
+        context_tags: dict[str, str] = {},
     ) -> None:
         self.id: Hashable = id
         self.name: Optional[str] = name
@@ -59,6 +60,7 @@ class CodeBlock:
         self.children: list[ForwardRef("CodeBlock")] = sorted(children)
         self.embedding_id: Optional[str] = embedding_id
         self.affixes: Tuple[str, str] = affixes
+        self.context_tags: dict[str, str] = context_tags
         self.complete = True
         self.omit_prefix = True

janus_llm-3.4.1/janus/language/naive/simple_ast.py ADDED Viewed

@@ -0,0 +1,93 @@
+from janus.language.alc.alc import AlcListingSplitter, AlcSplitter
+from janus.language.mumps.mumps import MumpsSplitter
+from janus.language.naive.registry import register_splitter
+from janus.language.splitter import Splitter
+from janus.language.treesitter import TreeSitterSplitter
+from janus.utils.enums import LANGUAGES
+from janus.utils.logger import create_logger
+log = create_logger(__name__)
+@register_splitter("ast-flex")
+def get_flexible_ast(language: str, **kwargs) -> Splitter:
+    """Get a flexible AST splitter for the given language.
+    Arguments:
+        language: The language to get the splitter for.
+    Returns:
+        A flexible AST splitter for the given language.
+    """
+    if language == "ibmhlasm":
+        return AlcSplitter(**kwargs)
+    elif language == "mumps":
+        return MumpsSplitter(**kwargs)
+    else:
+        return TreeSitterSplitter(language=language, **kwargs)
+@register_splitter("ast-strict")
+def get_strict_ast(language: str, **kwargs) -> Splitter:
+    """Get a strict AST splitter for the given language.
+    The strict splitter will only return nodes that are of a functional type.
+    Arguments:
+        language: The language to get the splitter for.
+    Returns:
+        A strict AST splitter for the given language.
+    """
+    kwargs.update(
+        protected_node_types=LANGUAGES[language]["functional_node_types"],
+        prune_unprotected=True,
+    )
+    if language == "ibmhlasm":
+        return AlcSplitter(**kwargs)
+    elif language == "mumps":
+        return MumpsSplitter(**kwargs)
+    else:
+        return TreeSitterSplitter(language=language, **kwargs)
+@register_splitter("ast-strict-listing")
+def get_strict_listing_ast(language: str, **kwargs) -> Splitter:
+    """Get a strict AST splitter for the given language. This splitter is intended for
+    use with IBM HLASM.
+    The strict splitter will only return nodes that are of a functional type.
+    Arguments:
+        language: The language to get the splitter for.
+    Returns:
+        A strict AST splitter for the given language.
+    """
+    kwargs.update(
+        protected_node_types=LANGUAGES[language]["functional_node_types"],
+        prune_unprotected=True,
+    )
+    if language == "ibmhlasm":
+        return AlcListingSplitter(**kwargs)
+    else:
+        log.warning("Listing splitter is only intended for use with IBMHLASM!")
+        return TreeSitterSplitter(language=language, **kwargs)
+@register_splitter("ast-flex-listing")
+def get_flexible_listing_ast(language: str, **kwargs) -> Splitter:
+    """Get a flexible AST splitter for the given language. This splitter is intended for
+    use with IBM HLASM.
+    Arguments:
+        language: The language to get the splitter for.
+    Returns:
+        A flexible AST splitter for the given language.
+    """
+    if language == "ibmhlasm":
+        return AlcListingSplitter(**kwargs)
+    else:
+        log.warning("Listing splitter is only intended for use with IBMHLASM!")
+        return TreeSitterSplitter(language=language, **kwargs)

{janus_llm-3.3.2 → janus_llm-3.4.1}/janus/parsers/refiner_parser.py RENAMED Viewed

@@ -40,7 +40,9 @@ class RefinerParser(BaseOutputParser):
                 return self.parser.parse(text)
             except OutputParserException as oe:
                 err = str(oe)
-                new_prompt, prompt_arguments = self.refiner.refine(last_prompt, text, err)
+                new_prompt, prompt_arguments = self.refiner.refine(
+                    self.initial_prompt, last_prompt, text, err
+                )
                 new_chain = new_prompt | self.llm
                 text = new_chain.invoke(prompt_arguments)
                 last_prompt = new_prompt.format(**prompt_arguments)

{janus_llm-3.3.2 → janus_llm-3.4.1}/janus/refiners/refiner.py RENAMED Viewed

@@ -5,7 +5,12 @@ from janus.llm.models_info import MODEL_PROMPT_ENGINES
 class Refiner:
     def refine(
-        self, original_prompt: str, original_output: str, errors: str, **kwargs
+        self,
+        original_prompt: str,
+        previous_prompt: str,
+        previous_output: str,
+        errors: str,
+        **kwargs,
     ) -> tuple[ChatPromptTemplate, dict[str, str]]:
         """Creates a new prompt based on feedback from original results
@@ -24,22 +29,27 @@ class BasicRefiner(Refiner):
     def __init__(
         self,
         prompt_name: str,
-        model_name: str,
+        model_id: str,
         source_language: str,
     ) -> None:
         """Basic refiner, asks llm to fix output of previous prompt given errors
         Arguments:
             prompt_name: refinement prompt name to use
-            model_name: name of llm to use
+            model_id: ID of the llm to use. Found in models_info.py
             source_language: source_langauge to use
         """
         self._prompt_name = prompt_name
-        self._model_name = model_name
+        self._model_id = model_id
         self._source_language = source_language
     def refine(
-        self, original_prompt: str, original_output: str, errors: str, **kwargs
+        self,
+        original_prompt: str,
+        previous_prompt: str,
+        previous_output: str,
+        errors: str,
+        **kwargs,
     ) -> tuple[ChatPromptTemplate, dict[str, str]]:
         """Creates a new prompt based on feedback from original results
@@ -51,13 +61,13 @@ class BasicRefiner(Refiner):
         Returns:
             Tuple of new prompt and prompt arguments
         """
-        prompt_engine = MODEL_PROMPT_ENGINES[self._model_name](
+        prompt_engine = MODEL_PROMPT_ENGINES[self._model_id](
             prompt_template=self._prompt_name,
             source_language=self._source_language,
         )
         prompt_arguments = {
             "ORIGINAL_PROMPT": original_prompt,
-            "OUTPUT": original_output,
+            "OUTPUT": previous_output,
             "ERRORS": errors,
         }
         return prompt_engine.prompt, prompt_arguments

{janus_llm-3.3.2 → janus_llm-3.4.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "janus-llm"
-version = "3.3.2"
+version = "3.4.1"
 description = "A transcoding library using LLMs."
 authors = ["Michael Doyle <mdoyle@mitre.org>", "Chris Glasz <cglasz@mitre.org>",
            "Chris Tohline <ctohline@mitre.org>", "William Macke <wmacke@mitre.org>",

janus_llm-3.3.2/janus/language/alc/alc.py DELETED Viewed

@@ -1,87 +0,0 @@
-from langchain.schema.language_model import BaseLanguageModel
-from janus.language.block import CodeBlock
-from janus.language.combine import Combiner
-from janus.language.node import NodeType
-from janus.language.treesitter import TreeSitterSplitter
-from janus.utils.logger import create_logger
-log = create_logger(__name__)
-class AlcCombiner(Combiner):
-    """A class that combines code blocks into ALC files."""
-    def __init__(self) -> None:
-        """Initialize a AlcCombiner instance."""
-        super().__init__("ibmhlasm")
-class AlcSplitter(TreeSitterSplitter):
-    """A class for splitting ALC code into functional blocks to prompt
-    with for transcoding.
-    """
-    def __init__(
-        self,
-        model: None | BaseLanguageModel = None,
-        max_tokens: int = 4096,
-        protected_node_types: tuple[str, ...] = (),
-        prune_node_types: tuple[str, ...] = (),
-        prune_unprotected: bool = False,
-    ):
-        """Initialize a AlcSplitter instance.
-        Arguments:
-            max_tokens: The maximum number of tokens supported by the model
-        """
-        super().__init__(
-            language="ibmhlasm",
-            model=model,
-            max_tokens=max_tokens,
-            protected_node_types=protected_node_types,
-            prune_node_types=prune_node_types,
-            prune_unprotected=prune_unprotected,
-        )
-    def _get_ast(self, code: str) -> CodeBlock:
-        root = super()._get_ast(code)
-        # Current treesitter implementation does not nest csects and dsects
-        # The loop below nests nodes following csect/dsect instructions into
-        #  the children of that instruction
-        sect_types = {"csect_instruction", "dsect_instruction"}
-        queue: list[CodeBlock] = [root]
-        while queue:
-            block = queue.pop(0)
-            # Search this children for csects and dsects. Create a list of groups
-            #  where each group is a csect or dsect, starting with the csect/dsect
-            #  instruction and containing all the subsequent nodes up until the
-            #  next csect or dsect instruction
-            sects: list[list[CodeBlock]] = [[]]
-            for c in block.children:
-                if c.node_type in sect_types:
-                    sects.append([c])
-                else:
-                    sects[-1].append(c)
-            sects = [s for s in sects if s]
-            # Restructure the tree, making the head of each group the parent
-            #  of all the remaining nodes in that group
-            if len(sects) > 1:
-                block.children = []
-                for sect in sects:
-                    if sect[0].node_type in sect_types:
-                        sect_node = self.merge_nodes(sect)
-                        sect_node.children = sect
-                        sect_node.node_type = NodeType(str(sect[0].node_type)[:5])
-                        block.children.append(sect_node)
-                    else:
-                        block.children.extend(sect)
-            # Push the children onto the queue
-            queue.extend(block.children)
-        return root

janus_llm-3.3.2/janus/language/naive/simple_ast.py DELETED Viewed

@@ -1,29 +0,0 @@
-from janus.language.alc.alc import AlcSplitter
-from janus.language.mumps.mumps import MumpsSplitter
-from janus.language.naive.registry import register_splitter
-from janus.language.treesitter import TreeSitterSplitter
-from janus.utils.enums import LANGUAGES
-@register_splitter("ast-flex")
-def get_flexible_ast(language: str, **kwargs):
-    if language == "ibmhlasm":
-        return AlcSplitter(**kwargs)
-    elif language == "mumps":
-        return MumpsSplitter(**kwargs)
-    else:
-        return TreeSitterSplitter(language=language, **kwargs)
-@register_splitter("ast-strict")
-def get_strict_ast(language: str, **kwargs):
-    kwargs.update(
-        protected_node_types=LANGUAGES[language]["functional_node_types"],
-        prune_unprotected=True,
-    )
-    if language == "ibmhlasm":
-        return AlcSplitter(**kwargs)
-    elif language == "mumps":
-        return MumpsSplitter(**kwargs)
-    else:
-        return TreeSitterSplitter(language=language, **kwargs)