PyPI - janus-llm - Versions diffs - 1.0.0__py3-none-any.whl → 2.0.0__py3-none-any.whl - Mend

janus-llm 1.0.0py3-none-any.whl → 2.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

janus/__init__.py +9 -1
janus/__main__.py +4 -0
janus/_tests/test_cli.py +128 -0
janus/_tests/test_translate.py +49 -7
janus/cli.py +530 -46
janus/converter.py +50 -19
janus/embedding/_tests/test_collections.py +2 -8
janus/embedding/_tests/test_database.py +32 -0
janus/embedding/_tests/test_vectorize.py +9 -4
janus/embedding/collections.py +49 -6
janus/embedding/embedding_models_info.py +120 -0
janus/embedding/vectorize.py +53 -62
janus/language/_tests/__init__.py +0 -0
janus/language/_tests/test_combine.py +62 -0
janus/language/_tests/test_splitter.py +16 -0
janus/language/binary/_tests/test_binary.py +16 -1
janus/language/binary/binary.py +10 -3
janus/language/block.py +31 -30
janus/language/combine.py +26 -34
janus/language/mumps/_tests/test_mumps.py +2 -2
janus/language/mumps/mumps.py +93 -9
janus/language/naive/__init__.py +4 -0
janus/language/naive/basic_splitter.py +14 -0
janus/language/naive/chunk_splitter.py +26 -0
janus/language/naive/registry.py +13 -0
janus/language/naive/simple_ast.py +18 -0
janus/language/naive/tag_splitter.py +61 -0
janus/language/splitter.py +168 -74
janus/language/treesitter/_tests/test_treesitter.py +9 -6
janus/language/treesitter/treesitter.py +37 -13
janus/llm/model_callbacks.py +177 -0
janus/llm/models_info.py +134 -70
janus/metrics/__init__.py +8 -0
janus/metrics/_tests/__init__.py +0 -0
janus/metrics/_tests/reference.py +2 -0
janus/metrics/_tests/target.py +2 -0
janus/metrics/_tests/test_bleu.py +56 -0
janus/metrics/_tests/test_chrf.py +67 -0
janus/metrics/_tests/test_file_pairing.py +59 -0
janus/metrics/_tests/test_llm.py +91 -0
janus/metrics/_tests/test_reading.py +28 -0
janus/metrics/_tests/test_rouge_score.py +65 -0
janus/metrics/_tests/test_similarity_score.py +23 -0
janus/metrics/_tests/test_treesitter_metrics.py +110 -0
janus/metrics/bleu.py +66 -0
janus/metrics/chrf.py +55 -0
janus/metrics/cli.py +7 -0
janus/metrics/complexity_metrics.py +208 -0
janus/metrics/file_pairing.py +113 -0
janus/metrics/llm_metrics.py +202 -0
janus/metrics/metric.py +466 -0
janus/metrics/reading.py +70 -0
janus/metrics/rouge_score.py +96 -0
janus/metrics/similarity.py +53 -0
janus/metrics/splitting.py +38 -0
janus/parsers/_tests/__init__.py +0 -0
janus/parsers/_tests/test_code_parser.py +32 -0
janus/parsers/code_parser.py +24 -253
janus/parsers/doc_parser.py +169 -0
janus/parsers/eval_parser.py +80 -0
janus/parsers/reqs_parser.py +72 -0
janus/prompts/prompt.py +103 -30
janus/translate.py +636 -111
janus/utils/_tests/__init__.py +0 -0
janus/utils/_tests/test_logger.py +67 -0
janus/utils/_tests/test_progress.py +20 -0
janus/utils/enums.py +56 -3
janus/utils/progress.py +56 -0
{janus_llm-1.0.0.dist-info → janus_llm-2.0.0.dist-info}/METADATA +23 -10
janus_llm-2.0.0.dist-info/RECORD +94 -0
{janus_llm-1.0.0.dist-info → janus_llm-2.0.0.dist-info}/WHEEL +1 -1
janus_llm-1.0.0.dist-info/RECORD +0 -48
{janus_llm-1.0.0.dist-info → janus_llm-2.0.0.dist-info}/LICENSE +0 -0
{janus_llm-1.0.0.dist-info → janus_llm-2.0.0.dist-info}/entry_points.txt +0 -0

janus/metrics/splitting.py ADDED Viewed

@@ -0,0 +1,38 @@
+from typing import Callable
+SPLITTING_METHODS: dict[str, Callable[[str, str], list[str]]] = {}
+def register_splitting_method(name: None | str = None) -> Callable[[Callable], Callable]:
+    """Registers a pairing method for splitting strings in files
+    Arguments:
+        name: The name of the splitting method. If None, the function name is used.
+        help: The help text for the pairing method.
+    Returns:
+        The decorator function.
+    """
+    def decorator(f: Callable[[str, str], list[tuple[str, str]]]):
+        if name is None:
+            splitting_name = f.__name__
+        else:
+            splitting_name = name
+        SPLITTING_METHODS[splitting_name] = f
+        return f
+    return decorator
+@register_splitting_method(name="file")
+def split_by_file(src: str, **kwargs) -> list[str]:
+    """Split the source text by file
+    Arguments:
+        src: The source text.
+    Returns:
+        A list of strings.
+    """
+    return [src]

janus/parsers/_tests/__init__.py ADDED Viewed

File without changes

janus/parsers/_tests/test_code_parser.py ADDED Viewed

@@ -0,0 +1,32 @@
+import unittest
+from ..code_parser import CodeParser, JanusParser
+class TestJanusParser(unittest.TestCase):
+    def setUp(self):
+        self.parser = JanusParser()
+    def test_parse_combined_output(self):
+        text = "test text"
+        self.assertEqual(self.parser.parse_combined_output(text), text)
+class TestCodeParser(unittest.TestCase):
+    def setUp(self):
+        self.parser = CodeParser(language="python")
+    def test_parse(self):
+        self.parser.language = "python"
+        text = "```\n# test text\n```"
+        self.assertEqual(self.parser.parse(text), text.strip("```").strip("\n"))
+    def test_get_format_instructions(self):
+        self.assertEqual(
+            self.parser.get_format_instructions(),
+            "Output must contain text contained within triple square brackets (```)",
+        )
+if __name__ == "__main__":
+    unittest.main()

janus/parsers/code_parser.py CHANGED Viewed

@@ -1,32 +1,17 @@
-import json
 import re
-from collections import defaultdict
-from typing import Any, Set
 from langchain.schema.output_parser import BaseOutputParser
+from langchain_core.exceptions import OutputParserException
+from langchain_core.messages import BaseMessage
+from langchain_core.output_parsers import StrOutputParser
 from ..language.block import CodeBlock
-from ..language.combine import Combiner
 from ..utils.logger import create_logger
 log = create_logger(__name__)
-PARSER_TYPES: Set[str] = {"code", "text", "eval"}
-class JanusParser(BaseOutputParser):
-    def parse(self, text: str) -> str:
-        """Parse the output text from the LLM.
-        Arguments:
-            text: The output text from the LLM
-        Returns:
-            A parsed version of the text
-        """
-        return text
+class JanusParser:
     def parse_combined_output(self, text: str) -> str:
         """Parse the output text from the LLM when multiple inputs are combined
@@ -36,253 +21,39 @@ class JanusParser(BaseOutputParser):
         Returns:
             A parsed version of the text
         """
+        if isinstance(text, BaseMessage):
+            text = text.content
         return text
-    def score(self, input_block: CodeBlock, output_text: str) -> float:
-        """Validate and score the output text based upon the input CodeBlock.
-        Output is a score between 0 and 1.
+    def parse_into_block(self, text: str, block: CodeBlock):
+        if isinstance(text, BaseMessage):
+            text = text.content
+        block.text = text
-        Arguments:
-            input_block: A `CodeBlock` representing the input to the LLM
-            output_text: The parsed text returned by the LLM
+    def set_reference(self, block: CodeBlock):
+        pass
-        Returns:
-            A score between 0 and 1 (inclusive). A score of 1.0 indicates that
-            the given text is fully acceptable, and no further attempts
-            should be made.
-        """
-        return 1.0
-    def get_format_instructions(self) -> str:
-        return "No format requirements"
-    @property
-    def _type(self) -> str:
-        return type(self).__name__
+class GenericParser(StrOutputParser, JanusParser):
+    def parse(self, text: str) -> str:
+        if isinstance(text, BaseMessage):
+            return text.content
+        return text
-class CodeParser(JanusParser):
+class CodeParser(BaseOutputParser[str], JanusParser):
     language: str
     def parse(self, text: str) -> str:
-        """Parse the output text from the LLM.
-        Arguments:
-            text: The output text from the LLM
-        Returns:
-            A parsed version of the text
-        """
+        if isinstance(text, BaseMessage):
+            text = text.content
         pattern = rf"```[^\S\r\n]*(?:{self.language}[^\S\r\n]*)?\n?(.*?)\n*```"
         code = re.search(pattern, text, re.DOTALL)
         if code is None:
-            raise ValueError("Code not find code between triple backticks")
-        return code.group(1)
-    def score(self, input_block: CodeBlock, output_text: str) -> float:
-        """The score for translated code is the percentage of this block's
-        children which are present in the output
-        Arguments:
-            input_block: A `CodeBlock` representing the input to the LLM
-            output_text: The parsed text returned by the LLM
-        Returns:
-            A score between 0 and 1 (inclusive). A score of 1.0 indicates that
-            the given text is fully acceptable, and no further attempts
-            should be made.
-        """
-        if not input_block.children:
-            return 1.0
-        missing_children = []
-        for child in input_block.children:
-            if not Combiner.contains_child(output_text, child):
-                missing_children.append(child.id)
-        if missing_children:
-            log.warning(
-                f"[{input_block.name}] Child placeholders not present in text: "
-                f"{missing_children}"
+            raise OutputParserException(
+                "Code not find code between triple square brackets"
             )
-            log.debug(f"Code:\n{output_text}")
-        return 1.0 - len(missing_children) / len(input_block.children)
-    def get_format_instructions(self) -> str:
-        return "Output must contain text contained within triple backticks."
-class JsonLinesParser(JanusParser):
-    def parse(self, text: str) -> str:
-        """Parse the output text from the LLM.
-        Arguments:
-            text: The output text from the LLM.
-        Returns:
-            A parsed version of the text.
-        """
-        string = r"\"\w+\""
-        number = r"-?\d+(?:\.\d*)?"
-        json_value = rf"(?:{string}|{number})"
-        json_line = rf"\s*{string} *: *{json_value},?\s*"
-        pattern = "({" + rf"(?:{json_line})+" + "})"
-        matches = list(re.finditer(pattern, text, re.DOTALL))
-        if not matches:
-            raise ValueError("Could not find JSON output")
-        output_strings = [json.dumps(json.loads(match.group(1))) for match in matches]
-        return "\n".join(output_strings)
-    def parse_combined_output(self, text: str) -> str:
-        """Parse the output text from the LLM when multiple inputs are combined.
-        Arguments:
-            text: The output text from the LLM.
-        Returns:
-            A parsed version of the text.
-        """
-        return self.parse(text)
-    def get_format_instructions(self) -> str:
-        """Get the format instructions for the parser.
-        Returns:
-            The format instructions for the LLM.
-        """
-        return "Output must contain one or more JSON-formatted blocks."
-class JsonParser(JsonLinesParser):
-    def parse(self, text: str) -> str:
-        """Parse the output text from the LLM.
-        Arguments:
-            text: The output text from the LLM.
-        Returns:
-            A parsed version of the text.
-        """
-        jsonl_text = super().parse(text)
-        if len(jsonl_text.split("\n")) > 1:
-            raise ValueError("Multiple JSON objects found")
-        return jsonl_text
-    def parse_combined_output(self, text: str) -> str:
-        """Parse the output text from the LLM when multiple inputs are combined.
-        Arguments:
-            text: The output text from the LLM.
-        Returns:
-            A parsed version of the text.
-        """
-        jsonl_text = JsonLinesParser.parse(self, text)
-        json_lines = jsonl_text.split("\n")
-        output_obj = {i: json.loads(t) for i, t in enumerate(json_lines)}
-        return json.dumps(output_obj)
+        return str(code.group(1))
     def get_format_instructions(self) -> str:
-        """Get the format instructions for the parser.
-        Returns:
-            The format instructions for the LLM.
-        """
-        return "Output must contain exactly one JSON-formatted block."
-class EvaluationParser(JsonParser):
-    expected_keys: Set[str]
-    def __init__(self, expected_keys: Set[str], **kwargs: Any):
-        """Create a new EvaluationParser.
-        Arguments:
-            expected_keys: The set of keys that should be present in the JSON
-                object
-            kwargs: Additional arguments to pass to the parent class
-        """
-        super().__init__(expected_keys=expected_keys, **kwargs)
-        self.expected_keys = {k.lower() for k in expected_keys}
-    def parse(self, text: str) -> str:
-        """Parse the JSON object, convert keys to lowercase, filter out
-        unexpected keys
-        Arguments:
-            text: The output text from the LLM.
-        Returns:
-            A parsed version of the text.
-        """
-        json_text = super().parse(text)
-        obj = json.loads(json_text)
-        obj = {k.lower(): v for k, v in obj.items()}
-        obj = {k: v for k, v in obj.items() if k in self.expected_keys}
-        return json.dumps(obj)
-    def parse_combined_output(self, text: str) -> str:
-        """Parse the JSON object, convert keys to lowercase, filter out
-        unexpected keys, and average the values
-        Arguments:
-            text: The output text from the LLM.
-        Returns:
-            A parsed version of the text.
-        """
-        json_text = super().parse_combined_output(text)
-        multi_obj = json.loads(json_text)
-        n_evals = len(multi_obj)
-        output_obj = defaultdict(float)
-        for obj in multi_obj.values():
-            for k, v in obj.items():
-                output_obj[k] += v / n_evals
-        return json.dumps(output_obj)
-    def score(self, input_block: CodeBlock, output_text: str) -> float:
-        """The score for the output text is the percentage of expected keys
-        that are present in the json object. Non-numeric values count for
-        half.
-        Arguments:
-            input_block: A `CodeBlock` representing the input to the LLM
-            output_text: The parsed text returned by the LLM
-        Returns:
-            A score between 0 and 1 (inclusive). A score of 1.0 indicates that
-            the given text is fully acceptable, and no further attempts
-            should be made.
-        """
-        obj = json.loads(output_text)
-        expected_keys = self.expected_keys.intersection(obj.keys())
-        missing_keys = self.expected_keys.difference(obj.keys())
-        if missing_keys:
-            log.warning(f"[{input_block.name}] Expected keys missing: {missing_keys}")
-        non_numerics = {k: v for k, v in obj.items() if not isinstance(v, (int, float))}
-        if non_numerics:
-            log.warning(f"[{input_block.name}] Non-numeric values: {non_numerics}")
-        if missing_keys or non_numerics:
-            log.debug(f"Text:\n{output_text}")
-        return (len(expected_keys) - len(non_numerics) * 0.5) / len(self.expected_keys)
-    def get_format_instructions(self) -> str:
-        """Get the format instructions for the parser.
-        Returns:
-            The format instructions for the LLM.
-        """
-        return (
-            "Output must contain exactly one JSON-formatted block. The JSON "
-            "object should contain only the keys contained in the provided "
-            "expected_keys set (if any), and values should be numeric."
-        )
+        return "Output must contain text contained within triple square brackets (```)"

janus/parsers/doc_parser.py ADDED Viewed

@@ -0,0 +1,169 @@
+import json
+import re
+from langchain.output_parsers import PydanticOutputParser
+from langchain.output_parsers.json import parse_json_markdown
+from langchain.schema.output_parser import BaseOutputParser
+from langchain_core.exceptions import OutputParserException
+from langchain_core.messages import AIMessage
+from langchain_core.pydantic_v1 import BaseModel, Field
+from ..language.block import CodeBlock
+from ..utils.logger import create_logger
+from .code_parser import JanusParser
+log = create_logger(__name__)
+class MultiDoc(BaseModel):
+    docstring: str = Field(
+        description="A Sphinx-style docstring for the code, including a summary "
+        "of its functionality; the name, type, and description of "
+        "any parameters or returns; and any potential exceptions "
+        "that might arise in its execution"
+    )
+    example_usage: str = Field(
+        description="A well-commented minimal example utilizing the given "
+        "code's functionality"
+    )
+    pseudocode: str = Field(
+        description="A Python-stype pseudocode implementation of the module or "
+        "function's behavior"
+    )
+class MultiDocumentationParser(PydanticOutputParser, JanusParser):
+    block_name: str = ""
+    def __init__(self):
+        PydanticOutputParser.__init__(self, pydantic_object=MultiDoc)
+    def set_reference(self, block: CodeBlock):
+        self.block_name = block.name
+    def parse(self, text: str) -> str:
+        if isinstance(text, AIMessage):
+            text = text.content
+        try:
+            docs = json.loads(super().parse(text).json())
+        except (OutputParserException, json.JSONDecodeError):
+            log.debug(f"Invalid JSON object. Output:\n{text}")
+            raise
+        docs["name"] = self.block_name
+        return json.dumps(docs)
+    def parse_combined_output(self, text: str) -> str:
+        """Parse the output text from the LLM when multiple inputs are combined.
+        Arguments:
+            text: The output text from the LLM.
+        Returns:
+            A parsed version of the text.
+        """
+        objs = [
+            parse_json_markdown(line.strip()) for line in text.split("\n") if line.strip()
+        ]
+        output_obj = {d.pop("name"): d for d in objs}
+        return json.dumps(output_obj)
+    def get_format_instructions(self) -> str:
+        """Get the format instructions for the parser.
+        Returns:
+            The format instructions for the LLM.
+        """
+        return (
+            "Output must contain a sphinx-style docstring, example usage, and "
+            "pseudocode, all in a json-formatted string with the following fields: "
+            '"docstring", "example_usage", and "pseudocode".'
+        )
+    @property
+    def _type(self) -> str:
+        return self.__class__.name
+class MadlibsDocumentationParser(BaseOutputParser[str], JanusParser):
+    expected_keys: set[str]
+    def __init__(self):
+        super().__init__(expected_keys=[])
+    def set_reference(self, block: CodeBlock):
+        comment_ids = re.findall(r"<(?:BLOCK|INLINE)_COMMENT (\w{8})>", block.text)
+        self.expected_keys = set(comment_ids)
+    def parse(self, text: str) -> str:
+        if isinstance(text, AIMessage):
+            text = text.content
+        try:
+            obj = parse_json_markdown(text)
+        except json.JSONDecodeError as e:
+            log.debug(f"Invalid JSON object. Output:\n{text}")
+            raise OutputParserException(f"Got invalid JSON object. Error: {e}")
+        if not isinstance(obj, dict):
+            raise OutputParserException(
+                f"Got invalid return object. Expected a dictionary, but got {type(obj)}"
+            )
+        seen_keys = set(obj.keys())
+        missing_keys = self.expected_keys.difference(obj.keys())
+        invalid_keys = seen_keys.difference(self.expected_keys)
+        if missing_keys:
+            log.debug(f"Missing keys: {missing_keys}")
+            if invalid_keys:
+                log.debug(f"Invalid keys: {invalid_keys}")
+            log.debug(f"Missing keys: {missing_keys}")
+            raise OutputParserException(
+                f"Got invalid return object. Missing the following expected "
+                f"keys: {missing_keys}"
+            )
+        for key in invalid_keys:
+            del obj[key]
+        for value in obj.values():
+            if not isinstance(value, str):
+                raise OutputParserException(
+                    f"Got invalid return object. Expected all string values,"
+                    f' but got type "{type(value)}"'
+                )
+        return json.dumps(obj)
+    def parse_combined_output(self, text: str) -> str:
+        """Parse the output text from the LLM when multiple inputs are combined.
+        Arguments:
+            text: The output text from the LLM.
+        Returns:
+            A parsed version of the text.
+        """
+        if not text.strip():
+            return str({})
+        objs = [
+            parse_json_markdown(line.strip()) for line in text.split("\n") if line.strip()
+        ]
+        output_obj = {}
+        for obj in objs:
+            output_obj.update(obj)
+        return json.dumps(output_obj)
+    def get_format_instructions(self) -> str:
+        """Get the format instructions for the parser.
+        Returns:
+            The format instructions for the LLM.
+        """
+        return (
+            "Output must contain exactly one JSON-formatted block. The JSON "
+            "object should contain only (and all of) the comment IDs present "
+            "in the input code."
+        )
+    @property
+    def _type(self) -> str:
+        return self.__class__.name

janus/parsers/eval_parser.py ADDED Viewed

@@ -0,0 +1,80 @@
+import json
+from langchain.output_parsers import PydanticOutputParser
+from langchain_core.pydantic_v1 import BaseModel, Field, validator
+from ..utils.logger import create_logger
+from .code_parser import JanusParser
+log = create_logger(__name__)
+class Eval(BaseModel):
+    syntax: float = Field(description="A numeric score (0-4) for code syntax")
+    style: float = Field(description="A numeric score (0-4) for code style")
+    completeness: float = Field(description="A numeric score (0-4) for code completeness")
+    correctness: float = Field(description="A numeric score (0-4) for code correctness")
+    # You can add custom validation logic easily with Pydantic.
+    @validator("*")
+    def score_is_valid(cls, v: float | int):
+        try:
+            v = float(v)
+        except ValueError:
+            raise ValueError("must be a number")
+        if not 0 <= v <= 4:
+            raise ValueError("must be a value between 0 and 4 inclusive")
+        return v
+    def __add__(self, other):
+        if isinstance(other, int) and other == 0:
+            return self.copy()
+        return Eval.construct(
+            syntax=self.syntax + other.syntax,
+            style=self.style + other.style,
+            correctness=self.correctness + other.correctness,
+            completeness=self.completeness + other.completeness,
+        )
+    def __radd__(self, other):
+        return self.__add__(other)
+    def __truediv__(self, other):
+        if isinstance(other, int):
+            return Eval.construct(
+                syntax=self.syntax / other,
+                style=self.style / other,
+                correctness=self.correctness / other,
+                completeness=self.completeness / other,
+            )
+        return Eval.construct(
+            syntax=self.syntax / other.syntax,
+            style=self.style / other.style,
+            correctness=self.correctness / other.correctness,
+            completeness=self.completeness / other.completeness,
+        )
+class EvaluationParser(PydanticOutputParser, JanusParser):
+    def __init__(self):
+        PydanticOutputParser.__init__(self, pydantic_object=Eval)
+    def parse(self, text: str) -> str:
+        eval = super().parse(text)
+        return json.dumps(eval.json())
+    def parse_combined_output(self, text: str) -> str:
+        """Parse the JSON object, convert keys to lowercase, filter out
+        unexpected keys, and average the values
+        Arguments:
+            text: The output text from the LLM.
+        Returns:
+            A parsed version of the text.
+        """
+        objs = [super().parse(line.strip()) for line in text.split("\n")]
+        avg_obj = sum(objs) / len(objs)
+        return json.dumps(avg_obj.json())

janus-llm 1.0.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

janus-llm 1.0.0py3-none-any.whl → 2.0.0py3-none-any.whl