PyPI - janus-llm - Versions diffs - 4.1.0__py3-none-any.whl → 4.3.1__py3-none-any.whl - Mend

janus-llm 4.1.0py3-none-any.whl → 4.3.1py3-none-any.whl

Files changed (25) hide show

janus/__init__.py +1 -1
janus/cli.py +286 -30
janus/converter/__init__.py +1 -0
janus/converter/converter.py +46 -47
janus/converter/evaluate.py +230 -4
janus/converter/partition.py +27 -0
janus/language/alc/_tests/test_alc.py +1 -1
janus/language/alc/alc.py +9 -4
janus/language/combine.py +22 -0
janus/language/splitter.py +31 -23
janus/language/treesitter/treesitter.py +9 -1
janus/llm/models_info.py +20 -12
janus/parsers/eval_parsers/incose_parser.py +134 -0
janus/parsers/eval_parsers/inline_comment_parser.py +112 -0
janus/parsers/partition_parser.py +168 -0
janus/refiners/refiner.py +38 -12
janus/refiners/uml.py +33 -0
janus/retrievers/retriever.py +60 -0
janus/utils/enums.py +14 -0
janus/utils/pdf_docs_reader.py +134 -0
{janus_llm-4.1.0.dist-info → janus_llm-4.3.1.dist-info}/METADATA +9 -1
{janus_llm-4.1.0.dist-info → janus_llm-4.3.1.dist-info}/RECORD +25 -19
{janus_llm-4.1.0.dist-info → janus_llm-4.3.1.dist-info}/WHEEL +1 -1
{janus_llm-4.1.0.dist-info → janus_llm-4.3.1.dist-info}/LICENSE +0 -0
{janus_llm-4.1.0.dist-info → janus_llm-4.3.1.dist-info}/entry_points.txt +0 -0

janus/converter/evaluate.py CHANGED Viewed

@@ -1,15 +1,241 @@
+import json
+import re
+from copy import deepcopy
+from langchain_core.runnables import Runnable, RunnableLambda, RunnableParallel
 from janus.converter.converter import Converter
+from janus.language.block import TranslatedCodeBlock
 from janus.language.combine import JsonCombiner
-from janus.parsers.eval_parser import EvaluationParser
+from janus.parsers.eval_parsers.incose_parser import IncoseParser
+from janus.parsers.eval_parsers.inline_comment_parser import InlineCommentParser
 from janus.utils.logger import create_logger
 log = create_logger(__name__)
 class Evaluator(Converter):
-    def __init__(self, **kwargs):
+    """Evaluator
+    A class that performs an LLM self evaluation"
+    "on an input target, with an associated prompt.
+    Current valid evaluation types:
+    ['incose', 'comments']
+    """
+    def __init__(self, **kwargs) -> None:
+        """Initialize the Evaluator class
+        Arguments:
+            model: The LLM to use for translation. If an OpenAI model, the
+                `OPENAI_API_KEY` environment variable must be set and the
+                `OPENAI_ORG_ID` environment variable should be set if needed.
+            model_arguments: Additional arguments to pass to the LLM constructor.
+            max_prompts: The maximum number of prompts to try before giving up.
+        """
+        super().__init__(**kwargs)
+        self._combiner = JsonCombiner()
+        self._load_parameters()
+class RequirementEvaluator(Evaluator):
+    """INCOSE Requirement Evaluator
+    A class that performs an LLM self evaluation on an input target,
+    with an associated prompt.
+    The evaluation prompts are for Incose Evaluations
+    """
+    def __init__(self, eval_items_per_request: int | None = None, **kwargs) -> None:
+        """Initialize the Evaluator class
+        Arguments:
+            model: The LLM to use for translation. If an OpenAI model, the
+                `OPENAI_API_KEY` environment variable must be set and the
+                `OPENAI_ORG_ID` environment variable should be set if needed.
+            model_arguments: Additional arguments to pass to the LLM constructor.
+            max_prompts: The maximum number of prompts to try before giving up.
+        """
+        super().__init__(**kwargs)
+        self.eval_items_per_request = eval_items_per_request
+        self._parser = IncoseParser()
+        self.set_prompt("eval_prompts/incose")
+    def _input_runnable(self) -> Runnable:
+        def _get_code(json_text: str) -> str:
+            return json.loads(json_text)["code"]
+        def _get_reqs(json_text: str) -> str:
+            return json.dumps(json.loads(json_text)["requirements"])
+        return RunnableLambda(self._parser.parse_input) | RunnableParallel(
+            SOURCE_CODE=_get_code,
+            REQUIREMENTS=_get_reqs,
+            context=self._retriever,
+        )
+    def _add_translation(self, block: TranslatedCodeBlock):
+        if block.translated:
+            return
+        if block.original.text is None:
+            block.translated = True
+            return
+        if self.eval_items_per_request is None:
+            return super()._add_translation(block)
+        input_obj = json.loads(block.original.text)
+        requirements = input_obj.get("requirements", [])
+        if not requirements:
+            log.debug(f"[{block.name}] Skipping empty block")
+            block.translated = True
+            block.text = None
+            block.complete = True
+            return
+        # For some reason requirements objects are in nested lists?
+        while isinstance(requirements[0], list):
+            requirements = [r for lst in requirements for r in lst]
+        if len(requirements) <= self.eval_items_per_request:
+            input_obj["requirements"] = requirements
+            block.original.text = json.dumps(input_obj)
+            return super()._add_translation(block)
+        block.processing_time = 0
+        block.cost = 0
+        block.retries = 0
+        obj = {}
+        for i in range(0, len(requirements), self.eval_items_per_request):
+            # Build a new TranslatedBlock using the new working text
+            working_requirements = requirements[i : i + self.eval_items_per_request]
+            working_copy = deepcopy(block.original)
+            working_obj = json.loads(working_copy.text)  # type: ignore
+            working_obj["requirements"] = working_requirements
+            working_copy.text = json.dumps(working_obj)
+            working_block = TranslatedCodeBlock(working_copy, self._target_language)
+            # Run the LLM on the working text
+            super()._add_translation(working_block)
+            # Update metadata to include for all runs
+            block.retries += working_block.retries
+            block.cost += working_block.cost
+            block.processing_time += working_block.processing_time
+            # Update the output text to merge this section's output in
+            obj.update(json.loads(working_block.text))
+        block.text = json.dumps(obj)
+        block.tokens = self._llm.get_num_tokens(block.text)
+        block.translated = True
+        log.debug(
+            f"[{block.name}] Output code:\n{json.dumps(json.loads(block.text), indent=2)}"
+        )
+class InlineCommentEvaluator(Evaluator):
+    """Inline Comment Evaluator
+    A class that performs an LLM self evaluation on inline comments,
+    with an associated prompt.
+    """
+    def __init__(self, eval_items_per_request: int | None = None, **kwargs) -> None:
+        """Initialize the Evaluator class
+        Arguments:
+            model: The LLM to use for translation. If an OpenAI model, the
+                `OPENAI_API_KEY` environment variable must be set and the
+                `OPENAI_ORG_ID` environment variable should be set if needed.
+            model_arguments: Additional arguments to pass to the LLM constructor.
+            max_prompts: The maximum number of prompts to try before giving up.
+        """
         super().__init__(**kwargs)
-        self.set_prompt("evaluate")
         self._combiner = JsonCombiner()
-        self._parser = EvaluationParser()
         self._load_parameters()
+        self._parser = InlineCommentParser()
+        self.set_prompt("eval_prompts/inline_comments")
+        self.eval_items_per_request = eval_items_per_request
+    def _add_translation(self, block: TranslatedCodeBlock):
+        if block.translated:
+            return
+        if block.original.text is None:
+            block.translated = True
+            return
+        if self.eval_items_per_request is None:
+            return super()._add_translation(block)
+        comment_pattern = r"<(?:INLINE|BLOCK)_COMMENT \w{8}>.*$"
+        comments = list(
+            re.finditer(comment_pattern, block.original.text, flags=re.MULTILINE)
+        )
+        if not comments:
+            log.info(f"[{block.name}] Skipping commentless block")
+            block.translated = True
+            block.text = None
+            block.complete = True
+            return
+        if len(comments) <= self.eval_items_per_request:
+            return super()._add_translation(block)
+        comment_group_indices = list(range(0, len(comments), self.eval_items_per_request))
+        log.debug(
+            f"[{block.name}] Block contains more than {self.eval_items_per_request}"
+            f" comments, splitting {len(comments)} comments into"
+            f" {len(comment_group_indices)} groups"
+        )
+        block.processing_time = 0
+        block.cost = 0
+        block.retries = 0
+        obj = {}
+        for i in range(0, len(comments), self.eval_items_per_request):
+            # Split the text into the section containing comments of interest,
+            #  all the text prior to those comments, and all the text after them
+            working_comments = comments[i : i + self.eval_items_per_request]
+            start_idx = working_comments[0].start()
+            end_idx = working_comments[-1].end()
+            prefix = block.original.text[:start_idx]
+            keeper = block.original.text[start_idx:end_idx]
+            suffix = block.original.text[end_idx:]
+            # Strip all comment placeholders outside of the section of interest
+            prefix = re.sub(comment_pattern, "", prefix, flags=re.MULTILINE)
+            suffix = re.sub(comment_pattern, "", suffix, flags=re.MULTILINE)
+            # Build a new TranslatedBlock using the new working text
+            working_copy = deepcopy(block.original)
+            working_copy.text = prefix + keeper + suffix
+            working_block = TranslatedCodeBlock(working_copy, self._target_language)
+            # Run the LLM on the working text
+            super()._add_translation(working_block)
+            # Update metadata to include for all runs
+            block.retries += working_block.retries
+            block.cost += working_block.cost
+            block.processing_time += working_block.processing_time
+            # Update the output text to merge this section's output in
+            obj.update(json.loads(working_block.text))
+        block.text = json.dumps(obj)
+        block.tokens = self._llm.get_num_tokens(block.text)
+        block.translated = True
+        log.debug(
+            f"[{block.name}] Output code:\n{json.dumps(json.loads(block.text), indent=2)}"
+        )

janus/converter/partition.py ADDED Viewed

@@ -0,0 +1,27 @@
+from pathlib import Path
+from janus.converter.converter import Converter
+from janus.language.block import TranslatedCodeBlock
+from janus.parsers.partition_parser import PartitionParser
+from janus.utils.logger import create_logger
+log = create_logger(__name__)
+class Partitioner(Converter):
+    def __init__(self, partition_token_limit: int, **kwargs):
+        super().__init__(**kwargs)
+        self.set_prompt("partition")
+        self._load_model()
+        self._parser = PartitionParser(
+            token_limit=partition_token_limit,
+            model=self._llm,
+        )
+        self._target_language = self._source_language
+        self._target_suffix = self._source_suffix
+        self._load_parameters()
+    def _save_to_file(self, block: TranslatedCodeBlock, out_path: Path) -> None:
+        output_str = self._parser.parse_combined_output(block.complete_text)
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+        out_path.write_text(output_str, encoding="utf-8")

janus/language/alc/_tests/test_alc.py CHANGED Viewed

@@ -20,7 +20,7 @@ class TestAlcSplitter(unittest.TestCase):
     def test_split(self):
         """Test the split method."""
         tree_root = self.splitter.split(self.test_file)
-        self.assertAlmostEqual(tree_root.n_descendents, 32, delta=5)
+        self.assertAlmostEqual(tree_root.n_descendents, 16, delta=2)
         self.assertLessEqual(tree_root.max_tokens, self.splitter.max_tokens)
         self.assertFalse(tree_root.complete)
         self.combiner.combine_children(tree_root)

janus/language/alc/alc.py CHANGED Viewed

@@ -79,10 +79,15 @@ class AlcSplitter(TreeSitterSplitter):
             if len(sects) > 1:
                 block.children = []
                 for sect in sects:
-                    if sect[0].node_type in sect_types:
-                        sect_node = self.merge_nodes(sect)
-                        sect_node.children = sect
-                        sect_node.node_type = NodeType(str(sect[0].node_type)[:5])
+                    node_type = sect[0].node_type
+                    if node_type in sect_types:
+                        if len(sect) == 1:
+                            # Don't make a node its own child
+                            sect_node = sect[0]
+                        else:
+                            sect_node = self.merge_nodes(sect)
+                            sect_node.children = sect
+                        sect_node.node_type = NodeType(str(node_type)[:5])
                         block.children.append(sect_node)
                     else:
                         block.children.extend(sect)

janus/language/combine.py CHANGED Viewed

@@ -1,3 +1,5 @@
+import re
 from janus.language.block import CodeBlock, TranslatedCodeBlock
 from janus.language.file import FileManager
 from janus.utils.logger import create_logger
@@ -90,3 +92,23 @@ class ChunkCombiner(Combiner):
             root: The functional code block to combine with its children.
         """
         return root
+class PartitionCombiner(Combiner):
+    @staticmethod
+    def combine(root: CodeBlock) -> None:
+        """A combiner which inserts partition tags between code blocks"""
+        queue = [root]
+        while queue:
+            block = queue.pop(0)
+            if block.children:
+                queue.extend(block.children)
+            else:
+                block.affixes = (block.prefix, block.suffix + "\n<JANUS_PARTITION>\n")
+        super(PartitionCombiner, PartitionCombiner).combine(root)
+        root.text = re.sub(r"(?:\n<JANUS_PARTITION>\n)+$", "", root.text)
+        root.affixes = (
+            root.prefix,
+            re.sub(r"(?:\n<JANUS_PARTITION>\n)+$", "", root.suffix),
+        )

janus/language/splitter.py CHANGED Viewed

@@ -275,42 +275,50 @@ class Splitter(FileManager):
         groups = [[n] for n in nodes]
         while len(groups) > 1 and min(adj_sums) <= self.max_tokens and any(merge_allowed):
-            # Get the indices of the adjacent nodes that would result in the
-            #  smallest possible merged snippet. Ignore protected nodes.
+            # Get the index of the node that would result in the smallest
+            #  merged snippet when merged with the node that follows it.
+            #  Ignore protected nodes.
             mergeable_indices = compress(range(len(adj_sums)), merge_allowed)
-            i0 = int(min(mergeable_indices, key=adj_sums.__getitem__))
-            i1 = i0 + 1
+            C = int(min(mergeable_indices, key=adj_sums.__getitem__))
+            # C: Central index
+            # L: Index to the left
+            # R: Index to the right (to be merged in to C)
+            # N: Next index (to the right of R, the "new R")
+            L, R, N = C - 1, C + 1, C + 2
             # Recalculate the length. We can't simply use the adj_sum, because
             #  it is an underestimate due to the adjoining suffix/prefix.
-            central_node = groups[i0][-1]
-            merged_text = "".join([text_chunks[i0], central_node.suffix, text_chunks[i1]])
+            central_node = groups[C][-1]
+            merged_text = "".join([text_chunks[C], central_node.suffix, text_chunks[R]])
             merged_text_length = self._count_tokens(merged_text)
             # If the true length of the merged pair is too long, don't merge them
             #  Instead, correct the estimate, since shorter pairs may yet exist
             if merged_text_length > self.max_tokens:
-                adj_sums[i0] = merged_text_length
+                adj_sums[C] = merged_text_length
                 continue
             # Update adjacent sum estimates
-            if i0 > 0:
-                adj_sums[i0 - 1] += merged_text_length
-            if i1 < len(adj_sums) - 1:
-                adj_sums[i1 + 1] += merged_text_length
-            if i0 > 0 and i1 < len(merge_allowed) - 1:
-                if not (merge_allowed[i0 - 1] and merge_allowed[i1 + 1]):
-                    merge_allowed[i0 - 1] = merge_allowed[i1 + 1] = False
+            if L >= 0:
+                adj_sums[L] = lengths[L] + merged_text_length
+            if N < len(adj_sums):
+                adj_sums[R] = lengths[N] + merged_text_length
             # The potential merge length for this pair is removed
-            adj_sums.pop(i0)
-            merge_allowed.pop(i0)
+            adj_sums.pop(C)
+            # The merged-in node is removed from the protected list
+            #  The merge_allowed list need not be updated - if the node now to
+            #  its right is protected, the merge_allowed element corresponding
+            #  to the merged neighbor will have been True, and now corresponds
+            #  to the merged node.
+            merge_allowed.pop(C)
             # Merge the pair of node groups
-            groups[i0 : i1 + 1] = [groups[i0] + groups[i1]]
-            text_chunks[i0 : i1 + 1] = [merged_text]
-            lengths[i0 : i1 + 1] = [merged_text_length]
+            groups[C:N] = [groups[C] + groups[R]]
+            text_chunks[C:N] = [merged_text]
+            lengths[C:N] = [merged_text_length]
         return groups
@@ -403,13 +411,13 @@ class Splitter(FileManager):
         self._split_into_lines(node)
     def _split_into_lines(self, node: CodeBlock):
-        split_text = re.split(r"(\n+)", node.text)
+        split_text = list(re.split(r"(\n+)", node.text))
         # If the string didn't start/end with newlines, make sure to include
         #  empty strings for the prefix/suffixes
-        if split_text[0].strip("\n"):
+        if not re.match(r"^\n+$", split_text[0]):
             split_text = [""] + split_text
-        if split_text[-1].strip("\n"):
+        if not re.match(r"^\n+$", split_text[-1]):
             split_text.append("")
         betweens = split_text[::2]
         lines = split_text[1::2]

janus/language/treesitter/treesitter.py CHANGED Viewed

@@ -154,7 +154,15 @@ class TreeSitterSplitter(Splitter):
             The pointer to the language.
         """
         lib = cdll.LoadLibrary(os.fspath(so_file))
-        language_function = getattr(lib, f"tree_sitter_{self.language}")
+        # Added this try-except block to handle the case where the language is not
+        # supported in lowercase by the creator of the grammar. Ex: COBOL
+        # https://github.com/yutaro-sakamoto/tree-sitter-cobol/blob/main/grammar.js#L13
+        try:
+            language_function = getattr(lib, f"tree_sitter_{self.language}")
+        except AttributeError:
+            language = self.language.upper()
+            language_function = getattr(lib, f"tree_sitter_{language}")
         language_function.restype = c_void_p
         pointer = language_function()
         return pointer

janus/llm/models_info.py CHANGED Viewed

@@ -6,9 +6,13 @@ from typing import Callable, Protocol, TypeVar
 from dotenv import load_dotenv
 from langchain_community.llms import HuggingFaceTextGenInference
 from langchain_core.runnables import Runnable
-from langchain_openai import AzureChatOpenAI
+from langchain_openai import AzureChatOpenAI, ChatOpenAI
-from janus.llm.model_callbacks import COST_PER_1K_TOKENS, azure_model_reroutes
+from janus.llm.model_callbacks import (
+    COST_PER_1K_TOKENS,
+    azure_model_reroutes,
+    openai_model_reroutes,
+)
 from janus.prompts.prompt import (
     ChatGptPromptEngine,
     ClaudePromptEngine,
@@ -90,6 +94,7 @@ claude_models = [
     "bedrock-claude-instant-v1",
     "bedrock-claude-haiku",
     "bedrock-claude-sonnet",
+    "bedrock-claude-sonnet-3.5",
 ]
 llama2_models = [
     "bedrock-llama2-70b",
@@ -126,7 +131,7 @@ bedrock_models = [
 all_models = [*azure_models, *bedrock_models]
 MODEL_TYPE_CONSTRUCTORS: dict[str, ModelType] = {
-    # "OpenAI": ChatOpenAI,
+    "OpenAI": ChatOpenAI,
     "HuggingFace": HuggingFaceTextGenInference,
     "Azure": AzureChatOpenAI,
     "Bedrock": Bedrock,
@@ -136,7 +141,7 @@ MODEL_TYPE_CONSTRUCTORS: dict[str, ModelType] = {
 MODEL_PROMPT_ENGINES: dict[str, Callable[..., PromptEngine]] = {
-    # **{m: ChatGptPromptEngine for m in openai_models},
+    **{m: ChatGptPromptEngine for m in openai_models},
     **{m: ChatGptPromptEngine for m in azure_models},
     **{m: ClaudePromptEngine for m in claude_models},
     **{m: Llama2PromptEngine for m in llama2_models},
@@ -147,12 +152,13 @@ MODEL_PROMPT_ENGINES: dict[str, Callable[..., PromptEngine]] = {
 }
 MODEL_ID_TO_LONG_ID = {
-    # **{m: mr for m, mr in openai_model_reroutes.items()},
+    **{m: mr for m, mr in openai_model_reroutes.items()},
     **{m: mr for m, mr in azure_model_reroutes.items()},
     "bedrock-claude-v2": "anthropic.claude-v2",
     "bedrock-claude-instant-v1": "anthropic.claude-instant-v1",
     "bedrock-claude-haiku": "anthropic.claude-3-haiku-20240307-v1:0",
     "bedrock-claude-sonnet": "anthropic.claude-3-sonnet-20240229-v1:0",
+    "bedrock-claude-sonnet-3.5": "anthropic.claude-3-5-sonnet-20240620-v1:0",
     "bedrock-llama2-70b": "meta.llama2-70b-v1",
     "bedrock-llama2-70b-chat": "meta.llama2-70b-chat-v1",
     "bedrock-llama2-13b": "meta.llama2-13b-chat-v1",
@@ -179,7 +185,7 @@ DEFAULT_MODELS = list(MODEL_DEFAULT_ARGUMENTS.keys())
 MODEL_CONFIG_DIR = Path.home().expanduser() / ".janus" / "llm"
 MODEL_TYPES: dict[str, PromptEngine] = {
-    # **{m: "OpenAI" for m in openai_models},
+    **{m: "OpenAI" for m in openai_models},
     **{m: "Azure" for m in azure_models},
     **{m: "BedrockChat" for m in bedrock_models},
 }
@@ -200,6 +206,7 @@ TOKEN_LIMITS: dict[str, int] = {
     "anthropic.claude-instant-v1": 100_000,
     "anthropic.claude-3-haiku-20240307-v1:0": 248_000,
     "anthropic.claude-3-sonnet-20240229-v1:0": 248_000,
+    "anthropic.claude-3-5-sonnet-20240620-v1:0": 200_000,
     "meta.llama2-70b-v1": 4096,
     "meta.llama2-70b-chat-v1": 4096,
     "meta.llama2-13b-chat-v1": 4096,
@@ -286,15 +293,16 @@ def load_model(model_id) -> JanusModel:
         # log.warning("Waiting 10 seconds...")
         # Give enough time for the user to read the warnings and cancel
         # time.sleep(10)
-        raise DeprecationWarning("OpenAI models are no longer supported.")
+        # raise DeprecationWarning("OpenAI models are no longer supported.")
     elif model_type_name == "Azure":
         model_args.update(
-            {
-                "api_key": os.getenv("AZURE_OPENAI_API_KEY"),
-                "azure_endpoint": os.getenv("AZURE_OPENAI_ENDPOINT"),
-                "api_version": os.getenv("OPENAI_API_VERSION", "2024-02-01"),
-            }
+            api_key=os.getenv("AZURE_OPENAI_API_KEY"),
+            azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
+            api_version=os.getenv("OPENAI_API_VERSION", "2024-02-01"),
+            azure_deployment=model_id,
+            request_timeout=3600,
+            max_tokens=4096,
         )
     model_type = MODEL_TYPE_CONSTRUCTORS[model_type_name]

janus/parsers/eval_parsers/incose_parser.py ADDED Viewed

@@ -0,0 +1,134 @@
+import json
+import random
+import uuid
+from typing import List
+from langchain.output_parsers import PydanticOutputParser
+from langchain_core.exceptions import OutputParserException
+from langchain_core.messages import BaseMessage
+from langchain_core.pydantic_v1 import BaseModel, Field, validator
+from janus.language.block import CodeBlock
+from janus.parsers.parser import JanusParser
+from janus.utils.logger import create_logger
+log = create_logger(__name__)
+RNG = random.Random()
+class Criteria(BaseModel):
+    reasoning: str = Field(description="A short explanation for the given assessment")
+    score: str = Field("A simple `pass` or `fail`")
+    @validator("score")
+    def score_is_valid(cls, v: str):
+        v = v.lower().strip()
+        if v not in {"pass", "fail"}:
+            raise OutputParserException("Score must be either 'pass' or 'fail'")
+        return v
+class Requirement(BaseModel):
+    requirement_id: str = Field(description="The 8-character comment ID")
+    requirement: str = Field(description="The original requirement being evaluated")
+    C1: Criteria
+    C2: Criteria
+    C3: Criteria
+    C4: Criteria
+    C5: Criteria
+    C6: Criteria
+    C7: Criteria
+    C8: Criteria
+    C9: Criteria
+class RequirementList(BaseModel):
+    __root__: List[Requirement] = Field(
+        description=(
+            "A list of requirement evaluations. Each element should include"
+            " the requirement's 8-character ID in the `requirement_id` field,"
+            " the original requirement in the 'requirement' field, "
+            " and nine score objects corresponding to each criterion."
+        )
+    )
+class IncoseParser(JanusParser, PydanticOutputParser):
+    requirements: dict[str, str]
+    def __init__(self):
+        PydanticOutputParser.__init__(
+            self,
+            pydantic_object=RequirementList,
+            requirements={},
+        )
+    def parse_input(self, block: CodeBlock) -> str:
+        # TODO: Perform comment stripping/placeholding here rather than in script
+        text = super().parse_input(block)
+        RNG.seed(text)
+        obj = json.loads(text)
+        # For some reason requirements objects are in a double list?
+        reqs = obj["requirements"]
+        # Generate a unique ID for each requirement (ensure they are unique)
+        req_ids = set()
+        while len(req_ids) < len(reqs):
+            req_ids.add(str(uuid.UUID(int=RNG.getrandbits(128), version=4))[:8])
+        self.requirements = dict(zip(req_ids, reqs))
+        reqs_str = "\n\n".join(
+            f"Requirement {rid} : {req}" for rid, req in self.requirements.items()
+        )
+        obj["requirements"] = reqs_str
+        return json.dumps(obj)
+    def parse(self, text: str | BaseMessage) -> str:
+        if isinstance(text, BaseMessage):
+            text = str(text.content)
+        # Strip everything outside the JSON object
+        begin, end = text.find("["), text.rfind("]")
+        text = text[begin : end + 1]
+        try:
+            out: RequirementList = super().parse(text)
+        except json.JSONDecodeError as e:
+            log.debug(f"Invalid JSON object. Output:\n{text}")
+            raise OutputParserException(f"Got invalid JSON object. Error: {e}")
+        evals: dict[str, dict] = {c.requirement_id: c.dict() for c in out.__root__}
+        seen_keys = set(evals.keys())
+        expected_keys = set(self.requirements.keys())
+        missing_keys = expected_keys.difference(seen_keys)
+        invalid_keys = seen_keys.difference(expected_keys)
+        if missing_keys:
+            log.debug(f"Missing keys: {missing_keys}")
+            if invalid_keys:
+                log.debug(f"Invalid keys: {invalid_keys}")
+            log.debug(f"Missing keys: {missing_keys}")
+            raise OutputParserException(
+                f"Got invalid return object. Missing the following expected "
+                f"keys: {missing_keys}"
+            )
+        for key in invalid_keys:
+            del evals[key]
+        for rid in evals.keys():
+            evals[rid]["requirement"] = self.requirements[rid]
+            evals[rid].pop("requirement_id")
+        return json.dumps(evals)
+    def parse_combined_output(self, text: str) -> str:
+        if not text.strip():
+            return str({})
+        objs = [json.loads(line.strip()) for line in text.split("\n") if line.strip()]
+        output_obj = {}
+        for obj in objs:
+            output_obj.update(obj)
+        return json.dumps(output_obj)

janus-llm 4.1.0__py3-none-any.whl → 4.3.1__py3-none-any.whl

janus-llm 4.1.0py3-none-any.whl → 4.3.1py3-none-any.whl