PyPI - janus-llm - Versions diffs - 1.0.0__py3-none-any.whl → 2.0.1__py3-none-any.whl - Mend

janus-llm 1.0.0py3-none-any.whl → 2.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

janus/__init__.py +9 -1
janus/__main__.py +4 -0
janus/_tests/test_cli.py +128 -0
janus/_tests/test_translate.py +49 -7
janus/cli.py +530 -46
janus/converter.py +50 -19
janus/embedding/_tests/test_collections.py +2 -8
janus/embedding/_tests/test_database.py +32 -0
janus/embedding/_tests/test_vectorize.py +9 -4
janus/embedding/collections.py +49 -6
janus/embedding/embedding_models_info.py +130 -0
janus/embedding/vectorize.py +53 -62
janus/language/_tests/__init__.py +0 -0
janus/language/_tests/test_combine.py +62 -0
janus/language/_tests/test_splitter.py +16 -0
janus/language/binary/_tests/test_binary.py +16 -1
janus/language/binary/binary.py +10 -3
janus/language/block.py +31 -30
janus/language/combine.py +26 -34
janus/language/mumps/_tests/test_mumps.py +2 -2
janus/language/mumps/mumps.py +93 -9
janus/language/naive/__init__.py +4 -0
janus/language/naive/basic_splitter.py +14 -0
janus/language/naive/chunk_splitter.py +26 -0
janus/language/naive/registry.py +13 -0
janus/language/naive/simple_ast.py +18 -0
janus/language/naive/tag_splitter.py +61 -0
janus/language/splitter.py +168 -74
janus/language/treesitter/_tests/test_treesitter.py +19 -14
janus/language/treesitter/treesitter.py +37 -13
janus/llm/model_callbacks.py +177 -0
janus/llm/models_info.py +165 -72
janus/metrics/__init__.py +8 -0
janus/metrics/_tests/__init__.py +0 -0
janus/metrics/_tests/reference.py +2 -0
janus/metrics/_tests/target.py +2 -0
janus/metrics/_tests/test_bleu.py +56 -0
janus/metrics/_tests/test_chrf.py +67 -0
janus/metrics/_tests/test_file_pairing.py +59 -0
janus/metrics/_tests/test_llm.py +91 -0
janus/metrics/_tests/test_reading.py +28 -0
janus/metrics/_tests/test_rouge_score.py +65 -0
janus/metrics/_tests/test_similarity_score.py +23 -0
janus/metrics/_tests/test_treesitter_metrics.py +110 -0
janus/metrics/bleu.py +66 -0
janus/metrics/chrf.py +55 -0
janus/metrics/cli.py +7 -0
janus/metrics/complexity_metrics.py +208 -0
janus/metrics/file_pairing.py +113 -0
janus/metrics/llm_metrics.py +202 -0
janus/metrics/metric.py +466 -0
janus/metrics/reading.py +70 -0
janus/metrics/rouge_score.py +96 -0
janus/metrics/similarity.py +53 -0
janus/metrics/splitting.py +38 -0
janus/parsers/_tests/__init__.py +0 -0
janus/parsers/_tests/test_code_parser.py +32 -0
janus/parsers/code_parser.py +24 -253
janus/parsers/doc_parser.py +169 -0
janus/parsers/eval_parser.py +80 -0
janus/parsers/reqs_parser.py +72 -0
janus/prompts/prompt.py +103 -30
janus/translate.py +636 -111
janus/utils/_tests/__init__.py +0 -0
janus/utils/_tests/test_logger.py +67 -0
janus/utils/_tests/test_progress.py +20 -0
janus/utils/enums.py +56 -3
janus/utils/progress.py +56 -0
{janus_llm-1.0.0.dist-info → janus_llm-2.0.1.dist-info}/METADATA +27 -11
janus_llm-2.0.1.dist-info/RECORD +94 -0
{janus_llm-1.0.0.dist-info → janus_llm-2.0.1.dist-info}/WHEEL +1 -1
janus_llm-1.0.0.dist-info/RECORD +0 -48
{janus_llm-1.0.0.dist-info → janus_llm-2.0.1.dist-info}/LICENSE +0 -0
{janus_llm-1.0.0.dist-info → janus_llm-2.0.1.dist-info}/entry_points.txt +0 -0

janus/language/_tests/test_combine.py ADDED Viewed

@@ -0,0 +1,62 @@
+import unittest
+from ..combine import CodeBlock, Combiner, TranslatedCodeBlock
+class TestCombiner(unittest.TestCase):
+    def setUp(self):
+        self.combiner = Combiner()
+        self.block = CodeBlock(
+            id=1,
+            name="test",
+            node_type="test",
+            language="python",
+            text="# test",
+            start_point=(0, 0),
+            end_point=(0, 0),
+            start_byte=0,
+            end_byte=0,
+            tokens=[],
+            children=[
+                CodeBlock(
+                    id=2,
+                    name="child",
+                    node_type="test",
+                    language="python",
+                    text="test",
+                    start_point=(0, 0),
+                    end_point=(0, 0),
+                    start_byte=0,
+                    end_byte=0,
+                    tokens=[],
+                    children=[],
+                )
+            ],
+        )
+        self.translated_block = TranslatedCodeBlock(
+            self.block,
+            language="python",
+        )
+    def test_combine(self):
+        self.combiner.combine(self.block)
+        self.assertFalse(self.block.omit_prefix)
+    def test_combine_children(self):
+        self.block.complete = False
+        self.combiner.combine_children(self.block)
+        self.assertTrue(self.block.complete)
+    def test_combine_children_with_translated_block(self):
+        self.translated_block.complete = False
+        self.combiner.combine_children(self.translated_block)
+        self.assertFalse(self.translated_block.complete)
+    def test_combine_children_with_text_none(self):
+        self.combiner.combine_children(self.block)
+        self.assertEqual(self.block.text, "# test")
+        self.assertTrue(self.block.complete)
+if __name__ == "__main__":
+    unittest.main()

janus/language/_tests/test_splitter.py ADDED Viewed

@@ -0,0 +1,16 @@
+import unittest
+from janus.language.splitter import Splitter
+class TestSplitter(unittest.TestCase):
+    def setUp(self):
+        self.splitter = Splitter(language="python")
+    def test_split(self):
+        input_data = "janus/__main__.py"
+        self.assertRaises(NotImplementedError, self.splitter.split, input_data)
+if __name__ == "__main__":
+    unittest.main()

janus/language/binary/_tests/test_binary.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 import unittest
 from pathlib import Path
+from unittest.mock import patch
 import pytest
@@ -12,12 +13,26 @@ class TestBinarySplitter(unittest.TestCase):
     """Tests for the BinarySplitter class."""
     def setUp(self):
-        model_name = "gpt-3.5-turbo"
+        model_name = "gpt-3.5-turbo-0125"
         self.binary_file = Path("janus/language/binary/_tests/hello")
         self.llm, _, _ = load_model(model_name)
         self.splitter = BinarySplitter(model=self.llm)
         os.environ["GHIDRA_INSTALL_PATH"] = "~/programs/ghidra_10.4_PUBLIC"
+    def test_setup(self):
+        """Test that the setup sets the environment variable correctly."""
+        with patch("os.getenv") as mock_getenv:
+            mock_getenv.return_value = "~/programs/ghidra_10.4_PUBLIC"
+            self.assertEqual(
+                os.getenv("GHIDRA_INSTALL_PATH"), "~/programs/ghidra_10.4_PUBLIC"
+            )
+            mock_getenv.assert_called_once_with("GHIDRA_INSTALL_PATH")
+    def test_initialization(self):
+        """Test that BinarySplitter is initialized correctly."""
+        self.assertIsInstance(self.splitter, BinarySplitter)
+        self.assertEqual(self.splitter.model, self.llm)
     @pytest.mark.ghidra(
         reason=(
             "No way to test this in CI w/o installing Ghidra, but want to keep here to "

janus/language/binary/binary.py CHANGED Viewed

@@ -29,7 +29,13 @@ class BinarySplitter(TreeSitterSplitter):
     with for transcoding.
     """
-    def __init__(self, model: None | BaseLanguageModel = None, max_tokens: int = 4096):
+    def __init__(
+        self,
+        model: None | BaseLanguageModel = None,
+        max_tokens: int = 4096,
+        protected_node_types: tuple[str] = (),
+        prune_node_types: tuple[str] = (),
+    ):
         """Initialize a BinarySplitter instance.
         Arguments:
@@ -40,7 +46,8 @@ class BinarySplitter(TreeSitterSplitter):
             language="binary",
             model=model,
             max_tokens=max_tokens,
-            use_placeholders=False,
+            protected_node_types=protected_node_types,
+            prune_node_types=prune_node_types,
         )
     def _execute_ghidra_script(self, cmd: list[str]) -> str:
@@ -131,7 +138,7 @@ class BinarySplitter(TreeSitterSplitter):
         code = self._get_decompilation(file)
         root = self._get_ast(code)
-        self._set_identifiers(root, path)
+        self._set_identifiers(root, path.name)
         self._segment_leaves(root)
         self._merge_tree(root)

janus/language/block.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from functools import total_ordering
-from typing import ForwardRef, Hashable
+from typing import ForwardRef, Hashable, Optional, Tuple
 from ..utils.logger import create_logger
 from .node import NodeType
@@ -14,7 +14,7 @@ class CodeBlock:
     Attributes:
         id: The id of the code block in the AST
         name: Descriptive name of node
-        type: The type of the code block ('function', 'module', etc.). Defined in the
+        node_type: The type of the code block ('function', 'module', etc.). Defined in the
             language-specific modules.
         language: The language of the code block.
         text: The code block.
@@ -33,32 +33,32 @@ class CodeBlock:
     def __init__(
         self,
         id: Hashable,
-        name: None | str,
-        type: NodeType,
+        name: Optional[str],
+        node_type: NodeType,
         language: str,
-        text: None | str,
-        start_point: None | tuple[int, int],
-        end_point: None | tuple[int, int],
-        start_byte: None | int,
-        end_byte: None | int,
+        text: Optional[str],
+        start_point: Optional[Tuple[int, int]],
+        end_point: Optional[Tuple[int, int]],
+        start_byte: Optional[int],
+        end_byte: Optional[int],
         tokens: int,
         children: list[ForwardRef("CodeBlock")],
-        embedding_id: None | str = None,
-        affixes: tuple[str, str] = ("", ""),
+        embedding_id: Optional[str] = None,
+        affixes: Tuple[str, str] = ("", ""),
     ) -> None:
         self.id: Hashable = id
-        self.name: None | str = name
-        self.type: NodeType = type
+        self.name: Optional[str] = name
+        self.node_type: NodeType = node_type
         self.language: str = language
-        self.text: None | str = text
-        self.start_point: None | tuple[int, int] = start_point
-        self.end_point: None | tuple[int, int] = end_point
-        self.start_byte: None | [int] = start_byte
-        self.end_byte: None | [int] = end_byte
+        self.text: Optional[str] = text
+        self.start_point: Optional[Tuple[int, int]] = start_point
+        self.end_point: Optional[Tuple[int, int]] = end_point
+        self.start_byte: Optional[int] = start_byte
+        self.end_byte: Optional[int] = end_byte
         self.tokens: int = tokens
         self.children: list[ForwardRef("CodeBlock")] = sorted(children)
-        self.embedding_id: None | [str] = embedding_id
-        self.affixes: tuple[str, str] = affixes
+        self.embedding_id: Optional[str] = embedding_id
+        self.affixes: Tuple[str, str] = affixes
         self.complete = True
         self.omit_prefix = True
@@ -83,15 +83,7 @@ class CodeBlock:
     @property
     def complete_text(self) -> str:
-        return f"{self.prefix}{self.text}{self.suffix}"
-    @property
-    def placeholder(self) -> str:
-        return f"<<<{self.id}>>>"
-    @property
-    def complete_placeholder(self) -> str:
-        return f"{self.prefix}<<<{self.id}>>>{self.suffix}"
+        return f"{self.prefix}{self.text or ''}{self.suffix}"
     @property
     def n_descendents(self) -> int:
@@ -146,6 +138,14 @@ class CodeBlock:
         self.affixes = (self.affixes[0], "")
         return suffix
+    def rebuild_text_from_children(self):
+        if self.children:
+            prefix = self.affixes[0] + self.children[0].pop_prefix()
+            suffix = self.children[-1].pop_suffix() + self.affixes[1]
+            self.text = "".join(c.complete_text for c in self.children)
+            self.affixes = (prefix, suffix)
+            self.tokens = sum(c.tokens for c in self.children)
     def tree_str(self, depth: int = 0) -> str:
         """A string representation of the tree with this block as the root
@@ -195,7 +195,7 @@ class TranslatedCodeBlock(CodeBlock):
         super().__init__(
             id=original.id,
             name=original.name,
-            type=original.type,
+            node_type=original.node_type,
             language=language,
             text=None,
             start_point=original.start_point,
@@ -214,6 +214,7 @@ class TranslatedCodeBlock(CodeBlock):
         self.translated = False
         self.cost = 0.0
         self.retries = 0
+        self.processing_time = 0
     @property
     def total_cost(self) -> float:

janus/language/combine.py CHANGED Viewed

@@ -11,14 +11,14 @@ class Combiner(FileManager):
     """
     @staticmethod
-    def combine(block: CodeBlock) -> None:
+    def combine(root: CodeBlock) -> None:
         """Combine the given block with its children.
         Arguments:
-            block: The functional code block to combine with its children.
+            root: The functional code block to combine with its children.
         """
-        Combiner.combine_children(block)
-        block.omit_prefix = False
+        Combiner.combine_children(root)
+        root.omit_prefix = False
     @staticmethod
     def combine_children(block: CodeBlock) -> None:
@@ -48,16 +48,11 @@ class Combiner(FileManager):
             block.complete = children_complete
             return
-        # Replace all placeholders
         missing_children = []
         for child in block.children:
             if isinstance(block, TranslatedCodeBlock) and not child.translated:
                 missing_children.append(child)
                 continue
-            if not Combiner.contains_child(block.text, child):
-                missing_children.append(child)
-                continue
-            block.text = block.text.replace(child.placeholder, child.text)
         if missing_children:
             missing_ids = [c.id for c in missing_children]
@@ -66,36 +61,33 @@ class Combiner(FileManager):
         block.children = missing_children
         block.complete = children_complete and not missing_children
+class JsonCombiner(Combiner):
     @staticmethod
-    def contains_child(code: str, child: CodeBlock) -> bool:
-        """Determine whether the given code contains a placeholder for the given
-        child block.
+    def combine(root: CodeBlock) -> None:
+        """Combine the given block with its children.
         Arguments:
-            code: The code to check for the placeholder
-            child: The child block to check for
-        Returns:
-            Whether the given code contains a placeholder for the given child
-            block.
+            root: The functional code block to combine with its children.
         """
-        return code is None or child.placeholder in code
+        stack = [root]
+        while stack:
+            block = stack.pop()
+            if block.children:
+                stack.extend(block.children)
+                block.affixes = ("", "")
+            else:
+                block.affixes = ("\n", "\n")
+        super(JsonCombiner, JsonCombiner).combine(root)
+class ChunkCombiner(Combiner):
     @staticmethod
-    def count_missing(input_block: CodeBlock, output_code: str) -> int:
-        """Return the number of children of input_block who are not represented
-        in output_code with a placeholder
+    def combine(root: CodeBlock) -> None:
+        """A combiner which doesn't actually combine the code blocks,
+        instead preserving children
         Arguments:
-            input_block: The block to check for missing children
-            output_code: The code to check for placeholders
-        Returns:
-            The number of children of input_block who are not represented in
-            output_code with a placeholder
+            root: The functional code block to combine with its children.
         """
-        missing_children = 0
-        for child in input_block.children:
-            if not Combiner.contains_child(output_code, child):
-                missing_children += 1
-        return missing_children
+        return root

janus/language/mumps/_tests/test_mumps.py CHANGED Viewed

@@ -11,7 +11,7 @@ class TestMumpsSplitter(unittest.TestCase):
     def setUp(self):
         """Set up the tests."""
-        model_name = "gpt-3.5-turbo"
+        model_name = "gpt-3.5-turbo-0125"
         llm, _, _ = load_model(model_name)
         self.splitter = MumpsSplitter(model=llm)
         self.combiner = Combiner(language="mumps")
@@ -20,7 +20,7 @@ class TestMumpsSplitter(unittest.TestCase):
     def test_split(self):
         """Test the split method."""
         tree_root = self.splitter.split(self.test_file)
-        self.assertEqual(len(tree_root.children), 6)
+        self.assertEqual(len(tree_root.children), 22)
         self.assertLessEqual(tree_root.max_tokens, self.splitter.max_tokens)
         self.assertFalse(tree_root.complete)
         self.combiner.combine_children(tree_root)

janus/language/mumps/mumps.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import re
-from pathlib import Path
 from langchain.schema.language_model import BaseLanguageModel
@@ -43,7 +42,13 @@ class MumpsSplitter(Splitter):
         re.VERBOSE | re.DOTALL,
     )
-    def __init__(self, model: None | BaseLanguageModel = None, max_tokens: int = 4096):
+    def __init__(
+        self,
+        model: None | BaseLanguageModel = None,
+        max_tokens: int = 4096,
+        protected_node_types: tuple[str] = ("routine_definition",),
+        prune_node_types: tuple[str] = (),
+    ):
         """Initialize a MumpsSplitter instance.
         Arguments:
@@ -53,17 +58,18 @@ class MumpsSplitter(Splitter):
             language="mumps",
             model=model,
             max_tokens=max_tokens,
-            use_placeholders=False,
+            protected_node_types=protected_node_types,
+            prune_node_types=prune_node_types,
         )
         # MUMPS code tends to take about 2/3 the space of Python
         self.max_tokens: int = int(max_tokens * 2 / 5)
-    def _set_identifiers(self, root: CodeBlock, path: Path):
+    def _set_identifiers(self, root: CodeBlock, name: str):
         stack = [root]
         while stack:
             node = stack.pop()
-            node.name = f"{path.name}:{node.id}"
+            node.name = f"{name}:{node.id}"
             stack.extend(node.children)
     def _get_ast(self, code: str) -> CodeBlock:
@@ -104,15 +110,19 @@ class MumpsSplitter(Splitter):
                 start_byte=start_byte,
                 end_byte=end_byte,
                 affixes=(prefix, suffix),
-                type=NodeType("subroutine"),
+                node_type=NodeType("routine_definition"),
                 children=[],
                 language=self.language,
                 tokens=self._count_tokens(chunk),
             )
+            self._split_into_lines(node)
+            for line_node in node.children:
+                self._split_comment(line_node)
             children.append(node)
-            start_byte = end_byte + len(bytes(suffix, "utf-8"))
-            start_line = end_line + suffix.count("\n")
+            start_byte = end_byte
+            start_line = end_line
         return CodeBlock(
             text=code,
@@ -122,8 +132,82 @@ class MumpsSplitter(Splitter):
             end_point=(code.count("\n"), 0),
             start_byte=0,
             end_byte=len(bytes(code, "utf-8")),
-            type=NodeType("routine"),
+            node_type=NodeType("routine"),
             children=children,
             language=self.language,
             tokens=self._count_tokens(code),
         )
+    @staticmethod
+    def comment_start(line: str) -> int:
+        first_semicolon = line.find(";")
+        if first_semicolon < 0:
+            return first_semicolon
+        # In mumps, quotes are escaped by doubling them (""). Single quote
+        #  characters are logical not operators, not quotes
+        n_quotes = line[:first_semicolon].replace('""', "").count('"')
+        # If the number of quotes prior to the first semicolon is even, then
+        #  that semicolon is not part of a quote (and therefore starts a comment)
+        if n_quotes % 2 == 0:
+            return first_semicolon
+        last_semicolon = first_semicolon
+        while (next_semicolon := line.find(";", last_semicolon + 1)) > 0:
+            n_quotes = line[last_semicolon:next_semicolon].replace('""', "").count('"')
+            # If the number of quotes in this chunk is odd, the total number
+            #  of them up to this point is even, and the next semicolon begins
+            #  the comment
+            if n_quotes % 2:
+                return next_semicolon
+            last_semicolon = next_semicolon
+        return -1
+    def _split_comment(self, line_node: CodeBlock):
+        comment_start = self.comment_start(line_node.text)
+        if comment_start < 0:
+            line_node.node_type = NodeType("code_line")
+            return
+        code = line_node.text[:comment_start]
+        if not code.strip():
+            line_node.node_type = NodeType("comment")
+            return
+        comment = line_node.text[comment_start:]
+        (l0, c0), (l1, c1) = line_node.start_point, line_node.end_point
+        prefix, suffix = line_node.affixes
+        code_bytes = len(bytes(code, "utf-8"))
+        line_node.children = [
+            CodeBlock(
+                text=code,
+                name=f"{line_node.name}-code",
+                id=f"{line_node.name}-code",
+                start_point=(l0, c0),
+                end_point=(l1, comment_start),
+                start_byte=line_node.start_byte,
+                end_byte=line_node.start_byte + code_bytes,
+                node_type=NodeType("code_line"),
+                children=[],
+                language=line_node.language,
+                tokens=self._count_tokens(code),
+            ),
+            CodeBlock(
+                text=comment,
+                name=f"{line_node.name}-comment",
+                id=f"{line_node.name}-comment",
+                start_point=(l0, c0 + comment_start),
+                end_point=(l1, c1),
+                start_byte=line_node.start_byte + code_bytes,
+                end_byte=line_node.end_byte,
+                node_type=NodeType("comment"),
+                children=[],
+                language=self.language,
+                tokens=self._count_tokens(comment),
+            ),
+        ]

janus/language/naive/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .basic_splitter import FileSplitter
+from .chunk_splitter import ChunkSplitter
+from .simple_ast import FlexibleTreeSitterSplitter, StrictTreeSitterSplitter
+from .tag_splitter import TagSplitter

janus/language/naive/basic_splitter.py ADDED Viewed

@@ -0,0 +1,14 @@
+from janus.language.block import CodeBlock
+from janus.language.naive.chunk_splitter import ChunkSplitter
+from janus.language.naive.registry import register_splitter
+from janus.language.splitter import FileSizeError
+@register_splitter("file")
+class FileSplitter(ChunkSplitter):
+    """
+    Splits based on the entire file of the code
+    """
+    def _split_into_lines(self, node: CodeBlock):
+        raise FileSizeError("File too large for basic splitter")

janus/language/naive/chunk_splitter.py ADDED Viewed

@@ -0,0 +1,26 @@
+from janus.language.block import CodeBlock
+from janus.language.naive.registry import register_splitter
+from janus.language.node import NodeType
+from janus.language.splitter import Splitter
+@register_splitter("chunk")
+class ChunkSplitter(Splitter):
+    """
+    Splits into fixed chunk sizes without parsing
+    """
+    def _get_ast(self, code: str) -> CodeBlock:
+        return CodeBlock(
+            text=code,
+            name="root",
+            id="root",
+            start_point=(0, 0),
+            end_point=(code.count("\n"), 0),
+            start_byte=0,
+            end_byte=len(bytes(code, "utf-8")),
+            node_type=NodeType("program"),
+            children=[],
+            language=self.language,
+            tokens=self._count_tokens(code),
+        )

janus/language/naive/registry.py ADDED Viewed

@@ -0,0 +1,13 @@
+from typing import Callable, Dict
+from janus.language.splitter import Splitter
+CUSTOM_SPLITTERS: Dict[str, Callable[..., Splitter]] = dict()
+def register_splitter(name: str):
+    def callback(splitter):
+        CUSTOM_SPLITTERS[name] = splitter
+        return splitter
+    return callback

janus/language/naive/simple_ast.py ADDED Viewed

@@ -0,0 +1,18 @@
+from janus.language.naive.registry import register_splitter
+from janus.language.treesitter import TreeSitterSplitter
+from janus.utils.enums import LANGUAGES
+@register_splitter("ast-flex")
+class FlexibleTreeSitterSplitter(TreeSitterSplitter):
+    pass
+@register_splitter("ast-strict")
+class StrictTreeSitterSplitter(TreeSitterSplitter):
+    def __init__(self, language: str, **kwargs):
+        kwargs.update(
+            protected_node_types=(LANGUAGES[language]["functional_node_type"],),
+            prune_unprotected=True,
+        )
+        super().__init__(language=language, **kwargs)

janus/language/naive/tag_splitter.py ADDED Viewed

@@ -0,0 +1,61 @@
+from janus.language.block import CodeBlock
+from janus.language.naive.registry import register_splitter
+from janus.language.node import NodeType
+from janus.language.splitter import Splitter
+@register_splitter("tag")
+class TagSplitter(Splitter):
+    """
+    Splits code by tags inserted into code
+    """
+    def __init__(self, tag: str, *args, **kwargs):
+        kwargs.update(protected_node_types=("chunk",))
+        super().__init__(*args, **kwargs)
+        self._tag = f"\n{tag}\n"
+    def _get_ast(self, code: str) -> CodeBlock:
+        chunks = code.split(self._tag)
+        children = []
+        start_line = 0
+        start_byte = 0
+        for i, chunk in enumerate(chunks):
+            prefix = suffix = self._tag
+            if i == 0:
+                prefix = ""
+            if i == len(chunks) - 1:
+                suffix = ""
+            end_byte = start_byte + len(bytes(chunk, "utf-8"))
+            end_line = start_line + chunk.count("\n")
+            end_char = len(chunk) - chunk.rfind("\n") - 1
+            node = CodeBlock(
+                text=chunk,
+                name=f"Chunk {i}",
+                id=f"Chunk {i}",
+                start_point=(start_line, 0),
+                end_point=(end_line, end_char),
+                start_byte=start_byte,
+                end_byte=end_byte,
+                affixes=(prefix, suffix),
+                node_type=NodeType("chunk"),
+                children=[],
+                language=self.language,
+                tokens=self._count_tokens(chunk),
+            )
+            children.append(node)
+            start_line = end_line
+            start_byte = end_byte
+        return CodeBlock(
+            text=code,
+            name="root",
+            id="root",
+            start_point=(0, 0),
+            end_point=(code.count("\n"), 0),
+            start_byte=0,
+            end_byte=len(bytes(code, "utf-8")),
+            node_type=NodeType("program"),
+            children=children,
+            language=self.language,
+            tokens=self._count_tokens(code),
+        )

janus-llm 1.0.0__py3-none-any.whl → 2.0.1__py3-none-any.whl

janus-llm 1.0.0py3-none-any.whl → 2.0.1py3-none-any.whl