janus-llm 2.0.2__py3-none-any.whl → 2.1.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
janus/__init__.py CHANGED
@@ -5,7 +5,7 @@ from langchain_core._api.deprecation import LangChainDeprecationWarning
5
5
  from .metrics import * # noqa: F403
6
6
  from .translate import Translator
7
7
 
8
- __version__ = "2.0.2"
8
+ __version__ = "2.1.0"
9
9
 
10
10
  # Ignoring a deprecation warning from langchain_core that I can't seem to hunt down
11
11
  warnings.filterwarnings("ignore", category=LangChainDeprecationWarning)
janus/cli.py CHANGED
@@ -12,8 +12,6 @@ from rich.console import Console
12
12
  from rich.prompt import Confirm
13
13
  from typing_extensions import Annotated
14
14
 
15
- from janus.language.naive.registry import CUSTOM_SPLITTERS
16
-
17
15
  from .embedding.collections import Collections
18
16
  from .embedding.database import ChromaEmbeddingDatabase
19
17
  from .embedding.embedding_models_info import (
@@ -25,6 +23,7 @@ from .embedding.embedding_models_info import (
25
23
  from .embedding.vectorize import ChromaDBVectorizer
26
24
  from .language.binary import BinarySplitter
27
25
  from .language.mumps import MumpsSplitter
26
+ from .language.naive.registry import CUSTOM_SPLITTERS
28
27
  from .language.treesitter import TreeSitterSplitter
29
28
  from .llm.model_callbacks import COST_PER_1K_TOKENS
30
29
  from .llm.models_info import MODEL_CONFIG_DIR, MODEL_TYPE_CONSTRUCTORS, TOKEN_LIMITS
janus/converter.py CHANGED
@@ -3,6 +3,7 @@ from typing import Any
3
3
 
4
4
  from langchain.schema.language_model import BaseLanguageModel
5
5
 
6
+ from .language.alc.alc import AlcSplitter
6
7
  from .language.binary import BinarySplitter
7
8
  from .language.mumps import MumpsSplitter
8
9
  from .language.splitter import Splitter
@@ -152,6 +153,8 @@ class Converter:
152
153
  if self._source_language in CUSTOM_SPLITTERS:
153
154
  if self._source_language == "mumps":
154
155
  self._splitter = MumpsSplitter(**kwargs)
156
+ elif self._source_language == "ibmhlasm":
157
+ self._splitter = AlcSplitter(**kwargs)
155
158
  elif self._source_language == "binary":
156
159
  self._splitter = BinarySplitter(**kwargs)
157
160
  else:
@@ -4,8 +4,8 @@ from unittest.mock import MagicMock
4
4
 
5
5
  import pytest
6
6
 
7
- from janus.embedding.collections import Collections
8
- from janus.utils.enums import EmbeddingType
7
+ from ...utils.enums import EmbeddingType
8
+ from ..collections import Collections
9
9
 
10
10
 
11
11
  class TestCollections(unittest.TestCase):
@@ -1,6 +1,6 @@
1
1
  import unittest
2
2
 
3
- from janus.language.splitter import Splitter
3
+ from ..splitter import Splitter
4
4
 
5
5
 
6
6
  class TestSplitter(unittest.TestCase):
@@ -0,0 +1 @@
1
+ from .alc import AlcCombiner, AlcSplitter
File without changes
@@ -0,0 +1,28 @@
1
+ import unittest
2
+ from pathlib import Path
3
+
4
+ from ....llm import load_model
5
+ from ...combine import Combiner
6
+ from ..alc import AlcSplitter
7
+
8
+
9
+ class TestAlcSplitter(unittest.TestCase):
10
+ """Tests for the Splitter class."""
11
+
12
+ def setUp(self):
13
+ """Set up the tests."""
14
+ model_name = "gpt-3.5-turbo-0125"
15
+ llm, _, _ = load_model(model_name)
16
+ self.splitter = AlcSplitter(model=llm)
17
+ self.combiner = Combiner(language="ibmhlasm")
18
+ self.test_file = Path("janus/language/alc/_tests/alc.asm")
19
+
20
+ def test_split(self):
21
+ """Test the split method."""
22
+ tree_root = self.splitter.split(self.test_file)
23
+ self.assertEqual(tree_root.n_descendents, 34)
24
+ self.assertLessEqual(tree_root.max_tokens, self.splitter.max_tokens)
25
+ self.assertFalse(tree_root.complete)
26
+ self.combiner.combine_children(tree_root)
27
+ self.assertTrue(tree_root.complete)
28
+ self.assertEqual(tree_root.complete_text, self.test_file.read_text())
@@ -0,0 +1,87 @@
1
+ from langchain.schema.language_model import BaseLanguageModel
2
+
3
+ from ...utils.logger import create_logger
4
+ from ..block import CodeBlock
5
+ from ..combine import Combiner
6
+ from ..node import NodeType
7
+ from ..treesitter import TreeSitterSplitter
8
+
9
+ log = create_logger(__name__)
10
+
11
+
12
+ class AlcCombiner(Combiner):
13
+ """A class that combines code blocks into ALC files."""
14
+
15
+ def __init__(self) -> None:
16
+ """Initialize a AlcCombiner instance."""
17
+ super().__init__("ibmhlasm")
18
+
19
+
20
+ class AlcSplitter(TreeSitterSplitter):
21
+ """A class for splitting ALC code into functional blocks to prompt
22
+ with for transcoding.
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ model: None | BaseLanguageModel = None,
28
+ max_tokens: int = 4096,
29
+ protected_node_types: tuple[str, ...] = (),
30
+ prune_node_types: tuple[str, ...] = (),
31
+ prune_unprotected: bool = False,
32
+ ):
33
+ """Initialize a AlcSplitter instance.
34
+
35
+ Arguments:
36
+ max_tokens: The maximum number of tokens supported by the model
37
+ """
38
+ super().__init__(
39
+ language="ibmhlasm",
40
+ model=model,
41
+ max_tokens=max_tokens,
42
+ protected_node_types=protected_node_types,
43
+ prune_node_types=prune_node_types,
44
+ prune_unprotected=prune_unprotected,
45
+ )
46
+
47
+ def _get_ast(self, code: str) -> CodeBlock:
48
+ root = super()._get_ast(code)
49
+
50
+ # Current treesitter implementation does not nest csects and dsects
51
+ # The loop below nests nodes following csect/dsect instructions into
52
+ # the children of that instruction
53
+ sect_types = {"csect_instruction", "dsect_instruction"}
54
+ queue: list[CodeBlock] = [root]
55
+ while queue:
56
+ block = queue.pop(0)
57
+
58
+ # Search this children for csects and dsects. Create a list of groups
59
+ # where each group is a csect or dsect, starting with the csect/dsect
60
+ # instruction and containing all the subsequent nodes up until the
61
+ # next csect or dsect instruction
62
+ sects: list[list[CodeBlock]] = [[]]
63
+ for c in block.children:
64
+ if c.node_type in sect_types:
65
+ sects.append([c])
66
+ else:
67
+ sects[-1].append(c)
68
+
69
+ sects = [s for s in sects if s]
70
+
71
+ # Restructure the tree, making the head of each group the parent
72
+ # of all the remaining nodes in that group
73
+ if len(sects) > 1:
74
+ block.children = []
75
+ for sect in sects:
76
+ if sect[0].node_type in sect_types:
77
+ sect_node = self.merge_nodes(sect)
78
+ sect_node.children = sect
79
+ sect_node.node_type = NodeType(str(sect[0].node_type)[:5])
80
+ block.children.append(sect_node)
81
+ else:
82
+ block.children.extend(sect)
83
+
84
+ # Push the children onto the queue
85
+ queue.extend(block.children)
86
+
87
+ return root
janus/language/block.py CHANGED
@@ -152,9 +152,11 @@ class CodeBlock:
152
152
  Returns:
153
153
  A string representation of the tree with this block as the root
154
154
  """
155
+ tokens = self.tokens
155
156
  identifier = self.id
156
157
  if self.text is None:
157
158
  identifier = f"({identifier})"
159
+ tokens = self.total_tokens
158
160
  elif not self.complete:
159
161
  identifier += "*"
160
162
  if self.start_point is not None and self.end_point is not None:
@@ -165,7 +167,7 @@ class CodeBlock:
165
167
  seg = ""
166
168
  return "\n".join(
167
169
  [
168
- f"{'| '*depth}{identifier}{seg}",
170
+ f"{'| '*depth}{identifier}{seg} ({tokens:,d} tokens)",
169
171
  *[c.tree_str(depth + 1) for c in self.children],
170
172
  ]
171
173
  )
@@ -48,6 +48,7 @@ class MumpsSplitter(Splitter):
48
48
  max_tokens: int = 4096,
49
49
  protected_node_types: tuple[str] = ("routine_definition",),
50
50
  prune_node_types: tuple[str] = (),
51
+ prune_unprotected: bool = False,
51
52
  ):
52
53
  """Initialize a MumpsSplitter instance.
53
54
 
@@ -60,11 +61,9 @@ class MumpsSplitter(Splitter):
60
61
  max_tokens=max_tokens,
61
62
  protected_node_types=protected_node_types,
62
63
  prune_node_types=prune_node_types,
64
+ prune_unprotected=prune_unprotected,
63
65
  )
64
66
 
65
- # MUMPS code tends to take about 2/3 the space of Python
66
- self.max_tokens: int = int(max_tokens * 2 / 5)
67
-
68
67
  def _set_identifiers(self, root: CodeBlock, name: str):
69
68
  stack = [root]
70
69
  while stack:
@@ -1,4 +1,4 @@
1
1
  from .basic_splitter import FileSplitter
2
2
  from .chunk_splitter import ChunkSplitter
3
- from .simple_ast import FlexibleTreeSitterSplitter, StrictTreeSitterSplitter
3
+ from .simple_ast import get_flexible_ast, get_strict_ast
4
4
  from .tag_splitter import TagSplitter
@@ -1,7 +1,7 @@
1
- from janus.language.block import CodeBlock
2
- from janus.language.naive.chunk_splitter import ChunkSplitter
3
- from janus.language.naive.registry import register_splitter
4
- from janus.language.splitter import FileSizeError
1
+ from ..block import CodeBlock
2
+ from ..naive.chunk_splitter import ChunkSplitter
3
+ from ..naive.registry import register_splitter
4
+ from ..splitter import FileSizeError
5
5
 
6
6
 
7
7
  @register_splitter("file")
@@ -1,7 +1,7 @@
1
- from janus.language.block import CodeBlock
2
- from janus.language.naive.registry import register_splitter
3
- from janus.language.node import NodeType
4
- from janus.language.splitter import Splitter
1
+ from ..block import CodeBlock
2
+ from ..node import NodeType
3
+ from ..splitter import Splitter
4
+ from .registry import register_splitter
5
5
 
6
6
 
7
7
  @register_splitter("chunk")
@@ -1,6 +1,6 @@
1
1
  from typing import Callable, Dict
2
2
 
3
- from janus.language.splitter import Splitter
3
+ from ..splitter import Splitter
4
4
 
5
5
  CUSTOM_SPLITTERS: Dict[str, Callable[..., Splitter]] = dict()
6
6
 
@@ -1,18 +1,29 @@
1
- from janus.language.naive.registry import register_splitter
2
- from janus.language.treesitter import TreeSitterSplitter
3
- from janus.utils.enums import LANGUAGES
1
+ from ...utils.enums import LANGUAGES
2
+ from ..alc.alc import AlcSplitter
3
+ from ..mumps.mumps import MumpsSplitter
4
+ from ..treesitter import TreeSitterSplitter
5
+ from .registry import register_splitter
4
6
 
5
7
 
6
8
  @register_splitter("ast-flex")
7
- class FlexibleTreeSitterSplitter(TreeSitterSplitter):
8
- pass
9
+ def get_flexible_ast(language: str, **kwargs):
10
+ if language == "ibmhlasm":
11
+ return AlcSplitter(**kwargs)
12
+ elif language == "mumps":
13
+ return MumpsSplitter(**kwargs)
14
+ else:
15
+ return TreeSitterSplitter(language=language, **kwargs)
9
16
 
10
17
 
11
18
  @register_splitter("ast-strict")
12
- class StrictTreeSitterSplitter(TreeSitterSplitter):
13
- def __init__(self, language: str, **kwargs):
14
- kwargs.update(
15
- protected_node_types=(LANGUAGES[language]["functional_node_type"],),
16
- prune_unprotected=True,
17
- )
18
- super().__init__(language=language, **kwargs)
19
+ def get_strict_ast(language: str, **kwargs):
20
+ kwargs.update(
21
+ protected_node_types=LANGUAGES[language]["functional_node_types"],
22
+ prune_unprotected=True,
23
+ )
24
+ if language == "ibmhlasm":
25
+ return AlcSplitter(**kwargs)
26
+ elif language == "mumps":
27
+ return MumpsSplitter(**kwargs)
28
+ else:
29
+ return TreeSitterSplitter(language=language, **kwargs)
@@ -1,7 +1,7 @@
1
- from janus.language.block import CodeBlock
2
- from janus.language.naive.registry import register_splitter
3
- from janus.language.node import NodeType
4
- from janus.language.splitter import Splitter
1
+ from ..block import CodeBlock
2
+ from ..node import NodeType
3
+ from ..splitter import Splitter
4
+ from .registry import register_splitter
5
5
 
6
6
 
7
7
  @register_splitter("tag")
@@ -47,8 +47,8 @@ class Splitter(FileManager):
47
47
  model: None | BaseLanguageModel = None,
48
48
  max_tokens: int = 4096,
49
49
  skip_merge: bool = False,
50
- protected_node_types: tuple[str] = (),
51
- prune_node_types: tuple[str] = (),
50
+ protected_node_types: tuple[str, ...] = (),
51
+ prune_node_types: tuple[str, ...] = (),
52
52
  prune_unprotected: bool = False,
53
53
  ):
54
54
  """
@@ -340,7 +340,10 @@ class Splitter(FileManager):
340
340
  # Double check length (in theory this should never be an issue)
341
341
  tokens = self._count_tokens(text)
342
342
  if tokens > self.max_tokens:
343
- log.error(f"Merged node ({name}) too long for context!")
343
+ log.error(
344
+ f"Merged node ({name}) too long for context!"
345
+ f" ({tokens} > {self.max_tokens})"
346
+ )
344
347
 
345
348
  return CodeBlock(
346
349
  text=text,
@@ -420,7 +423,10 @@ class Splitter(FileManager):
420
423
  name = f"{node.name}-L#{node_line}"
421
424
  tokens = self._count_tokens(line)
422
425
  if tokens > self.max_tokens:
423
- raise TokenLimitError(r"Irreducible node too large for context!")
426
+ raise TokenLimitError(
427
+ "Irreducible node too large for context!"
428
+ f" ({tokens} > {self.max_tokens})"
429
+ )
424
430
 
425
431
  node.children.append(
426
432
  CodeBlock(
@@ -26,8 +26,8 @@ class TreeSitterSplitter(Splitter):
26
26
  language: str,
27
27
  model: None | BaseLanguageModel = None,
28
28
  max_tokens: int = 4096,
29
- protected_node_types: tuple[str] = (),
30
- prune_node_types: tuple[str] = (),
29
+ protected_node_types: tuple[str, ...] = (),
30
+ prune_node_types: tuple[str, ...] = (),
31
31
  prune_unprotected: bool = False,
32
32
  ) -> None:
33
33
  """Initialize a TreeSitterSplitter instance.
@@ -48,10 +48,10 @@ class TreeSitterSplitter(Splitter):
48
48
  self._load_parser()
49
49
 
50
50
  def _get_ast(self, code: str) -> CodeBlock:
51
- code = bytes(code, "utf-8")
52
- tree = self.parser.parse(code)
51
+ code_bytes = bytes(code, "utf-8")
52
+ tree = self.parser.parse(code_bytes)
53
53
  root = tree.walk().node
54
- root = self._node_to_block(root, code)
54
+ root = self._node_to_block(root, code_bytes)
55
55
  return root
56
56
 
57
57
  # Recursively print tree to view parsed output (dev helper function)
@@ -98,7 +98,7 @@ class TreeSitterSplitter(Splitter):
98
98
 
99
99
  text = node.text.decode()
100
100
  children = [self._node_to_block(child, original_text) for child in node.children]
101
- node = CodeBlock(
101
+ return CodeBlock(
102
102
  id=node.id,
103
103
  name=str(node.id),
104
104
  text=text,
@@ -112,7 +112,6 @@ class TreeSitterSplitter(Splitter):
112
112
  language=self.language,
113
113
  tokens=self._count_tokens(text),
114
114
  )
115
- return node
116
115
 
117
116
  def _load_parser(self) -> None:
118
117
  """Load the parser for the given language.
@@ -8,7 +8,7 @@ from langchain_core.messages import AIMessage
8
8
  from langchain_core.outputs import ChatGeneration, LLMResult
9
9
  from langchain_core.tracers.context import register_configure_hook
10
10
 
11
- from janus.utils.logger import create_logger
11
+ from ..utils.logger import create_logger
12
12
 
13
13
  log = create_logger(__name__)
14
14
 
janus/llm/models_info.py CHANGED
@@ -8,8 +8,7 @@ from langchain_community.llms import HuggingFaceTextGenInference
8
8
  from langchain_core.language_models import BaseLanguageModel
9
9
  from langchain_openai import ChatOpenAI
10
10
 
11
- from janus.llm.model_callbacks import COST_PER_1K_TOKENS
12
- from janus.prompts.prompt import (
11
+ from ..prompts.prompt import (
13
12
  ChatGptPromptEngine,
14
13
  ClaudePromptEngine,
15
14
  CoherePromptEngine,
@@ -18,8 +17,8 @@ from janus.prompts.prompt import (
18
17
  PromptEngine,
19
18
  TitanPromptEngine,
20
19
  )
21
-
22
20
  from ..utils.logger import create_logger
21
+ from .model_callbacks import COST_PER_1K_TOKENS
23
22
 
24
23
  log = create_logger(__name__)
25
24
 
@@ -3,8 +3,7 @@ from unittest.mock import patch
3
3
 
4
4
  import pytest
5
5
 
6
- from janus.llm.models_info import load_model
7
-
6
+ from ...llm.models_info import load_model
8
7
  from ..llm_metrics import llm_evaluate_option, llm_evaluate_ref_option
9
8
 
10
9
 
@@ -40,7 +39,7 @@ class TestLLMMetrics(unittest.TestCase):
40
39
  print("'Hello, world!")
41
40
  """
42
41
 
43
- @patch("janus.llm.models_info.load_model")
42
+ @patch(".llm.models_info.load_model")
44
43
  @patch("janus.metrics.llm_metrics.llm_evaluate")
45
44
  @pytest.mark.llm_eval
46
45
  def test_llm_self_eval_quality(self, mock_llm_evaluate, mock_load_model):
@@ -1,6 +1,6 @@
1
1
  import unittest
2
2
 
3
- from janus.metrics.rouge_score import rouge
3
+ from ..rouge_score import rouge
4
4
 
5
5
 
6
6
  class TestRouge(unittest.TestCase):
@@ -1,6 +1,6 @@
1
1
  import unittest
2
2
 
3
- from janus.metrics.similarity import similarity_score
3
+ from ..similarity import similarity_score
4
4
 
5
5
 
6
6
  class TestSimilarityScore(unittest.TestCase):
@@ -1,10 +1,9 @@
1
1
  import math
2
2
  from typing import List, Optional
3
3
 
4
- from janus.language.block import CodeBlock
5
- from janus.language.treesitter.treesitter import TreeSitterSplitter
6
- from janus.utils.enums import LANGUAGES
7
-
4
+ from ..language.block import CodeBlock
5
+ from ..language.treesitter.treesitter import TreeSitterSplitter
6
+ from ..utils.enums import LANGUAGES
8
7
  from .metric import metric
9
8
 
10
9
 
janus/metrics/metric.py CHANGED
@@ -7,10 +7,9 @@ import click
7
7
  import typer
8
8
  from typing_extensions import Annotated
9
9
 
10
- from janus.llm import load_model
11
- from janus.utils.enums import LANGUAGES
12
- from janus.utils.logger import create_logger
13
-
10
+ from ..llm import load_model
11
+ from ..utils.enums import LANGUAGES
12
+ from ..utils.logger import create_logger
14
13
  from ..utils.progress import track
15
14
  from .cli import evaluate
16
15
  from .file_pairing import FILE_PAIRING_METHODS
janus/prompts/prompt.py CHANGED
@@ -34,6 +34,40 @@ HUMAN_PROMPT_TEMPLATE_FILENAME = "human.txt"
34
34
  PROMPT_VARIABLES_FILENAME = "variables.json"
35
35
 
36
36
 
37
+ retry_with_output_prompt_text = """Instructions:
38
+ --------------
39
+ {instructions}
40
+ --------------
41
+ Completion:
42
+ --------------
43
+ {input}
44
+ --------------
45
+
46
+ Above, the Completion did not satisfy the constraints given in the Instructions.
47
+ Error:
48
+ --------------
49
+ {error}
50
+ --------------
51
+
52
+ Please try again. Please only respond with an answer that satisfies the
53
+ constraints laid out in the Instructions:"""
54
+
55
+
56
+ retry_with_error_and_output_prompt_text = """Prompt:
57
+ {prompt}
58
+ Completion:
59
+ {input}
60
+
61
+ Above, the Completion did not satisfy the constraints given in the Prompt.
62
+ Details: {error}
63
+ Please try again:"""
64
+
65
+ retry_with_output_prompt = PromptTemplate.from_template(retry_with_output_prompt_text)
66
+ retry_with_error_and_output_prompt = PromptTemplate.from_template(
67
+ retry_with_error_and_output_prompt_text
68
+ )
69
+
70
+
37
71
  class PromptEngine(ABC):
38
72
  """A class defining prompting schemes for the LLM."""
39
73
 
janus/translate.py CHANGED
@@ -16,12 +16,11 @@ from langchain_core.runnables import RunnableLambda, RunnableParallel
16
16
  from openai import BadRequestError, RateLimitError
17
17
  from text_generation.errors import ValidationError
18
18
 
19
- from janus.language.naive.registry import CUSTOM_SPLITTERS
20
-
21
19
  from .converter import Converter, run_if_changed
22
20
  from .embedding.vectorize import ChromaDBVectorizer
23
21
  from .language.block import CodeBlock, TranslatedCodeBlock
24
22
  from .language.combine import ChunkCombiner, Combiner, JsonCombiner
23
+ from .language.naive.registry import CUSTOM_SPLITTERS
25
24
  from .language.splitter import EmptyTreeError, FileSizeError, TokenLimitError
26
25
  from .llm import load_model
27
26
  from .llm.model_callbacks import get_model_callback
@@ -30,7 +29,12 @@ from .parsers.code_parser import CodeParser, GenericParser
30
29
  from .parsers.doc_parser import MadlibsDocumentationParser, MultiDocumentationParser
31
30
  from .parsers.eval_parser import EvaluationParser
32
31
  from .parsers.reqs_parser import RequirementsParser
33
- from .prompts.prompt import SAME_OUTPUT, TEXT_OUTPUT
32
+ from .prompts.prompt import (
33
+ SAME_OUTPUT,
34
+ TEXT_OUTPUT,
35
+ retry_with_error_and_output_prompt,
36
+ retry_with_output_prompt,
37
+ )
34
38
  from .utils.enums import LANGUAGES
35
39
  from .utils.logger import create_logger
36
40
 
@@ -407,10 +411,10 @@ class Translator(Converter):
407
411
  """
408
412
  self._parser.set_reference(block.original)
409
413
 
410
- # Retries with just the output and the error
414
+ # Retries with just the format instructions, the output, and the error
411
415
  n1 = round(self.max_prompts ** (1 / 3))
412
416
 
413
- # Retries with the input, output, and error
417
+ # Retries with the input, the output, and the error
414
418
  n2 = round((self.max_prompts // n1) ** (1 / 2))
415
419
 
416
420
  # Retries with just the input
@@ -420,11 +424,13 @@ class Translator(Converter):
420
424
  llm=self._llm,
421
425
  parser=self._parser,
422
426
  max_retries=n1,
427
+ prompt=retry_with_output_prompt,
423
428
  )
424
429
  retry = RetryWithErrorOutputParser.from_llm(
425
430
  llm=self._llm,
426
431
  parser=fix_format,
427
432
  max_retries=n2,
433
+ prompt=retry_with_error_and_output_prompt,
428
434
  )
429
435
 
430
436
  completion_chain = self._prompt | self._llm
janus/utils/enums.py CHANGED
@@ -10,7 +10,7 @@ class EmbeddingType(Enum):
10
10
  TARGET = 5 # placeholder embeddings, are these useful for analysis?
11
11
 
12
12
 
13
- CUSTOM_SPLITTERS: Set[str] = {"mumps", "binary"}
13
+ CUSTOM_SPLITTERS: Set[str] = {"mumps", "binary", "ibmhlasm"}
14
14
 
15
15
  LANGUAGES: Dict[str, Dict[str, Any]] = {
16
16
  "ada": {
@@ -63,7 +63,7 @@ LANGUAGES: Dict[str, Dict[str, Any]] = {
63
63
  '#include <stdio.h>\n\nint main() {\n printf("Hello, World!\\n");\n'
64
64
  " return 0;\n}\n"
65
65
  ),
66
- "functional_node_type": "function_definition",
66
+ "functional_node_types": ["function_definition"],
67
67
  "comment_node_type": "comment",
68
68
  },
69
69
  "capnp": {
@@ -206,7 +206,7 @@ LANGUAGES: Dict[str, Dict[str, Any]] = {
206
206
  "example": (
207
207
  "program HelloWorld\n print *, 'Hello, World!'\nend program HelloWorld\n"
208
208
  ),
209
- "functional_node_type": "function",
209
+ "functional_node_types": ["function"],
210
210
  "comment_node_type": "comment",
211
211
  },
212
212
  "gitattributes": {
@@ -300,6 +300,7 @@ LANGUAGES: Dict[str, Dict[str, Any]] = {
300
300
  END HELLO
301
301
  """
302
302
  ),
303
+ "functional_node_types": ["csect", "dsect"],
303
304
  "branch_node_types": ["branch_instruction"],
304
305
  "operation_node_types": ["operation", "branch_operation"],
305
306
  "operand_node_types": ["operands"],
@@ -420,7 +421,7 @@ LANGUAGES: Dict[str, Dict[str, Any]] = {
420
421
  "suffix": "m",
421
422
  "url": "https://github.com/janus-llm/tree-sitter-mumps",
422
423
  "example": 'WRITE "Hello, World!"',
423
- "functional_node_type": "routine_definition",
424
+ "functional_node_types": ["routine_definition"],
424
425
  "comment_node_type": "comment",
425
426
  "branch_node_types": ["if_statement"],
426
427
  "operation_node_types": [
@@ -512,7 +513,7 @@ LANGUAGES: Dict[str, Dict[str, Any]] = {
512
513
  "suffix": "py",
513
514
  "url": "https://github.com/tree-sitter/tree-sitter-python",
514
515
  "example": "# Hello, World!\nprint('Hello, World!')\n",
515
- "functional_node_type": "function_definition",
516
+ "functional_node_types": ["function_definition"],
516
517
  "comment_node_type": "comment",
517
518
  },
518
519
  "qmljs": {
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: janus-llm
3
- Version: 2.0.2
3
+ Version: 2.1.0
4
4
  Summary: A transcoding library using LLMs.
5
5
  Home-page: https://github.com/janus-llm/janus-llm
6
6
  License: Apache 2.0
@@ -1,14 +1,14 @@
1
- janus/__init__.py,sha256=GWW38p6MV8wH_SpAjgMKsoncBmX5J-7qBjCjgyoO8TY,341
1
+ janus/__init__.py,sha256=BDely1z7CW0_cUz6BOjQbE3ZTFvMNAfp2WD5ZkVDF4U,341
2
2
  janus/__main__.py,sha256=Qd-f8z2Q2vpiEP2x6PBFsJrpACWDVxFKQk820MhFmHo,59
3
3
  janus/_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  janus/_tests/conftest.py,sha256=V7uW-oq3YbFiRPvrq15YoVVrA1n_83pjgiyTZ-IUGW8,963
5
5
  janus/_tests/test_cli.py,sha256=oP-WOM-ai4jZLDGqjLzI6kCtfXWpoeUR8TynP5p6cVg,4254
6
6
  janus/_tests/test_translate.py,sha256=71oRTTfdSVFOBvUhuOavgl3TuimTz1K6eG-04pUGpfE,16439
7
- janus/cli.py,sha256=X_7HXQGeQEYUkIF3AHbbc04lUEUjKegZ7tUFwmKjf10,29549
8
- janus/converter.py,sha256=ge7nJA1DlOrHW_uB9P0dguc48Au8g7bqCERNqMIqfxs,5941
7
+ janus/cli.py,sha256=ASM9OtdQ2S-KZa-23prrympPbMzKBjSNIclv_w81MSs,29543
8
+ janus/converter.py,sha256=Kxd8eq03XNHbZZI-fU8NUNJR6N2yDxPHTc6QENC8Bqo,6092
9
9
  janus/embedding/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  janus/embedding/_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- janus/embedding/_tests/test_collections.py,sha256=llg-JSuRRFhKkHFiWWSHEWV3iaT6Lwue0lp2tEml9io,2668
11
+ janus/embedding/_tests/test_collections.py,sha256=eT0cYv-qmPrHJRjDZqWPFTkqVzFDRoPrRKR__FPiz58,2651
12
12
  janus/embedding/_tests/test_database.py,sha256=uqI2Jgj8DEIlciqiwiZx_n0osjcspIPrHOSSN1NRZSk,1019
13
13
  janus/embedding/_tests/test_vectorize.py,sha256=NnJLHBwgMVycAProRJxuLVSByxrpJ35eaZCFca52gNY,1964
14
14
  janus/embedding/collections.py,sha256=ZE8QGYQ82DCLqhV0m1y7PiqpuHjEfxHPcS5SCKU0LAw,5411
@@ -18,35 +18,39 @@ janus/embedding/vectorize.py,sha256=ap3e6ZMai8U3M5vdpLc_st4Sw31xyqoaqEno0IJlVOU,
18
18
  janus/language/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
19
  janus/language/_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
20
  janus/language/_tests/test_combine.py,sha256=ydCYNbTxvaxT-5axiEBzPQLn6s4arSyZ5Tx2SYKLpJY,1830
21
- janus/language/_tests/test_splitter.py,sha256=Hqexa39LLEXlK3ZUw7Zot4PUIACvye2vkq0Jaox0T10,373
21
+ janus/language/_tests/test_splitter.py,sha256=VK48eqp5PYJfjdhD_x7IkeAjbF1KC3AyNnICfK8XnUQ,360
22
+ janus/language/alc/__init__.py,sha256=j7vOMGhT1Vri6p8dsjSaY-fkO5uFn0sJ0nrNGGvcizM,42
23
+ janus/language/alc/_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
+ janus/language/alc/_tests/test_alc.py,sha256=DttXpouP9Vkdlf23_a0qFalKdGtadGv6oXTsmN1pk8Q,994
25
+ janus/language/alc/alc.py,sha256=n8KVHTb6FFILw50N8UM3gfT60gLVvkTjk37easwluWs,3061
22
26
  janus/language/binary/__init__.py,sha256=AlNAe12ZA366kcGSrQ1FJyOdbwxFqGBFkYR2K6yL818,51
23
27
  janus/language/binary/_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
28
  janus/language/binary/_tests/test_binary.py,sha256=a-8RSfKA23UrJC9c1xPQK792XZCz8npCHI7isN2dAP8,1727
25
29
  janus/language/binary/binary.py,sha256=CS1RAieN8klSsCeXQEFYKUWioatUX-sOPXKQr5S6NzE,6534
26
30
  janus/language/binary/reveng/decompile_script.py,sha256=veW51oJzuO-4UD3Er062jXZ_FYtTFo9OCkl82Z2xr6A,2182
27
- janus/language/block.py,sha256=4f8e3YYSS2p-0fXjjl2erbbXDOHcBxiLzDHALKlPTg4,9188
31
+ janus/language/block.py,sha256=DVbnnthFWLQ2JjvS09kFqNmeCx7SGtYkBnFjtPdW8mM,9278
28
32
  janus/language/combine.py,sha256=hSEc1dHLcOELks-ZGsRKHOgYBXAronKus6BTmb1u42k,2940
29
33
  janus/language/file.py,sha256=X2MYcAMlCABK77uhMdI_J2foXLrqEdinapYRfLPyKB8,563
30
34
  janus/language/mumps/__init__.py,sha256=-Ou_wJ-JgHezfp1dub2_qCYNiK9wO-zo2MlqxM9qiwE,48
31
35
  janus/language/mumps/_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
32
36
  janus/language/mumps/_tests/test_mumps.py,sha256=6l7q14lPnKf231iWwMdRbf-dg9QuHa26YMS7-K7yC4A,1001
33
- janus/language/mumps/mumps.py,sha256=J8ewuLDh7y9GeyCcGNYqCGJ9HOwrEWypc2HNxClZ8is,7382
37
+ janus/language/mumps/mumps.py,sha256=MkF_TZB1SOIj3JQfGKYow1Hh2Bja0EglUlpd4aAY5Iw,7351
34
38
  janus/language/mumps/patterns.py,sha256=FW5T6Nt5kBO2UKgSL1KLVDbYRgMaJAzDvEmvBkxHppA,2310
35
- janus/language/naive/__init__.py,sha256=6P5rDAZtoHTObkFbZkiEdd-PVRA_9VTQogUjwvRMKK8,198
36
- janus/language/naive/basic_splitter.py,sha256=RM9pJK2YkHfb6_EFEV-dh_rLqkjS6v0cn3ASPf8A6Fg,459
37
- janus/language/naive/chunk_splitter.py,sha256=ebRSbaJhDW-Hyr5__ukbdmAl6kQ1WWFqrq_SfCgHo6k,772
38
- janus/language/naive/registry.py,sha256=8YQX1q0IdAm7t69-oC_00I-vfkdRnHuX-OD3KEjEIuU,294
39
- janus/language/naive/simple_ast.py,sha256=gix_fh864sHZ5KeXoOZIVdKdQeCN_4Qwq4Ox-haZ6sY,593
40
- janus/language/naive/tag_splitter.py,sha256=IXWMn9tBVUGAtzvQi89GhoZ6g7fPXk5MzO0kMCr2mb0,2045
39
+ janus/language/naive/__init__.py,sha256=gsdC543qsIX8y_RxblCBIgyW0tfucljFms6v2WTrEz0,178
40
+ janus/language/naive/basic_splitter.py,sha256=NFW3TvMFQwEmcj5r4jvQXBJCzgNcSZI-3Arjb191gAo,407
41
+ janus/language/naive/chunk_splitter.py,sha256=g1nqbhvaOZ31SjO-smIwAg6lHGTy2rPOOnQ-m6fIKAA,713
42
+ janus/language/naive/registry.py,sha256=CDUkMIgscdPBV_qu49u9TGnOIgr9mRasinPRwViTWz8,281
43
+ janus/language/naive/simple_ast.py,sha256=boX_pJ8x52_MxiM6hJ-0oa6MR75Fu4pyebBEtYJJZUc,907
44
+ janus/language/naive/tag_splitter.py,sha256=6DHBJdM3IllcVV-MrAyj8KPg5zXTiRdeD42CYrmEFHk,1986
41
45
  janus/language/node.py,sha256=-ymv--oILEYLVO2KSOrzOlzL2cZHNQpQJYwE1cKA-pY,200
42
- janus/language/splitter.py,sha256=Ep8RxWrnuih3MAcdkkbtAsSLrPmyQcjnk0IzbRC-460,16741
46
+ janus/language/splitter.py,sha256=4XAe0hXka7njS30UHGCngJzDgHxn3lygUjikSHuV7Xo,16924
43
47
  janus/language/treesitter/__init__.py,sha256=mUliw7ZJLZ8NkJKyUQMSoUV82hYXE0HvLHrEdGPJF4Q,43
44
48
  janus/language/treesitter/_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
49
  janus/language/treesitter/_tests/test_treesitter.py,sha256=nsavUV0aI6cpT9FkQve58eTTehLyQG6qJJBGlNa_bIw,2170
46
- janus/language/treesitter/treesitter.py,sha256=9hbP7eBuSEKSZm6OD4C9q2tbjzrEidaCAKw74aO4lEM,6855
50
+ janus/language/treesitter/treesitter.py,sha256=bWimG5yNxweb-W6xGnl4Bbpsd9lJn9WGuYZla_lWdxQ,6863
47
51
  janus/llm/__init__.py,sha256=8Pzn3Jdx867PzDc4xmwm8wvJDGzWSIhpN0NCEYFe0LQ,36
48
- janus/llm/model_callbacks.py,sha256=zMCbMgniKrzKf-sU9SxOcfoOvc3xz7y0VxIxfdlS5tA,6766
49
- janus/llm/models_info.py,sha256=jNTp7mg7MVSS-Anp9Z-wMTz8odiE-1xXeyi8ngpJi1E,7151
52
+ janus/llm/model_callbacks.py,sha256=cQb0Gmy__Mh27b1UdTZkdCN3747bEi3DN_Wxb6gnI7Q,6762
53
+ janus/llm/models_info.py,sha256=_roSyd9oQX6QMshkyW880qgWt2pEm5Jby-U4XhkMjSg,7137
50
54
  janus/metrics/__init__.py,sha256=AsxtZJUzZiXJPr2ehPPltuYP-ddechjg6X85WZUO7mA,241
51
55
  janus/metrics/_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
52
56
  janus/metrics/_tests/reference.py,sha256=hiaJPP9CXkvFBV_wL-gOe_BzELTw0nvB6uCxhxtIiE8,13
@@ -54,18 +58,18 @@ janus/metrics/_tests/target.py,sha256=hiaJPP9CXkvFBV_wL-gOe_BzELTw0nvB6uCxhxtIiE
54
58
  janus/metrics/_tests/test_bleu.py,sha256=TcSnNGpMh00Nkkk1zq5wDfdCANMUq9eXscU_hcBRU8A,1640
55
59
  janus/metrics/_tests/test_chrf.py,sha256=O4v1Cj513H8NYffJILpSI7CuR_dnm7F8CeB3C7sZYr0,2202
56
60
  janus/metrics/_tests/test_file_pairing.py,sha256=A4Qy6JIesFXUcaig45Ze6LiViuHQS7MFSQzDHQP3j9w,1880
57
- janus/metrics/_tests/test_llm.py,sha256=IYsLwX5zC2WcaaPeSlHuQVmU2sB55-dsOXnmQPhcKps,3007
61
+ janus/metrics/_tests/test_llm.py,sha256=dGXrdd79v-ix_560t6Q8RJEx-6mgZ-pkzJgm-O2ZBwA,2998
58
62
  janus/metrics/_tests/test_reading.py,sha256=NDLFyjmOpM5gWf1LLTjGIw3aUR8Qf22zTt9hwe7NABs,840
59
- janus/metrics/_tests/test_rouge_score.py,sha256=dnP99nry-U5wyE-CiC0eQwm78IyScnmGQH3BeIEdmLY,2032
60
- janus/metrics/_tests/test_similarity_score.py,sha256=tdzH_8hYb2h7fKxpd_a75di-GPnU_frZ0zn2aeYzkso,811
63
+ janus/metrics/_tests/test_rouge_score.py,sha256=rcHmrpy55cW507PnTnGQnp9Tsn5rk7JEyXmusY7la3Q,2020
64
+ janus/metrics/_tests/test_similarity_score.py,sha256=jc3r0lWW5Iqm6AMKc36ewz5rboKwVw29fliBHClkzIg,799
61
65
  janus/metrics/_tests/test_treesitter_metrics.py,sha256=tqpAg9LY811gfQ3n2ypRqBJesAFQodMf6Gz7dvOsqp4,4337
62
66
  janus/metrics/bleu.py,sha256=eRoHIQulPp5mezJzHCNkwUB_89tAj4PqV2pF9eV9HfI,1746
63
67
  janus/metrics/chrf.py,sha256=zNGWZ40CPMgj8rctnmwkbf25_PvSOLPbOjv-iN2cGXM,1472
64
68
  janus/metrics/cli.py,sha256=Duuw2RF47Z-t1pal0cg3L_-N_91rx29krirqtIwjYLY,157
65
- janus/metrics/complexity_metrics.py,sha256=kJh_TyZttMP716MXGfl-WbaS4beR_DaQWGYVg6MurSU,6573
69
+ janus/metrics/complexity_metrics.py,sha256=1Z9n0o_CrILqayk40wRkjR1f7yvHIsJG38DxAbqj614,6560
66
70
  janus/metrics/file_pairing.py,sha256=WNHRV1D8GOJMq8Pla5SPkTDAT7yVaS4-UU0XIGKvEVs,3729
67
71
  janus/metrics/llm_metrics.py,sha256=3677S6GYcoVcokpmAN-fwvNu-lYWAKd7M5mebiE6RZc,5687
68
- janus/metrics/metric.py,sha256=QRzLCkHY2g4pWDIiVLNaxNiEMF4gXw_eUrjecH5CdwA,16987
72
+ janus/metrics/metric.py,sha256=Lgdtq87oJ-kWC_6jdPQ6-d1MqoeTnhkRszo6IZJV6c0,16974
69
73
  janus/metrics/reading.py,sha256=KYuWjKnk0ALRU5S7mCNNZtaNgK02l0fdIGsaNvxLMO4,1690
70
74
  janus/metrics/rouge_score.py,sha256=HfUJwUWI-yq5pOjML2ee4QTOMl0NQahnqEY2Mt8Dtnw,2865
71
75
  janus/metrics/similarity.py,sha256=9pjWWpLKCsk0QfFfSgQNdPXiisqi7WJYOOHaiT8S0iY,1613
@@ -78,17 +82,17 @@ janus/parsers/doc_parser.py,sha256=X8eCb1QXbL6sVWLEFGjsPyxrpJ9XnOPg7G4KZSo9A9E,5
78
82
  janus/parsers/eval_parser.py,sha256=HB5-zY_Jpmkj6FDbuNCCVCRxwmzhViSAjPKbyyC0Ebc,2723
79
83
  janus/parsers/reqs_parser.py,sha256=MFBvtR3otpyPZlkZxu0dVH1YeEJhvhNzhaGKGHaQVHA,2359
80
84
  janus/prompts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
81
- janus/prompts/prompt.py,sha256=GMQ9EqwtIhB_x8MgfoeTeb4OkLaySYdGd3wVbpDMZXA,8911
82
- janus/translate.py,sha256=bsQ1YvjCPrVrL3y-rAA2PrCv2-x3ObCF7a6LuPWQPuE,38747
85
+ janus/prompts/prompt.py,sha256=FPjqCtWla9BW4AdgrcuY8JXo5rYBSJ4b0PDDw6i1AKs,9701
86
+ janus/translate.py,sha256=bIrvyFBXUH1Cf8M-h-qSybFe0NQwuCA38heiV2toP8w,38958
83
87
  janus/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
84
88
  janus/utils/_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
85
89
  janus/utils/_tests/test_logger.py,sha256=4jZFm8LX828Dt9lOjiFHZIPbxYy_hHaswyrMPkscgdM,2199
86
90
  janus/utils/_tests/test_progress.py,sha256=Yh5NDNq-24n2nhHHbJm39pENAH70PYnh9ymwdcn0_UU,481
87
- janus/utils/enums.py,sha256=SlZKHojLPYOSjuekQGirSHem5Etcgy57txCtVCej2Ag,27533
91
+ janus/utils/enums.py,sha256=AoilbdiYyMvY2Mp0AM4xlbLSELfut2XMwhIM1S_msP4,27610
88
92
  janus/utils/logger.py,sha256=KZeuaMAnlSZCsj4yL0P6N-JzZwpxXygzACWfdZFeuek,2337
89
93
  janus/utils/progress.py,sha256=pKcCzO9JOU9fSD7qTmLWcqY5smc8mujqQMXoPgqNysE,1458
90
- janus_llm-2.0.2.dist-info/LICENSE,sha256=_j0st0a-HB6MRbP3_BW3PUqpS16v54luyy-1zVyl8NU,10789
91
- janus_llm-2.0.2.dist-info/METADATA,sha256=sUtUMkjBZEmov6apazRphlqnBKG02dLkzWmq6-qIrqc,4184
92
- janus_llm-2.0.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
93
- janus_llm-2.0.2.dist-info/entry_points.txt,sha256=OGhQwzj6pvXp79B0SaBD5apGekCu7Dwe9fZZT_TZ544,39
94
- janus_llm-2.0.2.dist-info/RECORD,,
94
+ janus_llm-2.1.0.dist-info/LICENSE,sha256=_j0st0a-HB6MRbP3_BW3PUqpS16v54luyy-1zVyl8NU,10789
95
+ janus_llm-2.1.0.dist-info/METADATA,sha256=a7PmWYKoHGa6ynFGQJvOVLMtBYkVa8foj6zpJgN1Sfs,4184
96
+ janus_llm-2.1.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
97
+ janus_llm-2.1.0.dist-info/entry_points.txt,sha256=OGhQwzj6pvXp79B0SaBD5apGekCu7Dwe9fZZT_TZ544,39
98
+ janus_llm-2.1.0.dist-info/RECORD,,