PyPI - janus-llm - Versions diffs - 1.0.0__py3-none-any.whl → 2.0.0__py3-none-any.whl - Mend

janus-llm 1.0.0py3-none-any.whl → 2.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

janus/__init__.py +9 -1
janus/__main__.py +4 -0
janus/_tests/test_cli.py +128 -0
janus/_tests/test_translate.py +49 -7
janus/cli.py +530 -46
janus/converter.py +50 -19
janus/embedding/_tests/test_collections.py +2 -8
janus/embedding/_tests/test_database.py +32 -0
janus/embedding/_tests/test_vectorize.py +9 -4
janus/embedding/collections.py +49 -6
janus/embedding/embedding_models_info.py +120 -0
janus/embedding/vectorize.py +53 -62
janus/language/_tests/__init__.py +0 -0
janus/language/_tests/test_combine.py +62 -0
janus/language/_tests/test_splitter.py +16 -0
janus/language/binary/_tests/test_binary.py +16 -1
janus/language/binary/binary.py +10 -3
janus/language/block.py +31 -30
janus/language/combine.py +26 -34
janus/language/mumps/_tests/test_mumps.py +2 -2
janus/language/mumps/mumps.py +93 -9
janus/language/naive/__init__.py +4 -0
janus/language/naive/basic_splitter.py +14 -0
janus/language/naive/chunk_splitter.py +26 -0
janus/language/naive/registry.py +13 -0
janus/language/naive/simple_ast.py +18 -0
janus/language/naive/tag_splitter.py +61 -0
janus/language/splitter.py +168 -74
janus/language/treesitter/_tests/test_treesitter.py +9 -6
janus/language/treesitter/treesitter.py +37 -13
janus/llm/model_callbacks.py +177 -0
janus/llm/models_info.py +134 -70
janus/metrics/__init__.py +8 -0
janus/metrics/_tests/__init__.py +0 -0
janus/metrics/_tests/reference.py +2 -0
janus/metrics/_tests/target.py +2 -0
janus/metrics/_tests/test_bleu.py +56 -0
janus/metrics/_tests/test_chrf.py +67 -0
janus/metrics/_tests/test_file_pairing.py +59 -0
janus/metrics/_tests/test_llm.py +91 -0
janus/metrics/_tests/test_reading.py +28 -0
janus/metrics/_tests/test_rouge_score.py +65 -0
janus/metrics/_tests/test_similarity_score.py +23 -0
janus/metrics/_tests/test_treesitter_metrics.py +110 -0
janus/metrics/bleu.py +66 -0
janus/metrics/chrf.py +55 -0
janus/metrics/cli.py +7 -0
janus/metrics/complexity_metrics.py +208 -0
janus/metrics/file_pairing.py +113 -0
janus/metrics/llm_metrics.py +202 -0
janus/metrics/metric.py +466 -0
janus/metrics/reading.py +70 -0
janus/metrics/rouge_score.py +96 -0
janus/metrics/similarity.py +53 -0
janus/metrics/splitting.py +38 -0
janus/parsers/_tests/__init__.py +0 -0
janus/parsers/_tests/test_code_parser.py +32 -0
janus/parsers/code_parser.py +24 -253
janus/parsers/doc_parser.py +169 -0
janus/parsers/eval_parser.py +80 -0
janus/parsers/reqs_parser.py +72 -0
janus/prompts/prompt.py +103 -30
janus/translate.py +636 -111
janus/utils/_tests/__init__.py +0 -0
janus/utils/_tests/test_logger.py +67 -0
janus/utils/_tests/test_progress.py +20 -0
janus/utils/enums.py +56 -3
janus/utils/progress.py +56 -0
{janus_llm-1.0.0.dist-info → janus_llm-2.0.0.dist-info}/METADATA +23 -10
janus_llm-2.0.0.dist-info/RECORD +94 -0
{janus_llm-1.0.0.dist-info → janus_llm-2.0.0.dist-info}/WHEEL +1 -1
janus_llm-1.0.0.dist-info/RECORD +0 -48
{janus_llm-1.0.0.dist-info → janus_llm-2.0.0.dist-info}/LICENSE +0 -0
{janus_llm-1.0.0.dist-info → janus_llm-2.0.0.dist-info}/entry_points.txt +0 -0

janus/cli.py CHANGED Viewed

@@ -1,34 +1,50 @@
 import json
+import logging
 import os
 from pathlib import Path
 from typing import Optional
 import click
 import typer
+from pydantic import AnyHttpUrl
 from rich import print
 from rich.console import Console
 from rich.prompt import Confirm
 from typing_extensions import Annotated
+from janus.language.naive.registry import CUSTOM_SPLITTERS
 from .embedding.collections import Collections
 from .embedding.database import ChromaEmbeddingDatabase
+from .embedding.embedding_models_info import (
+    EMBEDDING_COST_PER_MODEL,
+    EMBEDDING_MODEL_CONFIG_DIR,
+    EMBEDDING_TOKEN_LIMITS,
+    EmbeddingModelType,
+)
 from .embedding.vectorize import ChromaDBVectorizer
 from .language.binary import BinarySplitter
 from .language.mumps import MumpsSplitter
 from .language.treesitter import TreeSitterSplitter
-from .llm.models_info import (
-    COST_PER_MODEL,
-    MODEL_CONFIG_DIR,
-    MODEL_TYPE_CONSTRUCTORS,
-    TOKEN_LIMITS,
+from .llm.model_callbacks import COST_PER_1K_TOKENS
+from .llm.models_info import MODEL_CONFIG_DIR, MODEL_TYPE_CONSTRUCTORS, TOKEN_LIMITS
+from .metrics.cli import evaluate
+from .translate import (
+    PARSER_TYPES,
+    DiagramGenerator,
+    Documenter,
+    MadLibsDocumenter,
+    MultiDocumenter,
+    RequirementsDocumenter,
+    Translator,
 )
-from .parsers.code_parser import PARSER_TYPES
-from .translate import Translator
-from .utils.enums import CUSTOM_SPLITTERS, LANGUAGES
+from .utils.enums import LANGUAGES
 from .utils.logger import create_logger
-log = create_logger(__name__)
+httpx_logger = logging.getLogger("httpx")
+httpx_logger.setLevel(logging.WARNING)
+log = create_logger(__name__)
 homedir = Path.home().expanduser()
 janus_dir = homedir / ".janus"
@@ -43,6 +59,17 @@ if not db_file.exists():
 with open(db_file, "r") as f:
     db_loc = f.read()
+collections_config_file = Path(db_loc) / "collections.json"
+def get_collections_config():
+    if collections_config_file.exists():
+        with open(collections_config_file, "r") as f:
+            config = json.load(f)
+    else:
+        config = {}
+    return config
 app = typer.Typer(
     help="Choose a command",
@@ -51,6 +78,7 @@ app = typer.Typer(
     context_settings={"help_option_names": ["-h", "--help"]},
 )
 db = typer.Typer(
     help="Database commands",
     add_completion=False,
@@ -64,6 +92,43 @@ llm = typer.Typer(
     context_settings={"help_option_names": ["-h", "--help"]},
 )
+embedding = typer.Typer(
+    help="Embedding model commands",
+    add_completion=False,
+    no_args_is_help=True,
+    context_settings={"help_option_names": ["-h", "--help"]},
+)
+def version_callback(value: bool) -> None:
+    if value:
+        from . import __version__ as version
+        print(f"Janus CLI [blue]v{version}[/blue]")
+        raise typer.Exit()
+@app.callback()
+def common(
+    ctx: typer.Context,
+    version: bool = typer.Option(
+        None,
+        "--version",
+        "-v",
+        callback=version_callback,
+        help="Print the version and exit.",
+    ),
+) -> None:
+    """A function for getting the app version
+    This will call the version_callback function to print the version and exit.
+    Arguments:
+        ctx: The typer context
+        version: A boolean flag for the version
+    """
+    pass
 @app.command(
     help="Translate code from one language to another using an LLM.",
@@ -73,41 +138,53 @@ def translate(
     input_dir: Annotated[
         Path,
         typer.Option(
+            "--input",
+            "-i",
             help="The directory containing the source code to be translated. "
-            "The files should all be in one flat directory."
+            "The files should all be in one flat directory.",
         ),
     ],
     source_lang: Annotated[
         str,
         typer.Option(
+            "--source-language",
+            "-s",
             help="The language of the source code.",
             click_type=click.Choice(sorted(LANGUAGES)),
         ),
     ],
     output_dir: Annotated[
         Path,
-        typer.Option(help="The directory to store the translated code in."),
+        typer.Option(
+            "--output", "-o", help="The directory to store the translated code in."
+        ),
     ],
     target_lang: Annotated[
         str,
         typer.Option(
+            "--target-language",
+            "-t",
             help="The desired output language to translate the source code to. The "
             "format can follow a 'language-version' syntax.  Use 'text' to get plaintext"
             "results as returned by the LLM. Examples: `python-3.10`, `mumps`, `java-10`,"
-            "text."
+            "text.",
         ),
     ],
     llm_name: Annotated[
         str,
         typer.Option(
+            "--llm",
+            "-L",
             help="The custom name of the model set with 'janus llm add'.",
         ),
-    ] = "gpt-3.5-turbo",
+    ] = "gpt-3.5-turbo-0125",
     max_prompts: Annotated[
         int,
         typer.Option(
+            "--max-prompts",
+            "-m",
             help="The maximum number of times to prompt a model on one functional block "
-            "before exiting the application. This is to prevent wasting too much money."
+            "before exiting the application. This is to prevent wasting too much money.",
         ),
     ] = 10,
     overwrite: Annotated[
@@ -119,18 +196,22 @@ def translate(
     ] = False,
     temp: Annotated[
         float,
-        typer.Option(help="Sampling temperature.", min=0, max=2),
+        typer.Option("--temperature", "-T", help="Sampling temperature.", min=0, max=2),
     ] = 0.7,
     prompt_template: Annotated[
         str,
         typer.Option(
+            "--prompt-template",
+            "-p",
             help="Name of the Janus prompt template directory or "
-            "path to a directory containing those template files."
+            "path to a directory containing those template files.",
         ),
     ] = "simple",
     parser_type: Annotated[
         str,
         typer.Option(
+            "--parser",
+            "-P",
             click_type=click.Choice(sorted(PARSER_TYPES)),
             help="The type of parser to use.",
         ),
@@ -144,6 +225,24 @@ def translate(
             "collection with the name provided.",
         ),
     ] = None,
+    custom_splitter: Annotated[
+        Optional[str],
+        typer.Option(
+            "-cs",
+            "--custom-splitter",
+            help="Name of custom splitter to use",
+            click_type=click.Choice(list(CUSTOM_SPLITTERS.keys())),
+        ),
+    ] = None,
+    max_tokens: Annotated[
+        int,
+        typer.Option(
+            "--max-tokens",
+            "-M",
+            help="The maximum number of tokens the model will take in. "
+            "If unspecificed, model's default max will be used.",
+        ),
+    ] = None,
 ):
     try:
         target_language, target_version = target_lang.split("-")
@@ -156,12 +255,7 @@ def translate(
         raise ValueError
     model_arguments = dict(temperature=temp)
-    output_collection = None
-    if collection is not None:
-        _check_collection(collection, input_dir)
-        db = ChromaEmbeddingDatabase(db_loc)
-        collections = Collections(db)
-        output_collection = collections.get_or_create(collection)
+    collections_config = get_collections_config()
     translator = Translator(
         model=llm_name,
         model_arguments=model_arguments,
@@ -169,21 +263,269 @@ def translate(
         target_language=target_language,
         target_version=target_version,
         max_prompts=max_prompts,
+        max_tokens=max_tokens,
         prompt_template=prompt_template,
         parser_type=parser_type,
+        db_path=db_loc,
+        db_config=collections_config,
+        custom_splitter=custom_splitter,
     )
-    translator.translate(input_dir, output_dir, overwrite, output_collection)
+    translator.translate(input_dir, output_dir, overwrite, collection)
+@app.command(
+    help="Document input code using an LLM.",
+    no_args_is_help=True,
+)
+def document(
+    input_dir: Annotated[
+        Path,
+        typer.Option(
+            "--input",
+            "-i",
+            help="The directory containing the source code to be translated. "
+            "The files should all be in one flat directory.",
+        ),
+    ],
+    language: Annotated[
+        str,
+        typer.Option(
+            "--language",
+            "-l",
+            help="The language of the source code.",
+            click_type=click.Choice(sorted(LANGUAGES)),
+        ),
+    ],
+    output_dir: Annotated[
+        Path,
+        typer.Option(
+            "--output-dir", "-o", help="The directory to store the translated code in."
+        ),
+    ],
+    llm_name: Annotated[
+        str,
+        typer.Option(
+            "--llm",
+            "-L",
+            help="The custom name of the model set with 'janus llm add'.",
+        ),
+    ] = "gpt-3.5-turbo-0125",
+    max_prompts: Annotated[
+        int,
+        typer.Option(
+            "--max-prompts",
+            "-m",
+            help="The maximum number of times to prompt a model on one functional block "
+            "before exiting the application. This is to prevent wasting too much money.",
+        ),
+    ] = 10,
+    overwrite: Annotated[
+        bool,
+        typer.Option(
+            "--overwrite/--preserve",
+            help="Whether to overwrite existing files in the output directory",
+        ),
+    ] = False,
+    doc_mode: Annotated[
+        str,
+        typer.Option(
+            "--doc-mode",
+            "-d",
+            help="The documentation mode.",
+            click_type=click.Choice(["madlibs", "summary", "multidoc", "requirements"]),
+        ),
+    ] = "madlibs",
+    comments_per_request: Annotated[
+        int,
+        typer.Option(
+            "--comments-per-request",
+            "-rc",
+            help="The maximum number of comments to generate per request when using "
+            "MadLibs documentation mode.",
+        ),
+    ] = None,
+    drop_comments: Annotated[
+        bool,
+        typer.Option(
+            "--drop-comments/--keep-comments",
+            help="Whether to drop or keep comments in the code sent to the LLM",
+        ),
+    ] = False,
+    temperature: Annotated[
+        float,
+        typer.Option("--temperature", "-t", help="Sampling temperature.", min=0, max=2),
+    ] = 0.7,
+    collection: Annotated[
+        str,
+        typer.Option(
+            "--collection",
+            "-c",
+            help="If set, will put the translated result into a Chroma DB "
+            "collection with the name provided.",
+        ),
+    ] = None,
+    custom_splitter: Annotated[
+        Optional[str],
+        typer.Option(
+            "-cs",
+            "--custom-splitter",
+            help="Name of custom splitter to use",
+            click_type=click.Choice(list(CUSTOM_SPLITTERS.keys())),
+        ),
+    ] = None,
+    max_tokens: Annotated[
+        int,
+        typer.Option(
+            "--max-tokens",
+            "-M",
+            help="The maximum number of tokens the model will take in. "
+            "If unspecificed, model's default max will be used.",
+        ),
+    ] = None,
+):
+    model_arguments = dict(temperature=temperature)
+    collections_config = get_collections_config()
+    kwargs = dict(
+        model=llm_name,
+        model_arguments=model_arguments,
+        source_language=language,
+        max_prompts=max_prompts,
+        max_tokens=max_tokens,
+        db_path=db_loc,
+        db_config=collections_config,
+        custom_splitter=custom_splitter,
+    )
+    if doc_mode == "madlibs":
+        documenter = MadLibsDocumenter(
+            comments_per_request=comments_per_request, **kwargs
+        )
+    elif doc_mode == "multidoc":
+        documenter = MultiDocumenter(drop_comments=drop_comments, **kwargs)
+    elif doc_mode == "requirements":
+        documenter = RequirementsDocumenter(drop_comments=drop_comments, **kwargs)
+    else:
+        documenter = Documenter(drop_comments=drop_comments, **kwargs)
+    documenter.translate(input_dir, output_dir, overwrite, collection)
+@app.command(
+    help="Diagram input code using an LLM.",
+    no_args_is_help=True,
+)
+def diagram(
+    input_dir: Annotated[
+        Path,
+        typer.Option(
+            "--input",
+            "-i",
+            help="The directory containing the source code to be translated. "
+            "The files should all be in one flat directory.",
+        ),
+    ],
+    language: Annotated[
+        str,
+        typer.Option(
+            "--language",
+            "-l",
+            help="The language of the source code.",
+            click_type=click.Choice(sorted(LANGUAGES)),
+        ),
+    ],
+    output_dir: Annotated[
+        Path,
+        typer.Option(
+            "--output-dir", "-o", help="The directory to store the translated code in."
+        ),
+    ],
+    llm_name: Annotated[
+        str,
+        typer.Option(
+            "--llm",
+            "-L",
+            help="The custom name of the model set with 'janus llm add'.",
+        ),
+    ] = "gpt-3.5-turbo-0125",
+    max_prompts: Annotated[
+        int,
+        typer.Option(
+            "--max-prompts",
+            "-m",
+            help="The maximum number of times to prompt a model on one functional block "
+            "before exiting the application. This is to prevent wasting too much money.",
+        ),
+    ] = 10,
+    overwrite: Annotated[
+        bool,
+        typer.Option(
+            "--overwrite/--preserve",
+            help="Whether to overwrite existing files in the output directory",
+        ),
+    ] = False,
+    temperature: Annotated[
+        float,
+        typer.Option("--temperature", "-t", help="Sampling temperature.", min=0, max=2),
+    ] = 0.7,
+    collection: Annotated[
+        str,
+        typer.Option(
+            "--collection",
+            "-c",
+            help="If set, will put the translated result into a Chroma DB "
+            "collection with the name provided.",
+        ),
+    ] = None,
+    diagram_type: Annotated[
+        str,
+        typer.Option(
+            "--diagram-type", "-dg", help="Diagram type to generate in PLANTUML"
+        ),
+    ] = "Activity",
+    add_documentation: Annotated[
+        bool,
+        typer.Option(
+            "--add-documentation/--no-documentation",
+            "-ad",
+            help="Whether to use documentation in generation",
+        ),
+    ] = False,
+    custom_splitter: Annotated[
+        Optional[str],
+        typer.Option(
+            "-cs",
+            "--custom-splitter",
+            help="Name of custom splitter to use",
+            click_type=click.Choice(list(CUSTOM_SPLITTERS.keys())),
+        ),
+    ] = None,
+):
+    model_arguments = dict(temperature=temperature)
+    collections_config = get_collections_config()
+    diagram_generator = DiagramGenerator(
+        model=llm_name,
+        model_arguments=model_arguments,
+        source_language=language,
+        max_prompts=max_prompts,
+        db_path=db_loc,
+        db_config=collections_config,
+        diagram_type=diagram_type,
+        add_documentation=add_documentation,
+        custom_splitter=custom_splitter,
+    )
+    diagram_generator.translate(input_dir, output_dir, overwrite, collection)
 @db.command("init", help="Connect to or create a database.")
 def db_init(
-    path: Annotated[str, typer.Option(help="The path to the database file.")] = str(
-        janus_dir / "chroma.db"
-    ),
+    path: Annotated[
+        str, typer.Option("--path", "-p", help="The path to the database file.")
+    ] = str(janus_dir / "chroma.db"),
     url: Annotated[
         str,
         typer.Option(
-            help="The URL of the database if the database is running externally."
+            "--url",
+            "-u",
+            help="The URL of the database if the database is running externally.",
         ),
     ] = "",
 ) -> None:
@@ -219,7 +561,7 @@ def db_ls(
     ] = None,
     peek: Annotated[
         Optional[int],
-        typer.Option(help="Peek at N entries for a specific collection."),
+        typer.Option("--peek", "-p", help="Peek at N entries for a specific collection."),
     ] = None,
 ) -> None:
     """List the current database's collections"""
@@ -256,17 +598,24 @@ def db_ls(
 @db.command("add", help="Add a collection to the current database.")
 def db_add(
     collection_name: Annotated[str, typer.Argument(help="The name of the collection.")],
+    model_name: Annotated[str, typer.Argument(help="The name of the embedding model.")],
     input_dir: Annotated[
         str,
-        typer.Option(help="The directory containing the source code to be added."),
+        typer.Option(
+            "--input",
+            "-i",
+            help="The directory containing the source code to be added.",
+        ),
     ] = "./",
     input_lang: Annotated[
-        str, typer.Option(help="The language of the source code.")
+        str, typer.Option("--language", "-l", help="The language of the source code.")
     ] = "python",
     max_tokens: Annotated[
         int,
         typer.Option(
-            help="The maximum number of tokens for each chunk of input source code."
+            "--max-tokens",
+            "-m",
+            help="The maximum number of tokens for each chunk of input source code.",
         ),
     ] = 4096,
 ) -> None:
@@ -274,13 +623,16 @@ def db_add(
     Arguments:
         collection_name: The name of the collection to add
+        model_name: The name of the embedding model to use
         input_dir: The directory containing the source code to be added
         input_lang: The language of the source code
+        max_tokens: The maximum number of tokens for each chunk of input source code
     """
     # TODO: import factory
     console = Console()
     added_to = _check_collection(collection_name, input_dir)
+    collections_config = get_collections_config()
     with console.status(
         f"Adding collection: [bold salmon]{collection_name}[/bold salmon]",
@@ -288,13 +640,13 @@ def db_add(
     ):
         vectorizer_factory = ChromaDBVectorizer()
         vectorizer = vectorizer_factory.create_vectorizer(
-            source_language=input_lang,
-            path=db_loc,
-            max_tokens=max_tokens,
+            path=db_loc, config=collections_config
         )
+        vectorizer.get_or_create_collection(collection_name, model_name=model_name)
         input_dir = Path(input_dir)
-        source_glob = f"**/*.{LANGUAGES[input_lang]['suffix']}"
-        input_paths = input_dir.rglob(source_glob)
+        suffix = LANGUAGES[input_lang]["suffix"]
+        source_glob = f"**/*.{suffix}"
+        input_paths = [p for p in input_dir.rglob(source_glob)]
         if input_lang in CUSTOM_SPLITTERS:
             if input_lang == "mumps":
                 splitter = MumpsSplitter(
@@ -311,15 +663,35 @@ def db_add(
             )
         for input_path in input_paths:
             input_block = splitter.split(input_path)
-            vectorizer._add_nodes_recursively(
+            vectorizer.add_nodes_recursively(
                 input_block,
                 collection_name,
                 input_path.name,
             )
+    total_files = len([p for p in Path.glob(input_dir, "**/*") if not p.is_dir()])
     if added_to:
-        print(f"Added to collection [bold salmon1]{collection_name}[/bold salmon1]")
+        print(
+            f"\nAdded to [bold salmon1]{collection_name}[/bold salmon1]:\n"
+            f"  Embedding Model: [green]{model_name}[/green]\n"
+            f"  Input Directory: {input_dir.absolute()}\n"
+            f"  {input_lang.capitalize()} [green]*.{suffix}[/green] Files: "
+            f"{len(input_paths)}\n"
+            "  Other Files (skipped): "
+            f"{total_files - len(input_paths)}\n"
+        )
+        [p for p in Path.glob(input_dir, f"**/*.{suffix}") if not p.is_dir()]
     else:
-        print(f"Created collection [bold salmon1]{collection_name}[/bold salmon1]")
+        print(
+            f"\nCreated [bold salmon1]{collection_name}[/bold salmon1]:\n"
+            f"  Embedding Model: '{model_name}'\n"
+            f"  Input Directory: {input_dir.absolute()}\n"
+            f"  {input_lang.capitalize()} [green]*.{suffix}[/green] Files: "
+            f"{len(input_paths)}\n"
+            "  Other Files (skipped): "
+            f"{total_files - len(input_paths)}\n"
+        )
+    with open(collections_config_file, "w") as f:
+        json.dump(vectorizer.config, f, indent=2)
 @db.command(
@@ -327,17 +699,28 @@ def db_add(
     help="Remove a collection from the database.",
 )
 def db_rm(
-    collection_name: Annotated[str, typer.Argument(help="The name of the collection.")]
+    collection_name: Annotated[str, typer.Argument(help="The name of the collection.")],
+    confirm: Annotated[
+        bool,
+        typer.Option(
+            "--yes",
+            "-y",
+            help="Confirm the removal of the collection.",
+        ),
+    ],
 ) -> None:
     """Remove a collection from the database
     Arguments:
         collection_name: The name of the collection to remove
     """
-    delete = Confirm.ask(
-        f"\nAre you sure you want to [bold red]remove[/bold red] "
-        f"[bold salmon1]{collection_name}[/bold salmon1]?",
-    )
+    if not confirm:
+        delete = Confirm.ask(
+            f"\nAre you sure you want to [bold red]remove[/bold red] "
+            f"[bold salmon1]{collection_name}[/bold salmon1]?",
+        )
+    else:
+        delete = True
     if not delete:
         raise typer.Abort()
     db = ChromaEmbeddingDatabase(db_loc)
@@ -425,16 +808,115 @@ def llm_add(
             "model_cost": {"input": in_cost, "output": out_cost},
         }
     elif model_type == "OpenAI":
-        model_name = typer.prompt("Enter the model name", default="gpt-3.5-turbo")
+        model_name = typer.prompt("Enter the model name", default="gpt-3.5-turbo-0125")
         params = dict(
             model_name=model_name,
             temperature=0.7,
             n=1,
         )
         max_tokens = TOKEN_LIMITS[model_name]
-        model_cost = COST_PER_MODEL[model_name]
+        model_cost = COST_PER_1K_TOKENS[model_name]
+        cfg = {
+            "model_type": model_type,
+            "model_args": params,
+            "token_limit": max_tokens,
+            "model_cost": model_cost,
+        }
+    else:
+        raise ValueError(f"Unknown model type {model_type}")
+    with open(model_cfg, "w") as f:
+        json.dump(cfg, f, indent=2)
+    print(f"Model config written to {model_cfg}")
+@embedding.command("add", help="Add an embedding model config to janus")
+def embedding_add(
+    model_name: Annotated[
+        str, typer.Argument(help="The user's custom name for the model")
+    ],
+    model_type: Annotated[
+        str,
+        typer.Option(
+            "--type",
+            "-t",
+            help="The type of the model",
+            click_type=click.Choice(list(val.value for val in EmbeddingModelType)),
+        ),
+    ] = "OpenAI",
+):
+    if not EMBEDDING_MODEL_CONFIG_DIR.exists():
+        EMBEDDING_MODEL_CONFIG_DIR.mkdir(parents=True)
+    model_cfg = EMBEDDING_MODEL_CONFIG_DIR / f"{model_name}.json"
+    if model_type in EmbeddingModelType.HuggingFaceInferenceAPI.values:
+        hf = typer.style("HuggingFaceInferenceAPI", fg="yellow")
+        url = typer.prompt(f"Enter the {hf} model's URL", type=str, value_proc=AnyHttpUrl)
+        api_model_name = typer.prompt("Enter the model's name", type=str, default="")
+        api_key = typer.prompt("Enter the API key", type=str, default="")
+        max_tokens = typer.prompt(
+            "Enter the model's maximum tokens", default=8191, type=int
+        )
+        in_cost = typer.prompt("Enter the cost per input token", default=0, type=float)
+        out_cost = typer.prompt("Enter the cost per output token", default=0, type=float)
+        params = dict(
+            model_name=api_model_name,
+            api_key=api_key,
+        )
+        cfg = {
+            "model_type": model_type,
+            "model_identifier": str(url),
+            "model_args": params,
+            "token_limit": max_tokens,
+            "model_cost": {"input": in_cost, "output": out_cost},
+        }
+    elif model_type in EmbeddingModelType.HuggingFaceLocal.values:
+        hf = typer.style("HuggingFace", fg="yellow")
+        model_id = typer.prompt(
+            f"Enter the {hf} model ID",
+            default="sentence-transformers/all-MiniLM-L6-v2",
+            type=str,
+        )
+        cache_folder = str(
+            Path(
+                typer.prompt(
+                    "Enter the model's cache folder",
+                    default=EMBEDDING_MODEL_CONFIG_DIR / "cache",
+                    type=str,
+                )
+            )
+        )
+        max_tokens = typer.prompt(
+            "Enter the model's maximum tokens", default=8191, type=int
+        )
+        params = dict(
+            cache_folder=str(cache_folder),
+        )
+        cfg = {
+            "model_type": model_type,
+            "model_identifier": model_id,
+            "model_args": params,
+            "token_limit": max_tokens,
+            "model_cost": {"input": 0, "output": 0},
+        }
+    elif model_type in EmbeddingModelType.OpenAI.values:
+        available_models = list(EMBEDDING_COST_PER_MODEL.keys())
+        open_ai = typer.style("OpenAI", fg="green")
+        prompt = f"Enter the {open_ai} model name"
+        model_name = typer.prompt(
+            prompt,
+            default="text-embedding-3-small",
+            type=click.types.Choice(available_models),
+            show_choices=False,
+        )
+        params = dict(
+            model=model_name,
+        )
+        max_tokens = EMBEDDING_TOKEN_LIMITS[model_name]
+        model_cost = EMBEDDING_COST_PER_MODEL[model_name]
         cfg = {
             "model_type": model_type,
+            "model_identifier": model_name,
             "model_args": params,
             "token_limit": max_tokens,
             "model_cost": model_cost,
@@ -448,6 +930,8 @@ def llm_add(
 app.add_typer(db, name="db")
 app.add_typer(llm, name="llm")
+app.add_typer(evaluate, name="evaluate")
+app.add_typer(embedding, name="embedding")
 if __name__ == "__main__":

janus-llm 1.0.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

janus-llm 1.0.0py3-none-any.whl → 2.0.0py3-none-any.whl