janus-llm 4.3.1__py3-none-any.whl → 4.3.5__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (128) hide show
  1. janus/__init__.py +1 -1
  2. janus/__main__.py +1 -1
  3. janus/_tests/evaluator_tests/EvalReadMe.md +85 -0
  4. janus/_tests/evaluator_tests/incose_tests/incose_large_test.json +39 -0
  5. janus/_tests/evaluator_tests/incose_tests/incose_small_test.json +17 -0
  6. janus/_tests/evaluator_tests/inline_comment_tests/mumps_inline_comment_test.m +71 -0
  7. janus/_tests/test_cli.py +3 -2
  8. janus/cli/aggregate.py +135 -0
  9. janus/cli/cli.py +111 -0
  10. janus/cli/constants.py +43 -0
  11. janus/cli/database.py +289 -0
  12. janus/cli/diagram.py +178 -0
  13. janus/cli/document.py +174 -0
  14. janus/cli/embedding.py +122 -0
  15. janus/cli/llm.py +187 -0
  16. janus/cli/partition.py +125 -0
  17. janus/cli/self_eval.py +149 -0
  18. janus/cli/translate.py +183 -0
  19. janus/converter/__init__.py +1 -1
  20. janus/converter/_tests/test_translate.py +2 -0
  21. janus/converter/converter.py +129 -93
  22. janus/converter/document.py +21 -14
  23. janus/converter/evaluate.py +20 -13
  24. janus/converter/translate.py +3 -3
  25. janus/embedding/collections.py +1 -1
  26. janus/language/alc/_tests/alc.asm +3779 -0
  27. janus/language/binary/_tests/hello.bin +0 -0
  28. janus/language/block.py +47 -12
  29. janus/language/file.py +1 -1
  30. janus/language/mumps/_tests/mumps.m +235 -0
  31. janus/language/treesitter/_tests/languages/fortran.f90 +416 -0
  32. janus/language/treesitter/_tests/languages/ibmhlasm.asm +16 -0
  33. janus/language/treesitter/_tests/languages/matlab.m +225 -0
  34. janus/llm/models_info.py +9 -1
  35. janus/metrics/_tests/asm_test_file.asm +10 -0
  36. janus/metrics/_tests/mumps_test_file.m +6 -0
  37. janus/metrics/_tests/test_treesitter_metrics.py +1 -1
  38. janus/metrics/prompts/clarity.txt +8 -0
  39. janus/metrics/prompts/completeness.txt +16 -0
  40. janus/metrics/prompts/faithfulness.txt +10 -0
  41. janus/metrics/prompts/hallucination.txt +16 -0
  42. janus/metrics/prompts/quality.txt +8 -0
  43. janus/metrics/prompts/readability.txt +16 -0
  44. janus/metrics/prompts/usefulness.txt +16 -0
  45. janus/parsers/code_parser.py +4 -4
  46. janus/parsers/doc_parser.py +12 -9
  47. janus/parsers/parser.py +7 -0
  48. janus/parsers/partition_parser.py +6 -4
  49. janus/parsers/reqs_parser.py +8 -5
  50. janus/parsers/uml.py +5 -4
  51. janus/prompts/prompt.py +2 -2
  52. janus/prompts/templates/README.md +30 -0
  53. janus/prompts/templates/basic_aggregation/human.txt +6 -0
  54. janus/prompts/templates/basic_aggregation/system.txt +1 -0
  55. janus/prompts/templates/basic_refinement/human.txt +14 -0
  56. janus/prompts/templates/basic_refinement/system.txt +1 -0
  57. janus/prompts/templates/diagram/human.txt +9 -0
  58. janus/prompts/templates/diagram/system.txt +1 -0
  59. janus/prompts/templates/diagram_with_documentation/human.txt +15 -0
  60. janus/prompts/templates/diagram_with_documentation/system.txt +1 -0
  61. janus/prompts/templates/document/human.txt +10 -0
  62. janus/prompts/templates/document/system.txt +1 -0
  63. janus/prompts/templates/document_cloze/human.txt +11 -0
  64. janus/prompts/templates/document_cloze/system.txt +1 -0
  65. janus/prompts/templates/document_cloze/variables.json +4 -0
  66. janus/prompts/templates/document_cloze/variables_asm.json +4 -0
  67. janus/prompts/templates/document_inline/human.txt +13 -0
  68. janus/prompts/templates/eval_prompts/incose/human.txt +32 -0
  69. janus/prompts/templates/eval_prompts/incose/system.txt +1 -0
  70. janus/prompts/templates/eval_prompts/incose/variables.json +3 -0
  71. janus/prompts/templates/eval_prompts/inline_comments/human.txt +49 -0
  72. janus/prompts/templates/eval_prompts/inline_comments/system.txt +1 -0
  73. janus/prompts/templates/eval_prompts/inline_comments/variables.json +3 -0
  74. janus/prompts/templates/micromanaged_mumps_v1.0/human.txt +23 -0
  75. janus/prompts/templates/micromanaged_mumps_v1.0/system.txt +3 -0
  76. janus/prompts/templates/micromanaged_mumps_v2.0/human.txt +28 -0
  77. janus/prompts/templates/micromanaged_mumps_v2.0/system.txt +3 -0
  78. janus/prompts/templates/micromanaged_mumps_v2.1/human.txt +29 -0
  79. janus/prompts/templates/micromanaged_mumps_v2.1/system.txt +3 -0
  80. janus/prompts/templates/multidocument/human.txt +15 -0
  81. janus/prompts/templates/multidocument/system.txt +1 -0
  82. janus/prompts/templates/partition/human.txt +22 -0
  83. janus/prompts/templates/partition/system.txt +1 -0
  84. janus/prompts/templates/partition/variables.json +4 -0
  85. janus/prompts/templates/pseudocode/human.txt +7 -0
  86. janus/prompts/templates/pseudocode/system.txt +7 -0
  87. janus/prompts/templates/refinement/fix_exceptions/human.txt +19 -0
  88. janus/prompts/templates/refinement/fix_exceptions/system.txt +1 -0
  89. janus/prompts/templates/refinement/format/code_format/human.txt +12 -0
  90. janus/prompts/templates/refinement/format/code_format/system.txt +1 -0
  91. janus/prompts/templates/refinement/format/requirements_format/human.txt +14 -0
  92. janus/prompts/templates/refinement/format/requirements_format/system.txt +1 -0
  93. janus/prompts/templates/refinement/hallucination/human.txt +13 -0
  94. janus/prompts/templates/refinement/hallucination/system.txt +1 -0
  95. janus/prompts/templates/refinement/reflection/human.txt +15 -0
  96. janus/prompts/templates/refinement/reflection/incose/human.txt +26 -0
  97. janus/prompts/templates/refinement/reflection/incose/system.txt +1 -0
  98. janus/prompts/templates/refinement/reflection/incose_deduplicate/human.txt +16 -0
  99. janus/prompts/templates/refinement/reflection/incose_deduplicate/system.txt +1 -0
  100. janus/prompts/templates/refinement/reflection/system.txt +1 -0
  101. janus/prompts/templates/refinement/revision/human.txt +16 -0
  102. janus/prompts/templates/refinement/revision/incose/human.txt +16 -0
  103. janus/prompts/templates/refinement/revision/incose/system.txt +1 -0
  104. janus/prompts/templates/refinement/revision/incose_deduplicate/human.txt +17 -0
  105. janus/prompts/templates/refinement/revision/incose_deduplicate/system.txt +1 -0
  106. janus/prompts/templates/refinement/revision/system.txt +1 -0
  107. janus/prompts/templates/refinement/uml/alc_fix_variables/human.txt +15 -0
  108. janus/prompts/templates/refinement/uml/alc_fix_variables/system.txt +2 -0
  109. janus/prompts/templates/refinement/uml/fix_connections/human.txt +15 -0
  110. janus/prompts/templates/refinement/uml/fix_connections/system.txt +2 -0
  111. janus/prompts/templates/requirements/human.txt +13 -0
  112. janus/prompts/templates/requirements/system.txt +2 -0
  113. janus/prompts/templates/retrieval/language_docs/human.txt +10 -0
  114. janus/prompts/templates/retrieval/language_docs/system.txt +1 -0
  115. janus/prompts/templates/simple/human.txt +16 -0
  116. janus/prompts/templates/simple/system.txt +3 -0
  117. janus/refiners/format.py +49 -0
  118. janus/refiners/refiner.py +113 -4
  119. janus/utils/enums.py +127 -112
  120. janus/utils/logger.py +2 -0
  121. {janus_llm-4.3.1.dist-info → janus_llm-4.3.5.dist-info}/METADATA +7 -7
  122. janus_llm-4.3.5.dist-info/RECORD +210 -0
  123. {janus_llm-4.3.1.dist-info → janus_llm-4.3.5.dist-info}/WHEEL +1 -1
  124. janus_llm-4.3.5.dist-info/entry_points.txt +3 -0
  125. janus/cli.py +0 -1488
  126. janus_llm-4.3.1.dist-info/RECORD +0 -115
  127. janus_llm-4.3.1.dist-info/entry_points.txt +0 -3
  128. {janus_llm-4.3.1.dist-info → janus_llm-4.3.5.dist-info}/LICENSE +0 -0
janus/cli.py DELETED
@@ -1,1488 +0,0 @@
1
- import json
2
- import logging
3
- import os
4
- import subprocess # nosec
5
- from pathlib import Path
6
- from typing import List, Optional
7
-
8
- import click
9
- import typer
10
- from pydantic import AnyHttpUrl
11
- from rich import print
12
- from rich.console import Console
13
- from rich.prompt import Confirm
14
- from typing_extensions import Annotated
15
-
16
- import janus.refiners.refiner
17
- import janus.refiners.uml
18
- from janus.converter.aggregator import Aggregator
19
- from janus.converter.converter import Converter
20
- from janus.converter.diagram import DiagramGenerator
21
- from janus.converter.document import Documenter, MadLibsDocumenter, MultiDocumenter
22
- from janus.converter.evaluate import InlineCommentEvaluator, RequirementEvaluator
23
- from janus.converter.partition import Partitioner
24
- from janus.converter.requirements import RequirementsDocumenter
25
- from janus.converter.translate import Translator
26
- from janus.embedding.collections import Collections
27
- from janus.embedding.database import ChromaEmbeddingDatabase
28
- from janus.embedding.embedding_models_info import (
29
- EMBEDDING_COST_PER_MODEL,
30
- EMBEDDING_MODEL_CONFIG_DIR,
31
- EMBEDDING_TOKEN_LIMITS,
32
- EmbeddingModelType,
33
- )
34
- from janus.embedding.vectorize import ChromaDBVectorizer
35
- from janus.language.binary import BinarySplitter
36
- from janus.language.mumps import MumpsSplitter
37
- from janus.language.naive.registry import CUSTOM_SPLITTERS
38
- from janus.language.treesitter import TreeSitterSplitter
39
- from janus.llm.model_callbacks import COST_PER_1K_TOKENS
40
- from janus.llm.models_info import (
41
- MODEL_CONFIG_DIR,
42
- MODEL_ID_TO_LONG_ID,
43
- MODEL_TYPE_CONSTRUCTORS,
44
- MODEL_TYPES,
45
- TOKEN_LIMITS,
46
- azure_models,
47
- bedrock_models,
48
- openai_models,
49
- )
50
- from janus.metrics.cli import evaluate
51
- from janus.utils.enums import LANGUAGES
52
- from janus.utils.logger import create_logger
53
-
54
- httpx_logger = logging.getLogger("httpx")
55
- httpx_logger.setLevel(logging.WARNING)
56
-
57
- log = create_logger(__name__)
58
- homedir = Path.home().expanduser()
59
-
60
- janus_dir = homedir / ".janus"
61
- if not janus_dir.exists():
62
- janus_dir.mkdir(parents=True)
63
-
64
- db_file = janus_dir / ".db"
65
- if not db_file.exists():
66
- with open(db_file, "w") as f:
67
- f.write(str(janus_dir / "chroma.db"))
68
-
69
- with open(db_file, "r") as f:
70
- db_loc = f.read()
71
-
72
- collections_config_file = Path(db_loc) / "collections.json"
73
-
74
-
75
- def get_subclasses(cls):
76
- return set(cls.__subclasses__()).union(
77
- set(s for c in cls.__subclasses__() for s in get_subclasses(c))
78
- )
79
-
80
-
81
- REFINER_TYPES = get_subclasses(janus.refiners.refiner.JanusRefiner).union(
82
- {janus.refiners.refiner.JanusRefiner}
83
- )
84
- REFINERS = {r.__name__: r for r in REFINER_TYPES}
85
-
86
-
87
- def get_collections_config():
88
- if collections_config_file.exists():
89
- with open(collections_config_file, "r") as f:
90
- config = json.load(f)
91
- else:
92
- config = {}
93
- return config
94
-
95
-
96
- app = typer.Typer(
97
- help=(
98
- "[bold][dark_orange]Janus[/dark_orange] is a CLI for translating, "
99
- "documenting, and diagramming code using large language models.[/bold]"
100
- ),
101
- add_completion=False,
102
- no_args_is_help=True,
103
- context_settings={"help_option_names": ["-h", "--help"]},
104
- rich_markup_mode="rich",
105
- )
106
-
107
-
108
- db = typer.Typer(
109
- help="Database commands",
110
- add_completion=False,
111
- no_args_is_help=True,
112
- context_settings={"help_option_names": ["-h", "--help"]},
113
- )
114
- llm = typer.Typer(
115
- help="LLM commands",
116
- add_completion=False,
117
- no_args_is_help=True,
118
- context_settings={"help_option_names": ["-h", "--help"]},
119
- )
120
-
121
- embedding = typer.Typer(
122
- help="Embedding model commands",
123
- add_completion=False,
124
- no_args_is_help=True,
125
- context_settings={"help_option_names": ["-h", "--help"]},
126
- )
127
-
128
-
129
- def version_callback(value: bool) -> None:
130
- if value:
131
- from . import __version__ as version
132
-
133
- print(f"Janus CLI [blue]v{version}[/blue]")
134
- raise typer.Exit()
135
-
136
-
137
- @app.callback()
138
- def common(
139
- ctx: typer.Context,
140
- version: bool = typer.Option(
141
- None,
142
- "--version",
143
- "-v",
144
- callback=version_callback,
145
- help="Print the version and exit.",
146
- ),
147
- ) -> None:
148
- """A function for getting the app version
149
-
150
- This will call the version_callback function to print the version and exit.
151
-
152
- Arguments:
153
- ctx: The typer context
154
- version: A boolean flag for the version
155
- """
156
- pass
157
-
158
-
159
- @app.command(
160
- help="Translate code from one language to another using an LLM.",
161
- no_args_is_help=True,
162
- )
163
- def translate(
164
- input_dir: Annotated[
165
- Path,
166
- typer.Option(
167
- "--input",
168
- "-i",
169
- help="The directory containing the source code to be translated. "
170
- "The files should all be in one flat directory.",
171
- ),
172
- ],
173
- source_lang: Annotated[
174
- str,
175
- typer.Option(
176
- "--source-language",
177
- "-s",
178
- help="The language of the source code.",
179
- click_type=click.Choice(sorted(LANGUAGES)),
180
- ),
181
- ],
182
- output_dir: Annotated[
183
- Path,
184
- typer.Option(
185
- "--output", "-o", help="The directory to store the translated code in."
186
- ),
187
- ],
188
- target_lang: Annotated[
189
- str,
190
- typer.Option(
191
- "--target-language",
192
- "-t",
193
- help="The desired output language to translate the source code to. The "
194
- "format can follow a 'language-version' syntax. Use 'text' to get plaintext"
195
- "results as returned by the LLM. Examples: `python-3.10`, `mumps`, `java-10`,"
196
- "text.",
197
- ),
198
- ],
199
- llm_name: Annotated[
200
- str,
201
- typer.Option(
202
- "--llm",
203
- "-L",
204
- help="The custom name of the model set with 'janus llm add'.",
205
- ),
206
- ],
207
- max_prompts: Annotated[
208
- int,
209
- typer.Option(
210
- "--max-prompts",
211
- "-m",
212
- help="The maximum number of times to prompt a model on one functional block "
213
- "before exiting the application. This is to prevent wasting too much money.",
214
- ),
215
- ] = 10,
216
- overwrite: Annotated[
217
- bool,
218
- typer.Option(
219
- "--overwrite/--preserve",
220
- help="Whether to overwrite existing files in the output directory",
221
- ),
222
- ] = False,
223
- skip_context: Annotated[
224
- bool,
225
- typer.Option(
226
- "--skip-context",
227
- help="Prompts will include any context information associated with source"
228
- " code blocks, unless this option is specified",
229
- ),
230
- ] = False,
231
- temp: Annotated[
232
- float,
233
- typer.Option("--temperature", "-T", help="Sampling temperature.", min=0, max=2),
234
- ] = 0.7,
235
- prompt_template: Annotated[
236
- str,
237
- typer.Option(
238
- "--prompt-template",
239
- "-p",
240
- help="Name of the Janus prompt template directory or "
241
- "path to a directory containing those template files.",
242
- ),
243
- ] = "simple",
244
- collection: Annotated[
245
- str,
246
- typer.Option(
247
- "--collection",
248
- "-c",
249
- help="If set, will put the translated result into a Chroma DB "
250
- "collection with the name provided.",
251
- ),
252
- ] = None,
253
- splitter_type: Annotated[
254
- str,
255
- typer.Option(
256
- "-S",
257
- "--splitter",
258
- help="Name of custom splitter to use",
259
- click_type=click.Choice(list(CUSTOM_SPLITTERS.keys())),
260
- ),
261
- ] = "file",
262
- refiner_types: Annotated[
263
- list[str],
264
- typer.Option(
265
- "-r",
266
- "--refiner",
267
- help="List of refiner types to use. Add -r for each refiner to use in\
268
- refinement chain",
269
- click_type=click.Choice(list(REFINERS.keys())),
270
- ),
271
- ] = ["JanusRefiner"],
272
- retriever_type: Annotated[
273
- str,
274
- typer.Option(
275
- "-R",
276
- "--retriever",
277
- help="Name of custom retriever to use",
278
- click_type=click.Choice(["active_usings", "language_docs"]),
279
- ),
280
- ] = None,
281
- max_tokens: Annotated[
282
- int,
283
- typer.Option(
284
- "--max-tokens",
285
- "-M",
286
- help="The maximum number of tokens the model will take in. "
287
- "If unspecificed, model's default max will be used.",
288
- ),
289
- ] = None,
290
- ):
291
- refiner_types = [REFINERS[r] for r in refiner_types]
292
- try:
293
- target_language, target_version = target_lang.split("-")
294
- except ValueError:
295
- target_language = target_lang
296
- target_version = None
297
- # make sure not overwriting input
298
- if source_lang.lower() == target_language.lower() and input_dir == output_dir:
299
- log.error("Output files would overwrite input! Aborting...")
300
- raise ValueError
301
-
302
- model_arguments = dict(temperature=temp)
303
- collections_config = get_collections_config()
304
- translator = Translator(
305
- model=llm_name,
306
- model_arguments=model_arguments,
307
- source_language=source_lang,
308
- target_language=target_language,
309
- target_version=target_version,
310
- max_prompts=max_prompts,
311
- max_tokens=max_tokens,
312
- prompt_template=prompt_template,
313
- db_path=db_loc,
314
- db_config=collections_config,
315
- splitter_type=splitter_type,
316
- refiner_types=refiner_types,
317
- retriever_type=retriever_type,
318
- )
319
- translator.translate(input_dir, output_dir, overwrite, collection)
320
-
321
-
322
- @app.command(
323
- help="Document input code using an LLM.",
324
- no_args_is_help=True,
325
- )
326
- def document(
327
- input_dir: Annotated[
328
- Path,
329
- typer.Option(
330
- "--input",
331
- "-i",
332
- help="The directory containing the source code to be translated. "
333
- "The files should all be in one flat directory.",
334
- ),
335
- ],
336
- language: Annotated[
337
- str,
338
- typer.Option(
339
- "--language",
340
- "-l",
341
- help="The language of the source code.",
342
- click_type=click.Choice(sorted(LANGUAGES)),
343
- ),
344
- ],
345
- output_dir: Annotated[
346
- Path,
347
- typer.Option(
348
- "--output-dir", "-o", help="The directory to store the translated code in."
349
- ),
350
- ],
351
- llm_name: Annotated[
352
- str,
353
- typer.Option(
354
- "--llm",
355
- "-L",
356
- help="The custom name of the model set with 'janus llm add'.",
357
- ),
358
- ],
359
- max_prompts: Annotated[
360
- int,
361
- typer.Option(
362
- "--max-prompts",
363
- "-m",
364
- help="The maximum number of times to prompt a model on one functional block "
365
- "before exiting the application. This is to prevent wasting too much money.",
366
- ),
367
- ] = 10,
368
- overwrite: Annotated[
369
- bool,
370
- typer.Option(
371
- "--overwrite/--preserve",
372
- help="Whether to overwrite existing files in the output directory",
373
- ),
374
- ] = False,
375
- doc_mode: Annotated[
376
- str,
377
- typer.Option(
378
- "--doc-mode",
379
- "-d",
380
- help="The documentation mode.",
381
- click_type=click.Choice(["madlibs", "summary", "multidoc", "requirements"]),
382
- ),
383
- ] = "madlibs",
384
- comments_per_request: Annotated[
385
- int,
386
- typer.Option(
387
- "--comments-per-request",
388
- "-rc",
389
- help="The maximum number of comments to generate per request when using "
390
- "MadLibs documentation mode.",
391
- ),
392
- ] = None,
393
- drop_comments: Annotated[
394
- bool,
395
- typer.Option(
396
- "--drop-comments/--keep-comments",
397
- help="Whether to drop or keep comments in the code sent to the LLM",
398
- ),
399
- ] = False,
400
- temperature: Annotated[
401
- float,
402
- typer.Option("--temperature", "-t", help="Sampling temperature.", min=0, max=2),
403
- ] = 0.7,
404
- collection: Annotated[
405
- str,
406
- typer.Option(
407
- "--collection",
408
- "-c",
409
- help="If set, will put the translated result into a Chroma DB "
410
- "collection with the name provided.",
411
- ),
412
- ] = None,
413
- splitter_type: Annotated[
414
- str,
415
- typer.Option(
416
- "-S",
417
- "--splitter",
418
- help="Name of custom splitter to use",
419
- click_type=click.Choice(list(CUSTOM_SPLITTERS.keys())),
420
- ),
421
- ] = "file",
422
- refiner_types: Annotated[
423
- list[str],
424
- typer.Option(
425
- "-r",
426
- "--refiner",
427
- help="List of refiner types to use. Add -r for each refiner to use in\
428
- refinement chain",
429
- click_type=click.Choice(list(REFINERS.keys())),
430
- ),
431
- ] = ["JanusRefiner"],
432
- retriever_type: Annotated[
433
- str,
434
- typer.Option(
435
- "-R",
436
- "--retriever",
437
- help="Name of custom retriever to use",
438
- click_type=click.Choice(["active_usings", "language_docs"]),
439
- ),
440
- ] = None,
441
- max_tokens: Annotated[
442
- int,
443
- typer.Option(
444
- "--max-tokens",
445
- "-M",
446
- help="The maximum number of tokens the model will take in. "
447
- "If unspecificed, model's default max will be used.",
448
- ),
449
- ] = None,
450
- ):
451
- refiner_types = [REFINERS[r] for r in refiner_types]
452
- model_arguments = dict(temperature=temperature)
453
- collections_config = get_collections_config()
454
- kwargs = dict(
455
- model=llm_name,
456
- model_arguments=model_arguments,
457
- source_language=language,
458
- max_prompts=max_prompts,
459
- max_tokens=max_tokens,
460
- db_path=db_loc,
461
- db_config=collections_config,
462
- splitter_type=splitter_type,
463
- refiner_types=refiner_types,
464
- retriever_type=retriever_type,
465
- )
466
- if doc_mode == "madlibs":
467
- documenter = MadLibsDocumenter(
468
- comments_per_request=comments_per_request, **kwargs
469
- )
470
- elif doc_mode == "multidoc":
471
- documenter = MultiDocumenter(drop_comments=drop_comments, **kwargs)
472
- elif doc_mode == "requirements":
473
- documenter = RequirementsDocumenter(drop_comments=drop_comments, **kwargs)
474
- else:
475
- documenter = Documenter(drop_comments=drop_comments, **kwargs)
476
-
477
- documenter.translate(input_dir, output_dir, overwrite, collection)
478
-
479
-
480
- @app.command()
481
- def aggregate(
482
- input_dir: Annotated[
483
- Path,
484
- typer.Option(
485
- "--input",
486
- "-i",
487
- help="The directory containing the source code to be translated. "
488
- "The files should all be in one flat directory.",
489
- ),
490
- ],
491
- language: Annotated[
492
- str,
493
- typer.Option(
494
- "--language",
495
- "-l",
496
- help="The language of the source code.",
497
- click_type=click.Choice(sorted(LANGUAGES)),
498
- ),
499
- ],
500
- output_dir: Annotated[
501
- Path,
502
- typer.Option(
503
- "--output-dir", "-o", help="The directory to store the translated code in."
504
- ),
505
- ],
506
- llm_name: Annotated[
507
- str,
508
- typer.Option(
509
- "--llm",
510
- "-L",
511
- help="The custom name of the model set with 'janus llm add'.",
512
- ),
513
- ],
514
- max_prompts: Annotated[
515
- int,
516
- typer.Option(
517
- "--max-prompts",
518
- "-m",
519
- help="The maximum number of times to prompt a model on one functional block "
520
- "before exiting the application. This is to prevent wasting too much money.",
521
- ),
522
- ] = 10,
523
- overwrite: Annotated[
524
- bool,
525
- typer.Option(
526
- "--overwrite/--preserve",
527
- help="Whether to overwrite existing files in the output directory",
528
- ),
529
- ] = False,
530
- temperature: Annotated[
531
- float,
532
- typer.Option("--temperature", "-t", help="Sampling temperature.", min=0, max=2),
533
- ] = 0.7,
534
- collection: Annotated[
535
- str,
536
- typer.Option(
537
- "--collection",
538
- "-c",
539
- help="If set, will put the translated result into a Chroma DB "
540
- "collection with the name provided.",
541
- ),
542
- ] = None,
543
- splitter_type: Annotated[
544
- str,
545
- typer.Option(
546
- "-S",
547
- "--splitter",
548
- help="Name of custom splitter to use",
549
- click_type=click.Choice(list(CUSTOM_SPLITTERS.keys())),
550
- ),
551
- ] = "file",
552
- intermediate_converters: Annotated[
553
- List[str],
554
- typer.Option(
555
- "-C",
556
- "--converter",
557
- help="Name of an intermediate converter to use",
558
- click_type=click.Choice([c.__name__ for c in get_subclasses(Converter)]),
559
- ),
560
- ] = ["Documenter"],
561
- ):
562
- converter_subclasses = get_subclasses(Converter)
563
- converter_subclasses_map = {c.__name__: c for c in converter_subclasses}
564
- model_arguments = dict(temperature=temperature)
565
- collections_config = get_collections_config()
566
- converters = []
567
- for ic in intermediate_converters:
568
- converters.append(
569
- converter_subclasses_map[ic](
570
- model=llm_name,
571
- model_arguments=model_arguments,
572
- source_language=language,
573
- max_prompts=max_prompts,
574
- db_path=db_loc,
575
- db_config=collections_config,
576
- splitter_type=splitter_type,
577
- )
578
- )
579
-
580
- aggregator = Aggregator(
581
- intermediate_converters=converters,
582
- model=llm_name,
583
- model_arguments=model_arguments,
584
- source_language=language,
585
- max_prompts=max_prompts,
586
- db_path=db_loc,
587
- db_config=collections_config,
588
- splitter_type=splitter_type,
589
- prompt_template="basic_aggregation",
590
- )
591
- aggregator.translate(input_dir, output_dir, overwrite, collection)
592
-
593
-
594
- @app.command(
595
- help="Partition input code using an LLM.",
596
- no_args_is_help=True,
597
- )
598
- def partition(
599
- input_dir: Annotated[
600
- Path,
601
- typer.Option(
602
- "--input",
603
- "-i",
604
- help="The directory containing the source code to be partitioned. ",
605
- ),
606
- ],
607
- language: Annotated[
608
- str,
609
- typer.Option(
610
- "--language",
611
- "-l",
612
- help="The language of the source code.",
613
- click_type=click.Choice(sorted(LANGUAGES)),
614
- ),
615
- ],
616
- output_dir: Annotated[
617
- Path,
618
- typer.Option(
619
- "--output-dir", "-o", help="The directory to store the partitioned code in."
620
- ),
621
- ],
622
- llm_name: Annotated[
623
- str,
624
- typer.Option(
625
- "--llm",
626
- "-L",
627
- help="The custom name of the model set with 'janus llm add'.",
628
- ),
629
- ] = "gpt-4o",
630
- max_prompts: Annotated[
631
- int,
632
- typer.Option(
633
- "--max-prompts",
634
- "-m",
635
- help="The maximum number of times to prompt a model on one functional block "
636
- "before exiting the application. This is to prevent wasting too much money.",
637
- ),
638
- ] = 10,
639
- overwrite: Annotated[
640
- bool,
641
- typer.Option(
642
- "--overwrite/--preserve",
643
- help="Whether to overwrite existing files in the output directory",
644
- ),
645
- ] = False,
646
- temperature: Annotated[
647
- float,
648
- typer.Option("--temperature", "-t", help="Sampling temperature.", min=0, max=2),
649
- ] = 0.7,
650
- splitter_type: Annotated[
651
- str,
652
- typer.Option(
653
- "-S",
654
- "--splitter",
655
- help="Name of custom splitter to use",
656
- click_type=click.Choice(list(CUSTOM_SPLITTERS.keys())),
657
- ),
658
- ] = "file",
659
- refiner_types: Annotated[
660
- list[str],
661
- typer.Option(
662
- "-r",
663
- "--refiner",
664
- help="List of refiner types to use. Add -r for each refiner to use in\
665
- refinement chain",
666
- click_type=click.Choice(list(REFINERS.keys())),
667
- ),
668
- ] = ["JanusRefiner"],
669
- max_tokens: Annotated[
670
- int,
671
- typer.Option(
672
- "--max-tokens",
673
- "-M",
674
- help="The maximum number of tokens the model will take in. "
675
- "If unspecificed, model's default max will be used.",
676
- ),
677
- ] = None,
678
- partition_token_limit: Annotated[
679
- int,
680
- typer.Option(
681
- "--partition-tokens",
682
- "-pt",
683
- help="The limit on the number of tokens per partition.",
684
- ),
685
- ] = 8192,
686
- ):
687
- refiner_types = [REFINERS[r] for r in refiner_types]
688
- model_arguments = dict(temperature=temperature)
689
- kwargs = dict(
690
- model=llm_name,
691
- model_arguments=model_arguments,
692
- source_language=language,
693
- max_prompts=max_prompts,
694
- max_tokens=max_tokens,
695
- splitter_type=splitter_type,
696
- refiner_types=refiner_types,
697
- partition_token_limit=partition_token_limit,
698
- )
699
- partitioner = Partitioner(**kwargs)
700
- partitioner.translate(input_dir, output_dir, overwrite)
701
-
702
-
703
- @app.command(
704
- help="Diagram input code using an LLM.",
705
- no_args_is_help=True,
706
- )
707
- def diagram(
708
- input_dir: Annotated[
709
- Path,
710
- typer.Option(
711
- "--input",
712
- "-i",
713
- help="The directory containing the source code to be translated. "
714
- "The files should all be in one flat directory.",
715
- ),
716
- ],
717
- language: Annotated[
718
- str,
719
- typer.Option(
720
- "--language",
721
- "-l",
722
- help="The language of the source code.",
723
- click_type=click.Choice(sorted(LANGUAGES)),
724
- ),
725
- ],
726
- output_dir: Annotated[
727
- Path,
728
- typer.Option(
729
- "--output-dir", "-o", help="The directory to store the translated code in."
730
- ),
731
- ],
732
- llm_name: Annotated[
733
- str,
734
- typer.Option(
735
- "--llm",
736
- "-L",
737
- help="The custom name of the model set with 'janus llm add'.",
738
- ),
739
- ],
740
- max_prompts: Annotated[
741
- int,
742
- typer.Option(
743
- "--max-prompts",
744
- "-m",
745
- help="The maximum number of times to prompt a model on one functional block "
746
- "before exiting the application. This is to prevent wasting too much money.",
747
- ),
748
- ] = 10,
749
- overwrite: Annotated[
750
- bool,
751
- typer.Option(
752
- "--overwrite/--preserve",
753
- help="Whether to overwrite existing files in the output directory",
754
- ),
755
- ] = False,
756
- temperature: Annotated[
757
- float,
758
- typer.Option("--temperature", "-t", help="Sampling temperature.", min=0, max=2),
759
- ] = 0.7,
760
- collection: Annotated[
761
- str,
762
- typer.Option(
763
- "--collection",
764
- "-c",
765
- help="If set, will put the translated result into a Chroma DB "
766
- "collection with the name provided.",
767
- ),
768
- ] = None,
769
- diagram_type: Annotated[
770
- str,
771
- typer.Option(
772
- "--diagram-type", "-dg", help="Diagram type to generate in PLANTUML"
773
- ),
774
- ] = "Activity",
775
- add_documentation: Annotated[
776
- bool,
777
- typer.Option(
778
- "--add-documentation/--no-documentation",
779
- "-ad",
780
- help="Whether to use documentation in generation",
781
- ),
782
- ] = False,
783
- splitter_type: Annotated[
784
- str,
785
- typer.Option(
786
- "-S",
787
- "--splitter",
788
- help="Name of custom splitter to use",
789
- click_type=click.Choice(list(CUSTOM_SPLITTERS.keys())),
790
- ),
791
- ] = "file",
792
- refiner_types: Annotated[
793
- list[str],
794
- typer.Option(
795
- "-r",
796
- "--refiner",
797
- help="List of refiner types to use. Add -r for each refiner to use in\
798
- refinement chain",
799
- click_type=click.Choice(list(REFINERS.keys())),
800
- ),
801
- ] = ["JanusRefiner"],
802
- retriever_type: Annotated[
803
- str,
804
- typer.Option(
805
- "-R",
806
- "--retriever",
807
- help="Name of custom retriever to use",
808
- click_type=click.Choice(["active_usings", "language_docs"]),
809
- ),
810
- ] = None,
811
- ):
812
- refiner_types = [REFINERS[r] for r in refiner_types]
813
- model_arguments = dict(temperature=temperature)
814
- collections_config = get_collections_config()
815
- diagram_generator = DiagramGenerator(
816
- model=llm_name,
817
- model_arguments=model_arguments,
818
- source_language=language,
819
- max_prompts=max_prompts,
820
- db_path=db_loc,
821
- db_config=collections_config,
822
- splitter_type=splitter_type,
823
- refiner_types=refiner_types,
824
- retriever_type=retriever_type,
825
- diagram_type=diagram_type,
826
- add_documentation=add_documentation,
827
- )
828
- diagram_generator.translate(input_dir, output_dir, overwrite, collection)
829
-
830
-
831
- @app.command(
832
- help="LLM self evaluation",
833
- no_args_is_help=True,
834
- )
835
- def llm_self_eval(
836
- input_dir: Annotated[
837
- Path,
838
- typer.Option(
839
- "--input",
840
- "-i",
841
- help="The directory containing the source code to be evaluated. "
842
- "The files should all be in one flat directory.",
843
- ),
844
- ],
845
- language: Annotated[
846
- str,
847
- typer.Option(
848
- "--language",
849
- "-l",
850
- help="The language of the source code.",
851
- click_type=click.Choice(sorted(LANGUAGES)),
852
- ),
853
- ],
854
- output_dir: Annotated[
855
- Path,
856
- typer.Option(
857
- "--output-dir", "-o", help="The directory to store the evaluations in."
858
- ),
859
- ],
860
- llm_name: Annotated[
861
- str,
862
- typer.Option(
863
- "--llm",
864
- "-L",
865
- help="The custom name of the model set with 'janus llm add'.",
866
- ),
867
- ] = "gpt-4o",
868
- evaluation_type: Annotated[
869
- str,
870
- typer.Option(
871
- "--evaluation-type",
872
- "-e",
873
- help="Type of output to evaluate.",
874
- click_type=click.Choice(["incose", "comments"]),
875
- ),
876
- ] = "incose",
877
- max_prompts: Annotated[
878
- int,
879
- typer.Option(
880
- "--max-prompts",
881
- "-m",
882
- help="The maximum number of times to prompt a model on one functional block "
883
- "before exiting the application. This is to prevent wasting too much money.",
884
- ),
885
- ] = 10,
886
- overwrite: Annotated[
887
- bool,
888
- typer.Option(
889
- "--overwrite/--preserve",
890
- help="Whether to overwrite existing files in the output directory",
891
- ),
892
- ] = False,
893
- temperature: Annotated[
894
- float,
895
- typer.Option("--temperature", "-t", help="Sampling temperature.", min=0, max=2),
896
- ] = 0.7,
897
- collection: Annotated[
898
- str,
899
- typer.Option(
900
- "--collection",
901
- "-c",
902
- help="If set, will put the translated result into a Chroma DB "
903
- "collection with the name provided.",
904
- ),
905
- ] = None,
906
- splitter_type: Annotated[
907
- str,
908
- typer.Option(
909
- "-S",
910
- "--splitter",
911
- help="Name of custom splitter to use",
912
- click_type=click.Choice(list(CUSTOM_SPLITTERS.keys())),
913
- ),
914
- ] = "file",
915
- refiner_types: Annotated[
916
- list[str],
917
- typer.Option(
918
- "-r",
919
- "--refiner",
920
- help="List of refiner types to use. Add -r for each refiner to use in\
921
- refinement chain",
922
- click_type=click.Choice(list(REFINERS.keys())),
923
- ),
924
- ] = ["JanusRefiner"],
925
- eval_items_per_request: Annotated[
926
- int,
927
- typer.Option(
928
- "--eval-items-per-request",
929
- "-rc",
930
- help="The maximum number of evaluation items per request",
931
- ),
932
- ] = None,
933
- max_tokens: Annotated[
934
- int,
935
- typer.Option(
936
- "--max-tokens",
937
- "-M",
938
- help="The maximum number of tokens the model will take in. "
939
- "If unspecificed, model's default max will be used.",
940
- ),
941
- ] = None,
942
- ):
943
- model_arguments = dict(temperature=temperature)
944
- refiner_types = [REFINERS[r] for r in refiner_types]
945
- kwargs = dict(
946
- eval_items_per_request=eval_items_per_request,
947
- model=llm_name,
948
- model_arguments=model_arguments,
949
- source_language=language,
950
- max_prompts=max_prompts,
951
- max_tokens=max_tokens,
952
- splitter_type=splitter_type,
953
- refiner_types=refiner_types,
954
- )
955
- # Setting parser type here
956
- if evaluation_type == "incose":
957
- evaluator = RequirementEvaluator(**kwargs)
958
- elif evaluation_type == "comments":
959
- evaluator = InlineCommentEvaluator(**kwargs)
960
-
961
- evaluator.translate(input_dir, output_dir, overwrite, collection)
962
-
963
-
964
- @db.command("init", help="Connect to or create a database.")
965
- def db_init(
966
- path: Annotated[
967
- str, typer.Option("--path", "-p", help="The path to the database file.")
968
- ] = str(janus_dir / "chroma.db"),
969
- url: Annotated[
970
- str,
971
- typer.Option(
972
- "--url",
973
- "-u",
974
- help="The URL of the database if the database is running externally.",
975
- ),
976
- ] = "",
977
- ) -> None:
978
- global db_loc
979
- if url != "":
980
- print(f"Pointing to Chroma DB at {url}")
981
- with open(db_file, "w") as f:
982
- f.write(url)
983
- db_loc = url
984
- else:
985
- path = os.path.abspath(path)
986
- print(f"Setting up Chroma DB at {path}")
987
- with open(db_file, "w") as f:
988
- f.write(path)
989
- db_loc = path
990
- global embedding_db
991
- embedding_db = ChromaEmbeddingDatabase(db_loc)
992
-
993
-
994
- @db.command("status", help="Print current database location.")
995
- def db_status():
996
- print(f"Chroma DB currently pointing to {db_loc}")
997
-
998
-
999
- @db.command(
1000
- "ls",
1001
- help="List the current database's collections. Or supply a collection name to list "
1002
- "information about its contents.",
1003
- )
1004
- def db_ls(
1005
- collection_name: Annotated[
1006
- Optional[str], typer.Argument(help="The name of the collection.")
1007
- ] = None,
1008
- peek: Annotated[
1009
- Optional[int],
1010
- typer.Option("--peek", "-p", help="Peek at N entries for a specific collection."),
1011
- ] = None,
1012
- ) -> None:
1013
- """List the current database's collections"""
1014
- if peek is not None and collection_name is None:
1015
- print(
1016
- "\n[bold red]Cannot peek at all collections. Please specify a "
1017
- "collection by name.[/bold red]"
1018
- )
1019
- return
1020
- db = ChromaEmbeddingDatabase(db_loc)
1021
- collections = Collections(db)
1022
- collection_list = collections.get(collection_name)
1023
- for collection in collection_list:
1024
- print(
1025
- f"\n[bold underline]Collection[/bold underline]: "
1026
- f"[bold salmon1]{collection.name}[/bold salmon1]"
1027
- )
1028
- print(f" ID: {collection.id}")
1029
- print(f" Metadata: {collection.metadata}")
1030
- print(f" Tenant: [green]{collection.tenant}[/green]")
1031
- print(f" Database: [green]{collection.database}[/green]")
1032
- print(f" Length: {collection.count()}")
1033
- if peek:
1034
- entry = collection.peek(peek)
1035
- entry["embeddings"] = entry["embeddings"][0][:2] + ["..."]
1036
- if peek == 1:
1037
- print(" [bold]Peeking at first entry[/bold]:")
1038
- else:
1039
- print(f" [bold]Peeking at first {peek} entries[/bold]:")
1040
- print(entry)
1041
- print()
1042
-
1043
-
1044
- @db.command("add", help="Add a collection to the current database.")
1045
- def db_add(
1046
- collection_name: Annotated[str, typer.Argument(help="The name of the collection.")],
1047
- model_name: Annotated[str, typer.Argument(help="The name of the embedding model.")],
1048
- input_dir: Annotated[
1049
- str,
1050
- typer.Option(
1051
- "--input",
1052
- "-i",
1053
- help="The directory containing the source code to be added.",
1054
- ),
1055
- ] = "./",
1056
- input_lang: Annotated[
1057
- str, typer.Option("--language", "-l", help="The language of the source code.")
1058
- ] = "python",
1059
- max_tokens: Annotated[
1060
- int,
1061
- typer.Option(
1062
- "--max-tokens",
1063
- "-m",
1064
- help="The maximum number of tokens for each chunk of input source code.",
1065
- ),
1066
- ] = 4096,
1067
- ) -> None:
1068
- """Add a collection to the database
1069
-
1070
- Arguments:
1071
- collection_name: The name of the collection to add
1072
- model_name: The name of the embedding model to use
1073
- input_dir: The directory containing the source code to be added
1074
- input_lang: The language of the source code
1075
- max_tokens: The maximum number of tokens for each chunk of input source code
1076
- """
1077
- # TODO: import factory
1078
- console = Console()
1079
-
1080
- added_to = _check_collection(collection_name, input_dir)
1081
- collections_config = get_collections_config()
1082
-
1083
- with console.status(
1084
- f"Adding collection: [bold salmon]{collection_name}[/bold salmon]",
1085
- spinner="arrow3",
1086
- ):
1087
- vectorizer_factory = ChromaDBVectorizer()
1088
- vectorizer = vectorizer_factory.create_vectorizer(
1089
- path=db_loc, config=collections_config
1090
- )
1091
- vectorizer.get_or_create_collection(collection_name, model_name=model_name)
1092
- input_dir = Path(input_dir)
1093
- suffix = LANGUAGES[input_lang]["suffix"]
1094
- source_glob = f"**/*.{suffix}"
1095
- input_paths = [p for p in input_dir.rglob(source_glob)]
1096
- if input_lang in CUSTOM_SPLITTERS:
1097
- if input_lang == "mumps":
1098
- splitter = MumpsSplitter(
1099
- max_tokens=max_tokens,
1100
- )
1101
- elif input_lang == "binary":
1102
- splitter = BinarySplitter(
1103
- max_tokens=max_tokens,
1104
- )
1105
- else:
1106
- splitter = TreeSitterSplitter(
1107
- language=input_lang,
1108
- max_tokens=max_tokens,
1109
- )
1110
- for input_path in input_paths:
1111
- input_block = splitter.split(input_path)
1112
- vectorizer.add_nodes_recursively(
1113
- input_block,
1114
- collection_name,
1115
- input_path.name,
1116
- )
1117
- total_files = len([p for p in Path.glob(input_dir, "**/*") if not p.is_dir()])
1118
- if added_to:
1119
- print(
1120
- f"\nAdded to [bold salmon1]{collection_name}[/bold salmon1]:\n"
1121
- f" Embedding Model: [green]{model_name}[/green]\n"
1122
- f" Input Directory: {input_dir.absolute()}\n"
1123
- f" {input_lang.capitalize()} [green]*.{suffix}[/green] Files: "
1124
- f"{len(input_paths)}\n"
1125
- " Other Files (skipped): "
1126
- f"{total_files - len(input_paths)}\n"
1127
- )
1128
- [p for p in Path.glob(input_dir, f"**/*.{suffix}") if not p.is_dir()]
1129
- else:
1130
- print(
1131
- f"\nCreated [bold salmon1]{collection_name}[/bold salmon1]:\n"
1132
- f" Embedding Model: '{model_name}'\n"
1133
- f" Input Directory: {input_dir.absolute()}\n"
1134
- f" {input_lang.capitalize()} [green]*.{suffix}[/green] Files: "
1135
- f"{len(input_paths)}\n"
1136
- " Other Files (skipped): "
1137
- f"{total_files - len(input_paths)}\n"
1138
- )
1139
- with open(collections_config_file, "w") as f:
1140
- json.dump(vectorizer.config, f, indent=2)
1141
-
1142
-
1143
- @db.command(
1144
- "rm",
1145
- help="Remove a collection from the database.",
1146
- )
1147
- def db_rm(
1148
- collection_name: Annotated[str, typer.Argument(help="The name of the collection.")],
1149
- confirm: Annotated[
1150
- bool,
1151
- typer.Option(
1152
- "--yes",
1153
- "-y",
1154
- help="Confirm the removal of the collection.",
1155
- ),
1156
- ],
1157
- ) -> None:
1158
- """Remove a collection from the database
1159
-
1160
- Arguments:
1161
- collection_name: The name of the collection to remove
1162
- """
1163
- if not confirm:
1164
- delete = Confirm.ask(
1165
- f"\nAre you sure you want to [bold red]remove[/bold red] "
1166
- f"[bold salmon1]{collection_name}[/bold salmon1]?",
1167
- )
1168
- else:
1169
- delete = True
1170
- if not delete:
1171
- raise typer.Abort()
1172
- db = ChromaEmbeddingDatabase(db_loc)
1173
- collections = Collections(db)
1174
- collections.delete(collection_name)
1175
- print(
1176
- f"[bold red]Removed[/bold red] collection "
1177
- f"[bold salmon1]{collection_name}[/bold salmon1]"
1178
- )
1179
-
1180
-
1181
- def _check_collection(collection_name: str, input_dir: str | Path) -> bool:
1182
- db = ChromaEmbeddingDatabase(db_loc)
1183
- collections = Collections(db)
1184
- added_to = False
1185
- try:
1186
- collections.get(collection_name)
1187
- # confirm_add = Confirm.ask(
1188
- # f"\nCollection [bold salmon1]{collection_name}[/bold salmon1] exists. Are "
1189
- # "you sure you want to update it with the contents of"
1190
- # f"[bold green]{input_dir}[/bold green]?"
1191
- # )
1192
- added_to = True
1193
- # if not confirm_add:
1194
- # raise typer.Abort()
1195
- except ValueError:
1196
- pass
1197
- return added_to
1198
-
1199
-
1200
- @llm.command("add", help="Add a model config to janus")
1201
- def llm_add(
1202
- model_name: Annotated[
1203
- str, typer.Argument(help="The user's custom name of the model")
1204
- ],
1205
- model_type: Annotated[
1206
- str,
1207
- typer.Option(
1208
- "--type",
1209
- "-t",
1210
- help="The type of the model",
1211
- click_type=click.Choice(sorted(list(MODEL_TYPE_CONSTRUCTORS.keys()))),
1212
- ),
1213
- ] = "Azure",
1214
- ):
1215
- if not MODEL_CONFIG_DIR.exists():
1216
- MODEL_CONFIG_DIR.mkdir(parents=True)
1217
- model_cfg = MODEL_CONFIG_DIR / f"{model_name}.json"
1218
- if model_type == "HuggingFace":
1219
- url = typer.prompt("Enter the model's URL")
1220
- max_tokens = typer.prompt(
1221
- "Enter the model's maximum tokens", default=4096, type=int
1222
- )
1223
- in_cost = typer.prompt("Enter the cost per input token", default=0, type=float)
1224
- out_cost = typer.prompt("Enter the cost per output token", default=0, type=float)
1225
- params = dict(
1226
- inference_server_url=url,
1227
- max_new_tokens=max_tokens,
1228
- top_k=10,
1229
- top_p=0.95,
1230
- typical_p=0.95,
1231
- temperature=0.01,
1232
- repetition_penalty=1.03,
1233
- timeout=240,
1234
- )
1235
- cfg = {
1236
- "model_type": model_type,
1237
- "model_args": params,
1238
- "token_limit": max_tokens,
1239
- "model_cost": {"input": in_cost, "output": out_cost},
1240
- }
1241
- elif model_type == "HuggingFaceLocal":
1242
- model_id = typer.prompt("Enter the model ID")
1243
- task = typer.prompt("Enter the task")
1244
- max_tokens = typer.prompt(
1245
- "Enter the model's maximum tokens", default=4096, type=int
1246
- )
1247
- in_cost = 0
1248
- out_cost = 0
1249
- params = {"model_id": model_id, "task": task}
1250
- cfg = {
1251
- "model_type": model_type,
1252
- "model_args": params,
1253
- "token_limit": max_tokens,
1254
- "model_cost": {"input": in_cost, "output": out_cost},
1255
- }
1256
- elif model_type == "OpenAI":
1257
- print("DEPRECATED: Use 'Azure' instead. CTRL+C to exit.")
1258
- model_id = typer.prompt(
1259
- "Enter the model ID (list model IDs with `janus llm ls -a`)",
1260
- default="gpt-4o",
1261
- type=click.Choice(openai_models),
1262
- show_choices=False,
1263
- )
1264
- params = dict(
1265
- model_name=model_name,
1266
- temperature=0.7,
1267
- n=1,
1268
- )
1269
- max_tokens = TOKEN_LIMITS[model_name]
1270
- model_cost = COST_PER_1K_TOKENS[model_name]
1271
- cfg = {
1272
- "model_type": model_type,
1273
- "model_id": model_id,
1274
- "model_args": params,
1275
- "token_limit": max_tokens,
1276
- "model_cost": model_cost,
1277
- }
1278
- elif model_type == "Azure":
1279
- model_id = typer.prompt(
1280
- "Enter the model ID (list model IDs with `janus llm ls -a`)",
1281
- default="gpt-4o",
1282
- type=click.Choice(azure_models),
1283
- show_choices=False,
1284
- )
1285
- params = dict(
1286
- # Azure uses the "azure_deployment" key for what we're calling "long_model_id"
1287
- azure_deployment=MODEL_ID_TO_LONG_ID[model_id],
1288
- temperature=0.7,
1289
- n=1,
1290
- )
1291
- max_tokens = TOKEN_LIMITS[MODEL_ID_TO_LONG_ID[model_id]]
1292
- model_cost = COST_PER_1K_TOKENS[MODEL_ID_TO_LONG_ID[model_id]]
1293
- cfg = {
1294
- "model_type": model_type,
1295
- "model_id": model_id,
1296
- "model_args": params,
1297
- "token_limit": max_tokens,
1298
- "model_cost": model_cost,
1299
- }
1300
- elif model_type == "BedrockChat" or model_type == "Bedrock":
1301
- model_id = typer.prompt(
1302
- "Enter the model ID (list model IDs with `janus llm ls -a`)",
1303
- default="bedrock-claude-sonnet",
1304
- type=click.Choice(bedrock_models),
1305
- show_choices=False,
1306
- )
1307
- params = dict(
1308
- # Bedrock uses the "model_id" key for what we're calling "long_model_id"
1309
- model_id=MODEL_ID_TO_LONG_ID[model_id],
1310
- model_kwargs={"temperature": 0.7},
1311
- )
1312
- max_tokens = TOKEN_LIMITS[MODEL_ID_TO_LONG_ID[model_id]]
1313
- model_cost = COST_PER_1K_TOKENS[MODEL_ID_TO_LONG_ID[model_id]]
1314
- cfg = {
1315
- "model_type": model_type,
1316
- "model_id": model_id,
1317
- "model_args": params,
1318
- "token_limit": max_tokens,
1319
- "model_cost": model_cost,
1320
- }
1321
- else:
1322
- raise ValueError(f"Unknown model type {model_type}")
1323
- with open(model_cfg, "w") as f:
1324
- json.dump(cfg, f, indent=2)
1325
- print(f"Model config written to {model_cfg}")
1326
-
1327
-
1328
- @llm.command("ls", help="List all of the user-configured models")
1329
- def llm_ls(
1330
- all: Annotated[
1331
- bool,
1332
- typer.Option(
1333
- "--all",
1334
- "-a",
1335
- is_flag=True,
1336
- help="List all models, including the default model IDs.",
1337
- click_type=click.Choice(sorted(list(MODEL_TYPE_CONSTRUCTORS.keys()))),
1338
- ),
1339
- ] = False,
1340
- ):
1341
- print("\n[green]User-configured models[/green]:")
1342
- for model_cfg in MODEL_CONFIG_DIR.glob("*.json"):
1343
- with open(model_cfg, "r") as f:
1344
- cfg = json.load(f)
1345
- print(f"\t[blue]{model_cfg.stem}[/blue]: [purple]{cfg['model_type']}[/purple]")
1346
-
1347
- if all:
1348
- print("\n[green]Available model IDs[/green]:")
1349
- for model_id, model_type in MODEL_TYPES.items():
1350
- print(f"\t[blue]{model_id}[/blue]: [purple]{model_type}[/purple]")
1351
-
1352
-
1353
- @embedding.command("add", help="Add an embedding model config to janus")
1354
- def embedding_add(
1355
- model_name: Annotated[
1356
- str, typer.Argument(help="The user's custom name for the model")
1357
- ],
1358
- model_type: Annotated[
1359
- str,
1360
- typer.Option(
1361
- "--type",
1362
- "-t",
1363
- help="The type of the model",
1364
- click_type=click.Choice(list(val.value for val in EmbeddingModelType)),
1365
- ),
1366
- ] = "OpenAI",
1367
- ):
1368
- if not EMBEDDING_MODEL_CONFIG_DIR.exists():
1369
- EMBEDDING_MODEL_CONFIG_DIR.mkdir(parents=True)
1370
- model_cfg = EMBEDDING_MODEL_CONFIG_DIR / f"{model_name}.json"
1371
- if model_type in EmbeddingModelType.HuggingFaceInferenceAPI.values:
1372
- hf = typer.style("HuggingFaceInferenceAPI", fg="yellow")
1373
- url = typer.prompt(f"Enter the {hf} model's URL", type=str, value_proc=AnyHttpUrl)
1374
- api_model_name = typer.prompt("Enter the model's name", type=str, default="")
1375
- api_key = typer.prompt("Enter the API key", type=str, default="")
1376
- max_tokens = typer.prompt(
1377
- "Enter the model's maximum tokens", default=8191, type=int
1378
- )
1379
- in_cost = typer.prompt("Enter the cost per input token", default=0, type=float)
1380
- out_cost = typer.prompt("Enter the cost per output token", default=0, type=float)
1381
- params = dict(
1382
- model_name=api_model_name,
1383
- api_key=api_key,
1384
- )
1385
- cfg = {
1386
- "model_type": model_type,
1387
- "model_identifier": str(url),
1388
- "model_args": params,
1389
- "token_limit": max_tokens,
1390
- "model_cost": {"input": in_cost, "output": out_cost},
1391
- }
1392
- elif model_type in EmbeddingModelType.HuggingFaceLocal.values:
1393
- hf = typer.style("HuggingFace", fg="yellow")
1394
- model_id = typer.prompt(
1395
- f"Enter the {hf} model ID",
1396
- default="sentence-transformers/all-MiniLM-L6-v2",
1397
- type=str,
1398
- )
1399
- cache_folder = str(
1400
- Path(
1401
- typer.prompt(
1402
- "Enter the model's cache folder",
1403
- default=EMBEDDING_MODEL_CONFIG_DIR / "cache",
1404
- type=str,
1405
- )
1406
- )
1407
- )
1408
- max_tokens = typer.prompt(
1409
- "Enter the model's maximum tokens", default=8191, type=int
1410
- )
1411
- params = dict(
1412
- cache_folder=str(cache_folder),
1413
- )
1414
- cfg = {
1415
- "model_type": model_type,
1416
- "model_identifier": model_id,
1417
- "model_args": params,
1418
- "token_limit": max_tokens,
1419
- "model_cost": {"input": 0, "output": 0},
1420
- }
1421
- elif model_type in EmbeddingModelType.OpenAI.values:
1422
- available_models = list(EMBEDDING_COST_PER_MODEL.keys())
1423
-
1424
- open_ai = typer.style("OpenAI", fg="green")
1425
- prompt = f"Enter the {open_ai} model name"
1426
-
1427
- model_name = typer.prompt(
1428
- prompt,
1429
- default="text-embedding-3-small",
1430
- type=click.types.Choice(available_models),
1431
- show_choices=False,
1432
- )
1433
- params = dict(
1434
- model=model_name,
1435
- )
1436
- max_tokens = EMBEDDING_TOKEN_LIMITS[model_name]
1437
- model_cost = EMBEDDING_COST_PER_MODEL[model_name]
1438
- cfg = {
1439
- "model_type": model_type,
1440
- "model_identifier": model_name,
1441
- "model_args": params,
1442
- "token_limit": max_tokens,
1443
- "model_cost": model_cost,
1444
- }
1445
- else:
1446
- raise ValueError(f"Unknown model type {model_type}")
1447
- with open(model_cfg, "w") as f:
1448
- json.dump(cfg, f, indent=2)
1449
- print(f"Model config written to {model_cfg}")
1450
-
1451
-
1452
- app.add_typer(db, name="db")
1453
- app.add_typer(llm, name="llm")
1454
- app.add_typer(evaluate, name="evaluate")
1455
- app.add_typer(embedding, name="embedding")
1456
-
1457
-
1458
- @app.command()
1459
- def render(
1460
- input_dir: Annotated[
1461
- str,
1462
- typer.Option(
1463
- "--input",
1464
- "-i",
1465
- ),
1466
- ],
1467
- output_dir: Annotated[str, typer.Option("--output", "-o")],
1468
- ):
1469
- input_dir = Path(input_dir)
1470
- output_dir = Path(output_dir)
1471
- for input_file in input_dir.rglob("*.json"):
1472
- with open(input_file, "r") as f:
1473
- data = json.load(f)
1474
-
1475
- output_file = output_dir / input_file.relative_to(input_dir).with_suffix(".txt")
1476
- if not output_file.parent.exists():
1477
- output_file.parent.mkdir()
1478
-
1479
- text = data["output"].replace("\\n", "\n").strip()
1480
- output_file.write_text(text)
1481
-
1482
- jar_path = homedir / ".janus/lib/plantuml.jar"
1483
- subprocess.run(["java", "-jar", jar_path, output_file]) # nosec
1484
- output_file.unlink()
1485
-
1486
-
1487
- if __name__ == "__main__":
1488
- app()