janus-llm 4.3.1__py3-none-any.whl → 4.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. janus/__init__.py +1 -1
  2. janus/__main__.py +1 -1
  3. janus/_tests/evaluator_tests/EvalReadMe.md +85 -0
  4. janus/_tests/evaluator_tests/incose_tests/incose_large_test.json +39 -0
  5. janus/_tests/evaluator_tests/incose_tests/incose_small_test.json +17 -0
  6. janus/_tests/evaluator_tests/inline_comment_tests/mumps_inline_comment_test.m +71 -0
  7. janus/_tests/test_cli.py +3 -2
  8. janus/cli/aggregate.py +135 -0
  9. janus/cli/cli.py +117 -0
  10. janus/cli/constants.py +49 -0
  11. janus/cli/database.py +289 -0
  12. janus/cli/diagram.py +207 -0
  13. janus/cli/document.py +183 -0
  14. janus/cli/embedding.py +122 -0
  15. janus/cli/llm.py +191 -0
  16. janus/cli/partition.py +134 -0
  17. janus/cli/pipeline.py +123 -0
  18. janus/cli/self_eval.py +147 -0
  19. janus/cli/translate.py +192 -0
  20. janus/converter/__init__.py +1 -1
  21. janus/converter/_tests/test_translate.py +7 -5
  22. janus/converter/chain.py +180 -0
  23. janus/converter/converter.py +444 -153
  24. janus/converter/diagram.py +8 -6
  25. janus/converter/document.py +27 -16
  26. janus/converter/evaluate.py +143 -144
  27. janus/converter/partition.py +2 -10
  28. janus/converter/requirements.py +4 -40
  29. janus/converter/translate.py +3 -59
  30. janus/embedding/collections.py +1 -1
  31. janus/language/alc/_tests/alc.asm +3779 -0
  32. janus/language/binary/_tests/hello.bin +0 -0
  33. janus/language/block.py +78 -14
  34. janus/language/file.py +1 -1
  35. janus/language/mumps/_tests/mumps.m +235 -0
  36. janus/language/treesitter/_tests/languages/fortran.f90 +416 -0
  37. janus/language/treesitter/_tests/languages/ibmhlasm.asm +16 -0
  38. janus/language/treesitter/_tests/languages/matlab.m +225 -0
  39. janus/llm/models_info.py +9 -1
  40. janus/metrics/_tests/asm_test_file.asm +10 -0
  41. janus/metrics/_tests/mumps_test_file.m +6 -0
  42. janus/metrics/_tests/test_treesitter_metrics.py +1 -1
  43. janus/metrics/metric.py +47 -124
  44. janus/metrics/prompts/clarity.txt +8 -0
  45. janus/metrics/prompts/completeness.txt +16 -0
  46. janus/metrics/prompts/faithfulness.txt +10 -0
  47. janus/metrics/prompts/hallucination.txt +16 -0
  48. janus/metrics/prompts/quality.txt +8 -0
  49. janus/metrics/prompts/readability.txt +16 -0
  50. janus/metrics/prompts/usefulness.txt +16 -0
  51. janus/parsers/code_parser.py +4 -4
  52. janus/parsers/doc_parser.py +12 -9
  53. janus/parsers/parser.py +7 -0
  54. janus/parsers/partition_parser.py +6 -4
  55. janus/parsers/reqs_parser.py +11 -8
  56. janus/parsers/uml.py +5 -4
  57. janus/prompts/prompt.py +2 -2
  58. janus/prompts/templates/README.md +30 -0
  59. janus/prompts/templates/basic_aggregation/human.txt +6 -0
  60. janus/prompts/templates/basic_aggregation/system.txt +1 -0
  61. janus/prompts/templates/basic_refinement/human.txt +14 -0
  62. janus/prompts/templates/basic_refinement/system.txt +1 -0
  63. janus/prompts/templates/diagram/human.txt +9 -0
  64. janus/prompts/templates/diagram/system.txt +1 -0
  65. janus/prompts/templates/diagram_with_documentation/human.txt +15 -0
  66. janus/prompts/templates/diagram_with_documentation/system.txt +1 -0
  67. janus/prompts/templates/document/human.txt +10 -0
  68. janus/prompts/templates/document/system.txt +1 -0
  69. janus/prompts/templates/document_cloze/human.txt +11 -0
  70. janus/prompts/templates/document_cloze/system.txt +1 -0
  71. janus/prompts/templates/document_cloze/variables.json +4 -0
  72. janus/prompts/templates/document_cloze/variables_asm.json +4 -0
  73. janus/prompts/templates/document_inline/human.txt +13 -0
  74. janus/prompts/templates/eval_prompts/incose/human.txt +32 -0
  75. janus/prompts/templates/eval_prompts/incose/system.txt +1 -0
  76. janus/prompts/templates/eval_prompts/incose/variables.json +3 -0
  77. janus/prompts/templates/eval_prompts/inline_comments/human.txt +49 -0
  78. janus/prompts/templates/eval_prompts/inline_comments/system.txt +1 -0
  79. janus/prompts/templates/eval_prompts/inline_comments/variables.json +3 -0
  80. janus/prompts/templates/micromanaged_mumps_v1.0/human.txt +23 -0
  81. janus/prompts/templates/micromanaged_mumps_v1.0/system.txt +3 -0
  82. janus/prompts/templates/micromanaged_mumps_v2.0/human.txt +28 -0
  83. janus/prompts/templates/micromanaged_mumps_v2.0/system.txt +3 -0
  84. janus/prompts/templates/micromanaged_mumps_v2.1/human.txt +29 -0
  85. janus/prompts/templates/micromanaged_mumps_v2.1/system.txt +3 -0
  86. janus/prompts/templates/multidocument/human.txt +15 -0
  87. janus/prompts/templates/multidocument/system.txt +1 -0
  88. janus/prompts/templates/partition/human.txt +22 -0
  89. janus/prompts/templates/partition/system.txt +1 -0
  90. janus/prompts/templates/partition/variables.json +4 -0
  91. janus/prompts/templates/pseudocode/human.txt +7 -0
  92. janus/prompts/templates/pseudocode/system.txt +7 -0
  93. janus/prompts/templates/refinement/fix_exceptions/human.txt +19 -0
  94. janus/prompts/templates/refinement/fix_exceptions/system.txt +1 -0
  95. janus/prompts/templates/refinement/format/code_format/human.txt +12 -0
  96. janus/prompts/templates/refinement/format/code_format/system.txt +1 -0
  97. janus/prompts/templates/refinement/format/requirements_format/human.txt +14 -0
  98. janus/prompts/templates/refinement/format/requirements_format/system.txt +1 -0
  99. janus/prompts/templates/refinement/hallucination/human.txt +13 -0
  100. janus/prompts/templates/refinement/hallucination/system.txt +1 -0
  101. janus/prompts/templates/refinement/reflection/human.txt +15 -0
  102. janus/prompts/templates/refinement/reflection/incose/human.txt +26 -0
  103. janus/prompts/templates/refinement/reflection/incose/system.txt +1 -0
  104. janus/prompts/templates/refinement/reflection/incose_deduplicate/human.txt +16 -0
  105. janus/prompts/templates/refinement/reflection/incose_deduplicate/system.txt +1 -0
  106. janus/prompts/templates/refinement/reflection/system.txt +1 -0
  107. janus/prompts/templates/refinement/revision/human.txt +16 -0
  108. janus/prompts/templates/refinement/revision/incose/human.txt +16 -0
  109. janus/prompts/templates/refinement/revision/incose/system.txt +1 -0
  110. janus/prompts/templates/refinement/revision/incose_deduplicate/human.txt +17 -0
  111. janus/prompts/templates/refinement/revision/incose_deduplicate/system.txt +1 -0
  112. janus/prompts/templates/refinement/revision/system.txt +1 -0
  113. janus/prompts/templates/refinement/uml/alc_fix_variables/human.txt +15 -0
  114. janus/prompts/templates/refinement/uml/alc_fix_variables/system.txt +2 -0
  115. janus/prompts/templates/refinement/uml/fix_connections/human.txt +15 -0
  116. janus/prompts/templates/refinement/uml/fix_connections/system.txt +2 -0
  117. janus/prompts/templates/requirements/human.txt +13 -0
  118. janus/prompts/templates/requirements/system.txt +2 -0
  119. janus/prompts/templates/retrieval/language_docs/human.txt +10 -0
  120. janus/prompts/templates/retrieval/language_docs/system.txt +1 -0
  121. janus/prompts/templates/simple/human.txt +16 -0
  122. janus/prompts/templates/simple/system.txt +3 -0
  123. janus/refiners/format.py +49 -0
  124. janus/refiners/refiner.py +113 -4
  125. janus/utils/enums.py +127 -112
  126. janus/utils/logger.py +2 -0
  127. {janus_llm-4.3.1.dist-info → janus_llm-4.4.5.dist-info}/METADATA +18 -18
  128. janus_llm-4.4.5.dist-info/RECORD +210 -0
  129. {janus_llm-4.3.1.dist-info → janus_llm-4.4.5.dist-info}/WHEEL +1 -1
  130. janus_llm-4.4.5.dist-info/entry_points.txt +3 -0
  131. janus/cli.py +0 -1488
  132. janus/metrics/_tests/test_llm.py +0 -90
  133. janus/metrics/llm_metrics.py +0 -202
  134. janus_llm-4.3.1.dist-info/RECORD +0 -115
  135. janus_llm-4.3.1.dist-info/entry_points.txt +0 -3
  136. {janus_llm-4.3.1.dist-info → janus_llm-4.4.5.dist-info}/LICENSE +0 -0
janus/cli.py DELETED
@@ -1,1488 +0,0 @@
1
- import json
2
- import logging
3
- import os
4
- import subprocess # nosec
5
- from pathlib import Path
6
- from typing import List, Optional
7
-
8
- import click
9
- import typer
10
- from pydantic import AnyHttpUrl
11
- from rich import print
12
- from rich.console import Console
13
- from rich.prompt import Confirm
14
- from typing_extensions import Annotated
15
-
16
- import janus.refiners.refiner
17
- import janus.refiners.uml
18
- from janus.converter.aggregator import Aggregator
19
- from janus.converter.converter import Converter
20
- from janus.converter.diagram import DiagramGenerator
21
- from janus.converter.document import Documenter, MadLibsDocumenter, MultiDocumenter
22
- from janus.converter.evaluate import InlineCommentEvaluator, RequirementEvaluator
23
- from janus.converter.partition import Partitioner
24
- from janus.converter.requirements import RequirementsDocumenter
25
- from janus.converter.translate import Translator
26
- from janus.embedding.collections import Collections
27
- from janus.embedding.database import ChromaEmbeddingDatabase
28
- from janus.embedding.embedding_models_info import (
29
- EMBEDDING_COST_PER_MODEL,
30
- EMBEDDING_MODEL_CONFIG_DIR,
31
- EMBEDDING_TOKEN_LIMITS,
32
- EmbeddingModelType,
33
- )
34
- from janus.embedding.vectorize import ChromaDBVectorizer
35
- from janus.language.binary import BinarySplitter
36
- from janus.language.mumps import MumpsSplitter
37
- from janus.language.naive.registry import CUSTOM_SPLITTERS
38
- from janus.language.treesitter import TreeSitterSplitter
39
- from janus.llm.model_callbacks import COST_PER_1K_TOKENS
40
- from janus.llm.models_info import (
41
- MODEL_CONFIG_DIR,
42
- MODEL_ID_TO_LONG_ID,
43
- MODEL_TYPE_CONSTRUCTORS,
44
- MODEL_TYPES,
45
- TOKEN_LIMITS,
46
- azure_models,
47
- bedrock_models,
48
- openai_models,
49
- )
50
- from janus.metrics.cli import evaluate
51
- from janus.utils.enums import LANGUAGES
52
- from janus.utils.logger import create_logger
53
-
54
- httpx_logger = logging.getLogger("httpx")
55
- httpx_logger.setLevel(logging.WARNING)
56
-
57
- log = create_logger(__name__)
58
- homedir = Path.home().expanduser()
59
-
60
- janus_dir = homedir / ".janus"
61
- if not janus_dir.exists():
62
- janus_dir.mkdir(parents=True)
63
-
64
- db_file = janus_dir / ".db"
65
- if not db_file.exists():
66
- with open(db_file, "w") as f:
67
- f.write(str(janus_dir / "chroma.db"))
68
-
69
- with open(db_file, "r") as f:
70
- db_loc = f.read()
71
-
72
- collections_config_file = Path(db_loc) / "collections.json"
73
-
74
-
75
- def get_subclasses(cls):
76
- return set(cls.__subclasses__()).union(
77
- set(s for c in cls.__subclasses__() for s in get_subclasses(c))
78
- )
79
-
80
-
81
- REFINER_TYPES = get_subclasses(janus.refiners.refiner.JanusRefiner).union(
82
- {janus.refiners.refiner.JanusRefiner}
83
- )
84
- REFINERS = {r.__name__: r for r in REFINER_TYPES}
85
-
86
-
87
- def get_collections_config():
88
- if collections_config_file.exists():
89
- with open(collections_config_file, "r") as f:
90
- config = json.load(f)
91
- else:
92
- config = {}
93
- return config
94
-
95
-
96
- app = typer.Typer(
97
- help=(
98
- "[bold][dark_orange]Janus[/dark_orange] is a CLI for translating, "
99
- "documenting, and diagramming code using large language models.[/bold]"
100
- ),
101
- add_completion=False,
102
- no_args_is_help=True,
103
- context_settings={"help_option_names": ["-h", "--help"]},
104
- rich_markup_mode="rich",
105
- )
106
-
107
-
108
- db = typer.Typer(
109
- help="Database commands",
110
- add_completion=False,
111
- no_args_is_help=True,
112
- context_settings={"help_option_names": ["-h", "--help"]},
113
- )
114
- llm = typer.Typer(
115
- help="LLM commands",
116
- add_completion=False,
117
- no_args_is_help=True,
118
- context_settings={"help_option_names": ["-h", "--help"]},
119
- )
120
-
121
- embedding = typer.Typer(
122
- help="Embedding model commands",
123
- add_completion=False,
124
- no_args_is_help=True,
125
- context_settings={"help_option_names": ["-h", "--help"]},
126
- )
127
-
128
-
129
- def version_callback(value: bool) -> None:
130
- if value:
131
- from . import __version__ as version
132
-
133
- print(f"Janus CLI [blue]v{version}[/blue]")
134
- raise typer.Exit()
135
-
136
-
137
- @app.callback()
138
- def common(
139
- ctx: typer.Context,
140
- version: bool = typer.Option(
141
- None,
142
- "--version",
143
- "-v",
144
- callback=version_callback,
145
- help="Print the version and exit.",
146
- ),
147
- ) -> None:
148
- """A function for getting the app version
149
-
150
- This will call the version_callback function to print the version and exit.
151
-
152
- Arguments:
153
- ctx: The typer context
154
- version: A boolean flag for the version
155
- """
156
- pass
157
-
158
-
159
- @app.command(
160
- help="Translate code from one language to another using an LLM.",
161
- no_args_is_help=True,
162
- )
163
- def translate(
164
- input_dir: Annotated[
165
- Path,
166
- typer.Option(
167
- "--input",
168
- "-i",
169
- help="The directory containing the source code to be translated. "
170
- "The files should all be in one flat directory.",
171
- ),
172
- ],
173
- source_lang: Annotated[
174
- str,
175
- typer.Option(
176
- "--source-language",
177
- "-s",
178
- help="The language of the source code.",
179
- click_type=click.Choice(sorted(LANGUAGES)),
180
- ),
181
- ],
182
- output_dir: Annotated[
183
- Path,
184
- typer.Option(
185
- "--output", "-o", help="The directory to store the translated code in."
186
- ),
187
- ],
188
- target_lang: Annotated[
189
- str,
190
- typer.Option(
191
- "--target-language",
192
- "-t",
193
- help="The desired output language to translate the source code to. The "
194
- "format can follow a 'language-version' syntax. Use 'text' to get plaintext"
195
- "results as returned by the LLM. Examples: `python-3.10`, `mumps`, `java-10`,"
196
- "text.",
197
- ),
198
- ],
199
- llm_name: Annotated[
200
- str,
201
- typer.Option(
202
- "--llm",
203
- "-L",
204
- help="The custom name of the model set with 'janus llm add'.",
205
- ),
206
- ],
207
- max_prompts: Annotated[
208
- int,
209
- typer.Option(
210
- "--max-prompts",
211
- "-m",
212
- help="The maximum number of times to prompt a model on one functional block "
213
- "before exiting the application. This is to prevent wasting too much money.",
214
- ),
215
- ] = 10,
216
- overwrite: Annotated[
217
- bool,
218
- typer.Option(
219
- "--overwrite/--preserve",
220
- help="Whether to overwrite existing files in the output directory",
221
- ),
222
- ] = False,
223
- skip_context: Annotated[
224
- bool,
225
- typer.Option(
226
- "--skip-context",
227
- help="Prompts will include any context information associated with source"
228
- " code blocks, unless this option is specified",
229
- ),
230
- ] = False,
231
- temp: Annotated[
232
- float,
233
- typer.Option("--temperature", "-T", help="Sampling temperature.", min=0, max=2),
234
- ] = 0.7,
235
- prompt_template: Annotated[
236
- str,
237
- typer.Option(
238
- "--prompt-template",
239
- "-p",
240
- help="Name of the Janus prompt template directory or "
241
- "path to a directory containing those template files.",
242
- ),
243
- ] = "simple",
244
- collection: Annotated[
245
- str,
246
- typer.Option(
247
- "--collection",
248
- "-c",
249
- help="If set, will put the translated result into a Chroma DB "
250
- "collection with the name provided.",
251
- ),
252
- ] = None,
253
- splitter_type: Annotated[
254
- str,
255
- typer.Option(
256
- "-S",
257
- "--splitter",
258
- help="Name of custom splitter to use",
259
- click_type=click.Choice(list(CUSTOM_SPLITTERS.keys())),
260
- ),
261
- ] = "file",
262
- refiner_types: Annotated[
263
- list[str],
264
- typer.Option(
265
- "-r",
266
- "--refiner",
267
- help="List of refiner types to use. Add -r for each refiner to use in\
268
- refinement chain",
269
- click_type=click.Choice(list(REFINERS.keys())),
270
- ),
271
- ] = ["JanusRefiner"],
272
- retriever_type: Annotated[
273
- str,
274
- typer.Option(
275
- "-R",
276
- "--retriever",
277
- help="Name of custom retriever to use",
278
- click_type=click.Choice(["active_usings", "language_docs"]),
279
- ),
280
- ] = None,
281
- max_tokens: Annotated[
282
- int,
283
- typer.Option(
284
- "--max-tokens",
285
- "-M",
286
- help="The maximum number of tokens the model will take in. "
287
- "If unspecificed, model's default max will be used.",
288
- ),
289
- ] = None,
290
- ):
291
- refiner_types = [REFINERS[r] for r in refiner_types]
292
- try:
293
- target_language, target_version = target_lang.split("-")
294
- except ValueError:
295
- target_language = target_lang
296
- target_version = None
297
- # make sure not overwriting input
298
- if source_lang.lower() == target_language.lower() and input_dir == output_dir:
299
- log.error("Output files would overwrite input! Aborting...")
300
- raise ValueError
301
-
302
- model_arguments = dict(temperature=temp)
303
- collections_config = get_collections_config()
304
- translator = Translator(
305
- model=llm_name,
306
- model_arguments=model_arguments,
307
- source_language=source_lang,
308
- target_language=target_language,
309
- target_version=target_version,
310
- max_prompts=max_prompts,
311
- max_tokens=max_tokens,
312
- prompt_template=prompt_template,
313
- db_path=db_loc,
314
- db_config=collections_config,
315
- splitter_type=splitter_type,
316
- refiner_types=refiner_types,
317
- retriever_type=retriever_type,
318
- )
319
- translator.translate(input_dir, output_dir, overwrite, collection)
320
-
321
-
322
- @app.command(
323
- help="Document input code using an LLM.",
324
- no_args_is_help=True,
325
- )
326
- def document(
327
- input_dir: Annotated[
328
- Path,
329
- typer.Option(
330
- "--input",
331
- "-i",
332
- help="The directory containing the source code to be translated. "
333
- "The files should all be in one flat directory.",
334
- ),
335
- ],
336
- language: Annotated[
337
- str,
338
- typer.Option(
339
- "--language",
340
- "-l",
341
- help="The language of the source code.",
342
- click_type=click.Choice(sorted(LANGUAGES)),
343
- ),
344
- ],
345
- output_dir: Annotated[
346
- Path,
347
- typer.Option(
348
- "--output-dir", "-o", help="The directory to store the translated code in."
349
- ),
350
- ],
351
- llm_name: Annotated[
352
- str,
353
- typer.Option(
354
- "--llm",
355
- "-L",
356
- help="The custom name of the model set with 'janus llm add'.",
357
- ),
358
- ],
359
- max_prompts: Annotated[
360
- int,
361
- typer.Option(
362
- "--max-prompts",
363
- "-m",
364
- help="The maximum number of times to prompt a model on one functional block "
365
- "before exiting the application. This is to prevent wasting too much money.",
366
- ),
367
- ] = 10,
368
- overwrite: Annotated[
369
- bool,
370
- typer.Option(
371
- "--overwrite/--preserve",
372
- help="Whether to overwrite existing files in the output directory",
373
- ),
374
- ] = False,
375
- doc_mode: Annotated[
376
- str,
377
- typer.Option(
378
- "--doc-mode",
379
- "-d",
380
- help="The documentation mode.",
381
- click_type=click.Choice(["madlibs", "summary", "multidoc", "requirements"]),
382
- ),
383
- ] = "madlibs",
384
- comments_per_request: Annotated[
385
- int,
386
- typer.Option(
387
- "--comments-per-request",
388
- "-rc",
389
- help="The maximum number of comments to generate per request when using "
390
- "MadLibs documentation mode.",
391
- ),
392
- ] = None,
393
- drop_comments: Annotated[
394
- bool,
395
- typer.Option(
396
- "--drop-comments/--keep-comments",
397
- help="Whether to drop or keep comments in the code sent to the LLM",
398
- ),
399
- ] = False,
400
- temperature: Annotated[
401
- float,
402
- typer.Option("--temperature", "-t", help="Sampling temperature.", min=0, max=2),
403
- ] = 0.7,
404
- collection: Annotated[
405
- str,
406
- typer.Option(
407
- "--collection",
408
- "-c",
409
- help="If set, will put the translated result into a Chroma DB "
410
- "collection with the name provided.",
411
- ),
412
- ] = None,
413
- splitter_type: Annotated[
414
- str,
415
- typer.Option(
416
- "-S",
417
- "--splitter",
418
- help="Name of custom splitter to use",
419
- click_type=click.Choice(list(CUSTOM_SPLITTERS.keys())),
420
- ),
421
- ] = "file",
422
- refiner_types: Annotated[
423
- list[str],
424
- typer.Option(
425
- "-r",
426
- "--refiner",
427
- help="List of refiner types to use. Add -r for each refiner to use in\
428
- refinement chain",
429
- click_type=click.Choice(list(REFINERS.keys())),
430
- ),
431
- ] = ["JanusRefiner"],
432
- retriever_type: Annotated[
433
- str,
434
- typer.Option(
435
- "-R",
436
- "--retriever",
437
- help="Name of custom retriever to use",
438
- click_type=click.Choice(["active_usings", "language_docs"]),
439
- ),
440
- ] = None,
441
- max_tokens: Annotated[
442
- int,
443
- typer.Option(
444
- "--max-tokens",
445
- "-M",
446
- help="The maximum number of tokens the model will take in. "
447
- "If unspecificed, model's default max will be used.",
448
- ),
449
- ] = None,
450
- ):
451
- refiner_types = [REFINERS[r] for r in refiner_types]
452
- model_arguments = dict(temperature=temperature)
453
- collections_config = get_collections_config()
454
- kwargs = dict(
455
- model=llm_name,
456
- model_arguments=model_arguments,
457
- source_language=language,
458
- max_prompts=max_prompts,
459
- max_tokens=max_tokens,
460
- db_path=db_loc,
461
- db_config=collections_config,
462
- splitter_type=splitter_type,
463
- refiner_types=refiner_types,
464
- retriever_type=retriever_type,
465
- )
466
- if doc_mode == "madlibs":
467
- documenter = MadLibsDocumenter(
468
- comments_per_request=comments_per_request, **kwargs
469
- )
470
- elif doc_mode == "multidoc":
471
- documenter = MultiDocumenter(drop_comments=drop_comments, **kwargs)
472
- elif doc_mode == "requirements":
473
- documenter = RequirementsDocumenter(drop_comments=drop_comments, **kwargs)
474
- else:
475
- documenter = Documenter(drop_comments=drop_comments, **kwargs)
476
-
477
- documenter.translate(input_dir, output_dir, overwrite, collection)
478
-
479
-
480
- @app.command()
481
- def aggregate(
482
- input_dir: Annotated[
483
- Path,
484
- typer.Option(
485
- "--input",
486
- "-i",
487
- help="The directory containing the source code to be translated. "
488
- "The files should all be in one flat directory.",
489
- ),
490
- ],
491
- language: Annotated[
492
- str,
493
- typer.Option(
494
- "--language",
495
- "-l",
496
- help="The language of the source code.",
497
- click_type=click.Choice(sorted(LANGUAGES)),
498
- ),
499
- ],
500
- output_dir: Annotated[
501
- Path,
502
- typer.Option(
503
- "--output-dir", "-o", help="The directory to store the translated code in."
504
- ),
505
- ],
506
- llm_name: Annotated[
507
- str,
508
- typer.Option(
509
- "--llm",
510
- "-L",
511
- help="The custom name of the model set with 'janus llm add'.",
512
- ),
513
- ],
514
- max_prompts: Annotated[
515
- int,
516
- typer.Option(
517
- "--max-prompts",
518
- "-m",
519
- help="The maximum number of times to prompt a model on one functional block "
520
- "before exiting the application. This is to prevent wasting too much money.",
521
- ),
522
- ] = 10,
523
- overwrite: Annotated[
524
- bool,
525
- typer.Option(
526
- "--overwrite/--preserve",
527
- help="Whether to overwrite existing files in the output directory",
528
- ),
529
- ] = False,
530
- temperature: Annotated[
531
- float,
532
- typer.Option("--temperature", "-t", help="Sampling temperature.", min=0, max=2),
533
- ] = 0.7,
534
- collection: Annotated[
535
- str,
536
- typer.Option(
537
- "--collection",
538
- "-c",
539
- help="If set, will put the translated result into a Chroma DB "
540
- "collection with the name provided.",
541
- ),
542
- ] = None,
543
- splitter_type: Annotated[
544
- str,
545
- typer.Option(
546
- "-S",
547
- "--splitter",
548
- help="Name of custom splitter to use",
549
- click_type=click.Choice(list(CUSTOM_SPLITTERS.keys())),
550
- ),
551
- ] = "file",
552
- intermediate_converters: Annotated[
553
- List[str],
554
- typer.Option(
555
- "-C",
556
- "--converter",
557
- help="Name of an intermediate converter to use",
558
- click_type=click.Choice([c.__name__ for c in get_subclasses(Converter)]),
559
- ),
560
- ] = ["Documenter"],
561
- ):
562
- converter_subclasses = get_subclasses(Converter)
563
- converter_subclasses_map = {c.__name__: c for c in converter_subclasses}
564
- model_arguments = dict(temperature=temperature)
565
- collections_config = get_collections_config()
566
- converters = []
567
- for ic in intermediate_converters:
568
- converters.append(
569
- converter_subclasses_map[ic](
570
- model=llm_name,
571
- model_arguments=model_arguments,
572
- source_language=language,
573
- max_prompts=max_prompts,
574
- db_path=db_loc,
575
- db_config=collections_config,
576
- splitter_type=splitter_type,
577
- )
578
- )
579
-
580
- aggregator = Aggregator(
581
- intermediate_converters=converters,
582
- model=llm_name,
583
- model_arguments=model_arguments,
584
- source_language=language,
585
- max_prompts=max_prompts,
586
- db_path=db_loc,
587
- db_config=collections_config,
588
- splitter_type=splitter_type,
589
- prompt_template="basic_aggregation",
590
- )
591
- aggregator.translate(input_dir, output_dir, overwrite, collection)
592
-
593
-
594
- @app.command(
595
- help="Partition input code using an LLM.",
596
- no_args_is_help=True,
597
- )
598
- def partition(
599
- input_dir: Annotated[
600
- Path,
601
- typer.Option(
602
- "--input",
603
- "-i",
604
- help="The directory containing the source code to be partitioned. ",
605
- ),
606
- ],
607
- language: Annotated[
608
- str,
609
- typer.Option(
610
- "--language",
611
- "-l",
612
- help="The language of the source code.",
613
- click_type=click.Choice(sorted(LANGUAGES)),
614
- ),
615
- ],
616
- output_dir: Annotated[
617
- Path,
618
- typer.Option(
619
- "--output-dir", "-o", help="The directory to store the partitioned code in."
620
- ),
621
- ],
622
- llm_name: Annotated[
623
- str,
624
- typer.Option(
625
- "--llm",
626
- "-L",
627
- help="The custom name of the model set with 'janus llm add'.",
628
- ),
629
- ] = "gpt-4o",
630
- max_prompts: Annotated[
631
- int,
632
- typer.Option(
633
- "--max-prompts",
634
- "-m",
635
- help="The maximum number of times to prompt a model on one functional block "
636
- "before exiting the application. This is to prevent wasting too much money.",
637
- ),
638
- ] = 10,
639
- overwrite: Annotated[
640
- bool,
641
- typer.Option(
642
- "--overwrite/--preserve",
643
- help="Whether to overwrite existing files in the output directory",
644
- ),
645
- ] = False,
646
- temperature: Annotated[
647
- float,
648
- typer.Option("--temperature", "-t", help="Sampling temperature.", min=0, max=2),
649
- ] = 0.7,
650
- splitter_type: Annotated[
651
- str,
652
- typer.Option(
653
- "-S",
654
- "--splitter",
655
- help="Name of custom splitter to use",
656
- click_type=click.Choice(list(CUSTOM_SPLITTERS.keys())),
657
- ),
658
- ] = "file",
659
- refiner_types: Annotated[
660
- list[str],
661
- typer.Option(
662
- "-r",
663
- "--refiner",
664
- help="List of refiner types to use. Add -r for each refiner to use in\
665
- refinement chain",
666
- click_type=click.Choice(list(REFINERS.keys())),
667
- ),
668
- ] = ["JanusRefiner"],
669
- max_tokens: Annotated[
670
- int,
671
- typer.Option(
672
- "--max-tokens",
673
- "-M",
674
- help="The maximum number of tokens the model will take in. "
675
- "If unspecificed, model's default max will be used.",
676
- ),
677
- ] = None,
678
- partition_token_limit: Annotated[
679
- int,
680
- typer.Option(
681
- "--partition-tokens",
682
- "-pt",
683
- help="The limit on the number of tokens per partition.",
684
- ),
685
- ] = 8192,
686
- ):
687
- refiner_types = [REFINERS[r] for r in refiner_types]
688
- model_arguments = dict(temperature=temperature)
689
- kwargs = dict(
690
- model=llm_name,
691
- model_arguments=model_arguments,
692
- source_language=language,
693
- max_prompts=max_prompts,
694
- max_tokens=max_tokens,
695
- splitter_type=splitter_type,
696
- refiner_types=refiner_types,
697
- partition_token_limit=partition_token_limit,
698
- )
699
- partitioner = Partitioner(**kwargs)
700
- partitioner.translate(input_dir, output_dir, overwrite)
701
-
702
-
703
- @app.command(
704
- help="Diagram input code using an LLM.",
705
- no_args_is_help=True,
706
- )
707
- def diagram(
708
- input_dir: Annotated[
709
- Path,
710
- typer.Option(
711
- "--input",
712
- "-i",
713
- help="The directory containing the source code to be translated. "
714
- "The files should all be in one flat directory.",
715
- ),
716
- ],
717
- language: Annotated[
718
- str,
719
- typer.Option(
720
- "--language",
721
- "-l",
722
- help="The language of the source code.",
723
- click_type=click.Choice(sorted(LANGUAGES)),
724
- ),
725
- ],
726
- output_dir: Annotated[
727
- Path,
728
- typer.Option(
729
- "--output-dir", "-o", help="The directory to store the translated code in."
730
- ),
731
- ],
732
- llm_name: Annotated[
733
- str,
734
- typer.Option(
735
- "--llm",
736
- "-L",
737
- help="The custom name of the model set with 'janus llm add'.",
738
- ),
739
- ],
740
- max_prompts: Annotated[
741
- int,
742
- typer.Option(
743
- "--max-prompts",
744
- "-m",
745
- help="The maximum number of times to prompt a model on one functional block "
746
- "before exiting the application. This is to prevent wasting too much money.",
747
- ),
748
- ] = 10,
749
- overwrite: Annotated[
750
- bool,
751
- typer.Option(
752
- "--overwrite/--preserve",
753
- help="Whether to overwrite existing files in the output directory",
754
- ),
755
- ] = False,
756
- temperature: Annotated[
757
- float,
758
- typer.Option("--temperature", "-t", help="Sampling temperature.", min=0, max=2),
759
- ] = 0.7,
760
- collection: Annotated[
761
- str,
762
- typer.Option(
763
- "--collection",
764
- "-c",
765
- help="If set, will put the translated result into a Chroma DB "
766
- "collection with the name provided.",
767
- ),
768
- ] = None,
769
- diagram_type: Annotated[
770
- str,
771
- typer.Option(
772
- "--diagram-type", "-dg", help="Diagram type to generate in PLANTUML"
773
- ),
774
- ] = "Activity",
775
- add_documentation: Annotated[
776
- bool,
777
- typer.Option(
778
- "--add-documentation/--no-documentation",
779
- "-ad",
780
- help="Whether to use documentation in generation",
781
- ),
782
- ] = False,
783
- splitter_type: Annotated[
784
- str,
785
- typer.Option(
786
- "-S",
787
- "--splitter",
788
- help="Name of custom splitter to use",
789
- click_type=click.Choice(list(CUSTOM_SPLITTERS.keys())),
790
- ),
791
- ] = "file",
792
- refiner_types: Annotated[
793
- list[str],
794
- typer.Option(
795
- "-r",
796
- "--refiner",
797
- help="List of refiner types to use. Add -r for each refiner to use in\
798
- refinement chain",
799
- click_type=click.Choice(list(REFINERS.keys())),
800
- ),
801
- ] = ["JanusRefiner"],
802
- retriever_type: Annotated[
803
- str,
804
- typer.Option(
805
- "-R",
806
- "--retriever",
807
- help="Name of custom retriever to use",
808
- click_type=click.Choice(["active_usings", "language_docs"]),
809
- ),
810
- ] = None,
811
- ):
812
- refiner_types = [REFINERS[r] for r in refiner_types]
813
- model_arguments = dict(temperature=temperature)
814
- collections_config = get_collections_config()
815
- diagram_generator = DiagramGenerator(
816
- model=llm_name,
817
- model_arguments=model_arguments,
818
- source_language=language,
819
- max_prompts=max_prompts,
820
- db_path=db_loc,
821
- db_config=collections_config,
822
- splitter_type=splitter_type,
823
- refiner_types=refiner_types,
824
- retriever_type=retriever_type,
825
- diagram_type=diagram_type,
826
- add_documentation=add_documentation,
827
- )
828
- diagram_generator.translate(input_dir, output_dir, overwrite, collection)
829
-
830
-
831
- @app.command(
832
- help="LLM self evaluation",
833
- no_args_is_help=True,
834
- )
835
- def llm_self_eval(
836
- input_dir: Annotated[
837
- Path,
838
- typer.Option(
839
- "--input",
840
- "-i",
841
- help="The directory containing the source code to be evaluated. "
842
- "The files should all be in one flat directory.",
843
- ),
844
- ],
845
- language: Annotated[
846
- str,
847
- typer.Option(
848
- "--language",
849
- "-l",
850
- help="The language of the source code.",
851
- click_type=click.Choice(sorted(LANGUAGES)),
852
- ),
853
- ],
854
- output_dir: Annotated[
855
- Path,
856
- typer.Option(
857
- "--output-dir", "-o", help="The directory to store the evaluations in."
858
- ),
859
- ],
860
- llm_name: Annotated[
861
- str,
862
- typer.Option(
863
- "--llm",
864
- "-L",
865
- help="The custom name of the model set with 'janus llm add'.",
866
- ),
867
- ] = "gpt-4o",
868
- evaluation_type: Annotated[
869
- str,
870
- typer.Option(
871
- "--evaluation-type",
872
- "-e",
873
- help="Type of output to evaluate.",
874
- click_type=click.Choice(["incose", "comments"]),
875
- ),
876
- ] = "incose",
877
- max_prompts: Annotated[
878
- int,
879
- typer.Option(
880
- "--max-prompts",
881
- "-m",
882
- help="The maximum number of times to prompt a model on one functional block "
883
- "before exiting the application. This is to prevent wasting too much money.",
884
- ),
885
- ] = 10,
886
- overwrite: Annotated[
887
- bool,
888
- typer.Option(
889
- "--overwrite/--preserve",
890
- help="Whether to overwrite existing files in the output directory",
891
- ),
892
- ] = False,
893
- temperature: Annotated[
894
- float,
895
- typer.Option("--temperature", "-t", help="Sampling temperature.", min=0, max=2),
896
- ] = 0.7,
897
- collection: Annotated[
898
- str,
899
- typer.Option(
900
- "--collection",
901
- "-c",
902
- help="If set, will put the translated result into a Chroma DB "
903
- "collection with the name provided.",
904
- ),
905
- ] = None,
906
- splitter_type: Annotated[
907
- str,
908
- typer.Option(
909
- "-S",
910
- "--splitter",
911
- help="Name of custom splitter to use",
912
- click_type=click.Choice(list(CUSTOM_SPLITTERS.keys())),
913
- ),
914
- ] = "file",
915
- refiner_types: Annotated[
916
- list[str],
917
- typer.Option(
918
- "-r",
919
- "--refiner",
920
- help="List of refiner types to use. Add -r for each refiner to use in\
921
- refinement chain",
922
- click_type=click.Choice(list(REFINERS.keys())),
923
- ),
924
- ] = ["JanusRefiner"],
925
- eval_items_per_request: Annotated[
926
- int,
927
- typer.Option(
928
- "--eval-items-per-request",
929
- "-rc",
930
- help="The maximum number of evaluation items per request",
931
- ),
932
- ] = None,
933
- max_tokens: Annotated[
934
- int,
935
- typer.Option(
936
- "--max-tokens",
937
- "-M",
938
- help="The maximum number of tokens the model will take in. "
939
- "If unspecificed, model's default max will be used.",
940
- ),
941
- ] = None,
942
- ):
943
- model_arguments = dict(temperature=temperature)
944
- refiner_types = [REFINERS[r] for r in refiner_types]
945
- kwargs = dict(
946
- eval_items_per_request=eval_items_per_request,
947
- model=llm_name,
948
- model_arguments=model_arguments,
949
- source_language=language,
950
- max_prompts=max_prompts,
951
- max_tokens=max_tokens,
952
- splitter_type=splitter_type,
953
- refiner_types=refiner_types,
954
- )
955
- # Setting parser type here
956
- if evaluation_type == "incose":
957
- evaluator = RequirementEvaluator(**kwargs)
958
- elif evaluation_type == "comments":
959
- evaluator = InlineCommentEvaluator(**kwargs)
960
-
961
- evaluator.translate(input_dir, output_dir, overwrite, collection)
962
-
963
-
964
- @db.command("init", help="Connect to or create a database.")
965
- def db_init(
966
- path: Annotated[
967
- str, typer.Option("--path", "-p", help="The path to the database file.")
968
- ] = str(janus_dir / "chroma.db"),
969
- url: Annotated[
970
- str,
971
- typer.Option(
972
- "--url",
973
- "-u",
974
- help="The URL of the database if the database is running externally.",
975
- ),
976
- ] = "",
977
- ) -> None:
978
- global db_loc
979
- if url != "":
980
- print(f"Pointing to Chroma DB at {url}")
981
- with open(db_file, "w") as f:
982
- f.write(url)
983
- db_loc = url
984
- else:
985
- path = os.path.abspath(path)
986
- print(f"Setting up Chroma DB at {path}")
987
- with open(db_file, "w") as f:
988
- f.write(path)
989
- db_loc = path
990
- global embedding_db
991
- embedding_db = ChromaEmbeddingDatabase(db_loc)
992
-
993
-
994
- @db.command("status", help="Print current database location.")
995
- def db_status():
996
- print(f"Chroma DB currently pointing to {db_loc}")
997
-
998
-
999
- @db.command(
1000
- "ls",
1001
- help="List the current database's collections. Or supply a collection name to list "
1002
- "information about its contents.",
1003
- )
1004
- def db_ls(
1005
- collection_name: Annotated[
1006
- Optional[str], typer.Argument(help="The name of the collection.")
1007
- ] = None,
1008
- peek: Annotated[
1009
- Optional[int],
1010
- typer.Option("--peek", "-p", help="Peek at N entries for a specific collection."),
1011
- ] = None,
1012
- ) -> None:
1013
- """List the current database's collections"""
1014
- if peek is not None and collection_name is None:
1015
- print(
1016
- "\n[bold red]Cannot peek at all collections. Please specify a "
1017
- "collection by name.[/bold red]"
1018
- )
1019
- return
1020
- db = ChromaEmbeddingDatabase(db_loc)
1021
- collections = Collections(db)
1022
- collection_list = collections.get(collection_name)
1023
- for collection in collection_list:
1024
- print(
1025
- f"\n[bold underline]Collection[/bold underline]: "
1026
- f"[bold salmon1]{collection.name}[/bold salmon1]"
1027
- )
1028
- print(f" ID: {collection.id}")
1029
- print(f" Metadata: {collection.metadata}")
1030
- print(f" Tenant: [green]{collection.tenant}[/green]")
1031
- print(f" Database: [green]{collection.database}[/green]")
1032
- print(f" Length: {collection.count()}")
1033
- if peek:
1034
- entry = collection.peek(peek)
1035
- entry["embeddings"] = entry["embeddings"][0][:2] + ["..."]
1036
- if peek == 1:
1037
- print(" [bold]Peeking at first entry[/bold]:")
1038
- else:
1039
- print(f" [bold]Peeking at first {peek} entries[/bold]:")
1040
- print(entry)
1041
- print()
1042
-
1043
-
1044
- @db.command("add", help="Add a collection to the current database.")
1045
- def db_add(
1046
- collection_name: Annotated[str, typer.Argument(help="The name of the collection.")],
1047
- model_name: Annotated[str, typer.Argument(help="The name of the embedding model.")],
1048
- input_dir: Annotated[
1049
- str,
1050
- typer.Option(
1051
- "--input",
1052
- "-i",
1053
- help="The directory containing the source code to be added.",
1054
- ),
1055
- ] = "./",
1056
- input_lang: Annotated[
1057
- str, typer.Option("--language", "-l", help="The language of the source code.")
1058
- ] = "python",
1059
- max_tokens: Annotated[
1060
- int,
1061
- typer.Option(
1062
- "--max-tokens",
1063
- "-m",
1064
- help="The maximum number of tokens for each chunk of input source code.",
1065
- ),
1066
- ] = 4096,
1067
- ) -> None:
1068
- """Add a collection to the database
1069
-
1070
- Arguments:
1071
- collection_name: The name of the collection to add
1072
- model_name: The name of the embedding model to use
1073
- input_dir: The directory containing the source code to be added
1074
- input_lang: The language of the source code
1075
- max_tokens: The maximum number of tokens for each chunk of input source code
1076
- """
1077
- # TODO: import factory
1078
- console = Console()
1079
-
1080
- added_to = _check_collection(collection_name, input_dir)
1081
- collections_config = get_collections_config()
1082
-
1083
- with console.status(
1084
- f"Adding collection: [bold salmon]{collection_name}[/bold salmon]",
1085
- spinner="arrow3",
1086
- ):
1087
- vectorizer_factory = ChromaDBVectorizer()
1088
- vectorizer = vectorizer_factory.create_vectorizer(
1089
- path=db_loc, config=collections_config
1090
- )
1091
- vectorizer.get_or_create_collection(collection_name, model_name=model_name)
1092
- input_dir = Path(input_dir)
1093
- suffix = LANGUAGES[input_lang]["suffix"]
1094
- source_glob = f"**/*.{suffix}"
1095
- input_paths = [p for p in input_dir.rglob(source_glob)]
1096
- if input_lang in CUSTOM_SPLITTERS:
1097
- if input_lang == "mumps":
1098
- splitter = MumpsSplitter(
1099
- max_tokens=max_tokens,
1100
- )
1101
- elif input_lang == "binary":
1102
- splitter = BinarySplitter(
1103
- max_tokens=max_tokens,
1104
- )
1105
- else:
1106
- splitter = TreeSitterSplitter(
1107
- language=input_lang,
1108
- max_tokens=max_tokens,
1109
- )
1110
- for input_path in input_paths:
1111
- input_block = splitter.split(input_path)
1112
- vectorizer.add_nodes_recursively(
1113
- input_block,
1114
- collection_name,
1115
- input_path.name,
1116
- )
1117
- total_files = len([p for p in Path.glob(input_dir, "**/*") if not p.is_dir()])
1118
- if added_to:
1119
- print(
1120
- f"\nAdded to [bold salmon1]{collection_name}[/bold salmon1]:\n"
1121
- f" Embedding Model: [green]{model_name}[/green]\n"
1122
- f" Input Directory: {input_dir.absolute()}\n"
1123
- f" {input_lang.capitalize()} [green]*.{suffix}[/green] Files: "
1124
- f"{len(input_paths)}\n"
1125
- " Other Files (skipped): "
1126
- f"{total_files - len(input_paths)}\n"
1127
- )
1128
- [p for p in Path.glob(input_dir, f"**/*.{suffix}") if not p.is_dir()]
1129
- else:
1130
- print(
1131
- f"\nCreated [bold salmon1]{collection_name}[/bold salmon1]:\n"
1132
- f" Embedding Model: '{model_name}'\n"
1133
- f" Input Directory: {input_dir.absolute()}\n"
1134
- f" {input_lang.capitalize()} [green]*.{suffix}[/green] Files: "
1135
- f"{len(input_paths)}\n"
1136
- " Other Files (skipped): "
1137
- f"{total_files - len(input_paths)}\n"
1138
- )
1139
- with open(collections_config_file, "w") as f:
1140
- json.dump(vectorizer.config, f, indent=2)
1141
-
1142
-
1143
- @db.command(
1144
- "rm",
1145
- help="Remove a collection from the database.",
1146
- )
1147
- def db_rm(
1148
- collection_name: Annotated[str, typer.Argument(help="The name of the collection.")],
1149
- confirm: Annotated[
1150
- bool,
1151
- typer.Option(
1152
- "--yes",
1153
- "-y",
1154
- help="Confirm the removal of the collection.",
1155
- ),
1156
- ],
1157
- ) -> None:
1158
- """Remove a collection from the database
1159
-
1160
- Arguments:
1161
- collection_name: The name of the collection to remove
1162
- """
1163
- if not confirm:
1164
- delete = Confirm.ask(
1165
- f"\nAre you sure you want to [bold red]remove[/bold red] "
1166
- f"[bold salmon1]{collection_name}[/bold salmon1]?",
1167
- )
1168
- else:
1169
- delete = True
1170
- if not delete:
1171
- raise typer.Abort()
1172
- db = ChromaEmbeddingDatabase(db_loc)
1173
- collections = Collections(db)
1174
- collections.delete(collection_name)
1175
- print(
1176
- f"[bold red]Removed[/bold red] collection "
1177
- f"[bold salmon1]{collection_name}[/bold salmon1]"
1178
- )
1179
-
1180
-
1181
- def _check_collection(collection_name: str, input_dir: str | Path) -> bool:
1182
- db = ChromaEmbeddingDatabase(db_loc)
1183
- collections = Collections(db)
1184
- added_to = False
1185
- try:
1186
- collections.get(collection_name)
1187
- # confirm_add = Confirm.ask(
1188
- # f"\nCollection [bold salmon1]{collection_name}[/bold salmon1] exists. Are "
1189
- # "you sure you want to update it with the contents of"
1190
- # f"[bold green]{input_dir}[/bold green]?"
1191
- # )
1192
- added_to = True
1193
- # if not confirm_add:
1194
- # raise typer.Abort()
1195
- except ValueError:
1196
- pass
1197
- return added_to
1198
-
1199
-
1200
- @llm.command("add", help="Add a model config to janus")
1201
- def llm_add(
1202
- model_name: Annotated[
1203
- str, typer.Argument(help="The user's custom name of the model")
1204
- ],
1205
- model_type: Annotated[
1206
- str,
1207
- typer.Option(
1208
- "--type",
1209
- "-t",
1210
- help="The type of the model",
1211
- click_type=click.Choice(sorted(list(MODEL_TYPE_CONSTRUCTORS.keys()))),
1212
- ),
1213
- ] = "Azure",
1214
- ):
1215
- if not MODEL_CONFIG_DIR.exists():
1216
- MODEL_CONFIG_DIR.mkdir(parents=True)
1217
- model_cfg = MODEL_CONFIG_DIR / f"{model_name}.json"
1218
- if model_type == "HuggingFace":
1219
- url = typer.prompt("Enter the model's URL")
1220
- max_tokens = typer.prompt(
1221
- "Enter the model's maximum tokens", default=4096, type=int
1222
- )
1223
- in_cost = typer.prompt("Enter the cost per input token", default=0, type=float)
1224
- out_cost = typer.prompt("Enter the cost per output token", default=0, type=float)
1225
- params = dict(
1226
- inference_server_url=url,
1227
- max_new_tokens=max_tokens,
1228
- top_k=10,
1229
- top_p=0.95,
1230
- typical_p=0.95,
1231
- temperature=0.01,
1232
- repetition_penalty=1.03,
1233
- timeout=240,
1234
- )
1235
- cfg = {
1236
- "model_type": model_type,
1237
- "model_args": params,
1238
- "token_limit": max_tokens,
1239
- "model_cost": {"input": in_cost, "output": out_cost},
1240
- }
1241
- elif model_type == "HuggingFaceLocal":
1242
- model_id = typer.prompt("Enter the model ID")
1243
- task = typer.prompt("Enter the task")
1244
- max_tokens = typer.prompt(
1245
- "Enter the model's maximum tokens", default=4096, type=int
1246
- )
1247
- in_cost = 0
1248
- out_cost = 0
1249
- params = {"model_id": model_id, "task": task}
1250
- cfg = {
1251
- "model_type": model_type,
1252
- "model_args": params,
1253
- "token_limit": max_tokens,
1254
- "model_cost": {"input": in_cost, "output": out_cost},
1255
- }
1256
- elif model_type == "OpenAI":
1257
- print("DEPRECATED: Use 'Azure' instead. CTRL+C to exit.")
1258
- model_id = typer.prompt(
1259
- "Enter the model ID (list model IDs with `janus llm ls -a`)",
1260
- default="gpt-4o",
1261
- type=click.Choice(openai_models),
1262
- show_choices=False,
1263
- )
1264
- params = dict(
1265
- model_name=model_name,
1266
- temperature=0.7,
1267
- n=1,
1268
- )
1269
- max_tokens = TOKEN_LIMITS[model_name]
1270
- model_cost = COST_PER_1K_TOKENS[model_name]
1271
- cfg = {
1272
- "model_type": model_type,
1273
- "model_id": model_id,
1274
- "model_args": params,
1275
- "token_limit": max_tokens,
1276
- "model_cost": model_cost,
1277
- }
1278
- elif model_type == "Azure":
1279
- model_id = typer.prompt(
1280
- "Enter the model ID (list model IDs with `janus llm ls -a`)",
1281
- default="gpt-4o",
1282
- type=click.Choice(azure_models),
1283
- show_choices=False,
1284
- )
1285
- params = dict(
1286
- # Azure uses the "azure_deployment" key for what we're calling "long_model_id"
1287
- azure_deployment=MODEL_ID_TO_LONG_ID[model_id],
1288
- temperature=0.7,
1289
- n=1,
1290
- )
1291
- max_tokens = TOKEN_LIMITS[MODEL_ID_TO_LONG_ID[model_id]]
1292
- model_cost = COST_PER_1K_TOKENS[MODEL_ID_TO_LONG_ID[model_id]]
1293
- cfg = {
1294
- "model_type": model_type,
1295
- "model_id": model_id,
1296
- "model_args": params,
1297
- "token_limit": max_tokens,
1298
- "model_cost": model_cost,
1299
- }
1300
- elif model_type == "BedrockChat" or model_type == "Bedrock":
1301
- model_id = typer.prompt(
1302
- "Enter the model ID (list model IDs with `janus llm ls -a`)",
1303
- default="bedrock-claude-sonnet",
1304
- type=click.Choice(bedrock_models),
1305
- show_choices=False,
1306
- )
1307
- params = dict(
1308
- # Bedrock uses the "model_id" key for what we're calling "long_model_id"
1309
- model_id=MODEL_ID_TO_LONG_ID[model_id],
1310
- model_kwargs={"temperature": 0.7},
1311
- )
1312
- max_tokens = TOKEN_LIMITS[MODEL_ID_TO_LONG_ID[model_id]]
1313
- model_cost = COST_PER_1K_TOKENS[MODEL_ID_TO_LONG_ID[model_id]]
1314
- cfg = {
1315
- "model_type": model_type,
1316
- "model_id": model_id,
1317
- "model_args": params,
1318
- "token_limit": max_tokens,
1319
- "model_cost": model_cost,
1320
- }
1321
- else:
1322
- raise ValueError(f"Unknown model type {model_type}")
1323
- with open(model_cfg, "w") as f:
1324
- json.dump(cfg, f, indent=2)
1325
- print(f"Model config written to {model_cfg}")
1326
-
1327
-
1328
- @llm.command("ls", help="List all of the user-configured models")
1329
- def llm_ls(
1330
- all: Annotated[
1331
- bool,
1332
- typer.Option(
1333
- "--all",
1334
- "-a",
1335
- is_flag=True,
1336
- help="List all models, including the default model IDs.",
1337
- click_type=click.Choice(sorted(list(MODEL_TYPE_CONSTRUCTORS.keys()))),
1338
- ),
1339
- ] = False,
1340
- ):
1341
- print("\n[green]User-configured models[/green]:")
1342
- for model_cfg in MODEL_CONFIG_DIR.glob("*.json"):
1343
- with open(model_cfg, "r") as f:
1344
- cfg = json.load(f)
1345
- print(f"\t[blue]{model_cfg.stem}[/blue]: [purple]{cfg['model_type']}[/purple]")
1346
-
1347
- if all:
1348
- print("\n[green]Available model IDs[/green]:")
1349
- for model_id, model_type in MODEL_TYPES.items():
1350
- print(f"\t[blue]{model_id}[/blue]: [purple]{model_type}[/purple]")
1351
-
1352
-
1353
- @embedding.command("add", help="Add an embedding model config to janus")
1354
- def embedding_add(
1355
- model_name: Annotated[
1356
- str, typer.Argument(help="The user's custom name for the model")
1357
- ],
1358
- model_type: Annotated[
1359
- str,
1360
- typer.Option(
1361
- "--type",
1362
- "-t",
1363
- help="The type of the model",
1364
- click_type=click.Choice(list(val.value for val in EmbeddingModelType)),
1365
- ),
1366
- ] = "OpenAI",
1367
- ):
1368
- if not EMBEDDING_MODEL_CONFIG_DIR.exists():
1369
- EMBEDDING_MODEL_CONFIG_DIR.mkdir(parents=True)
1370
- model_cfg = EMBEDDING_MODEL_CONFIG_DIR / f"{model_name}.json"
1371
- if model_type in EmbeddingModelType.HuggingFaceInferenceAPI.values:
1372
- hf = typer.style("HuggingFaceInferenceAPI", fg="yellow")
1373
- url = typer.prompt(f"Enter the {hf} model's URL", type=str, value_proc=AnyHttpUrl)
1374
- api_model_name = typer.prompt("Enter the model's name", type=str, default="")
1375
- api_key = typer.prompt("Enter the API key", type=str, default="")
1376
- max_tokens = typer.prompt(
1377
- "Enter the model's maximum tokens", default=8191, type=int
1378
- )
1379
- in_cost = typer.prompt("Enter the cost per input token", default=0, type=float)
1380
- out_cost = typer.prompt("Enter the cost per output token", default=0, type=float)
1381
- params = dict(
1382
- model_name=api_model_name,
1383
- api_key=api_key,
1384
- )
1385
- cfg = {
1386
- "model_type": model_type,
1387
- "model_identifier": str(url),
1388
- "model_args": params,
1389
- "token_limit": max_tokens,
1390
- "model_cost": {"input": in_cost, "output": out_cost},
1391
- }
1392
- elif model_type in EmbeddingModelType.HuggingFaceLocal.values:
1393
- hf = typer.style("HuggingFace", fg="yellow")
1394
- model_id = typer.prompt(
1395
- f"Enter the {hf} model ID",
1396
- default="sentence-transformers/all-MiniLM-L6-v2",
1397
- type=str,
1398
- )
1399
- cache_folder = str(
1400
- Path(
1401
- typer.prompt(
1402
- "Enter the model's cache folder",
1403
- default=EMBEDDING_MODEL_CONFIG_DIR / "cache",
1404
- type=str,
1405
- )
1406
- )
1407
- )
1408
- max_tokens = typer.prompt(
1409
- "Enter the model's maximum tokens", default=8191, type=int
1410
- )
1411
- params = dict(
1412
- cache_folder=str(cache_folder),
1413
- )
1414
- cfg = {
1415
- "model_type": model_type,
1416
- "model_identifier": model_id,
1417
- "model_args": params,
1418
- "token_limit": max_tokens,
1419
- "model_cost": {"input": 0, "output": 0},
1420
- }
1421
- elif model_type in EmbeddingModelType.OpenAI.values:
1422
- available_models = list(EMBEDDING_COST_PER_MODEL.keys())
1423
-
1424
- open_ai = typer.style("OpenAI", fg="green")
1425
- prompt = f"Enter the {open_ai} model name"
1426
-
1427
- model_name = typer.prompt(
1428
- prompt,
1429
- default="text-embedding-3-small",
1430
- type=click.types.Choice(available_models),
1431
- show_choices=False,
1432
- )
1433
- params = dict(
1434
- model=model_name,
1435
- )
1436
- max_tokens = EMBEDDING_TOKEN_LIMITS[model_name]
1437
- model_cost = EMBEDDING_COST_PER_MODEL[model_name]
1438
- cfg = {
1439
- "model_type": model_type,
1440
- "model_identifier": model_name,
1441
- "model_args": params,
1442
- "token_limit": max_tokens,
1443
- "model_cost": model_cost,
1444
- }
1445
- else:
1446
- raise ValueError(f"Unknown model type {model_type}")
1447
- with open(model_cfg, "w") as f:
1448
- json.dump(cfg, f, indent=2)
1449
- print(f"Model config written to {model_cfg}")
1450
-
1451
-
1452
- app.add_typer(db, name="db")
1453
- app.add_typer(llm, name="llm")
1454
- app.add_typer(evaluate, name="evaluate")
1455
- app.add_typer(embedding, name="embedding")
1456
-
1457
-
1458
- @app.command()
1459
- def render(
1460
- input_dir: Annotated[
1461
- str,
1462
- typer.Option(
1463
- "--input",
1464
- "-i",
1465
- ),
1466
- ],
1467
- output_dir: Annotated[str, typer.Option("--output", "-o")],
1468
- ):
1469
- input_dir = Path(input_dir)
1470
- output_dir = Path(output_dir)
1471
- for input_file in input_dir.rglob("*.json"):
1472
- with open(input_file, "r") as f:
1473
- data = json.load(f)
1474
-
1475
- output_file = output_dir / input_file.relative_to(input_dir).with_suffix(".txt")
1476
- if not output_file.parent.exists():
1477
- output_file.parent.mkdir()
1478
-
1479
- text = data["output"].replace("\\n", "\n").strip()
1480
- output_file.write_text(text)
1481
-
1482
- jar_path = homedir / ".janus/lib/plantuml.jar"
1483
- subprocess.run(["java", "-jar", jar_path, output_file]) # nosec
1484
- output_file.unlink()
1485
-
1486
-
1487
- if __name__ == "__main__":
1488
- app()