janus-llm 2.0.2__py3-none-any.whl → 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- janus/__init__.py +2 -2
- janus/__main__.py +1 -1
- janus/_tests/test_cli.py +1 -2
- janus/cli.py +43 -51
- janus/converter/__init__.py +6 -0
- janus/converter/_tests/__init__.py +0 -0
- janus/{_tests → converter/_tests}/test_translate.py +11 -22
- janus/converter/converter.py +614 -0
- janus/converter/diagram.py +124 -0
- janus/converter/document.py +131 -0
- janus/converter/evaluate.py +15 -0
- janus/converter/requirements.py +50 -0
- janus/converter/translate.py +108 -0
- janus/embedding/_tests/test_collections.py +2 -2
- janus/language/_tests/test_splitter.py +1 -1
- janus/language/alc/__init__.py +1 -0
- janus/language/alc/_tests/__init__.py +0 -0
- janus/language/alc/_tests/test_alc.py +28 -0
- janus/language/alc/alc.py +87 -0
- janus/language/block.py +4 -2
- janus/language/combine.py +0 -1
- janus/language/mumps/mumps.py +2 -3
- janus/language/naive/__init__.py +1 -1
- janus/language/naive/basic_splitter.py +4 -4
- janus/language/naive/chunk_splitter.py +4 -4
- janus/language/naive/registry.py +1 -1
- janus/language/naive/simple_ast.py +23 -12
- janus/language/naive/tag_splitter.py +4 -4
- janus/language/splitter.py +10 -4
- janus/language/treesitter/treesitter.py +26 -8
- janus/llm/model_callbacks.py +34 -37
- janus/llm/models_info.py +16 -3
- janus/metrics/_tests/test_llm.py +2 -3
- janus/metrics/_tests/test_rouge_score.py +1 -1
- janus/metrics/_tests/test_similarity_score.py +1 -1
- janus/metrics/complexity_metrics.py +3 -4
- janus/metrics/metric.py +3 -4
- janus/metrics/reading.py +27 -5
- janus/prompts/prompt.py +67 -7
- janus/utils/enums.py +6 -5
- {janus_llm-2.0.2.dist-info → janus_llm-3.0.0.dist-info}/METADATA +1 -1
- {janus_llm-2.0.2.dist-info → janus_llm-3.0.0.dist-info}/RECORD +45 -35
- janus/converter.py +0 -158
- janus/translate.py +0 -981
- {janus_llm-2.0.2.dist-info → janus_llm-3.0.0.dist-info}/LICENSE +0 -0
- {janus_llm-2.0.2.dist-info → janus_llm-3.0.0.dist-info}/WHEEL +0 -0
- {janus_llm-2.0.2.dist-info → janus_llm-3.0.0.dist-info}/entry_points.txt +0 -0
@@ -1,14 +1,21 @@
|
|
1
|
-
janus/__init__.py,sha256=
|
2
|
-
janus/__main__.py,sha256=
|
1
|
+
janus/__init__.py,sha256=LVDmiK9hI9u2qwCVVKQtBo3fR1FomBig_DphrdUtr3E,351
|
2
|
+
janus/__main__.py,sha256=lEkpNtLVPtFo8ySDZeXJ_NXDHb0GVdZFPWB4gD4RPS8,64
|
3
3
|
janus/_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
4
|
janus/_tests/conftest.py,sha256=V7uW-oq3YbFiRPvrq15YoVVrA1n_83pjgiyTZ-IUGW8,963
|
5
|
-
janus/_tests/test_cli.py,sha256=
|
6
|
-
janus/
|
7
|
-
janus/
|
8
|
-
janus/converter.py,sha256=
|
5
|
+
janus/_tests/test_cli.py,sha256=mi7wAWV07ZFli5nQdExRGIGA3AMFD9s39-HcmDV4B6Y,4232
|
6
|
+
janus/cli.py,sha256=-aeg8R6CobK2EG_BPoZgBy_x1d6G9gp-KKKhnLMepo4,29541
|
7
|
+
janus/converter/__init__.py,sha256=kzVmWOPXRDayqqBZ8ZDaFQzA_q8PEdv407dc-DefPxY,255
|
8
|
+
janus/converter/_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
|
+
janus/converter/_tests/test_translate.py,sha256=eiLbmouokZrAeAYmdoJgnlx5-k4QiO6i0N6e6ZvZsvM,15885
|
10
|
+
janus/converter/converter.py,sha256=Bq07_9N_3Dv9NBqVACvb7LC2HxdQmfVZ1b0BlWrxjgo,23521
|
11
|
+
janus/converter/diagram.py,sha256=v-3ZZ4t1q74lDOjF2N6NRPkC3IK-sjLDn5_VChZTEGA,4608
|
12
|
+
janus/converter/document.py,sha256=hsW512veNjFWbdl5WriuUdNmMEqZy8ktRvqn9rRmA6E,4566
|
13
|
+
janus/converter/evaluate.py,sha256=APWQUY3gjAXqkJkPzvj0UA4wPK3Cv9QSJLM-YK9t-ng,476
|
14
|
+
janus/converter/requirements.py,sha256=orOVFymFSqSVyyII1DeDqeo3pTZLxw6dJ-JKBPigR0A,1751
|
15
|
+
janus/converter/translate.py,sha256=kMlGUiBYGQBXSxwX5in3CUyUifPM95wynCaRMxSDxMw,4238
|
9
16
|
janus/embedding/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
17
|
janus/embedding/_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
|
-
janus/embedding/_tests/test_collections.py,sha256=
|
18
|
+
janus/embedding/_tests/test_collections.py,sha256=eT0cYv-qmPrHJRjDZqWPFTkqVzFDRoPrRKR__FPiz58,2651
|
12
19
|
janus/embedding/_tests/test_database.py,sha256=uqI2Jgj8DEIlciqiwiZx_n0osjcspIPrHOSSN1NRZSk,1019
|
13
20
|
janus/embedding/_tests/test_vectorize.py,sha256=NnJLHBwgMVycAProRJxuLVSByxrpJ35eaZCFca52gNY,1964
|
14
21
|
janus/embedding/collections.py,sha256=ZE8QGYQ82DCLqhV0m1y7PiqpuHjEfxHPcS5SCKU0LAw,5411
|
@@ -18,35 +25,39 @@ janus/embedding/vectorize.py,sha256=ap3e6ZMai8U3M5vdpLc_st4Sw31xyqoaqEno0IJlVOU,
|
|
18
25
|
janus/language/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
26
|
janus/language/_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
20
27
|
janus/language/_tests/test_combine.py,sha256=ydCYNbTxvaxT-5axiEBzPQLn6s4arSyZ5Tx2SYKLpJY,1830
|
21
|
-
janus/language/_tests/test_splitter.py,sha256=
|
28
|
+
janus/language/_tests/test_splitter.py,sha256=VK48eqp5PYJfjdhD_x7IkeAjbF1KC3AyNnICfK8XnUQ,360
|
29
|
+
janus/language/alc/__init__.py,sha256=j7vOMGhT1Vri6p8dsjSaY-fkO5uFn0sJ0nrNGGvcizM,42
|
30
|
+
janus/language/alc/_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
31
|
+
janus/language/alc/_tests/test_alc.py,sha256=DttXpouP9Vkdlf23_a0qFalKdGtadGv6oXTsmN1pk8Q,994
|
32
|
+
janus/language/alc/alc.py,sha256=n8KVHTb6FFILw50N8UM3gfT60gLVvkTjk37easwluWs,3061
|
22
33
|
janus/language/binary/__init__.py,sha256=AlNAe12ZA366kcGSrQ1FJyOdbwxFqGBFkYR2K6yL818,51
|
23
34
|
janus/language/binary/_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
24
35
|
janus/language/binary/_tests/test_binary.py,sha256=a-8RSfKA23UrJC9c1xPQK792XZCz8npCHI7isN2dAP8,1727
|
25
36
|
janus/language/binary/binary.py,sha256=CS1RAieN8klSsCeXQEFYKUWioatUX-sOPXKQr5S6NzE,6534
|
26
37
|
janus/language/binary/reveng/decompile_script.py,sha256=veW51oJzuO-4UD3Er062jXZ_FYtTFo9OCkl82Z2xr6A,2182
|
27
|
-
janus/language/block.py,sha256=
|
28
|
-
janus/language/combine.py,sha256=
|
38
|
+
janus/language/block.py,sha256=57hfOY-KSVMioKhkCvfDtovQt4h8lCg9cJbRF7ddV1s,9280
|
39
|
+
janus/language/combine.py,sha256=e7j8zQO_D3_LElaVCsGgtnzia7aFFK56m-mhArQBlR0,2908
|
29
40
|
janus/language/file.py,sha256=X2MYcAMlCABK77uhMdI_J2foXLrqEdinapYRfLPyKB8,563
|
30
41
|
janus/language/mumps/__init__.py,sha256=-Ou_wJ-JgHezfp1dub2_qCYNiK9wO-zo2MlqxM9qiwE,48
|
31
42
|
janus/language/mumps/_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
32
43
|
janus/language/mumps/_tests/test_mumps.py,sha256=6l7q14lPnKf231iWwMdRbf-dg9QuHa26YMS7-K7yC4A,1001
|
33
|
-
janus/language/mumps/mumps.py,sha256=
|
44
|
+
janus/language/mumps/mumps.py,sha256=MkF_TZB1SOIj3JQfGKYow1Hh2Bja0EglUlpd4aAY5Iw,7351
|
34
45
|
janus/language/mumps/patterns.py,sha256=FW5T6Nt5kBO2UKgSL1KLVDbYRgMaJAzDvEmvBkxHppA,2310
|
35
|
-
janus/language/naive/__init__.py,sha256=
|
36
|
-
janus/language/naive/basic_splitter.py,sha256=
|
37
|
-
janus/language/naive/chunk_splitter.py,sha256=
|
38
|
-
janus/language/naive/registry.py,sha256=
|
39
|
-
janus/language/naive/simple_ast.py,sha256=
|
40
|
-
janus/language/naive/tag_splitter.py,sha256=
|
46
|
+
janus/language/naive/__init__.py,sha256=gsdC543qsIX8y_RxblCBIgyW0tfucljFms6v2WTrEz0,178
|
47
|
+
janus/language/naive/basic_splitter.py,sha256=NFW3TvMFQwEmcj5r4jvQXBJCzgNcSZI-3Arjb191gAo,407
|
48
|
+
janus/language/naive/chunk_splitter.py,sha256=g1nqbhvaOZ31SjO-smIwAg6lHGTy2rPOOnQ-m6fIKAA,713
|
49
|
+
janus/language/naive/registry.py,sha256=CDUkMIgscdPBV_qu49u9TGnOIgr9mRasinPRwViTWz8,281
|
50
|
+
janus/language/naive/simple_ast.py,sha256=boX_pJ8x52_MxiM6hJ-0oa6MR75Fu4pyebBEtYJJZUc,907
|
51
|
+
janus/language/naive/tag_splitter.py,sha256=6DHBJdM3IllcVV-MrAyj8KPg5zXTiRdeD42CYrmEFHk,1986
|
41
52
|
janus/language/node.py,sha256=-ymv--oILEYLVO2KSOrzOlzL2cZHNQpQJYwE1cKA-pY,200
|
42
|
-
janus/language/splitter.py,sha256=
|
53
|
+
janus/language/splitter.py,sha256=4XAe0hXka7njS30UHGCngJzDgHxn3lygUjikSHuV7Xo,16924
|
43
54
|
janus/language/treesitter/__init__.py,sha256=mUliw7ZJLZ8NkJKyUQMSoUV82hYXE0HvLHrEdGPJF4Q,43
|
44
55
|
janus/language/treesitter/_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
45
56
|
janus/language/treesitter/_tests/test_treesitter.py,sha256=nsavUV0aI6cpT9FkQve58eTTehLyQG6qJJBGlNa_bIw,2170
|
46
|
-
janus/language/treesitter/treesitter.py,sha256=
|
57
|
+
janus/language/treesitter/treesitter.py,sha256=UiV4OuWTt6IwMohHSw4FHsVNA_zxr9lNk4_Du09APdo,7509
|
47
58
|
janus/llm/__init__.py,sha256=8Pzn3Jdx867PzDc4xmwm8wvJDGzWSIhpN0NCEYFe0LQ,36
|
48
|
-
janus/llm/model_callbacks.py,sha256=
|
49
|
-
janus/llm/models_info.py,sha256=
|
59
|
+
janus/llm/model_callbacks.py,sha256=h_xlBAHRx-gxQBBjVKRpGXxdxYf6d9L6kBoXjbEAEdI,7106
|
60
|
+
janus/llm/models_info.py,sha256=B9Dn5mHc43OeZe5mHFj5wuhO194XHCTwShAa2ybnPyY,7688
|
50
61
|
janus/metrics/__init__.py,sha256=AsxtZJUzZiXJPr2ehPPltuYP-ddechjg6X85WZUO7mA,241
|
51
62
|
janus/metrics/_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
52
63
|
janus/metrics/_tests/reference.py,sha256=hiaJPP9CXkvFBV_wL-gOe_BzELTw0nvB6uCxhxtIiE8,13
|
@@ -54,19 +65,19 @@ janus/metrics/_tests/target.py,sha256=hiaJPP9CXkvFBV_wL-gOe_BzELTw0nvB6uCxhxtIiE
|
|
54
65
|
janus/metrics/_tests/test_bleu.py,sha256=TcSnNGpMh00Nkkk1zq5wDfdCANMUq9eXscU_hcBRU8A,1640
|
55
66
|
janus/metrics/_tests/test_chrf.py,sha256=O4v1Cj513H8NYffJILpSI7CuR_dnm7F8CeB3C7sZYr0,2202
|
56
67
|
janus/metrics/_tests/test_file_pairing.py,sha256=A4Qy6JIesFXUcaig45Ze6LiViuHQS7MFSQzDHQP3j9w,1880
|
57
|
-
janus/metrics/_tests/test_llm.py,sha256=
|
68
|
+
janus/metrics/_tests/test_llm.py,sha256=dGXrdd79v-ix_560t6Q8RJEx-6mgZ-pkzJgm-O2ZBwA,2998
|
58
69
|
janus/metrics/_tests/test_reading.py,sha256=NDLFyjmOpM5gWf1LLTjGIw3aUR8Qf22zTt9hwe7NABs,840
|
59
|
-
janus/metrics/_tests/test_rouge_score.py,sha256=
|
60
|
-
janus/metrics/_tests/test_similarity_score.py,sha256=
|
70
|
+
janus/metrics/_tests/test_rouge_score.py,sha256=rcHmrpy55cW507PnTnGQnp9Tsn5rk7JEyXmusY7la3Q,2020
|
71
|
+
janus/metrics/_tests/test_similarity_score.py,sha256=jc3r0lWW5Iqm6AMKc36ewz5rboKwVw29fliBHClkzIg,799
|
61
72
|
janus/metrics/_tests/test_treesitter_metrics.py,sha256=tqpAg9LY811gfQ3n2ypRqBJesAFQodMf6Gz7dvOsqp4,4337
|
62
73
|
janus/metrics/bleu.py,sha256=eRoHIQulPp5mezJzHCNkwUB_89tAj4PqV2pF9eV9HfI,1746
|
63
74
|
janus/metrics/chrf.py,sha256=zNGWZ40CPMgj8rctnmwkbf25_PvSOLPbOjv-iN2cGXM,1472
|
64
75
|
janus/metrics/cli.py,sha256=Duuw2RF47Z-t1pal0cg3L_-N_91rx29krirqtIwjYLY,157
|
65
|
-
janus/metrics/complexity_metrics.py,sha256=
|
76
|
+
janus/metrics/complexity_metrics.py,sha256=1Z9n0o_CrILqayk40wRkjR1f7yvHIsJG38DxAbqj614,6560
|
66
77
|
janus/metrics/file_pairing.py,sha256=WNHRV1D8GOJMq8Pla5SPkTDAT7yVaS4-UU0XIGKvEVs,3729
|
67
78
|
janus/metrics/llm_metrics.py,sha256=3677S6GYcoVcokpmAN-fwvNu-lYWAKd7M5mebiE6RZc,5687
|
68
|
-
janus/metrics/metric.py,sha256=
|
69
|
-
janus/metrics/reading.py,sha256=
|
79
|
+
janus/metrics/metric.py,sha256=Lgdtq87oJ-kWC_6jdPQ6-d1MqoeTnhkRszo6IZJV6c0,16974
|
80
|
+
janus/metrics/reading.py,sha256=srLb2MO-vZL5ccRjaHz-dA4MwAvXVNyIKnOrvJXg77E,2244
|
70
81
|
janus/metrics/rouge_score.py,sha256=HfUJwUWI-yq5pOjML2ee4QTOMl0NQahnqEY2Mt8Dtnw,2865
|
71
82
|
janus/metrics/similarity.py,sha256=9pjWWpLKCsk0QfFfSgQNdPXiisqi7WJYOOHaiT8S0iY,1613
|
72
83
|
janus/metrics/splitting.py,sha256=610ScHRvALwdkqA6YyGI-tr3a18_cUofldBxGYX0SwE,968
|
@@ -78,17 +89,16 @@ janus/parsers/doc_parser.py,sha256=X8eCb1QXbL6sVWLEFGjsPyxrpJ9XnOPg7G4KZSo9A9E,5
|
|
78
89
|
janus/parsers/eval_parser.py,sha256=HB5-zY_Jpmkj6FDbuNCCVCRxwmzhViSAjPKbyyC0Ebc,2723
|
79
90
|
janus/parsers/reqs_parser.py,sha256=MFBvtR3otpyPZlkZxu0dVH1YeEJhvhNzhaGKGHaQVHA,2359
|
80
91
|
janus/prompts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
81
|
-
janus/prompts/prompt.py,sha256=
|
82
|
-
janus/translate.py,sha256=bsQ1YvjCPrVrL3y-rAA2PrCv2-x3ObCF7a6LuPWQPuE,38747
|
92
|
+
janus/prompts/prompt.py,sha256=vd7UbitF0VFCi21RsggDebD51xcuyls_lQLGKkphfI8,10578
|
83
93
|
janus/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
84
94
|
janus/utils/_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
85
95
|
janus/utils/_tests/test_logger.py,sha256=4jZFm8LX828Dt9lOjiFHZIPbxYy_hHaswyrMPkscgdM,2199
|
86
96
|
janus/utils/_tests/test_progress.py,sha256=Yh5NDNq-24n2nhHHbJm39pENAH70PYnh9ymwdcn0_UU,481
|
87
|
-
janus/utils/enums.py,sha256=
|
97
|
+
janus/utils/enums.py,sha256=AoilbdiYyMvY2Mp0AM4xlbLSELfut2XMwhIM1S_msP4,27610
|
88
98
|
janus/utils/logger.py,sha256=KZeuaMAnlSZCsj4yL0P6N-JzZwpxXygzACWfdZFeuek,2337
|
89
99
|
janus/utils/progress.py,sha256=pKcCzO9JOU9fSD7qTmLWcqY5smc8mujqQMXoPgqNysE,1458
|
90
|
-
janus_llm-
|
91
|
-
janus_llm-
|
92
|
-
janus_llm-
|
93
|
-
janus_llm-
|
94
|
-
janus_llm-
|
100
|
+
janus_llm-3.0.0.dist-info/LICENSE,sha256=_j0st0a-HB6MRbP3_BW3PUqpS16v54luyy-1zVyl8NU,10789
|
101
|
+
janus_llm-3.0.0.dist-info/METADATA,sha256=arXVqb1tVW76Bpjj1b1hq-rWsvu25DHg9rNv3ZaFZW0,4184
|
102
|
+
janus_llm-3.0.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
103
|
+
janus_llm-3.0.0.dist-info/entry_points.txt,sha256=OGhQwzj6pvXp79B0SaBD5apGekCu7Dwe9fZZT_TZ544,39
|
104
|
+
janus_llm-3.0.0.dist-info/RECORD,,
|
janus/converter.py
DELETED
@@ -1,158 +0,0 @@
|
|
1
|
-
import functools
|
2
|
-
from typing import Any
|
3
|
-
|
4
|
-
from langchain.schema.language_model import BaseLanguageModel
|
5
|
-
|
6
|
-
from .language.binary import BinarySplitter
|
7
|
-
from .language.mumps import MumpsSplitter
|
8
|
-
from .language.splitter import Splitter
|
9
|
-
from .language.treesitter import TreeSitterSplitter
|
10
|
-
from .utils.enums import CUSTOM_SPLITTERS, LANGUAGES
|
11
|
-
from .utils.logger import create_logger
|
12
|
-
|
13
|
-
log = create_logger(__name__)
|
14
|
-
|
15
|
-
|
16
|
-
def run_if_changed(*tracked_vars):
|
17
|
-
"""Wrapper to skip function calls if the given instance attributes haven't
|
18
|
-
been updated. Requires the _changed_attrs set to exist, and the __setattr__
|
19
|
-
method to be overridden to track parameter updates in _changed_attrs.
|
20
|
-
"""
|
21
|
-
|
22
|
-
def wrapper(func):
|
23
|
-
@functools.wraps(func)
|
24
|
-
def wrapped(self, *args, **kwargs):
|
25
|
-
# If there is overlap between the tracked variables and the changed
|
26
|
-
# ones, then call the function as normal
|
27
|
-
if self._changed_attrs.intersection(tracked_vars):
|
28
|
-
func(self, *args, **kwargs)
|
29
|
-
|
30
|
-
return wrapped
|
31
|
-
|
32
|
-
return wrapper
|
33
|
-
|
34
|
-
|
35
|
-
class Converter:
|
36
|
-
"""Parent class that converts code into something else.
|
37
|
-
|
38
|
-
Children will determine what the code gets converted into. Whether that's translated
|
39
|
-
into another language, into pseudocode, requirements, documentation, etc., or
|
40
|
-
converted into embeddings
|
41
|
-
"""
|
42
|
-
|
43
|
-
def __init__(
|
44
|
-
self,
|
45
|
-
source_language: str = "fortran",
|
46
|
-
max_tokens: None | int = None,
|
47
|
-
protected_node_types: set[str] | list[str] | tuple[str] = (),
|
48
|
-
prune_node_types: set[str] | list[str] | tuple[str] = (),
|
49
|
-
) -> None:
|
50
|
-
"""Initialize a Converter instance.
|
51
|
-
|
52
|
-
Arguments:
|
53
|
-
source_language: The source programming language.
|
54
|
-
parser_type: The type of parser to use for parsing the LLM output. Valid
|
55
|
-
values are `"code"`, `"text"`, `"eval"`, and `None` (default). If `None`,
|
56
|
-
the `Converter` assumes you won't be parsing an output (i.e., adding to an
|
57
|
-
embedding DB).
|
58
|
-
"""
|
59
|
-
self._changed_attrs: set = set()
|
60
|
-
|
61
|
-
self._source_language: None | str
|
62
|
-
self._source_glob: None | str
|
63
|
-
self._protected_node_types: tuple[str] = ()
|
64
|
-
self._prune_node_types: tuple[str] = ()
|
65
|
-
self._splitter: None | Splitter
|
66
|
-
self._llm: None | BaseLanguageModel = None
|
67
|
-
self._max_tokens: None | int = max_tokens
|
68
|
-
|
69
|
-
self.set_source_language(source_language)
|
70
|
-
self.set_protected_node_types(protected_node_types)
|
71
|
-
self.set_prune_node_types(prune_node_types)
|
72
|
-
|
73
|
-
# Child class must call this. Should we enforce somehow?
|
74
|
-
# self._load_parameters()
|
75
|
-
|
76
|
-
def __setattr__(self, key: Any, value: Any) -> None:
|
77
|
-
if hasattr(self, "_changed_attrs"):
|
78
|
-
if not hasattr(self, key) or getattr(self, key) != value:
|
79
|
-
self._changed_attrs.add(key)
|
80
|
-
# Avoid infinite recursion
|
81
|
-
elif key != "_changed_attrs":
|
82
|
-
self._changed_attrs = set()
|
83
|
-
super().__setattr__(key, value)
|
84
|
-
|
85
|
-
def _load_parameters(self) -> None:
|
86
|
-
self._load_splitter()
|
87
|
-
self._changed_attrs.clear()
|
88
|
-
|
89
|
-
def set_source_language(self, source_language: str) -> None:
|
90
|
-
"""Validate and set the source language.
|
91
|
-
|
92
|
-
The affected objects will not be updated until _load_parameters() is called.
|
93
|
-
|
94
|
-
Arguments:
|
95
|
-
source_language: The source programming language.
|
96
|
-
"""
|
97
|
-
source_language = source_language.lower()
|
98
|
-
if source_language not in LANGUAGES:
|
99
|
-
raise ValueError(
|
100
|
-
f"Invalid source language: {source_language}. "
|
101
|
-
"Valid source languages are found in `janus.utils.enums.LANGUAGES`."
|
102
|
-
)
|
103
|
-
|
104
|
-
self._source_glob = f"**/*.{LANGUAGES[source_language]['suffix']}"
|
105
|
-
self._source_language = source_language
|
106
|
-
|
107
|
-
def set_protected_node_types(
|
108
|
-
self, protected_node_types: set[str] | list[str] | tuple[str]
|
109
|
-
) -> None:
|
110
|
-
"""Set the protected (non-mergeable) node types. This will often be structures
|
111
|
-
like functions, classes, or modules which you might want to keep separate
|
112
|
-
|
113
|
-
The affected objects will not be updated until _load_parameters() is called.
|
114
|
-
|
115
|
-
Arguments:
|
116
|
-
protected_node_types: A set of node types that aren't to be merged
|
117
|
-
"""
|
118
|
-
self._protected_node_types = tuple(set(protected_node_types or []))
|
119
|
-
|
120
|
-
def set_prune_node_types(
|
121
|
-
self, prune_node_types: set[str] | list[str] | tuple[str]
|
122
|
-
) -> None:
|
123
|
-
"""Set the node types to prune. This will often be structures
|
124
|
-
like comments or whitespace which you might want to keep out of the LLM
|
125
|
-
|
126
|
-
The affected objects will not be updated until _load_parameters() is called.
|
127
|
-
|
128
|
-
Arguments:
|
129
|
-
prune_node_types: A set of node types which should be pruned
|
130
|
-
"""
|
131
|
-
self._prune_node_types = tuple(set(prune_node_types or []))
|
132
|
-
|
133
|
-
@run_if_changed(
|
134
|
-
"_source_language",
|
135
|
-
"_max_tokens",
|
136
|
-
"_llm",
|
137
|
-
"_protected_node_types",
|
138
|
-
"_prune_node_types",
|
139
|
-
)
|
140
|
-
def _load_splitter(self) -> None:
|
141
|
-
"""Load the splitter according to this instance's attributes.
|
142
|
-
|
143
|
-
If the relevant fields have not been changed since the last time this method was
|
144
|
-
called, nothing happens.
|
145
|
-
"""
|
146
|
-
kwargs = dict(
|
147
|
-
max_tokens=self._max_tokens,
|
148
|
-
model=self._llm,
|
149
|
-
protected_node_types=self._protected_node_types,
|
150
|
-
prune_node_types=self._prune_node_types,
|
151
|
-
)
|
152
|
-
if self._source_language in CUSTOM_SPLITTERS:
|
153
|
-
if self._source_language == "mumps":
|
154
|
-
self._splitter = MumpsSplitter(**kwargs)
|
155
|
-
elif self._source_language == "binary":
|
156
|
-
self._splitter = BinarySplitter(**kwargs)
|
157
|
-
else:
|
158
|
-
self._splitter = TreeSitterSplitter(language=self._source_language, **kwargs)
|