xinference 0.12.0__py3-none-any.whl → 0.12.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (85) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +108 -14
  3. xinference/client/restful/restful_client.py +78 -5
  4. xinference/constants.py +1 -0
  5. xinference/core/cache_tracker.py +48 -28
  6. xinference/core/event.py +5 -6
  7. xinference/core/model.py +59 -42
  8. xinference/core/scheduler.py +46 -18
  9. xinference/core/supervisor.py +73 -24
  10. xinference/core/worker.py +68 -2
  11. xinference/deploy/cmdline.py +86 -2
  12. xinference/deploy/test/test_cmdline.py +19 -10
  13. xinference/model/audio/__init__.py +14 -1
  14. xinference/model/audio/core.py +12 -1
  15. xinference/model/audio/custom.py +6 -4
  16. xinference/model/audio/model_spec_modelscope.json +20 -0
  17. xinference/model/llm/__init__.py +34 -2
  18. xinference/model/llm/llm_family.json +8 -2
  19. xinference/model/llm/llm_family.py +86 -1
  20. xinference/model/llm/llm_family_csghub.json +66 -0
  21. xinference/model/llm/llm_family_modelscope.json +8 -2
  22. xinference/model/llm/pytorch/chatglm.py +41 -12
  23. xinference/model/llm/pytorch/core.py +128 -88
  24. xinference/model/llm/pytorch/glm4v.py +24 -3
  25. xinference/model/llm/pytorch/internlm2.py +15 -0
  26. xinference/model/llm/pytorch/qwen_vl.py +1 -1
  27. xinference/model/llm/pytorch/utils.py +69 -189
  28. xinference/model/llm/utils.py +27 -14
  29. xinference/model/llm/vllm/core.py +10 -4
  30. xinference/model/rerank/core.py +35 -6
  31. xinference/model/utils.py +8 -2
  32. xinference/thirdparty/ChatTTS/experimental/__init__.py +0 -0
  33. xinference/thirdparty/ChatTTS/experimental/llm.py +40 -0
  34. xinference/thirdparty/ChatTTS/infer/__init__.py +0 -0
  35. xinference/thirdparty/ChatTTS/infer/api.py +125 -0
  36. xinference/thirdparty/ChatTTS/model/__init__.py +0 -0
  37. xinference/thirdparty/ChatTTS/model/dvae.py +155 -0
  38. xinference/thirdparty/ChatTTS/model/gpt.py +265 -0
  39. xinference/thirdparty/ChatTTS/utils/__init__.py +0 -0
  40. xinference/thirdparty/ChatTTS/utils/gpu_utils.py +23 -0
  41. xinference/thirdparty/ChatTTS/utils/infer_utils.py +141 -0
  42. xinference/thirdparty/ChatTTS/utils/io_utils.py +14 -0
  43. xinference/types.py +28 -0
  44. xinference/web/ui/build/asset-manifest.json +6 -6
  45. xinference/web/ui/build/index.html +1 -1
  46. xinference/web/ui/build/static/css/main.4bafd904.css +2 -0
  47. xinference/web/ui/build/static/css/main.4bafd904.css.map +1 -0
  48. xinference/web/ui/build/static/js/main.b80d9c08.js +3 -0
  49. xinference/web/ui/build/static/js/main.b80d9c08.js.map +1 -0
  50. xinference/web/ui/node_modules/.cache/babel-loader/0c2fb5375667931c4a331c99e0d87dc145e8f327cea3f44d6e56f54c7c1d4020.json +1 -0
  51. xinference/web/ui/node_modules/.cache/babel-loader/131091b25d26b17cdca187d7542a21475c211138d900cf667682260e76ef9463.json +1 -0
  52. xinference/web/ui/node_modules/.cache/babel-loader/16537795de12c61903b6110c241f62a7855b2d0fc1e7c3d1faa347267f3a6893.json +1 -0
  53. xinference/web/ui/node_modules/.cache/babel-loader/17b8f071491402d70b146532358b1a612226e5dc7b3e8755a1322d27b4680cee.json +1 -0
  54. xinference/web/ui/node_modules/.cache/babel-loader/395409bd005e19d48b437c48d88e5126c7865ba9631fe98535333c952e383dc5.json +1 -0
  55. xinference/web/ui/node_modules/.cache/babel-loader/3da7d55e87882a4af923e187b1351160e34ca102f589086439c15131a227fb6e.json +1 -0
  56. xinference/web/ui/node_modules/.cache/babel-loader/43991bb67c3136863e6fb37f796466b12eb547a1465408cc77820fddafb3bed3.json +1 -0
  57. xinference/web/ui/node_modules/.cache/babel-loader/72bcecc71c5267250edeb89608859d449b586f13ff9923a5e70e7172976ec403.json +1 -0
  58. xinference/web/ui/node_modules/.cache/babel-loader/{15e2cf8cd8d0989719b6349428ff576f9009ff4c2dcc52378be0bd938e82495e.json → 935efd2867664c58230378fdf2ff1ea85e58d853b7214014e20dfbca8dab7b05.json} +1 -1
  59. xinference/web/ui/node_modules/.cache/babel-loader/a7109d4425e3d94ca2726fc7020fd33bf5030afd4c9cf4bf71e21776cd70646a.json +1 -0
  60. xinference/web/ui/node_modules/.cache/babel-loader/c2abe75f04ad82fba68f35ed9cbe2e287762c876684fddccccfa73f739489b65.json +1 -0
  61. xinference/web/ui/node_modules/.cache/babel-loader/f28b83886159d83b84f099b05d607a822dca4dd7f2d8aa6d56fe08bab0b5b086.json +1 -0
  62. xinference/web/ui/node_modules/.cache/babel-loader/f51bf63ddaa7afd125ef2254a105789333eecc1c94fdf5157a9b88ef7ad0a5bd.json +1 -0
  63. {xinference-0.12.0.dist-info → xinference-0.12.2.dist-info}/METADATA +1 -1
  64. {xinference-0.12.0.dist-info → xinference-0.12.2.dist-info}/RECORD +69 -56
  65. xinference/web/ui/build/static/css/main.54bca460.css +0 -2
  66. xinference/web/ui/build/static/css/main.54bca460.css.map +0 -1
  67. xinference/web/ui/build/static/js/main.551aa479.js +0 -3
  68. xinference/web/ui/build/static/js/main.551aa479.js.map +0 -1
  69. xinference/web/ui/node_modules/.cache/babel-loader/1e86938a0cdf706d21e99b21f5d868fa247c0c88b26807047e26dcdc4d9a9db3.json +0 -1
  70. xinference/web/ui/node_modules/.cache/babel-loader/1fa824d82b2af519de7700c594e50bde4bbca60d13bd3fabff576802e4070304.json +0 -1
  71. xinference/web/ui/node_modules/.cache/babel-loader/2c63e940b945fd5817157e08a42b889b30d668ea4c91332f48ef2b1b9d26f520.json +0 -1
  72. xinference/web/ui/node_modules/.cache/babel-loader/3c2f277c93c5f1638e08db38df0d0fb4e58d1c5571aea03241a5c04ff4094704.json +0 -1
  73. xinference/web/ui/node_modules/.cache/babel-loader/3e737bcdbcbc407ccd65b90e199ef0c3214b261e8e41dbf14d921384a717d9ee.json +0 -1
  74. xinference/web/ui/node_modules/.cache/babel-loader/4135fe8745434cbce6438d1ebfa47422e0c77d884db4edc75c8bf32ea1d50621.json +0 -1
  75. xinference/web/ui/node_modules/.cache/babel-loader/46b6dd1f6d1109cd0e2455a0ea0be3e9bda1097cd4ebec9c4040070372671cfc.json +0 -1
  76. xinference/web/ui/node_modules/.cache/babel-loader/4de0a71074f9cbe1e7862750dcdd08cbc1bae7d9d9849a78b1783ca670017b3c.json +0 -1
  77. xinference/web/ui/node_modules/.cache/babel-loader/59ce49eae0f486af4c5034d4d2f9ca77c3ec3a32ecc560085caf5ef482b5f4c9.json +0 -1
  78. xinference/web/ui/node_modules/.cache/babel-loader/9cfd33238ca43e5bf9fc7e442690e8cc6027c73553db36de87e3597ed524ee4b.json +0 -1
  79. xinference/web/ui/node_modules/.cache/babel-loader/a6da6bc3d0d2191adebee87fb58ecebe82d071087bd2f7f3a9c7fdd2ada130f2.json +0 -1
  80. xinference/web/ui/node_modules/.cache/babel-loader/e6eccc9aa641e7da833492e27846dc965f9750281420977dc84654ca6ed221e4.json +0 -1
  81. /xinference/web/ui/build/static/js/{main.551aa479.js.LICENSE.txt → main.b80d9c08.js.LICENSE.txt} +0 -0
  82. {xinference-0.12.0.dist-info → xinference-0.12.2.dist-info}/LICENSE +0 -0
  83. {xinference-0.12.0.dist-info → xinference-0.12.2.dist-info}/WHEEL +0 -0
  84. {xinference-0.12.0.dist-info → xinference-0.12.2.dist-info}/entry_points.txt +0 -0
  85. {xinference-0.12.0.dist-info → xinference-0.12.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,125 @@
1
+
2
+ import torch
3
+ import torch.nn.functional as F
4
+ from transformers.generation import TopKLogitsWarper, TopPLogitsWarper
5
+ from ..utils.infer_utils import CustomRepetitionPenaltyLogitsProcessorRepeat
6
+
7
+ def infer_code(
8
+ models,
9
+ text,
10
+ spk_emb = None,
11
+ top_P = 0.7,
12
+ top_K = 20,
13
+ temperature = 0.3,
14
+ repetition_penalty = 1.05,
15
+ max_new_token = 2048,
16
+ **kwargs
17
+ ):
18
+
19
+ device = next(models['gpt'].parameters()).device
20
+
21
+ if not isinstance(text, list):
22
+ text = [text]
23
+
24
+ if not isinstance(temperature, list):
25
+ temperature = [temperature] * models['gpt'].num_vq
26
+
27
+ if spk_emb is not None:
28
+ text = [f'[Stts][spk_emb]{i}[Ptts]' for i in text]
29
+ else:
30
+ text = [f'[Stts][empty_spk]{i}[Ptts]' for i in text]
31
+
32
+ text_token = models['tokenizer'](text, return_tensors='pt', add_special_tokens=False, padding=True).to(device)
33
+ input_ids = text_token['input_ids'][...,None].expand(-1, -1, models['gpt'].num_vq)
34
+ text_mask = torch.ones(text_token['input_ids'].shape, dtype=bool, device=device)
35
+
36
+ inputs = {
37
+ 'input_ids': input_ids,
38
+ 'text_mask': text_mask,
39
+ 'attention_mask': text_token['attention_mask'],
40
+ }
41
+
42
+ emb = models['gpt'].get_emb(**inputs)
43
+ if spk_emb is not None:
44
+ emb[inputs['input_ids'][..., 0] == models['tokenizer'].convert_tokens_to_ids('[spk_emb]')] = \
45
+ F.normalize(spk_emb.to(device).to(emb.dtype)[None].expand(len(text), -1), p=2.0, dim=1, eps=1e-12)
46
+
47
+ num_code = models['gpt'].emb_code[0].num_embeddings - 1
48
+
49
+ LogitsWarpers = []
50
+ if top_P is not None:
51
+ LogitsWarpers.append(TopPLogitsWarper(top_P, min_tokens_to_keep=3))
52
+ if top_K is not None:
53
+ LogitsWarpers.append(TopKLogitsWarper(top_K, min_tokens_to_keep=3))
54
+
55
+ LogitsProcessors = []
56
+ if repetition_penalty is not None and repetition_penalty != 1:
57
+ LogitsProcessors.append(CustomRepetitionPenaltyLogitsProcessorRepeat(\
58
+ repetition_penalty, num_code, 16))
59
+
60
+ result = models['gpt'].generate(
61
+ emb, inputs['input_ids'],
62
+ temperature = torch.tensor(temperature, device=device),
63
+ attention_mask = inputs['attention_mask'],
64
+ LogitsWarpers = LogitsWarpers,
65
+ LogitsProcessors = LogitsProcessors,
66
+ eos_token = num_code,
67
+ max_new_token = max_new_token,
68
+ infer_text = False,
69
+ **kwargs
70
+ )
71
+
72
+ return result
73
+
74
+
75
+ def refine_text(
76
+ models,
77
+ text,
78
+ top_P = 0.7,
79
+ top_K = 20,
80
+ temperature = 0.7,
81
+ repetition_penalty = 1.0,
82
+ max_new_token = 384,
83
+ prompt = '',
84
+ **kwargs
85
+ ):
86
+
87
+ device = next(models['gpt'].parameters()).device
88
+
89
+ if not isinstance(text, list):
90
+ text = [text]
91
+
92
+ assert len(text), 'text should not be empty'
93
+
94
+ text = [f"[Sbreak]{i}[Pbreak]{prompt}" for i in text]
95
+ text_token = models['tokenizer'](text, return_tensors='pt', add_special_tokens=False, padding=True).to(device)
96
+ text_mask = torch.ones(text_token['input_ids'].shape, dtype=bool, device=device)
97
+
98
+ inputs = {
99
+ 'input_ids': text_token['input_ids'][...,None].expand(-1, -1, models['gpt'].num_vq),
100
+ 'text_mask': text_mask,
101
+ 'attention_mask': text_token['attention_mask'],
102
+ }
103
+
104
+ LogitsWarpers = []
105
+ if top_P is not None:
106
+ LogitsWarpers.append(TopPLogitsWarper(top_P, min_tokens_to_keep=3))
107
+ if top_K is not None:
108
+ LogitsWarpers.append(TopKLogitsWarper(top_K, min_tokens_to_keep=3))
109
+
110
+ LogitsProcessors = []
111
+ if repetition_penalty is not None and repetition_penalty != 1:
112
+ LogitsProcessors.append(CustomRepetitionPenaltyLogitsProcessorRepeat(repetition_penalty, len(models['tokenizer']), 16))
113
+
114
+ result = models['gpt'].generate(
115
+ models['gpt'].get_emb(**inputs), inputs['input_ids'],
116
+ temperature = torch.tensor([temperature,], device=device),
117
+ attention_mask = inputs['attention_mask'],
118
+ LogitsWarpers = LogitsWarpers,
119
+ LogitsProcessors = LogitsProcessors,
120
+ eos_token = torch.tensor(models['tokenizer'].convert_tokens_to_ids('[Ebreak]'), device=device)[None],
121
+ max_new_token = max_new_token,
122
+ infer_text = True,
123
+ **kwargs
124
+ )
125
+ return result
File without changes
@@ -0,0 +1,155 @@
1
+ import math
2
+ from einops import rearrange
3
+ from vector_quantize_pytorch import GroupedResidualFSQ
4
+
5
+ import torch
6
+ import torch.nn as nn
7
+ import torch.nn.functional as F
8
+
9
+ class ConvNeXtBlock(nn.Module):
10
+ def __init__(
11
+ self,
12
+ dim: int,
13
+ intermediate_dim: int,
14
+ kernel, dilation,
15
+ layer_scale_init_value: float = 1e-6,
16
+ ):
17
+ # ConvNeXt Block copied from Vocos.
18
+ super().__init__()
19
+ self.dwconv = nn.Conv1d(dim, dim,
20
+ kernel_size=kernel, padding=dilation*(kernel//2),
21
+ dilation=dilation, groups=dim
22
+ ) # depthwise conv
23
+
24
+ self.norm = nn.LayerNorm(dim, eps=1e-6)
25
+ self.pwconv1 = nn.Linear(dim, intermediate_dim) # pointwise/1x1 convs, implemented with linear layers
26
+ self.act = nn.GELU()
27
+ self.pwconv2 = nn.Linear(intermediate_dim, dim)
28
+ self.gamma = (
29
+ nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
30
+ if layer_scale_init_value > 0
31
+ else None
32
+ )
33
+
34
+ def forward(self, x: torch.Tensor, cond = None) -> torch.Tensor:
35
+ residual = x
36
+ x = self.dwconv(x)
37
+ x = x.transpose(1, 2) # (B, C, T) -> (B, T, C)
38
+ x = self.norm(x)
39
+ x = self.pwconv1(x)
40
+ x = self.act(x)
41
+ x = self.pwconv2(x)
42
+ if self.gamma is not None:
43
+ x = self.gamma * x
44
+ x = x.transpose(1, 2) # (B, T, C) -> (B, C, T)
45
+
46
+ x = residual + x
47
+ return x
48
+
49
+
50
+
51
+ class GFSQ(nn.Module):
52
+
53
+ def __init__(self,
54
+ dim, levels, G, R, eps=1e-5, transpose = True
55
+ ):
56
+ super(GFSQ, self).__init__()
57
+ self.quantizer = GroupedResidualFSQ(
58
+ dim=dim,
59
+ levels=levels,
60
+ num_quantizers=R,
61
+ groups=G,
62
+ )
63
+ self.n_ind = math.prod(levels)
64
+ self.eps = eps
65
+ self.transpose = transpose
66
+ self.G = G
67
+ self.R = R
68
+
69
+ def _embed(self, x):
70
+ if self.transpose:
71
+ x = x.transpose(1,2)
72
+ x = rearrange(
73
+ x, "b t (g r) -> g b t r", g = self.G, r = self.R,
74
+ )
75
+ feat = self.quantizer.get_output_from_indices(x)
76
+ return feat.transpose(1,2) if self.transpose else feat
77
+
78
+ def forward(self, x,):
79
+ if self.transpose:
80
+ x = x.transpose(1,2)
81
+ feat, ind = self.quantizer(x)
82
+ ind = rearrange(
83
+ ind, "g b t r ->b t (g r)",
84
+ )
85
+ embed_onehot = F.one_hot(ind.long(), self.n_ind).to(x.dtype)
86
+ e_mean = torch.mean(embed_onehot, dim=[0,1])
87
+ e_mean = e_mean / (e_mean.sum(dim=1) + self.eps).unsqueeze(1)
88
+ perplexity = torch.exp(-torch.sum(e_mean * torch.log(e_mean + self.eps), dim=1))
89
+
90
+ return (
91
+ torch.zeros(perplexity.shape, dtype=x.dtype, device=x.device),
92
+ feat.transpose(1,2) if self.transpose else feat,
93
+ perplexity,
94
+ None,
95
+ ind.transpose(1,2) if self.transpose else ind,
96
+ )
97
+
98
+ class DVAEDecoder(nn.Module):
99
+ def __init__(self, idim, odim,
100
+ n_layer = 12, bn_dim = 64, hidden = 256,
101
+ kernel = 7, dilation = 2, up = False
102
+ ):
103
+ super().__init__()
104
+ self.up = up
105
+ self.conv_in = nn.Sequential(
106
+ nn.Conv1d(idim, bn_dim, 3, 1, 1), nn.GELU(),
107
+ nn.Conv1d(bn_dim, hidden, 3, 1, 1)
108
+ )
109
+ self.decoder_block = nn.ModuleList([
110
+ ConvNeXtBlock(hidden, hidden* 4, kernel, dilation,)
111
+ for _ in range(n_layer)])
112
+ self.conv_out = nn.Conv1d(hidden, odim, kernel_size=1, bias=False)
113
+
114
+ def forward(self, input, conditioning=None):
115
+ # B, T, C
116
+ x = input.transpose(1, 2)
117
+ x = self.conv_in(x)
118
+ for f in self.decoder_block:
119
+ x = f(x, conditioning)
120
+
121
+ x = self.conv_out(x)
122
+ return x.transpose(1, 2)
123
+
124
+
125
+ class DVAE(nn.Module):
126
+ def __init__(
127
+ self, decoder_config, vq_config, dim=512
128
+ ):
129
+ super().__init__()
130
+ self.register_buffer('coef', torch.randn(1, 100, 1))
131
+
132
+ self.decoder = DVAEDecoder(**decoder_config)
133
+ self.out_conv = nn.Conv1d(dim, 100, 3, 1, 1, bias=False)
134
+ if vq_config is not None:
135
+ self.vq_layer = GFSQ(**vq_config)
136
+ else:
137
+ self.vq_layer = None
138
+
139
+ def forward(self, inp):
140
+
141
+ if self.vq_layer is not None:
142
+ vq_feats = self.vq_layer._embed(inp)
143
+ else:
144
+ vq_feats = inp.detach().clone()
145
+
146
+ temp = torch.chunk(vq_feats, 2, dim=1) # flatten trick :)
147
+ temp = torch.stack(temp, -1)
148
+ vq_feats = temp.reshape(*temp.shape[:2], -1)
149
+
150
+ vq_feats = vq_feats.transpose(1, 2)
151
+ dec_out = self.decoder(input=vq_feats)
152
+ dec_out = self.out_conv(dec_out.transpose(1, 2))
153
+ mel = dec_out * self.coef
154
+
155
+ return mel
@@ -0,0 +1,265 @@
1
+ import os
2
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
3
+
4
+ import logging
5
+ from tqdm import tqdm
6
+ from einops import rearrange
7
+ from transformers.cache_utils import Cache
8
+
9
+ import torch
10
+ import torch.nn as nn
11
+ import torch.nn.functional as F
12
+ import torch.nn.utils.parametrize as P
13
+ from torch.nn.utils.parametrizations import weight_norm
14
+ from transformers import LlamaModel, LlamaConfig
15
+
16
+
17
+ class LlamaMLP(nn.Module):
18
+ def __init__(self, hidden_size, intermediate_size):
19
+ super().__init__()
20
+ self.hidden_size = hidden_size
21
+ self.intermediate_size = intermediate_size
22
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
23
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
24
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
25
+ self.act_fn = F.silu
26
+
27
+ def forward(self, x):
28
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
29
+ return down_proj
30
+
31
+
32
+ class GPT_warpper(nn.Module):
33
+ def __init__(
34
+ self,
35
+ gpt_config,
36
+ num_audio_tokens,
37
+ num_text_tokens,
38
+ num_vq=4,
39
+ **kwargs,
40
+ ):
41
+ super().__init__()
42
+
43
+ self.logger = logging.getLogger(__name__)
44
+ self.gpt = self.build_model(gpt_config)
45
+ self.model_dim = self.gpt.config.hidden_size
46
+
47
+ self.num_vq = num_vq
48
+ self.emb_code = nn.ModuleList([nn.Embedding(num_audio_tokens, self.model_dim) for i in range(self.num_vq)])
49
+ self.emb_text = nn.Embedding(num_text_tokens, self.model_dim)
50
+ self.head_text = weight_norm(nn.Linear(self.model_dim, num_text_tokens, bias=False), name='weight')
51
+ self.head_code = nn.ModuleList([weight_norm(nn.Linear(self.model_dim, num_audio_tokens, bias=False), name='weight') for i in range(self.num_vq)])
52
+
53
+ def build_model(self, config):
54
+
55
+ configuration = LlamaConfig(**config)
56
+ model = LlamaModel(configuration)
57
+ del model.embed_tokens
58
+
59
+ return model
60
+
61
+ def get_emb(self, input_ids, text_mask, **kwargs):
62
+
63
+ emb_text = self.emb_text(input_ids[text_mask][:, 0])
64
+
65
+ emb_code = [self.emb_code[i](input_ids[~text_mask][:, i]) for i in range(self.num_vq)]
66
+ emb_code = torch.stack(emb_code, 2).sum(2)
67
+
68
+ emb = torch.zeros((input_ids.shape[:-1])+(emb_text.shape[-1],), device=emb_text.device, dtype=emb_text.dtype)
69
+ emb[text_mask] = emb_text
70
+ emb[~text_mask] = emb_code.to(emb.dtype)
71
+
72
+ return emb
73
+
74
+ def prepare_inputs_for_generation(
75
+ self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, cache_position=None, **kwargs
76
+ ):
77
+ # With static cache, the `past_key_values` is None
78
+ # TODO joao: standardize interface for the different Cache classes and remove of this if
79
+ has_static_cache = False
80
+ if past_key_values is None:
81
+ past_key_values = getattr(self.gpt.layers[0].self_attn, "past_key_value", None)
82
+ has_static_cache = past_key_values is not None
83
+
84
+ past_length = 0
85
+ if past_key_values is not None:
86
+ if isinstance(past_key_values, Cache):
87
+ past_length = cache_position[0] if cache_position is not None else past_key_values.get_seq_length()
88
+ max_cache_length = (
89
+ torch.tensor(past_key_values.get_max_length(), device=input_ids.device)
90
+ if past_key_values.get_max_length() is not None
91
+ else None
92
+ )
93
+ cache_length = past_length if max_cache_length is None else torch.min(max_cache_length, past_length)
94
+ # TODO joao: remove this `else` after `generate` prioritizes `Cache` objects
95
+ else:
96
+ cache_length = past_length = past_key_values[0][0].shape[2]
97
+ max_cache_length = None
98
+
99
+ # Keep only the unprocessed tokens:
100
+ # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
101
+ # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
102
+ # input)
103
+ if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
104
+ input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
105
+ # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
106
+ # input_ids based on the past_length.
107
+ elif past_length < input_ids.shape[1]:
108
+ input_ids = input_ids[:, past_length:]
109
+ # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
110
+
111
+ # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
112
+ if (
113
+ max_cache_length is not None
114
+ and attention_mask is not None
115
+ and cache_length + input_ids.shape[1] > max_cache_length
116
+ ):
117
+ attention_mask = attention_mask[:, -max_cache_length:]
118
+
119
+ position_ids = kwargs.get("position_ids", None)
120
+ if attention_mask is not None and position_ids is None:
121
+ # create position_ids on the fly for batch generation
122
+ position_ids = attention_mask.long().cumsum(-1) - 1
123
+ position_ids.masked_fill_(attention_mask == 0, 1)
124
+ if past_key_values:
125
+ position_ids = position_ids[:, -input_ids.shape[1] :]
126
+
127
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
128
+ if inputs_embeds is not None and past_key_values is None:
129
+ model_inputs = {"inputs_embeds": inputs_embeds}
130
+ else:
131
+ # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise
132
+ # recompiles graphs as the stride of the inputs is a guard. Ref: https://github.com/huggingface/transformers/pull/29114
133
+ # TODO: use `next_tokens` directly instead.
134
+ model_inputs = {"input_ids": input_ids.contiguous()}
135
+
136
+ input_length = position_ids.shape[-1] if position_ids is not None else input_ids.shape[-1]
137
+ if cache_position is None:
138
+ cache_position = torch.arange(past_length, past_length + input_length, device=input_ids.device)
139
+ else:
140
+ cache_position = cache_position[-input_length:]
141
+
142
+ if has_static_cache:
143
+ past_key_values = None
144
+
145
+ model_inputs.update(
146
+ {
147
+ "position_ids": position_ids,
148
+ "cache_position": cache_position,
149
+ "past_key_values": past_key_values,
150
+ "use_cache": kwargs.get("use_cache"),
151
+ "attention_mask": attention_mask,
152
+ }
153
+ )
154
+ return model_inputs
155
+
156
+ def generate(
157
+ self,
158
+ emb,
159
+ inputs_ids,
160
+ temperature,
161
+ eos_token,
162
+ attention_mask = None,
163
+ max_new_token = 2048,
164
+ min_new_token = 0,
165
+ LogitsWarpers = [],
166
+ LogitsProcessors = [],
167
+ infer_text=False,
168
+ return_attn=False,
169
+ return_hidden=False,
170
+ ):
171
+
172
+ with torch.no_grad():
173
+
174
+ attentions = []
175
+ hiddens = []
176
+
177
+ start_idx, end_idx = inputs_ids.shape[1], torch.zeros(inputs_ids.shape[0], device=inputs_ids.device, dtype=torch.long)
178
+ finish = torch.zeros(inputs_ids.shape[0], device=inputs_ids.device).bool()
179
+
180
+ temperature = temperature[None].expand(inputs_ids.shape[0], -1)
181
+ temperature = rearrange(temperature, "b n -> (b n) 1")
182
+
183
+ attention_mask_cache = torch.ones((inputs_ids.shape[0], inputs_ids.shape[1]+max_new_token,), dtype=torch.bool, device=inputs_ids.device)
184
+ if attention_mask is not None:
185
+ attention_mask_cache[:, :attention_mask.shape[1]] = attention_mask
186
+
187
+ for i in tqdm(range(max_new_token)):
188
+
189
+ model_input = self.prepare_inputs_for_generation(inputs_ids,
190
+ outputs.past_key_values if i!=0 else None,
191
+ attention_mask_cache[:, :inputs_ids.shape[1]], use_cache=True)
192
+
193
+ if i == 0:
194
+ model_input['inputs_embeds'] = emb
195
+ else:
196
+ if infer_text:
197
+ model_input['inputs_embeds'] = self.emb_text(model_input['input_ids'][:,:,0])
198
+ else:
199
+ code_emb = [self.emb_code[i](model_input['input_ids'][:,:,i]) for i in range(self.num_vq)]
200
+ model_input['inputs_embeds'] = torch.stack(code_emb, 3).sum(3)
201
+
202
+ model_input['input_ids'] = None
203
+ outputs = self.gpt.forward(**model_input, output_attentions=return_attn)
204
+ attentions.append(outputs.attentions)
205
+ hidden_states = outputs[0] # 🐻
206
+ if return_hidden:
207
+ hiddens.append(hidden_states[:, -1])
208
+
209
+ with P.cached():
210
+ if infer_text:
211
+ logits = self.head_text(hidden_states)
212
+ else:
213
+ logits = torch.stack([self.head_code[i](hidden_states) for i in range(self.num_vq)], 3)
214
+
215
+ logits = logits[:, -1].float()
216
+
217
+ if not infer_text:
218
+ logits = rearrange(logits, "b c n -> (b n) c")
219
+ logits_token = rearrange(inputs_ids[:, start_idx:], "b c n -> (b n) c")
220
+ else:
221
+ logits_token = inputs_ids[:, start_idx:, 0]
222
+
223
+ logits = logits / temperature
224
+
225
+ for logitsProcessors in LogitsProcessors:
226
+ logits = logitsProcessors(logits_token, logits)
227
+
228
+ for logitsWarpers in LogitsWarpers:
229
+ logits = logitsWarpers(logits_token, logits)
230
+
231
+ if i < min_new_token:
232
+ logits[:, eos_token] = -torch.inf
233
+
234
+ scores = F.softmax(logits, dim=-1)
235
+
236
+ idx_next = torch.multinomial(scores, num_samples=1)
237
+
238
+ if not infer_text:
239
+ idx_next = rearrange(idx_next, "(b n) 1 -> b n", n=self.num_vq)
240
+ finish = finish | (idx_next == eos_token).any(1)
241
+ inputs_ids = torch.cat([inputs_ids, idx_next.unsqueeze(1)], 1)
242
+ else:
243
+ finish = finish | (idx_next == eos_token).any(1)
244
+ inputs_ids = torch.cat([inputs_ids, idx_next.unsqueeze(-1).expand(-1, -1, self.num_vq)], 1)
245
+
246
+ end_idx = end_idx + (~finish).int()
247
+
248
+ if finish.all():
249
+ break
250
+
251
+ inputs_ids = [inputs_ids[idx, start_idx: start_idx+i] for idx, i in enumerate(end_idx.int())]
252
+ inputs_ids = [i[:, 0] for i in inputs_ids] if infer_text else inputs_ids
253
+
254
+ if return_hidden:
255
+ hiddens = torch.stack(hiddens, 1)
256
+ hiddens = [hiddens[idx, :i] for idx, i in enumerate(end_idx.int())]
257
+
258
+ if not finish.all():
259
+ self.logger.warn(f'Incomplete result. hit max_new_token: {max_new_token}')
260
+
261
+ return {
262
+ 'ids': inputs_ids,
263
+ 'attentions': attentions,
264
+ 'hiddens':hiddens,
265
+ }
File without changes
@@ -0,0 +1,23 @@
1
+
2
+ import torch
3
+ import logging
4
+
5
+ def select_device(min_memory = 2048):
6
+ logger = logging.getLogger(__name__)
7
+ if torch.cuda.is_available():
8
+ available_gpus = []
9
+ for i in range(torch.cuda.device_count()):
10
+ props = torch.cuda.get_device_properties(i)
11
+ free_memory = props.total_memory - torch.cuda.memory_reserved(i)
12
+ available_gpus.append((i, free_memory))
13
+ selected_gpu, max_free_memory = max(available_gpus, key=lambda x: x[1])
14
+ device = torch.device(f'cuda:{selected_gpu}')
15
+ free_memory_mb = max_free_memory / (1024 * 1024)
16
+ if free_memory_mb < min_memory:
17
+ logger.log(logging.WARNING, f'GPU {selected_gpu} has {round(free_memory_mb, 2)} MB memory left.')
18
+ device = torch.device('cpu')
19
+ else:
20
+ logger.log(logging.WARNING, f'No GPU found, use CPU instead')
21
+ device = torch.device('cpu')
22
+
23
+ return device