xinference 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (124) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +4 -7
  3. xinference/client/handlers.py +3 -0
  4. xinference/core/chat_interface.py +6 -1
  5. xinference/core/model.py +2 -0
  6. xinference/core/scheduler.py +4 -7
  7. xinference/core/supervisor.py +114 -23
  8. xinference/core/worker.py +70 -4
  9. xinference/deploy/local.py +2 -1
  10. xinference/model/audio/core.py +11 -0
  11. xinference/model/audio/cosyvoice.py +16 -5
  12. xinference/model/audio/kokoro.py +139 -0
  13. xinference/model/audio/melotts.py +110 -0
  14. xinference/model/audio/model_spec.json +80 -0
  15. xinference/model/audio/model_spec_modelscope.json +18 -0
  16. xinference/model/audio/whisper.py +35 -10
  17. xinference/model/llm/llama_cpp/core.py +21 -14
  18. xinference/model/llm/llm_family.json +527 -1
  19. xinference/model/llm/llm_family.py +4 -1
  20. xinference/model/llm/llm_family_modelscope.json +495 -3
  21. xinference/model/llm/memory.py +1 -1
  22. xinference/model/llm/mlx/core.py +24 -6
  23. xinference/model/llm/transformers/core.py +9 -1
  24. xinference/model/llm/transformers/qwen2_audio.py +3 -1
  25. xinference/model/llm/transformers/qwen2_vl.py +20 -3
  26. xinference/model/llm/transformers/utils.py +22 -11
  27. xinference/model/llm/utils.py +115 -1
  28. xinference/model/llm/vllm/core.py +14 -4
  29. xinference/model/llm/vllm/xavier/block.py +3 -4
  30. xinference/model/llm/vllm/xavier/block_tracker.py +71 -58
  31. xinference/model/llm/vllm/xavier/collective.py +74 -0
  32. xinference/model/llm/vllm/xavier/collective_manager.py +147 -0
  33. xinference/model/llm/vllm/xavier/executor.py +18 -16
  34. xinference/model/llm/vllm/xavier/scheduler.py +79 -63
  35. xinference/model/llm/vllm/xavier/test/test_xavier.py +60 -35
  36. xinference/model/llm/vllm/xavier/transfer.py +53 -32
  37. xinference/thirdparty/cosyvoice/bin/spk2info.pt +0 -0
  38. xinference/thirdparty/melo/__init__.py +0 -0
  39. xinference/thirdparty/melo/api.py +135 -0
  40. xinference/thirdparty/melo/app.py +61 -0
  41. xinference/thirdparty/melo/attentions.py +459 -0
  42. xinference/thirdparty/melo/commons.py +160 -0
  43. xinference/thirdparty/melo/configs/config.json +94 -0
  44. xinference/thirdparty/melo/data/example/metadata.list +20 -0
  45. xinference/thirdparty/melo/data_utils.py +413 -0
  46. xinference/thirdparty/melo/download_utils.py +67 -0
  47. xinference/thirdparty/melo/infer.py +25 -0
  48. xinference/thirdparty/melo/init_downloads.py +14 -0
  49. xinference/thirdparty/melo/losses.py +58 -0
  50. xinference/thirdparty/melo/main.py +36 -0
  51. xinference/thirdparty/melo/mel_processing.py +174 -0
  52. xinference/thirdparty/melo/models.py +1030 -0
  53. xinference/thirdparty/melo/modules.py +598 -0
  54. xinference/thirdparty/melo/monotonic_align/__init__.py +16 -0
  55. xinference/thirdparty/melo/monotonic_align/core.py +46 -0
  56. xinference/thirdparty/melo/preprocess_text.py +135 -0
  57. xinference/thirdparty/melo/split_utils.py +174 -0
  58. xinference/thirdparty/melo/text/__init__.py +35 -0
  59. xinference/thirdparty/melo/text/chinese.py +199 -0
  60. xinference/thirdparty/melo/text/chinese_bert.py +107 -0
  61. xinference/thirdparty/melo/text/chinese_mix.py +253 -0
  62. xinference/thirdparty/melo/text/cleaner.py +36 -0
  63. xinference/thirdparty/melo/text/cleaner_multiling.py +110 -0
  64. xinference/thirdparty/melo/text/cmudict.rep +129530 -0
  65. xinference/thirdparty/melo/text/cmudict_cache.pickle +0 -0
  66. xinference/thirdparty/melo/text/english.py +284 -0
  67. xinference/thirdparty/melo/text/english_bert.py +39 -0
  68. xinference/thirdparty/melo/text/english_utils/__init__.py +0 -0
  69. xinference/thirdparty/melo/text/english_utils/abbreviations.py +35 -0
  70. xinference/thirdparty/melo/text/english_utils/number_norm.py +97 -0
  71. xinference/thirdparty/melo/text/english_utils/time_norm.py +47 -0
  72. xinference/thirdparty/melo/text/es_phonemizer/__init__.py +0 -0
  73. xinference/thirdparty/melo/text/es_phonemizer/base.py +140 -0
  74. xinference/thirdparty/melo/text/es_phonemizer/cleaner.py +109 -0
  75. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.json +79 -0
  76. xinference/thirdparty/melo/text/es_phonemizer/es_symbols.txt +1 -0
  77. xinference/thirdparty/melo/text/es_phonemizer/es_symbols_v2.json +83 -0
  78. xinference/thirdparty/melo/text/es_phonemizer/es_to_ipa.py +12 -0
  79. xinference/thirdparty/melo/text/es_phonemizer/example_ipa.txt +400 -0
  80. xinference/thirdparty/melo/text/es_phonemizer/gruut_wrapper.py +253 -0
  81. xinference/thirdparty/melo/text/es_phonemizer/punctuation.py +174 -0
  82. xinference/thirdparty/melo/text/es_phonemizer/spanish_symbols.txt +1 -0
  83. xinference/thirdparty/melo/text/es_phonemizer/test.ipynb +124 -0
  84. xinference/thirdparty/melo/text/fr_phonemizer/__init__.py +0 -0
  85. xinference/thirdparty/melo/text/fr_phonemizer/base.py +140 -0
  86. xinference/thirdparty/melo/text/fr_phonemizer/cleaner.py +122 -0
  87. xinference/thirdparty/melo/text/fr_phonemizer/en_symbols.json +78 -0
  88. xinference/thirdparty/melo/text/fr_phonemizer/example_ipa.txt +1 -0
  89. xinference/thirdparty/melo/text/fr_phonemizer/fr_symbols.json +89 -0
  90. xinference/thirdparty/melo/text/fr_phonemizer/fr_to_ipa.py +30 -0
  91. xinference/thirdparty/melo/text/fr_phonemizer/french_abbreviations.py +48 -0
  92. xinference/thirdparty/melo/text/fr_phonemizer/french_symbols.txt +1 -0
  93. xinference/thirdparty/melo/text/fr_phonemizer/gruut_wrapper.py +258 -0
  94. xinference/thirdparty/melo/text/fr_phonemizer/punctuation.py +172 -0
  95. xinference/thirdparty/melo/text/french.py +94 -0
  96. xinference/thirdparty/melo/text/french_bert.py +39 -0
  97. xinference/thirdparty/melo/text/japanese.py +647 -0
  98. xinference/thirdparty/melo/text/japanese_bert.py +49 -0
  99. xinference/thirdparty/melo/text/ko_dictionary.py +44 -0
  100. xinference/thirdparty/melo/text/korean.py +192 -0
  101. xinference/thirdparty/melo/text/opencpop-strict.txt +429 -0
  102. xinference/thirdparty/melo/text/spanish.py +122 -0
  103. xinference/thirdparty/melo/text/spanish_bert.py +39 -0
  104. xinference/thirdparty/melo/text/symbols.py +290 -0
  105. xinference/thirdparty/melo/text/tone_sandhi.py +769 -0
  106. xinference/thirdparty/melo/train.py +635 -0
  107. xinference/thirdparty/melo/train.sh +19 -0
  108. xinference/thirdparty/melo/transforms.py +209 -0
  109. xinference/thirdparty/melo/utils.py +424 -0
  110. xinference/types.py +2 -0
  111. xinference/web/ui/build/asset-manifest.json +3 -3
  112. xinference/web/ui/build/index.html +1 -1
  113. xinference/web/ui/build/static/js/{main.1eb206d1.js → main.b0936c54.js} +3 -3
  114. xinference/web/ui/build/static/js/main.b0936c54.js.map +1 -0
  115. xinference/web/ui/node_modules/.cache/babel-loader/a3ff866acddf34917a7ee399e0e571a4dfd8ba66d5057db885f243e16a6eb17d.json +1 -0
  116. {xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/METADATA +37 -27
  117. {xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/RECORD +122 -45
  118. xinference/web/ui/build/static/js/main.1eb206d1.js.map +0 -1
  119. xinference/web/ui/node_modules/.cache/babel-loader/2213d49de260e1f67c888081b18f120f5225462b829ae57c9e05a05cec83689d.json +0 -1
  120. /xinference/web/ui/build/static/js/{main.1eb206d1.js.LICENSE.txt → main.b0936c54.js.LICENSE.txt} +0 -0
  121. {xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/LICENSE +0 -0
  122. {xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/WHEEL +0 -0
  123. {xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/entry_points.txt +0 -0
  124. {xinference-1.2.0.dist-info → xinference-1.2.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,769 @@
1
+ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from typing import List
15
+ from typing import Tuple
16
+
17
+ import jieba
18
+ from pypinyin import lazy_pinyin
19
+ from pypinyin import Style
20
+
21
+
22
+ class ToneSandhi:
23
+ def __init__(self):
24
+ self.must_neural_tone_words = {
25
+ "麻烦",
26
+ "麻利",
27
+ "鸳鸯",
28
+ "高粱",
29
+ "骨头",
30
+ "骆驼",
31
+ "马虎",
32
+ "首饰",
33
+ "馒头",
34
+ "馄饨",
35
+ "风筝",
36
+ "难为",
37
+ "队伍",
38
+ "阔气",
39
+ "闺女",
40
+ "门道",
41
+ "锄头",
42
+ "铺盖",
43
+ "铃铛",
44
+ "铁匠",
45
+ "钥匙",
46
+ "里脊",
47
+ "里头",
48
+ "部分",
49
+ "那么",
50
+ "道士",
51
+ "造化",
52
+ "迷糊",
53
+ "连累",
54
+ "这么",
55
+ "这个",
56
+ "运气",
57
+ "过去",
58
+ "软和",
59
+ "转悠",
60
+ "踏实",
61
+ "跳蚤",
62
+ "跟头",
63
+ "趔趄",
64
+ "财主",
65
+ "豆腐",
66
+ "讲究",
67
+ "记性",
68
+ "记号",
69
+ "认识",
70
+ "规矩",
71
+ "见识",
72
+ "裁缝",
73
+ "补丁",
74
+ "衣裳",
75
+ "衣服",
76
+ "衙门",
77
+ "街坊",
78
+ "行李",
79
+ "行当",
80
+ "蛤蟆",
81
+ "蘑菇",
82
+ "薄荷",
83
+ "葫芦",
84
+ "葡萄",
85
+ "萝卜",
86
+ "荸荠",
87
+ "苗条",
88
+ "苗头",
89
+ "苍蝇",
90
+ "芝麻",
91
+ "舒服",
92
+ "舒坦",
93
+ "舌头",
94
+ "自在",
95
+ "膏药",
96
+ "脾气",
97
+ "脑袋",
98
+ "脊梁",
99
+ "能耐",
100
+ "胳膊",
101
+ "胭脂",
102
+ "胡萝",
103
+ "胡琴",
104
+ "胡同",
105
+ "聪明",
106
+ "耽误",
107
+ "耽搁",
108
+ "耷拉",
109
+ "耳朵",
110
+ "老爷",
111
+ "老实",
112
+ "老婆",
113
+ "老头",
114
+ "老太",
115
+ "翻腾",
116
+ "罗嗦",
117
+ "罐头",
118
+ "编辑",
119
+ "结实",
120
+ "红火",
121
+ "累赘",
122
+ "糨糊",
123
+ "糊涂",
124
+ "精神",
125
+ "粮食",
126
+ "簸箕",
127
+ "篱笆",
128
+ "算计",
129
+ "算盘",
130
+ "答应",
131
+ "笤帚",
132
+ "笑语",
133
+ "笑话",
134
+ "窟窿",
135
+ "窝囊",
136
+ "窗户",
137
+ "稳当",
138
+ "稀罕",
139
+ "称呼",
140
+ "秧歌",
141
+ "秀气",
142
+ "秀才",
143
+ "福气",
144
+ "祖宗",
145
+ "砚台",
146
+ "码头",
147
+ "石榴",
148
+ "石头",
149
+ "石匠",
150
+ "知识",
151
+ "眼睛",
152
+ "眯缝",
153
+ "眨巴",
154
+ "眉毛",
155
+ "相声",
156
+ "盘算",
157
+ "白净",
158
+ "痢疾",
159
+ "痛快",
160
+ "疟疾",
161
+ "疙瘩",
162
+ "疏忽",
163
+ "畜生",
164
+ "生意",
165
+ "甘蔗",
166
+ "琵琶",
167
+ "琢磨",
168
+ "琉璃",
169
+ "玻璃",
170
+ "玫瑰",
171
+ "玄乎",
172
+ "狐狸",
173
+ "状元",
174
+ "特务",
175
+ "牲口",
176
+ "牙碜",
177
+ "牌楼",
178
+ "爽快",
179
+ "爱人",
180
+ "热闹",
181
+ "烧饼",
182
+ "烟筒",
183
+ "烂糊",
184
+ "点心",
185
+ "炊帚",
186
+ "灯笼",
187
+ "火候",
188
+ "漂亮",
189
+ "滑溜",
190
+ "溜达",
191
+ "温和",
192
+ "清楚",
193
+ "消息",
194
+ "浪头",
195
+ "活泼",
196
+ "比方",
197
+ "正经",
198
+ "欺负",
199
+ "模糊",
200
+ "槟榔",
201
+ "棺材",
202
+ "棒槌",
203
+ "棉花",
204
+ "核桃",
205
+ "栅栏",
206
+ "柴火",
207
+ "架势",
208
+ "枕头",
209
+ "枇杷",
210
+ "机灵",
211
+ "本事",
212
+ "木头",
213
+ "木匠",
214
+ "朋友",
215
+ "月饼",
216
+ "月亮",
217
+ "暖和",
218
+ "明白",
219
+ "时候",
220
+ "新鲜",
221
+ "故事",
222
+ "收拾",
223
+ "收成",
224
+ "提防",
225
+ "挖苦",
226
+ "挑剔",
227
+ "指甲",
228
+ "指头",
229
+ "拾掇",
230
+ "拳头",
231
+ "拨弄",
232
+ "招牌",
233
+ "招呼",
234
+ "抬举",
235
+ "护士",
236
+ "折腾",
237
+ "扫帚",
238
+ "打量",
239
+ "打算",
240
+ "打点",
241
+ "打扮",
242
+ "打听",
243
+ "打发",
244
+ "扎实",
245
+ "扁担",
246
+ "戒指",
247
+ "懒得",
248
+ "意识",
249
+ "意思",
250
+ "情形",
251
+ "悟性",
252
+ "怪物",
253
+ "思量",
254
+ "怎么",
255
+ "念头",
256
+ "念叨",
257
+ "快活",
258
+ "忙活",
259
+ "志气",
260
+ "心思",
261
+ "得罪",
262
+ "张罗",
263
+ "弟兄",
264
+ "开通",
265
+ "应酬",
266
+ "庄稼",
267
+ "干事",
268
+ "帮手",
269
+ "帐篷",
270
+ "希罕",
271
+ "师父",
272
+ "师傅",
273
+ "巴结",
274
+ "巴掌",
275
+ "差事",
276
+ "工夫",
277
+ "岁数",
278
+ "屁股",
279
+ "尾巴",
280
+ "少爷",
281
+ "小气",
282
+ "小伙",
283
+ "将就",
284
+ "对头",
285
+ "对付",
286
+ "寡妇",
287
+ "家伙",
288
+ "客气",
289
+ "实在",
290
+ "官司",
291
+ "学问",
292
+ "学生",
293
+ "字号",
294
+ "嫁妆",
295
+ "媳妇",
296
+ "媒人",
297
+ "婆家",
298
+ "娘家",
299
+ "委屈",
300
+ "姑娘",
301
+ "姐夫",
302
+ "妯娌",
303
+ "妥当",
304
+ "妖精",
305
+ "奴才",
306
+ "女婿",
307
+ "头发",
308
+ "太阳",
309
+ "大爷",
310
+ "大方",
311
+ "大意",
312
+ "大夫",
313
+ "多少",
314
+ "多么",
315
+ "外甥",
316
+ "壮实",
317
+ "地道",
318
+ "地方",
319
+ "在乎",
320
+ "困难",
321
+ "嘴巴",
322
+ "嘱咐",
323
+ "嘟囔",
324
+ "嘀咕",
325
+ "喜欢",
326
+ "喇嘛",
327
+ "喇叭",
328
+ "商量",
329
+ "唾沫",
330
+ "哑巴",
331
+ "哈欠",
332
+ "哆嗦",
333
+ "咳嗽",
334
+ "和尚",
335
+ "告诉",
336
+ "告示",
337
+ "含糊",
338
+ "吓唬",
339
+ "后头",
340
+ "名字",
341
+ "名堂",
342
+ "合同",
343
+ "吆喝",
344
+ "叫唤",
345
+ "口袋",
346
+ "厚道",
347
+ "厉害",
348
+ "千斤",
349
+ "包袱",
350
+ "包涵",
351
+ "匀称",
352
+ "勤快",
353
+ "动静",
354
+ "动弹",
355
+ "功夫",
356
+ "力气",
357
+ "前头",
358
+ "刺猬",
359
+ "刺激",
360
+ "别扭",
361
+ "利落",
362
+ "利索",
363
+ "利害",
364
+ "分析",
365
+ "出息",
366
+ "凑合",
367
+ "凉快",
368
+ "冷战",
369
+ "冤枉",
370
+ "冒失",
371
+ "养活",
372
+ "关系",
373
+ "先生",
374
+ "兄弟",
375
+ "便宜",
376
+ "使唤",
377
+ "佩服",
378
+ "作坊",
379
+ "体面",
380
+ "位置",
381
+ "似的",
382
+ "伙计",
383
+ "休息",
384
+ "什么",
385
+ "人家",
386
+ "亲戚",
387
+ "亲家",
388
+ "交情",
389
+ "云彩",
390
+ "事情",
391
+ "买卖",
392
+ "主意",
393
+ "丫头",
394
+ "丧气",
395
+ "两口",
396
+ "东西",
397
+ "东家",
398
+ "世故",
399
+ "不由",
400
+ "不在",
401
+ "下水",
402
+ "下巴",
403
+ "上头",
404
+ "上司",
405
+ "丈夫",
406
+ "丈人",
407
+ "一辈",
408
+ "那个",
409
+ "菩萨",
410
+ "父亲",
411
+ "母亲",
412
+ "咕噜",
413
+ "邋遢",
414
+ "费用",
415
+ "冤家",
416
+ "甜头",
417
+ "介绍",
418
+ "荒唐",
419
+ "大人",
420
+ "泥鳅",
421
+ "幸福",
422
+ "熟悉",
423
+ "计划",
424
+ "扑腾",
425
+ "蜡烛",
426
+ "姥爷",
427
+ "照顾",
428
+ "喉咙",
429
+ "吉他",
430
+ "弄堂",
431
+ "蚂蚱",
432
+ "凤凰",
433
+ "拖沓",
434
+ "寒碜",
435
+ "糟蹋",
436
+ "倒腾",
437
+ "报复",
438
+ "逻辑",
439
+ "盘缠",
440
+ "喽啰",
441
+ "牢骚",
442
+ "咖喱",
443
+ "扫把",
444
+ "惦记",
445
+ }
446
+ self.must_not_neural_tone_words = {
447
+ "男子",
448
+ "女子",
449
+ "分子",
450
+ "原子",
451
+ "量子",
452
+ "莲子",
453
+ "石子",
454
+ "瓜子",
455
+ "电子",
456
+ "人人",
457
+ "虎虎",
458
+ }
459
+ self.punc = ":,;。?!“”‘’':,;.?!"
460
+
461
+ # the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041
462
+ # e.g.
463
+ # word: "家里"
464
+ # pos: "s"
465
+ # finals: ['ia1', 'i3']
466
+ def _neural_sandhi(self, word: str, pos: str, finals: List[str]) -> List[str]:
467
+ # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
468
+ for j, item in enumerate(word):
469
+ if (
470
+ j - 1 >= 0
471
+ and item == word[j - 1]
472
+ and pos[0] in {"n", "v", "a"}
473
+ and word not in self.must_not_neural_tone_words
474
+ ):
475
+ finals[j] = finals[j][:-1] + "5"
476
+ ge_idx = word.find("个")
477
+ if len(word) >= 1 and word[-1] in "吧呢啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶":
478
+ finals[-1] = finals[-1][:-1] + "5"
479
+ elif len(word) >= 1 and word[-1] in "的地得":
480
+ finals[-1] = finals[-1][:-1] + "5"
481
+ # e.g. 走了, 看着, 去过
482
+ # elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}:
483
+ # finals[-1] = finals[-1][:-1] + "5"
484
+ elif (
485
+ len(word) > 1
486
+ and word[-1] in "们子"
487
+ and pos in {"r", "n"}
488
+ and word not in self.must_not_neural_tone_words
489
+ ):
490
+ finals[-1] = finals[-1][:-1] + "5"
491
+ # e.g. 桌上, 地下, 家里
492
+ elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}:
493
+ finals[-1] = finals[-1][:-1] + "5"
494
+ # e.g. 上来, 下去
495
+ elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开":
496
+ finals[-1] = finals[-1][:-1] + "5"
497
+ # 个做量词
498
+ elif (
499
+ ge_idx >= 1
500
+ and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是")
501
+ ) or word == "个":
502
+ finals[ge_idx] = finals[ge_idx][:-1] + "5"
503
+ else:
504
+ if (
505
+ word in self.must_neural_tone_words
506
+ or word[-2:] in self.must_neural_tone_words
507
+ ):
508
+ finals[-1] = finals[-1][:-1] + "5"
509
+
510
+ word_list = self._split_word(word)
511
+ finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
512
+ for i, word in enumerate(word_list):
513
+ # conventional neural in Chinese
514
+ if (
515
+ word in self.must_neural_tone_words
516
+ or word[-2:] in self.must_neural_tone_words
517
+ ):
518
+ finals_list[i][-1] = finals_list[i][-1][:-1] + "5"
519
+ finals = sum(finals_list, [])
520
+ return finals
521
+
522
+ def _bu_sandhi(self, word: str, finals: List[str]) -> List[str]:
523
+ # e.g. 看不懂
524
+ if len(word) == 3 and word[1] == "不":
525
+ finals[1] = finals[1][:-1] + "5"
526
+ else:
527
+ for i, char in enumerate(word):
528
+ # "不" before tone4 should be bu2, e.g. 不怕
529
+ if char == "不" and i + 1 < len(word) and finals[i + 1][-1] == "4":
530
+ finals[i] = finals[i][:-1] + "2"
531
+ return finals
532
+
533
+ def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]:
534
+ # "一" in number sequences, e.g. 一零零, 二一零
535
+ if word.find("一") != -1 and all(
536
+ [item.isnumeric() for item in word if item != "一"]
537
+ ):
538
+ return finals
539
+ # "一" between reduplication words should be yi5, e.g. 看一看
540
+ elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]:
541
+ finals[1] = finals[1][:-1] + "5"
542
+ # when "一" is ordinal word, it should be yi1
543
+ elif word.startswith("第一"):
544
+ finals[1] = finals[1][:-1] + "1"
545
+ else:
546
+ for i, char in enumerate(word):
547
+ if char == "一" and i + 1 < len(word):
548
+ # "一" before tone4 should be yi2, e.g. 一段
549
+ if finals[i + 1][-1] == "4":
550
+ finals[i] = finals[i][:-1] + "2"
551
+ # "一" before non-tone4 should be yi4, e.g. 一天
552
+ else:
553
+ # "一" 后面如果是标点,还读一声
554
+ if word[i + 1] not in self.punc:
555
+ finals[i] = finals[i][:-1] + "4"
556
+ return finals
557
+
558
+ def _split_word(self, word: str) -> List[str]:
559
+ word_list = jieba.cut_for_search(word)
560
+ word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
561
+ first_subword = word_list[0]
562
+ first_begin_idx = word.find(first_subword)
563
+ if first_begin_idx == 0:
564
+ second_subword = word[len(first_subword) :]
565
+ new_word_list = [first_subword, second_subword]
566
+ else:
567
+ second_subword = word[: -len(first_subword)]
568
+ new_word_list = [second_subword, first_subword]
569
+ return new_word_list
570
+
571
+ def _three_sandhi(self, word: str, finals: List[str]) -> List[str]:
572
+ if len(word) == 2 and self._all_tone_three(finals):
573
+ finals[0] = finals[0][:-1] + "2"
574
+ elif len(word) == 3:
575
+ word_list = self._split_word(word)
576
+ if self._all_tone_three(finals):
577
+ # disyllabic + monosyllabic, e.g. 蒙古/包
578
+ if len(word_list[0]) == 2:
579
+ finals[0] = finals[0][:-1] + "2"
580
+ finals[1] = finals[1][:-1] + "2"
581
+ # monosyllabic + disyllabic, e.g. 纸/老虎
582
+ elif len(word_list[0]) == 1:
583
+ finals[1] = finals[1][:-1] + "2"
584
+ else:
585
+ finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
586
+ if len(finals_list) == 2:
587
+ for i, sub in enumerate(finals_list):
588
+ # e.g. 所有/人
589
+ if self._all_tone_three(sub) and len(sub) == 2:
590
+ finals_list[i][0] = finals_list[i][0][:-1] + "2"
591
+ # e.g. 好/喜欢
592
+ elif (
593
+ i == 1
594
+ and not self._all_tone_three(sub)
595
+ and finals_list[i][0][-1] == "3"
596
+ and finals_list[0][-1][-1] == "3"
597
+ ):
598
+ finals_list[0][-1] = finals_list[0][-1][:-1] + "2"
599
+ finals = sum(finals_list, [])
600
+ # split idiom into two words who's length is 2
601
+ elif len(word) == 4:
602
+ finals_list = [finals[:2], finals[2:]]
603
+ finals = []
604
+ for sub in finals_list:
605
+ if self._all_tone_three(sub):
606
+ sub[0] = sub[0][:-1] + "2"
607
+ finals += sub
608
+
609
+ return finals
610
+
611
+ def _all_tone_three(self, finals: List[str]) -> bool:
612
+ return all(x[-1] == "3" for x in finals)
613
+
614
+ # merge "不" and the word behind it
615
+ # if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error
616
+ def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
617
+ new_seg = []
618
+ last_word = ""
619
+ for word, pos in seg:
620
+ if last_word == "不":
621
+ word = last_word + word
622
+ if word != "不":
623
+ new_seg.append((word, pos))
624
+ last_word = word[:]
625
+ if last_word == "不":
626
+ new_seg.append((last_word, "d"))
627
+ last_word = ""
628
+ return new_seg
629
+
630
+ # function 1: merge "一" and reduplication words in it's left and right, e.g. "听","一","听" ->"听一听"
631
+ # function 2: merge single "一" and the word behind it
632
+ # if don't merge, "一" sometimes appears alone according to jieba, which may occur sandhi error
633
+ # e.g.
634
+ # input seg: [('听', 'v'), ('一', 'm'), ('听', 'v')]
635
+ # output seg: [['听一听', 'v']]
636
+ def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
637
+ new_seg = []
638
+ # function 1
639
+ for i, (word, pos) in enumerate(seg):
640
+ if (
641
+ i - 1 >= 0
642
+ and word == "一"
643
+ and i + 1 < len(seg)
644
+ and seg[i - 1][0] == seg[i + 1][0]
645
+ and seg[i - 1][1] == "v"
646
+ ):
647
+ new_seg[i - 1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0]
648
+ else:
649
+ if (
650
+ i - 2 >= 0
651
+ and seg[i - 1][0] == "一"
652
+ and seg[i - 2][0] == word
653
+ and pos == "v"
654
+ ):
655
+ continue
656
+ else:
657
+ new_seg.append([word, pos])
658
+ seg = new_seg
659
+ new_seg = []
660
+ # function 2
661
+ for i, (word, pos) in enumerate(seg):
662
+ if new_seg and new_seg[-1][0] == "一":
663
+ new_seg[-1][0] = new_seg[-1][0] + word
664
+ else:
665
+ new_seg.append([word, pos])
666
+ return new_seg
667
+
668
+ # the first and the second words are all_tone_three
669
+ def _merge_continuous_three_tones(
670
+ self, seg: List[Tuple[str, str]]
671
+ ) -> List[Tuple[str, str]]:
672
+ new_seg = []
673
+ sub_finals_list = [
674
+ lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
675
+ for (word, pos) in seg
676
+ ]
677
+ assert len(sub_finals_list) == len(seg)
678
+ merge_last = [False] * len(seg)
679
+ for i, (word, pos) in enumerate(seg):
680
+ if (
681
+ i - 1 >= 0
682
+ and self._all_tone_three(sub_finals_list[i - 1])
683
+ and self._all_tone_three(sub_finals_list[i])
684
+ and not merge_last[i - 1]
685
+ ):
686
+ # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
687
+ if (
688
+ not self._is_reduplication(seg[i - 1][0])
689
+ and len(seg[i - 1][0]) + len(seg[i][0]) <= 3
690
+ ):
691
+ new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
692
+ merge_last[i] = True
693
+ else:
694
+ new_seg.append([word, pos])
695
+ else:
696
+ new_seg.append([word, pos])
697
+
698
+ return new_seg
699
+
700
+ def _is_reduplication(self, word: str) -> bool:
701
+ return len(word) == 2 and word[0] == word[1]
702
+
703
+ # the last char of first word and the first char of second word is tone_three
704
+ def _merge_continuous_three_tones_2(
705
+ self, seg: List[Tuple[str, str]]
706
+ ) -> List[Tuple[str, str]]:
707
+ new_seg = []
708
+ sub_finals_list = [
709
+ lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
710
+ for (word, pos) in seg
711
+ ]
712
+ assert len(sub_finals_list) == len(seg)
713
+ merge_last = [False] * len(seg)
714
+ for i, (word, pos) in enumerate(seg):
715
+ if (
716
+ i - 1 >= 0
717
+ and sub_finals_list[i - 1][-1][-1] == "3"
718
+ and sub_finals_list[i][0][-1] == "3"
719
+ and not merge_last[i - 1]
720
+ ):
721
+ # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
722
+ if (
723
+ not self._is_reduplication(seg[i - 1][0])
724
+ and len(seg[i - 1][0]) + len(seg[i][0]) <= 3
725
+ ):
726
+ new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
727
+ merge_last[i] = True
728
+ else:
729
+ new_seg.append([word, pos])
730
+ else:
731
+ new_seg.append([word, pos])
732
+ return new_seg
733
+
734
+ def _merge_er(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
735
+ new_seg = []
736
+ for i, (word, pos) in enumerate(seg):
737
+ if i - 1 >= 0 and word == "儿" and seg[i - 1][0] != "#":
738
+ new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
739
+ else:
740
+ new_seg.append([word, pos])
741
+ return new_seg
742
+
743
+ def _merge_reduplication(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
744
+ new_seg = []
745
+ for i, (word, pos) in enumerate(seg):
746
+ if new_seg and word == new_seg[-1][0]:
747
+ new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
748
+ else:
749
+ new_seg.append([word, pos])
750
+ return new_seg
751
+
752
+ def pre_merge_for_modify(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
753
+ seg = self._merge_bu(seg)
754
+ try:
755
+ seg = self._merge_yi(seg)
756
+ except:
757
+ print("_merge_yi failed")
758
+ seg = self._merge_reduplication(seg)
759
+ seg = self._merge_continuous_three_tones(seg)
760
+ seg = self._merge_continuous_three_tones_2(seg)
761
+ seg = self._merge_er(seg)
762
+ return seg
763
+
764
+ def modified_tone(self, word: str, pos: str, finals: List[str]) -> List[str]:
765
+ finals = self._bu_sandhi(word, finals)
766
+ finals = self._yi_sandhi(word, finals)
767
+ finals = self._neural_sandhi(word, pos, finals)
768
+ finals = self._three_sandhi(word, finals)
769
+ return finals