xinference 0.14.4.post1__py3-none-any.whl → 0.15.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_compat.py +51 -0
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +209 -40
- xinference/client/restful/restful_client.py +7 -26
- xinference/conftest.py +1 -1
- xinference/constants.py +5 -0
- xinference/core/cache_tracker.py +1 -1
- xinference/core/chat_interface.py +8 -14
- xinference/core/event.py +1 -1
- xinference/core/image_interface.py +28 -0
- xinference/core/model.py +110 -31
- xinference/core/scheduler.py +37 -37
- xinference/core/status_guard.py +1 -1
- xinference/core/supervisor.py +17 -10
- xinference/core/utils.py +80 -22
- xinference/core/worker.py +17 -16
- xinference/deploy/cmdline.py +8 -16
- xinference/deploy/local.py +1 -1
- xinference/deploy/supervisor.py +1 -1
- xinference/deploy/utils.py +1 -1
- xinference/deploy/worker.py +1 -1
- xinference/model/audio/cosyvoice.py +86 -41
- xinference/model/audio/fish_speech.py +9 -9
- xinference/model/audio/model_spec.json +9 -9
- xinference/model/audio/whisper.py +4 -1
- xinference/model/embedding/core.py +52 -31
- xinference/model/image/core.py +2 -1
- xinference/model/image/model_spec.json +16 -4
- xinference/model/image/model_spec_modelscope.json +16 -4
- xinference/model/image/sdapi.py +136 -0
- xinference/model/image/stable_diffusion/core.py +164 -19
- xinference/model/llm/__init__.py +29 -11
- xinference/model/llm/llama_cpp/core.py +16 -33
- xinference/model/llm/llm_family.json +1011 -1296
- xinference/model/llm/llm_family.py +34 -53
- xinference/model/llm/llm_family_csghub.json +18 -35
- xinference/model/llm/llm_family_modelscope.json +981 -1122
- xinference/model/llm/lmdeploy/core.py +56 -88
- xinference/model/llm/mlx/core.py +46 -69
- xinference/model/llm/sglang/core.py +36 -18
- xinference/model/llm/transformers/chatglm.py +168 -306
- xinference/model/llm/transformers/cogvlm2.py +36 -63
- xinference/model/llm/transformers/cogvlm2_video.py +33 -223
- xinference/model/llm/transformers/core.py +55 -50
- xinference/model/llm/transformers/deepseek_v2.py +340 -0
- xinference/model/llm/transformers/deepseek_vl.py +53 -96
- xinference/model/llm/transformers/glm4v.py +55 -111
- xinference/model/llm/transformers/intern_vl.py +39 -70
- xinference/model/llm/transformers/internlm2.py +32 -54
- xinference/model/llm/transformers/minicpmv25.py +22 -55
- xinference/model/llm/transformers/minicpmv26.py +158 -68
- xinference/model/llm/transformers/omnilmm.py +5 -28
- xinference/model/llm/transformers/qwen2_audio.py +168 -0
- xinference/model/llm/transformers/qwen2_vl.py +234 -0
- xinference/model/llm/transformers/qwen_vl.py +34 -86
- xinference/model/llm/transformers/utils.py +32 -38
- xinference/model/llm/transformers/yi_vl.py +32 -72
- xinference/model/llm/utils.py +280 -554
- xinference/model/llm/vllm/core.py +161 -100
- xinference/model/rerank/core.py +41 -8
- xinference/model/rerank/model_spec.json +7 -0
- xinference/model/rerank/model_spec_modelscope.json +7 -1
- xinference/model/utils.py +1 -31
- xinference/thirdparty/cosyvoice/bin/export_jit.py +64 -0
- xinference/thirdparty/cosyvoice/bin/export_trt.py +8 -0
- xinference/thirdparty/cosyvoice/bin/inference.py +5 -2
- xinference/thirdparty/cosyvoice/cli/cosyvoice.py +38 -22
- xinference/thirdparty/cosyvoice/cli/model.py +139 -26
- xinference/thirdparty/cosyvoice/flow/flow.py +15 -9
- xinference/thirdparty/cosyvoice/flow/length_regulator.py +20 -1
- xinference/thirdparty/cosyvoice/hifigan/generator.py +8 -4
- xinference/thirdparty/cosyvoice/llm/llm.py +14 -13
- xinference/thirdparty/cosyvoice/transformer/attention.py +7 -3
- xinference/thirdparty/cosyvoice/transformer/decoder.py +1 -1
- xinference/thirdparty/cosyvoice/transformer/embedding.py +4 -3
- xinference/thirdparty/cosyvoice/transformer/encoder.py +4 -2
- xinference/thirdparty/cosyvoice/utils/common.py +36 -0
- xinference/thirdparty/cosyvoice/utils/file_utils.py +16 -0
- xinference/thirdparty/deepseek_vl/serve/assets/Kelpy-Codos.js +100 -0
- xinference/thirdparty/deepseek_vl/serve/assets/avatar.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/assets/custom.css +355 -0
- xinference/thirdparty/deepseek_vl/serve/assets/custom.js +22 -0
- xinference/thirdparty/deepseek_vl/serve/assets/favicon.ico +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/app.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/chart.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/mirror.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/pipeline.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/puzzle.png +0 -0
- xinference/thirdparty/deepseek_vl/serve/examples/rap.jpeg +0 -0
- xinference/thirdparty/fish_speech/fish_speech/configs/base.yaml +87 -0
- xinference/thirdparty/fish_speech/fish_speech/configs/firefly_gan_vq.yaml +33 -0
- xinference/thirdparty/fish_speech/fish_speech/configs/lora/r_8_alpha_16.yaml +4 -0
- xinference/thirdparty/fish_speech/fish_speech/configs/text2semantic_finetune.yaml +83 -0
- xinference/thirdparty/fish_speech/fish_speech/datasets/protos/text-data.proto +24 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/README.md +27 -0
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json +1 -1
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json +1 -1
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json +1 -1
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/pt_BR.json +1 -1
- xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json +1 -1
- xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py +2 -2
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/__init__.py +0 -3
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py +169 -198
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py +4 -27
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/.gitignore +114 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/README.md +36 -0
- xinference/thirdparty/fish_speech/fish_speech/text/clean.py +9 -47
- xinference/thirdparty/fish_speech/fish_speech/text/spliter.py +2 -2
- xinference/thirdparty/fish_speech/fish_speech/train.py +2 -0
- xinference/thirdparty/fish_speech/fish_speech/webui/css/style.css +161 -0
- xinference/thirdparty/fish_speech/fish_speech/webui/html/footer.html +11 -0
- xinference/thirdparty/fish_speech/fish_speech/webui/js/animate.js +69 -0
- xinference/thirdparty/fish_speech/fish_speech/webui/manage.py +12 -10
- xinference/thirdparty/fish_speech/tools/api.py +79 -134
- xinference/thirdparty/fish_speech/tools/commons.py +35 -0
- xinference/thirdparty/fish_speech/tools/download_models.py +3 -3
- xinference/thirdparty/fish_speech/tools/file.py +17 -0
- xinference/thirdparty/fish_speech/tools/llama/build_dataset.py +1 -1
- xinference/thirdparty/fish_speech/tools/llama/generate.py +29 -24
- xinference/thirdparty/fish_speech/tools/llama/merge_lora.py +1 -1
- xinference/thirdparty/fish_speech/tools/llama/quantize.py +2 -2
- xinference/thirdparty/fish_speech/tools/msgpack_api.py +34 -0
- xinference/thirdparty/fish_speech/tools/post_api.py +85 -44
- xinference/thirdparty/fish_speech/tools/sensevoice/README.md +59 -0
- xinference/thirdparty/fish_speech/tools/sensevoice/fun_asr.py +1 -1
- xinference/thirdparty/fish_speech/tools/smart_pad.py +16 -3
- xinference/thirdparty/fish_speech/tools/vqgan/extract_vq.py +2 -2
- xinference/thirdparty/fish_speech/tools/vqgan/inference.py +4 -2
- xinference/thirdparty/fish_speech/tools/webui.py +12 -146
- xinference/thirdparty/matcha/VERSION +1 -0
- xinference/thirdparty/matcha/hifigan/LICENSE +21 -0
- xinference/thirdparty/matcha/hifigan/README.md +101 -0
- xinference/thirdparty/omnilmm/LICENSE +201 -0
- xinference/thirdparty/whisper/__init__.py +156 -0
- xinference/thirdparty/whisper/__main__.py +3 -0
- xinference/thirdparty/whisper/assets/gpt2.tiktoken +50256 -0
- xinference/thirdparty/whisper/assets/mel_filters.npz +0 -0
- xinference/thirdparty/whisper/assets/multilingual.tiktoken +50257 -0
- xinference/thirdparty/whisper/audio.py +157 -0
- xinference/thirdparty/whisper/decoding.py +826 -0
- xinference/thirdparty/whisper/model.py +314 -0
- xinference/thirdparty/whisper/normalizers/__init__.py +2 -0
- xinference/thirdparty/whisper/normalizers/basic.py +76 -0
- xinference/thirdparty/whisper/normalizers/english.json +1741 -0
- xinference/thirdparty/whisper/normalizers/english.py +550 -0
- xinference/thirdparty/whisper/timing.py +386 -0
- xinference/thirdparty/whisper/tokenizer.py +395 -0
- xinference/thirdparty/whisper/transcribe.py +605 -0
- xinference/thirdparty/whisper/triton_ops.py +109 -0
- xinference/thirdparty/whisper/utils.py +316 -0
- xinference/thirdparty/whisper/version.py +1 -0
- xinference/types.py +14 -53
- xinference/web/ui/build/asset-manifest.json +6 -6
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/css/{main.4bafd904.css → main.5061c4c3.css} +2 -2
- xinference/web/ui/build/static/css/main.5061c4c3.css.map +1 -0
- xinference/web/ui/build/static/js/main.754740c0.js +3 -0
- xinference/web/ui/build/static/js/{main.eb13fe95.js.LICENSE.txt → main.754740c0.js.LICENSE.txt} +2 -0
- xinference/web/ui/build/static/js/main.754740c0.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/10c69dc7a296779fcffedeff9393d832dfcb0013c36824adf623d3c518b801ff.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/68bede6d95bb5ef0b35bbb3ec5b8c937eaf6862c6cdbddb5ef222a7776aaf336.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/77d50223f3e734d4485cca538cb098a8c3a7a0a1a9f01f58cdda3af42fe1adf5.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/a56d5a642409a84988891089c98ca28ad0546432dfbae8aaa51bc5a280e1cdd2.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/cd90b08d177025dfe84209596fc51878f8a86bcaa6a240848a3d2e5fd4c7ff24.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/d9ff696a3e3471f01b46c63d18af32e491eb5dc0e43cb30202c96871466df57f.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/e42b72d4cc1ea412ebecbb8d040dc6c6bfee462c33903c2f1f3facb602ad742e.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +1 -0
- xinference/web/ui/node_modules/.package-lock.json +37 -0
- xinference/web/ui/node_modules/a-sync-waterfall/package.json +21 -0
- xinference/web/ui/node_modules/nunjucks/node_modules/commander/package.json +48 -0
- xinference/web/ui/node_modules/nunjucks/package.json +112 -0
- xinference/web/ui/package-lock.json +38 -0
- xinference/web/ui/package.json +1 -0
- {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/METADATA +16 -10
- {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/RECORD +179 -127
- xinference/model/llm/transformers/llama_2.py +0 -108
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/lit_module.py +0 -442
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/discriminator.py +0 -44
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/reference.py +0 -115
- xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/wavenet.py +0 -225
- xinference/thirdparty/fish_speech/tools/auto_rerank.py +0 -159
- xinference/thirdparty/fish_speech/tools/gen_ref.py +0 -36
- xinference/thirdparty/fish_speech/tools/merge_asr_files.py +0 -55
- xinference/web/ui/build/static/css/main.4bafd904.css.map +0 -1
- xinference/web/ui/build/static/js/main.eb13fe95.js +0 -3
- xinference/web/ui/build/static/js/main.eb13fe95.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/0b11a5339468c13b2d31ac085e7effe4303259b2071abd46a0a8eb8529233a5e.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/213b5913e164773c2b0567455377765715f5f07225fbac77ad8e1e9dc9648a47.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/5c26a23b5eacf5b752a08531577ae3840bb247745ef9a39583dc2d05ba93a82a.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/978b57d1a04a701bc3fcfebc511f5f274eed6ed7eade67f6fb76c27d5fd9ecc8.json +0 -1
- {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/LICENSE +0 -0
- {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/WHEEL +0 -0
- {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/entry_points.txt +0 -0
- {xinference-0.14.4.post1.dist-info → xinference-0.15.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,550 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
from fractions import Fraction
|
|
5
|
+
from typing import Iterator, List, Match, Optional, Union
|
|
6
|
+
|
|
7
|
+
from more_itertools import windowed
|
|
8
|
+
|
|
9
|
+
from .basic import remove_symbols_and_diacritics
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class EnglishNumberNormalizer:
|
|
13
|
+
"""
|
|
14
|
+
Convert any spelled-out numbers into arabic numbers, while handling:
|
|
15
|
+
|
|
16
|
+
- remove any commas
|
|
17
|
+
- keep the suffixes such as: `1960s`, `274th`, `32nd`, etc.
|
|
18
|
+
- spell out currency symbols after the number. e.g. `$20 million` -> `20000000 dollars`
|
|
19
|
+
- spell out `one` and `ones`
|
|
20
|
+
- interpret successive single-digit numbers as nominal: `one oh one` -> `101`
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(self):
|
|
24
|
+
super().__init__()
|
|
25
|
+
|
|
26
|
+
self.zeros = {"o", "oh", "zero"}
|
|
27
|
+
self.ones = {
|
|
28
|
+
name: i
|
|
29
|
+
for i, name in enumerate(
|
|
30
|
+
[
|
|
31
|
+
"one",
|
|
32
|
+
"two",
|
|
33
|
+
"three",
|
|
34
|
+
"four",
|
|
35
|
+
"five",
|
|
36
|
+
"six",
|
|
37
|
+
"seven",
|
|
38
|
+
"eight",
|
|
39
|
+
"nine",
|
|
40
|
+
"ten",
|
|
41
|
+
"eleven",
|
|
42
|
+
"twelve",
|
|
43
|
+
"thirteen",
|
|
44
|
+
"fourteen",
|
|
45
|
+
"fifteen",
|
|
46
|
+
"sixteen",
|
|
47
|
+
"seventeen",
|
|
48
|
+
"eighteen",
|
|
49
|
+
"nineteen",
|
|
50
|
+
],
|
|
51
|
+
start=1,
|
|
52
|
+
)
|
|
53
|
+
}
|
|
54
|
+
self.ones_plural = {
|
|
55
|
+
"sixes" if name == "six" else name + "s": (value, "s")
|
|
56
|
+
for name, value in self.ones.items()
|
|
57
|
+
}
|
|
58
|
+
self.ones_ordinal = {
|
|
59
|
+
"zeroth": (0, "th"),
|
|
60
|
+
"first": (1, "st"),
|
|
61
|
+
"second": (2, "nd"),
|
|
62
|
+
"third": (3, "rd"),
|
|
63
|
+
"fifth": (5, "th"),
|
|
64
|
+
"twelfth": (12, "th"),
|
|
65
|
+
**{
|
|
66
|
+
name + ("h" if name.endswith("t") else "th"): (value, "th")
|
|
67
|
+
for name, value in self.ones.items()
|
|
68
|
+
if value > 3 and value != 5 and value != 12
|
|
69
|
+
},
|
|
70
|
+
}
|
|
71
|
+
self.ones_suffixed = {**self.ones_plural, **self.ones_ordinal}
|
|
72
|
+
|
|
73
|
+
self.tens = {
|
|
74
|
+
"twenty": 20,
|
|
75
|
+
"thirty": 30,
|
|
76
|
+
"forty": 40,
|
|
77
|
+
"fifty": 50,
|
|
78
|
+
"sixty": 60,
|
|
79
|
+
"seventy": 70,
|
|
80
|
+
"eighty": 80,
|
|
81
|
+
"ninety": 90,
|
|
82
|
+
}
|
|
83
|
+
self.tens_plural = {
|
|
84
|
+
name.replace("y", "ies"): (value, "s") for name, value in self.tens.items()
|
|
85
|
+
}
|
|
86
|
+
self.tens_ordinal = {
|
|
87
|
+
name.replace("y", "ieth"): (value, "th")
|
|
88
|
+
for name, value in self.tens.items()
|
|
89
|
+
}
|
|
90
|
+
self.tens_suffixed = {**self.tens_plural, **self.tens_ordinal}
|
|
91
|
+
|
|
92
|
+
self.multipliers = {
|
|
93
|
+
"hundred": 100,
|
|
94
|
+
"thousand": 1_000,
|
|
95
|
+
"million": 1_000_000,
|
|
96
|
+
"billion": 1_000_000_000,
|
|
97
|
+
"trillion": 1_000_000_000_000,
|
|
98
|
+
"quadrillion": 1_000_000_000_000_000,
|
|
99
|
+
"quintillion": 1_000_000_000_000_000_000,
|
|
100
|
+
"sextillion": 1_000_000_000_000_000_000_000,
|
|
101
|
+
"septillion": 1_000_000_000_000_000_000_000_000,
|
|
102
|
+
"octillion": 1_000_000_000_000_000_000_000_000_000,
|
|
103
|
+
"nonillion": 1_000_000_000_000_000_000_000_000_000_000,
|
|
104
|
+
"decillion": 1_000_000_000_000_000_000_000_000_000_000_000,
|
|
105
|
+
}
|
|
106
|
+
self.multipliers_plural = {
|
|
107
|
+
name + "s": (value, "s") for name, value in self.multipliers.items()
|
|
108
|
+
}
|
|
109
|
+
self.multipliers_ordinal = {
|
|
110
|
+
name + "th": (value, "th") for name, value in self.multipliers.items()
|
|
111
|
+
}
|
|
112
|
+
self.multipliers_suffixed = {
|
|
113
|
+
**self.multipliers_plural,
|
|
114
|
+
**self.multipliers_ordinal,
|
|
115
|
+
}
|
|
116
|
+
self.decimals = {*self.ones, *self.tens, *self.zeros}
|
|
117
|
+
|
|
118
|
+
self.preceding_prefixers = {
|
|
119
|
+
"minus": "-",
|
|
120
|
+
"negative": "-",
|
|
121
|
+
"plus": "+",
|
|
122
|
+
"positive": "+",
|
|
123
|
+
}
|
|
124
|
+
self.following_prefixers = {
|
|
125
|
+
"pound": "£",
|
|
126
|
+
"pounds": "£",
|
|
127
|
+
"euro": "€",
|
|
128
|
+
"euros": "€",
|
|
129
|
+
"dollar": "$",
|
|
130
|
+
"dollars": "$",
|
|
131
|
+
"cent": "¢",
|
|
132
|
+
"cents": "¢",
|
|
133
|
+
}
|
|
134
|
+
self.prefixes = set(
|
|
135
|
+
list(self.preceding_prefixers.values())
|
|
136
|
+
+ list(self.following_prefixers.values())
|
|
137
|
+
)
|
|
138
|
+
self.suffixers = {
|
|
139
|
+
"per": {"cent": "%"},
|
|
140
|
+
"percent": "%",
|
|
141
|
+
}
|
|
142
|
+
self.specials = {"and", "double", "triple", "point"}
|
|
143
|
+
|
|
144
|
+
self.words = set(
|
|
145
|
+
[
|
|
146
|
+
key
|
|
147
|
+
for mapping in [
|
|
148
|
+
self.zeros,
|
|
149
|
+
self.ones,
|
|
150
|
+
self.ones_suffixed,
|
|
151
|
+
self.tens,
|
|
152
|
+
self.tens_suffixed,
|
|
153
|
+
self.multipliers,
|
|
154
|
+
self.multipliers_suffixed,
|
|
155
|
+
self.preceding_prefixers,
|
|
156
|
+
self.following_prefixers,
|
|
157
|
+
self.suffixers,
|
|
158
|
+
self.specials,
|
|
159
|
+
]
|
|
160
|
+
for key in mapping
|
|
161
|
+
]
|
|
162
|
+
)
|
|
163
|
+
self.literal_words = {"one", "ones"}
|
|
164
|
+
|
|
165
|
+
def process_words(self, words: List[str]) -> Iterator[str]:
|
|
166
|
+
prefix: Optional[str] = None
|
|
167
|
+
value: Optional[Union[str, int]] = None
|
|
168
|
+
skip = False
|
|
169
|
+
|
|
170
|
+
def to_fraction(s: str):
|
|
171
|
+
try:
|
|
172
|
+
return Fraction(s)
|
|
173
|
+
except ValueError:
|
|
174
|
+
return None
|
|
175
|
+
|
|
176
|
+
def output(result: Union[str, int]):
|
|
177
|
+
nonlocal prefix, value
|
|
178
|
+
result = str(result)
|
|
179
|
+
if prefix is not None:
|
|
180
|
+
result = prefix + result
|
|
181
|
+
value = None
|
|
182
|
+
prefix = None
|
|
183
|
+
return result
|
|
184
|
+
|
|
185
|
+
if len(words) == 0:
|
|
186
|
+
return
|
|
187
|
+
|
|
188
|
+
for prev, current, next in windowed([None] + words + [None], 3):
|
|
189
|
+
if skip:
|
|
190
|
+
skip = False
|
|
191
|
+
continue
|
|
192
|
+
|
|
193
|
+
next_is_numeric = next is not None and re.match(r"^\d+(\.\d+)?$", next)
|
|
194
|
+
has_prefix = current[0] in self.prefixes
|
|
195
|
+
current_without_prefix = current[1:] if has_prefix else current
|
|
196
|
+
if re.match(r"^\d+(\.\d+)?$", current_without_prefix):
|
|
197
|
+
# arabic numbers (potentially with signs and fractions)
|
|
198
|
+
f = to_fraction(current_without_prefix)
|
|
199
|
+
assert f is not None
|
|
200
|
+
if value is not None:
|
|
201
|
+
if isinstance(value, str) and value.endswith("."):
|
|
202
|
+
# concatenate decimals / ip address components
|
|
203
|
+
value = str(value) + str(current)
|
|
204
|
+
continue
|
|
205
|
+
else:
|
|
206
|
+
yield output(value)
|
|
207
|
+
|
|
208
|
+
prefix = current[0] if has_prefix else prefix
|
|
209
|
+
if f.denominator == 1:
|
|
210
|
+
value = f.numerator # store integers as int
|
|
211
|
+
else:
|
|
212
|
+
value = current_without_prefix
|
|
213
|
+
elif current not in self.words:
|
|
214
|
+
# non-numeric words
|
|
215
|
+
if value is not None:
|
|
216
|
+
yield output(value)
|
|
217
|
+
yield output(current)
|
|
218
|
+
elif current in self.zeros:
|
|
219
|
+
value = str(value or "") + "0"
|
|
220
|
+
elif current in self.ones:
|
|
221
|
+
ones = self.ones[current]
|
|
222
|
+
|
|
223
|
+
if value is None:
|
|
224
|
+
value = ones
|
|
225
|
+
elif isinstance(value, str) or prev in self.ones:
|
|
226
|
+
if (
|
|
227
|
+
prev in self.tens and ones < 10
|
|
228
|
+
): # replace the last zero with the digit
|
|
229
|
+
assert value[-1] == "0"
|
|
230
|
+
value = value[:-1] + str(ones)
|
|
231
|
+
else:
|
|
232
|
+
value = str(value) + str(ones)
|
|
233
|
+
elif ones < 10:
|
|
234
|
+
if value % 10 == 0:
|
|
235
|
+
value += ones
|
|
236
|
+
else:
|
|
237
|
+
value = str(value) + str(ones)
|
|
238
|
+
else: # eleven to nineteen
|
|
239
|
+
if value % 100 == 0:
|
|
240
|
+
value += ones
|
|
241
|
+
else:
|
|
242
|
+
value = str(value) + str(ones)
|
|
243
|
+
elif current in self.ones_suffixed:
|
|
244
|
+
# ordinal or cardinal; yield the number right away
|
|
245
|
+
ones, suffix = self.ones_suffixed[current]
|
|
246
|
+
if value is None:
|
|
247
|
+
yield output(str(ones) + suffix)
|
|
248
|
+
elif isinstance(value, str) or prev in self.ones:
|
|
249
|
+
if prev in self.tens and ones < 10:
|
|
250
|
+
assert value[-1] == "0"
|
|
251
|
+
yield output(value[:-1] + str(ones) + suffix)
|
|
252
|
+
else:
|
|
253
|
+
yield output(str(value) + str(ones) + suffix)
|
|
254
|
+
elif ones < 10:
|
|
255
|
+
if value % 10 == 0:
|
|
256
|
+
yield output(str(value + ones) + suffix)
|
|
257
|
+
else:
|
|
258
|
+
yield output(str(value) + str(ones) + suffix)
|
|
259
|
+
else: # eleven to nineteen
|
|
260
|
+
if value % 100 == 0:
|
|
261
|
+
yield output(str(value + ones) + suffix)
|
|
262
|
+
else:
|
|
263
|
+
yield output(str(value) + str(ones) + suffix)
|
|
264
|
+
value = None
|
|
265
|
+
elif current in self.tens:
|
|
266
|
+
tens = self.tens[current]
|
|
267
|
+
if value is None:
|
|
268
|
+
value = tens
|
|
269
|
+
elif isinstance(value, str):
|
|
270
|
+
value = str(value) + str(tens)
|
|
271
|
+
else:
|
|
272
|
+
if value % 100 == 0:
|
|
273
|
+
value += tens
|
|
274
|
+
else:
|
|
275
|
+
value = str(value) + str(tens)
|
|
276
|
+
elif current in self.tens_suffixed:
|
|
277
|
+
# ordinal or cardinal; yield the number right away
|
|
278
|
+
tens, suffix = self.tens_suffixed[current]
|
|
279
|
+
if value is None:
|
|
280
|
+
yield output(str(tens) + suffix)
|
|
281
|
+
elif isinstance(value, str):
|
|
282
|
+
yield output(str(value) + str(tens) + suffix)
|
|
283
|
+
else:
|
|
284
|
+
if value % 100 == 0:
|
|
285
|
+
yield output(str(value + tens) + suffix)
|
|
286
|
+
else:
|
|
287
|
+
yield output(str(value) + str(tens) + suffix)
|
|
288
|
+
elif current in self.multipliers:
|
|
289
|
+
multiplier = self.multipliers[current]
|
|
290
|
+
if value is None:
|
|
291
|
+
value = multiplier
|
|
292
|
+
elif isinstance(value, str) or value == 0:
|
|
293
|
+
f = to_fraction(value)
|
|
294
|
+
p = f * multiplier if f is not None else None
|
|
295
|
+
if f is not None and p.denominator == 1:
|
|
296
|
+
value = p.numerator
|
|
297
|
+
else:
|
|
298
|
+
yield output(value)
|
|
299
|
+
value = multiplier
|
|
300
|
+
else:
|
|
301
|
+
before = value // 1000 * 1000
|
|
302
|
+
residual = value % 1000
|
|
303
|
+
value = before + residual * multiplier
|
|
304
|
+
elif current in self.multipliers_suffixed:
|
|
305
|
+
multiplier, suffix = self.multipliers_suffixed[current]
|
|
306
|
+
if value is None:
|
|
307
|
+
yield output(str(multiplier) + suffix)
|
|
308
|
+
elif isinstance(value, str):
|
|
309
|
+
f = to_fraction(value)
|
|
310
|
+
p = f * multiplier if f is not None else None
|
|
311
|
+
if f is not None and p.denominator == 1:
|
|
312
|
+
yield output(str(p.numerator) + suffix)
|
|
313
|
+
else:
|
|
314
|
+
yield output(value)
|
|
315
|
+
yield output(str(multiplier) + suffix)
|
|
316
|
+
else: # int
|
|
317
|
+
before = value // 1000 * 1000
|
|
318
|
+
residual = value % 1000
|
|
319
|
+
value = before + residual * multiplier
|
|
320
|
+
yield output(str(value) + suffix)
|
|
321
|
+
value = None
|
|
322
|
+
elif current in self.preceding_prefixers:
|
|
323
|
+
# apply prefix (positive, minus, etc.) if it precedes a number
|
|
324
|
+
if value is not None:
|
|
325
|
+
yield output(value)
|
|
326
|
+
|
|
327
|
+
if next in self.words or next_is_numeric:
|
|
328
|
+
prefix = self.preceding_prefixers[current]
|
|
329
|
+
else:
|
|
330
|
+
yield output(current)
|
|
331
|
+
elif current in self.following_prefixers:
|
|
332
|
+
# apply prefix (dollars, cents, etc.) only after a number
|
|
333
|
+
if value is not None:
|
|
334
|
+
prefix = self.following_prefixers[current]
|
|
335
|
+
yield output(value)
|
|
336
|
+
else:
|
|
337
|
+
yield output(current)
|
|
338
|
+
elif current in self.suffixers:
|
|
339
|
+
# apply suffix symbols (percent -> '%')
|
|
340
|
+
if value is not None:
|
|
341
|
+
suffix = self.suffixers[current]
|
|
342
|
+
if isinstance(suffix, dict):
|
|
343
|
+
if next in suffix:
|
|
344
|
+
yield output(str(value) + suffix[next])
|
|
345
|
+
skip = True
|
|
346
|
+
else:
|
|
347
|
+
yield output(value)
|
|
348
|
+
yield output(current)
|
|
349
|
+
else:
|
|
350
|
+
yield output(str(value) + suffix)
|
|
351
|
+
else:
|
|
352
|
+
yield output(current)
|
|
353
|
+
elif current in self.specials:
|
|
354
|
+
if next not in self.words and not next_is_numeric:
|
|
355
|
+
# apply special handling only if the next word can be numeric
|
|
356
|
+
if value is not None:
|
|
357
|
+
yield output(value)
|
|
358
|
+
yield output(current)
|
|
359
|
+
elif current == "and":
|
|
360
|
+
# ignore "and" after hundreds, thousands, etc.
|
|
361
|
+
if prev not in self.multipliers:
|
|
362
|
+
if value is not None:
|
|
363
|
+
yield output(value)
|
|
364
|
+
yield output(current)
|
|
365
|
+
elif current == "double" or current == "triple":
|
|
366
|
+
if next in self.ones or next in self.zeros:
|
|
367
|
+
repeats = 2 if current == "double" else 3
|
|
368
|
+
ones = self.ones.get(next, 0)
|
|
369
|
+
value = str(value or "") + str(ones) * repeats
|
|
370
|
+
skip = True
|
|
371
|
+
else:
|
|
372
|
+
if value is not None:
|
|
373
|
+
yield output(value)
|
|
374
|
+
yield output(current)
|
|
375
|
+
elif current == "point":
|
|
376
|
+
if next in self.decimals or next_is_numeric:
|
|
377
|
+
value = str(value or "") + "."
|
|
378
|
+
else:
|
|
379
|
+
# should all have been covered at this point
|
|
380
|
+
raise ValueError(f"Unexpected token: {current}")
|
|
381
|
+
else:
|
|
382
|
+
# all should have been covered at this point
|
|
383
|
+
raise ValueError(f"Unexpected token: {current}")
|
|
384
|
+
|
|
385
|
+
if value is not None:
|
|
386
|
+
yield output(value)
|
|
387
|
+
|
|
388
|
+
def preprocess(self, s: str):
|
|
389
|
+
# replace "<number> and a half" with "<number> point five"
|
|
390
|
+
results = []
|
|
391
|
+
|
|
392
|
+
segments = re.split(r"\band\s+a\s+half\b", s)
|
|
393
|
+
for i, segment in enumerate(segments):
|
|
394
|
+
if len(segment.strip()) == 0:
|
|
395
|
+
continue
|
|
396
|
+
if i == len(segments) - 1:
|
|
397
|
+
results.append(segment)
|
|
398
|
+
else:
|
|
399
|
+
results.append(segment)
|
|
400
|
+
last_word = segment.rsplit(maxsplit=2)[-1]
|
|
401
|
+
if last_word in self.decimals or last_word in self.multipliers:
|
|
402
|
+
results.append("point five")
|
|
403
|
+
else:
|
|
404
|
+
results.append("and a half")
|
|
405
|
+
|
|
406
|
+
s = " ".join(results)
|
|
407
|
+
|
|
408
|
+
# put a space at number/letter boundary
|
|
409
|
+
s = re.sub(r"([a-z])([0-9])", r"\1 \2", s)
|
|
410
|
+
s = re.sub(r"([0-9])([a-z])", r"\1 \2", s)
|
|
411
|
+
|
|
412
|
+
# but remove spaces which could be a suffix
|
|
413
|
+
s = re.sub(r"([0-9])\s+(st|nd|rd|th|s)\b", r"\1\2", s)
|
|
414
|
+
|
|
415
|
+
return s
|
|
416
|
+
|
|
417
|
+
def postprocess(self, s: str):
|
|
418
|
+
def combine_cents(m: Match):
|
|
419
|
+
try:
|
|
420
|
+
currency = m.group(1)
|
|
421
|
+
integer = m.group(2)
|
|
422
|
+
cents = int(m.group(3))
|
|
423
|
+
return f"{currency}{integer}.{cents:02d}"
|
|
424
|
+
except ValueError:
|
|
425
|
+
return m.string
|
|
426
|
+
|
|
427
|
+
def extract_cents(m: Match):
|
|
428
|
+
try:
|
|
429
|
+
return f"¢{int(m.group(1))}"
|
|
430
|
+
except ValueError:
|
|
431
|
+
return m.string
|
|
432
|
+
|
|
433
|
+
# apply currency postprocessing; "$2 and ¢7" -> "$2.07"
|
|
434
|
+
s = re.sub(r"([€£$])([0-9]+) (?:and )?¢([0-9]{1,2})\b", combine_cents, s)
|
|
435
|
+
s = re.sub(r"[€£$]0.([0-9]{1,2})\b", extract_cents, s)
|
|
436
|
+
|
|
437
|
+
# write "one(s)" instead of "1(s)", just for the readability
|
|
438
|
+
s = re.sub(r"\b1(s?)\b", r"one\1", s)
|
|
439
|
+
|
|
440
|
+
return s
|
|
441
|
+
|
|
442
|
+
def __call__(self, s: str):
|
|
443
|
+
s = self.preprocess(s)
|
|
444
|
+
s = " ".join(word for word in self.process_words(s.split()) if word is not None)
|
|
445
|
+
s = self.postprocess(s)
|
|
446
|
+
|
|
447
|
+
return s
|
|
448
|
+
|
|
449
|
+
|
|
450
|
+
class EnglishSpellingNormalizer:
|
|
451
|
+
"""
|
|
452
|
+
Applies British-American spelling mappings as listed in [1].
|
|
453
|
+
|
|
454
|
+
[1] https://www.tysto.com/uk-us-spelling-list.html
|
|
455
|
+
"""
|
|
456
|
+
|
|
457
|
+
def __init__(self):
|
|
458
|
+
mapping_path = os.path.join(os.path.dirname(__file__), "english.json")
|
|
459
|
+
self.mapping = json.load(open(mapping_path))
|
|
460
|
+
|
|
461
|
+
def __call__(self, s: str):
|
|
462
|
+
return " ".join(self.mapping.get(word, word) for word in s.split())
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
class EnglishTextNormalizer:
|
|
466
|
+
def __init__(self):
|
|
467
|
+
self.ignore_patterns = r"\b(hmm|mm|mhm|mmm|uh|um)\b"
|
|
468
|
+
self.replacers = {
|
|
469
|
+
# common contractions
|
|
470
|
+
r"\bwon't\b": "will not",
|
|
471
|
+
r"\bcan't\b": "can not",
|
|
472
|
+
r"\blet's\b": "let us",
|
|
473
|
+
r"\bain't\b": "aint",
|
|
474
|
+
r"\by'all\b": "you all",
|
|
475
|
+
r"\bwanna\b": "want to",
|
|
476
|
+
r"\bgotta\b": "got to",
|
|
477
|
+
r"\bgonna\b": "going to",
|
|
478
|
+
r"\bi'ma\b": "i am going to",
|
|
479
|
+
r"\bimma\b": "i am going to",
|
|
480
|
+
r"\bwoulda\b": "would have",
|
|
481
|
+
r"\bcoulda\b": "could have",
|
|
482
|
+
r"\bshoulda\b": "should have",
|
|
483
|
+
r"\bma'am\b": "madam",
|
|
484
|
+
# contractions in titles/prefixes
|
|
485
|
+
r"\bmr\b": "mister ",
|
|
486
|
+
r"\bmrs\b": "missus ",
|
|
487
|
+
r"\bst\b": "saint ",
|
|
488
|
+
r"\bdr\b": "doctor ",
|
|
489
|
+
r"\bprof\b": "professor ",
|
|
490
|
+
r"\bcapt\b": "captain ",
|
|
491
|
+
r"\bgov\b": "governor ",
|
|
492
|
+
r"\bald\b": "alderman ",
|
|
493
|
+
r"\bgen\b": "general ",
|
|
494
|
+
r"\bsen\b": "senator ",
|
|
495
|
+
r"\brep\b": "representative ",
|
|
496
|
+
r"\bpres\b": "president ",
|
|
497
|
+
r"\brev\b": "reverend ",
|
|
498
|
+
r"\bhon\b": "honorable ",
|
|
499
|
+
r"\basst\b": "assistant ",
|
|
500
|
+
r"\bassoc\b": "associate ",
|
|
501
|
+
r"\blt\b": "lieutenant ",
|
|
502
|
+
r"\bcol\b": "colonel ",
|
|
503
|
+
r"\bjr\b": "junior ",
|
|
504
|
+
r"\bsr\b": "senior ",
|
|
505
|
+
r"\besq\b": "esquire ",
|
|
506
|
+
# prefect tenses, ideally it should be any past participles, but it's harder..
|
|
507
|
+
r"'d been\b": " had been",
|
|
508
|
+
r"'s been\b": " has been",
|
|
509
|
+
r"'d gone\b": " had gone",
|
|
510
|
+
r"'s gone\b": " has gone",
|
|
511
|
+
r"'d done\b": " had done", # "'s done" is ambiguous
|
|
512
|
+
r"'s got\b": " has got",
|
|
513
|
+
# general contractions
|
|
514
|
+
r"n't\b": " not",
|
|
515
|
+
r"'re\b": " are",
|
|
516
|
+
r"'s\b": " is",
|
|
517
|
+
r"'d\b": " would",
|
|
518
|
+
r"'ll\b": " will",
|
|
519
|
+
r"'t\b": " not",
|
|
520
|
+
r"'ve\b": " have",
|
|
521
|
+
r"'m\b": " am",
|
|
522
|
+
}
|
|
523
|
+
self.standardize_numbers = EnglishNumberNormalizer()
|
|
524
|
+
self.standardize_spellings = EnglishSpellingNormalizer()
|
|
525
|
+
|
|
526
|
+
def __call__(self, s: str):
|
|
527
|
+
s = s.lower()
|
|
528
|
+
|
|
529
|
+
s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets
|
|
530
|
+
s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis
|
|
531
|
+
s = re.sub(self.ignore_patterns, "", s)
|
|
532
|
+
s = re.sub(r"\s+'", "'", s) # when there's a space before an apostrophe
|
|
533
|
+
|
|
534
|
+
for pattern, replacement in self.replacers.items():
|
|
535
|
+
s = re.sub(pattern, replacement, s)
|
|
536
|
+
|
|
537
|
+
s = re.sub(r"(\d),(\d)", r"\1\2", s) # remove commas between digits
|
|
538
|
+
s = re.sub(r"\.([^0-9]|$)", r" \1", s) # remove periods not followed by numbers
|
|
539
|
+
s = remove_symbols_and_diacritics(s, keep=".%$¢€£") # keep numeric symbols
|
|
540
|
+
|
|
541
|
+
s = self.standardize_numbers(s)
|
|
542
|
+
s = self.standardize_spellings(s)
|
|
543
|
+
|
|
544
|
+
# now remove prefix/suffix symbols that are not preceded/followed by numbers
|
|
545
|
+
s = re.sub(r"[.$¢€£]([^0-9])", r" \1", s)
|
|
546
|
+
s = re.sub(r"([^0-9])%", r"\1 ", s)
|
|
547
|
+
|
|
548
|
+
s = re.sub(r"\s+", " ", s) # replace any successive whitespaces with a space
|
|
549
|
+
|
|
550
|
+
return s
|