@qvac/translation-nmtcpp 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +202 -0
- package/README.md +470 -0
- package/binding.js +1 -0
- package/index.d.ts +82 -0
- package/index.js +188 -0
- package/lib/error.js +65 -0
- package/marian.js +186 -0
- package/package.json +69 -0
- package/prebuilds/android-arm/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/android-arm64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/android-ia32/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/android-x64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/darwin-arm64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/darwin-arm64/qvac__translation-nmtcpp.bare.exports +3622 -0
- package/prebuilds/darwin-x64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/darwin-x64/qvac__translation-nmtcpp.bare.exports +3731 -0
- package/prebuilds/ios-arm64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/ios-arm64/qvac__translation-nmtcpp.bare.exports +3603 -0
- package/prebuilds/ios-arm64-simulator/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/ios-arm64-simulator/qvac__translation-nmtcpp.bare.exports +3603 -0
- package/prebuilds/ios-x64-simulator/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/ios-x64-simulator/qvac__translation-nmtcpp.bare.exports +3720 -0
- package/prebuilds/linux-x64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/win32-x64/qvac__translation-nmtcpp.bare +0 -0
- package/prebuilds/win32-x64/qvac__translation-nmtcpp.bare.exports +0 -0
- package/third-party/indic-processor-deps/indicnlp/INDIC_NLP_LICENCE +9 -0
- package/third-party/indic-processor-deps/indicnlp/index.js +11 -0
- package/third-party/indic-processor-deps/indicnlp/indic_detokenize.js +141 -0
- package/third-party/indic-processor-deps/indicnlp/indic_normalize.js +1213 -0
- package/third-party/indic-processor-deps/indicnlp/indic_tokenize.js +123 -0
- package/third-party/indic-processor-deps/indicnlp/langinfo.js +609 -0
- package/third-party/indic-processor-deps/indicnlp/sinhala_transliterator.js +197 -0
- package/third-party/indic-processor-deps/indicnlp/unicode_transliterator.js +120 -0
- package/third-party/indic-processor-deps/sacremoses/SACREMOSES_LICENCE +21 -0
- package/third-party/indic-processor-deps/sacremoses/cjk.js +202 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/README.txt +8 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.as +65 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.bn +65 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ca +75 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.cs +390 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.de +325 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.el +1568 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.en +123 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.es +118 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.et +138 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.fi +138 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.fr +153 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ga +48 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.gu +105 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.hi +113 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.hu +103 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.is +251 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.it +180 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.kn +70 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.lt +698 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.lv +100 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ml +67 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.mni +65 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.mr +113 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.nl +115 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.or +101 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.pa +102 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.pl +283 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.pt +210 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ro +38 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ru +293 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.sk +474 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.sl +78 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.sv +97 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.ta +71 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.tdt +210 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.te +70 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.yue +53 -0
- package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.zh +53 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/CJK.txt +23246 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/CJKSymbols.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Close_Punctuation.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Currency_Symbol.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Han.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Hangul.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Hangul_Syllables.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Hiragana.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlnum-unichars-au.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlnum.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlpha-unichars-au.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsAlpha.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsLower.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsN.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsPf.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsPi.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsSc.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsSo.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/IsUpper.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Katakana.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Line_Separator.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Lowercase_Letter.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Number.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Open_Punctuation.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Punctuation.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Separator.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Symbol.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Titlecase_Letter.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/data/perluniprops/Uppercase_Letter.txt +1 -0
- package/third-party/indic-processor-deps/sacremoses/index.js +8 -0
- package/third-party/indic-processor-deps/sacremoses/indic.js +76 -0
- package/third-party/indic-processor-deps/sacremoses/normalizer.js +264 -0
- package/third-party/indic-processor-deps/sacremoses/pernuliprops.js +287 -0
- package/third-party/indic-processor-deps/sacremoses/tokenizer.js +1217 -0
- package/third-party/indic-processor.js +565 -0
package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.gu
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
|
|
2
|
+
|
|
3
|
+
#common exceptions
|
|
4
|
+
# Rs
|
|
5
|
+
રૂ
|
|
6
|
+
# Dr
|
|
7
|
+
ડો
|
|
8
|
+
# Dr
|
|
9
|
+
ડૉ
|
|
10
|
+
# Mr
|
|
11
|
+
શ્રી
|
|
12
|
+
|
|
13
|
+
#others
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
#phonetics
|
|
17
|
+
# A
|
|
18
|
+
એ
|
|
19
|
+
# B
|
|
20
|
+
બી
|
|
21
|
+
# C
|
|
22
|
+
સી
|
|
23
|
+
# D
|
|
24
|
+
ડી
|
|
25
|
+
# E
|
|
26
|
+
ઇ
|
|
27
|
+
# F
|
|
28
|
+
એફ
|
|
29
|
+
# G
|
|
30
|
+
જી
|
|
31
|
+
# H
|
|
32
|
+
એચ
|
|
33
|
+
# I
|
|
34
|
+
આઈ
|
|
35
|
+
# J
|
|
36
|
+
જે
|
|
37
|
+
# K
|
|
38
|
+
કે
|
|
39
|
+
# L
|
|
40
|
+
એલ
|
|
41
|
+
# M
|
|
42
|
+
એમ
|
|
43
|
+
# N
|
|
44
|
+
એન
|
|
45
|
+
# O
|
|
46
|
+
ઓ
|
|
47
|
+
# P
|
|
48
|
+
પી
|
|
49
|
+
# Q
|
|
50
|
+
ક્યૂ
|
|
51
|
+
# R
|
|
52
|
+
આર
|
|
53
|
+
# S
|
|
54
|
+
એસ
|
|
55
|
+
# T
|
|
56
|
+
ટી
|
|
57
|
+
# U
|
|
58
|
+
યુ
|
|
59
|
+
# V
|
|
60
|
+
વી
|
|
61
|
+
# W
|
|
62
|
+
ડબલ્યુ
|
|
63
|
+
# X
|
|
64
|
+
એક્સ
|
|
65
|
+
# Y
|
|
66
|
+
વાય
|
|
67
|
+
# Z
|
|
68
|
+
ઝેડ
|
|
69
|
+
|
|
70
|
+
#consonants
|
|
71
|
+
ક
|
|
72
|
+
ખ
|
|
73
|
+
ગ
|
|
74
|
+
ઘ
|
|
75
|
+
ઙ
|
|
76
|
+
ચ
|
|
77
|
+
છ
|
|
78
|
+
જ
|
|
79
|
+
ઝ
|
|
80
|
+
ઞ
|
|
81
|
+
ટ
|
|
82
|
+
ઠ
|
|
83
|
+
ડ
|
|
84
|
+
ઢ
|
|
85
|
+
ણ
|
|
86
|
+
ત
|
|
87
|
+
થ
|
|
88
|
+
દ
|
|
89
|
+
ધ
|
|
90
|
+
ન
|
|
91
|
+
પ
|
|
92
|
+
ફ
|
|
93
|
+
બ
|
|
94
|
+
ભ
|
|
95
|
+
મ
|
|
96
|
+
ય
|
|
97
|
+
ર
|
|
98
|
+
લ
|
|
99
|
+
ળ
|
|
100
|
+
વ
|
|
101
|
+
શ
|
|
102
|
+
ષ
|
|
103
|
+
સ
|
|
104
|
+
હ
|
|
105
|
+
|
package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.hi
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
|
|
2
|
+
|
|
3
|
+
#common exceptions
|
|
4
|
+
# Rs
|
|
5
|
+
रु
|
|
6
|
+
# Dr
|
|
7
|
+
डॉ
|
|
8
|
+
# Dr
|
|
9
|
+
डा
|
|
10
|
+
# Mr
|
|
11
|
+
श्री
|
|
12
|
+
|
|
13
|
+
#others
|
|
14
|
+
टीवी
|
|
15
|
+
|
|
16
|
+
#phonetics
|
|
17
|
+
# A
|
|
18
|
+
ए
|
|
19
|
+
ऐ
|
|
20
|
+
# B
|
|
21
|
+
बी
|
|
22
|
+
# C
|
|
23
|
+
सी
|
|
24
|
+
# D
|
|
25
|
+
डी
|
|
26
|
+
# E
|
|
27
|
+
ई
|
|
28
|
+
# F
|
|
29
|
+
ऐफ
|
|
30
|
+
एफ
|
|
31
|
+
# G
|
|
32
|
+
जी
|
|
33
|
+
# H
|
|
34
|
+
ऐच
|
|
35
|
+
एच
|
|
36
|
+
# I
|
|
37
|
+
आइ
|
|
38
|
+
# J
|
|
39
|
+
जे
|
|
40
|
+
# K
|
|
41
|
+
के
|
|
42
|
+
# L
|
|
43
|
+
ऐल
|
|
44
|
+
एल
|
|
45
|
+
# M
|
|
46
|
+
ऐम
|
|
47
|
+
एम
|
|
48
|
+
# N
|
|
49
|
+
ऐन
|
|
50
|
+
एन
|
|
51
|
+
# O
|
|
52
|
+
ओ
|
|
53
|
+
# P
|
|
54
|
+
पी
|
|
55
|
+
# Q
|
|
56
|
+
क्यू
|
|
57
|
+
# R
|
|
58
|
+
आर
|
|
59
|
+
# S
|
|
60
|
+
ऐस
|
|
61
|
+
एस
|
|
62
|
+
# T
|
|
63
|
+
टी
|
|
64
|
+
# U
|
|
65
|
+
यू
|
|
66
|
+
# V
|
|
67
|
+
वी
|
|
68
|
+
# W
|
|
69
|
+
डब्ल्यू
|
|
70
|
+
# X
|
|
71
|
+
ऐक्स
|
|
72
|
+
एक्स
|
|
73
|
+
# Y
|
|
74
|
+
वाय
|
|
75
|
+
वाई
|
|
76
|
+
# Z
|
|
77
|
+
ज़ैड
|
|
78
|
+
|
|
79
|
+
#consonants
|
|
80
|
+
क
|
|
81
|
+
ख
|
|
82
|
+
ग
|
|
83
|
+
घ
|
|
84
|
+
ङ
|
|
85
|
+
च
|
|
86
|
+
छ
|
|
87
|
+
ज
|
|
88
|
+
झ
|
|
89
|
+
ञ
|
|
90
|
+
ट
|
|
91
|
+
ठ
|
|
92
|
+
ड
|
|
93
|
+
ढ
|
|
94
|
+
ण
|
|
95
|
+
त
|
|
96
|
+
थ
|
|
97
|
+
द
|
|
98
|
+
ध
|
|
99
|
+
न
|
|
100
|
+
प
|
|
101
|
+
फ
|
|
102
|
+
ब
|
|
103
|
+
भ
|
|
104
|
+
म
|
|
105
|
+
य
|
|
106
|
+
र
|
|
107
|
+
ल
|
|
108
|
+
व
|
|
109
|
+
श
|
|
110
|
+
ष
|
|
111
|
+
स
|
|
112
|
+
ह
|
|
113
|
+
|
package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.hu
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
|
|
2
|
+
#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
|
|
3
|
+
|
|
4
|
+
#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
|
|
5
|
+
#usually upper case letters are initials in a name
|
|
6
|
+
A
|
|
7
|
+
B
|
|
8
|
+
C
|
|
9
|
+
D
|
|
10
|
+
E
|
|
11
|
+
F
|
|
12
|
+
G
|
|
13
|
+
H
|
|
14
|
+
I
|
|
15
|
+
J
|
|
16
|
+
K
|
|
17
|
+
L
|
|
18
|
+
M
|
|
19
|
+
N
|
|
20
|
+
O
|
|
21
|
+
P
|
|
22
|
+
Q
|
|
23
|
+
R
|
|
24
|
+
S
|
|
25
|
+
T
|
|
26
|
+
U
|
|
27
|
+
V
|
|
28
|
+
W
|
|
29
|
+
X
|
|
30
|
+
Y
|
|
31
|
+
Z
|
|
32
|
+
Á
|
|
33
|
+
É
|
|
34
|
+
Í
|
|
35
|
+
Ó
|
|
36
|
+
Ö
|
|
37
|
+
Ő
|
|
38
|
+
Ú
|
|
39
|
+
Ü
|
|
40
|
+
Ű
|
|
41
|
+
|
|
42
|
+
#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
|
|
43
|
+
Dr
|
|
44
|
+
dr
|
|
45
|
+
kb
|
|
46
|
+
Kb
|
|
47
|
+
vö
|
|
48
|
+
Vö
|
|
49
|
+
pl
|
|
50
|
+
Pl
|
|
51
|
+
ca
|
|
52
|
+
Ca
|
|
53
|
+
min
|
|
54
|
+
Min
|
|
55
|
+
max
|
|
56
|
+
Max
|
|
57
|
+
ún
|
|
58
|
+
Ún
|
|
59
|
+
prof
|
|
60
|
+
Prof
|
|
61
|
+
de
|
|
62
|
+
De
|
|
63
|
+
du
|
|
64
|
+
Du
|
|
65
|
+
Szt
|
|
66
|
+
St
|
|
67
|
+
|
|
68
|
+
#Numbers only. These should only induce breaks when followed by a numeric sequence
|
|
69
|
+
# add NUMERIC_ONLY after the word for this function
|
|
70
|
+
#This case is mostly for the english "No." which can either be a sentence of its own, or
|
|
71
|
+
#if followed by a number, a non-breaking prefix
|
|
72
|
+
|
|
73
|
+
# Month name abbreviations
|
|
74
|
+
jan #NUMERIC_ONLY#
|
|
75
|
+
Jan #NUMERIC_ONLY#
|
|
76
|
+
Feb #NUMERIC_ONLY#
|
|
77
|
+
feb #NUMERIC_ONLY#
|
|
78
|
+
márc #NUMERIC_ONLY#
|
|
79
|
+
Márc #NUMERIC_ONLY#
|
|
80
|
+
ápr #NUMERIC_ONLY#
|
|
81
|
+
Ápr #NUMERIC_ONLY#
|
|
82
|
+
máj #NUMERIC_ONLY#
|
|
83
|
+
Máj #NUMERIC_ONLY#
|
|
84
|
+
jún #NUMERIC_ONLY#
|
|
85
|
+
Jún #NUMERIC_ONLY#
|
|
86
|
+
Júl #NUMERIC_ONLY#
|
|
87
|
+
júl #NUMERIC_ONLY#
|
|
88
|
+
aug #NUMERIC_ONLY#
|
|
89
|
+
Aug #NUMERIC_ONLY#
|
|
90
|
+
Szept #NUMERIC_ONLY#
|
|
91
|
+
szept #NUMERIC_ONLY#
|
|
92
|
+
okt #NUMERIC_ONLY#
|
|
93
|
+
Okt #NUMERIC_ONLY#
|
|
94
|
+
nov #NUMERIC_ONLY#
|
|
95
|
+
Nov #NUMERIC_ONLY#
|
|
96
|
+
dec #NUMERIC_ONLY#
|
|
97
|
+
Dec #NUMERIC_ONLY#
|
|
98
|
+
|
|
99
|
+
# Other abbreviations
|
|
100
|
+
tel #NUMERIC_ONLY#
|
|
101
|
+
Tel #NUMERIC_ONLY#
|
|
102
|
+
Fax #NUMERIC_ONLY#
|
|
103
|
+
fax #NUMERIC_ONLY#
|
package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.is
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
no #NUMERIC_ONLY#
|
|
2
|
+
No #NUMERIC_ONLY#
|
|
3
|
+
nr #NUMERIC_ONLY#
|
|
4
|
+
Nr #NUMERIC_ONLY#
|
|
5
|
+
nR #NUMERIC_ONLY#
|
|
6
|
+
NR #NUMERIC_ONLY#
|
|
7
|
+
a
|
|
8
|
+
b
|
|
9
|
+
c
|
|
10
|
+
d
|
|
11
|
+
e
|
|
12
|
+
f
|
|
13
|
+
g
|
|
14
|
+
h
|
|
15
|
+
i
|
|
16
|
+
j
|
|
17
|
+
k
|
|
18
|
+
l
|
|
19
|
+
m
|
|
20
|
+
n
|
|
21
|
+
o
|
|
22
|
+
p
|
|
23
|
+
q
|
|
24
|
+
r
|
|
25
|
+
s
|
|
26
|
+
t
|
|
27
|
+
u
|
|
28
|
+
v
|
|
29
|
+
w
|
|
30
|
+
x
|
|
31
|
+
y
|
|
32
|
+
z
|
|
33
|
+
^
|
|
34
|
+
í
|
|
35
|
+
á
|
|
36
|
+
ó
|
|
37
|
+
æ
|
|
38
|
+
A
|
|
39
|
+
B
|
|
40
|
+
C
|
|
41
|
+
D
|
|
42
|
+
E
|
|
43
|
+
F
|
|
44
|
+
G
|
|
45
|
+
H
|
|
46
|
+
I
|
|
47
|
+
J
|
|
48
|
+
K
|
|
49
|
+
L
|
|
50
|
+
M
|
|
51
|
+
N
|
|
52
|
+
O
|
|
53
|
+
P
|
|
54
|
+
Q
|
|
55
|
+
R
|
|
56
|
+
S
|
|
57
|
+
T
|
|
58
|
+
U
|
|
59
|
+
V
|
|
60
|
+
W
|
|
61
|
+
X
|
|
62
|
+
Y
|
|
63
|
+
Z
|
|
64
|
+
ab.fn
|
|
65
|
+
a.fn
|
|
66
|
+
afs
|
|
67
|
+
al
|
|
68
|
+
alm
|
|
69
|
+
alg
|
|
70
|
+
andh
|
|
71
|
+
ath
|
|
72
|
+
aths
|
|
73
|
+
atr
|
|
74
|
+
ao
|
|
75
|
+
au
|
|
76
|
+
aukaf
|
|
77
|
+
áfn
|
|
78
|
+
áhrl.s
|
|
79
|
+
áhrs
|
|
80
|
+
ákv.gr
|
|
81
|
+
ákv
|
|
82
|
+
bh
|
|
83
|
+
bls
|
|
84
|
+
dr
|
|
85
|
+
e.Kr
|
|
86
|
+
et
|
|
87
|
+
ef
|
|
88
|
+
efn
|
|
89
|
+
ennfr
|
|
90
|
+
eink
|
|
91
|
+
end
|
|
92
|
+
e.st
|
|
93
|
+
erl
|
|
94
|
+
fél
|
|
95
|
+
fskj
|
|
96
|
+
fh
|
|
97
|
+
f.hl
|
|
98
|
+
físl
|
|
99
|
+
fl
|
|
100
|
+
fn
|
|
101
|
+
fo
|
|
102
|
+
forl
|
|
103
|
+
frb
|
|
104
|
+
frl
|
|
105
|
+
frh
|
|
106
|
+
frt
|
|
107
|
+
fsl
|
|
108
|
+
fsh
|
|
109
|
+
fs
|
|
110
|
+
fsk
|
|
111
|
+
fst
|
|
112
|
+
f.Kr
|
|
113
|
+
ft
|
|
114
|
+
fv
|
|
115
|
+
fyrrn
|
|
116
|
+
fyrrv
|
|
117
|
+
germ
|
|
118
|
+
gm
|
|
119
|
+
gr
|
|
120
|
+
hdl
|
|
121
|
+
hdr
|
|
122
|
+
hf
|
|
123
|
+
hl
|
|
124
|
+
hlsk
|
|
125
|
+
hljsk
|
|
126
|
+
hljv
|
|
127
|
+
hljóðv
|
|
128
|
+
hr
|
|
129
|
+
hv
|
|
130
|
+
hvk
|
|
131
|
+
holl
|
|
132
|
+
Hos
|
|
133
|
+
höf
|
|
134
|
+
hk
|
|
135
|
+
hrl
|
|
136
|
+
ísl
|
|
137
|
+
kaf
|
|
138
|
+
kap
|
|
139
|
+
Khöfn
|
|
140
|
+
kk
|
|
141
|
+
kg
|
|
142
|
+
kk
|
|
143
|
+
km
|
|
144
|
+
kl
|
|
145
|
+
klst
|
|
146
|
+
kr
|
|
147
|
+
kt
|
|
148
|
+
kgúrsk
|
|
149
|
+
kvk
|
|
150
|
+
leturbr
|
|
151
|
+
lh
|
|
152
|
+
lh.nt
|
|
153
|
+
lh.þt
|
|
154
|
+
lo
|
|
155
|
+
ltr
|
|
156
|
+
mlja
|
|
157
|
+
mljó
|
|
158
|
+
millj
|
|
159
|
+
mm
|
|
160
|
+
mms
|
|
161
|
+
m.fl
|
|
162
|
+
miðm
|
|
163
|
+
mgr
|
|
164
|
+
mst
|
|
165
|
+
mín
|
|
166
|
+
nf
|
|
167
|
+
nh
|
|
168
|
+
nhm
|
|
169
|
+
nl
|
|
170
|
+
nk
|
|
171
|
+
nmgr
|
|
172
|
+
no
|
|
173
|
+
núv
|
|
174
|
+
nt
|
|
175
|
+
o.áfr
|
|
176
|
+
o.m.fl
|
|
177
|
+
ohf
|
|
178
|
+
o.fl
|
|
179
|
+
o.s.frv
|
|
180
|
+
ófn
|
|
181
|
+
ób
|
|
182
|
+
óákv.gr
|
|
183
|
+
óákv
|
|
184
|
+
pfn
|
|
185
|
+
PR
|
|
186
|
+
pr
|
|
187
|
+
Ritstj
|
|
188
|
+
Rvík
|
|
189
|
+
Rvk
|
|
190
|
+
samb
|
|
191
|
+
samhlj
|
|
192
|
+
samn
|
|
193
|
+
samn
|
|
194
|
+
sbr
|
|
195
|
+
sek
|
|
196
|
+
sérn
|
|
197
|
+
sf
|
|
198
|
+
sfn
|
|
199
|
+
sh
|
|
200
|
+
sfn
|
|
201
|
+
sh
|
|
202
|
+
s.hl
|
|
203
|
+
sk
|
|
204
|
+
skv
|
|
205
|
+
sl
|
|
206
|
+
sn
|
|
207
|
+
so
|
|
208
|
+
ss.us
|
|
209
|
+
s.st
|
|
210
|
+
samþ
|
|
211
|
+
sbr
|
|
212
|
+
shlj
|
|
213
|
+
sign
|
|
214
|
+
skál
|
|
215
|
+
st
|
|
216
|
+
st.s
|
|
217
|
+
stk
|
|
218
|
+
sþ
|
|
219
|
+
teg
|
|
220
|
+
tbl
|
|
221
|
+
tfn
|
|
222
|
+
tl
|
|
223
|
+
tvíhlj
|
|
224
|
+
tvt
|
|
225
|
+
till
|
|
226
|
+
to
|
|
227
|
+
umr
|
|
228
|
+
uh
|
|
229
|
+
us
|
|
230
|
+
uppl
|
|
231
|
+
útg
|
|
232
|
+
vb
|
|
233
|
+
Vf
|
|
234
|
+
vh
|
|
235
|
+
vkf
|
|
236
|
+
Vl
|
|
237
|
+
vl
|
|
238
|
+
vlf
|
|
239
|
+
vmf
|
|
240
|
+
8vo
|
|
241
|
+
vsk
|
|
242
|
+
vth
|
|
243
|
+
þt
|
|
244
|
+
þf
|
|
245
|
+
þjs
|
|
246
|
+
þgf
|
|
247
|
+
þlt
|
|
248
|
+
þolm
|
|
249
|
+
þm
|
|
250
|
+
þml
|
|
251
|
+
þýð
|
package/third-party/indic-processor-deps/sacremoses/data/nonbreaking_prefixes/nonbreaking_prefix.it
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
|
|
2
|
+
#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
|
|
3
|
+
|
|
4
|
+
#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
|
|
5
|
+
#usually upper case letters are initials in a name
|
|
6
|
+
A
|
|
7
|
+
B
|
|
8
|
+
C
|
|
9
|
+
D
|
|
10
|
+
E
|
|
11
|
+
F
|
|
12
|
+
G
|
|
13
|
+
H
|
|
14
|
+
I
|
|
15
|
+
J
|
|
16
|
+
K
|
|
17
|
+
L
|
|
18
|
+
M
|
|
19
|
+
N
|
|
20
|
+
O
|
|
21
|
+
P
|
|
22
|
+
Q
|
|
23
|
+
R
|
|
24
|
+
S
|
|
25
|
+
T
|
|
26
|
+
U
|
|
27
|
+
V
|
|
28
|
+
W
|
|
29
|
+
X
|
|
30
|
+
Y
|
|
31
|
+
Z
|
|
32
|
+
|
|
33
|
+
#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
|
|
34
|
+
Adj
|
|
35
|
+
Adm
|
|
36
|
+
Adv
|
|
37
|
+
Amn
|
|
38
|
+
Arch
|
|
39
|
+
Asst
|
|
40
|
+
Avv
|
|
41
|
+
Bart
|
|
42
|
+
Bcc
|
|
43
|
+
Bldg
|
|
44
|
+
Brig
|
|
45
|
+
Bros
|
|
46
|
+
C.A.P
|
|
47
|
+
C.P
|
|
48
|
+
Capt
|
|
49
|
+
Cc
|
|
50
|
+
Cmdr
|
|
51
|
+
Co
|
|
52
|
+
Col
|
|
53
|
+
Comdr
|
|
54
|
+
Con
|
|
55
|
+
Corp
|
|
56
|
+
Cpl
|
|
57
|
+
DR
|
|
58
|
+
Dott
|
|
59
|
+
Dr
|
|
60
|
+
Drs
|
|
61
|
+
Egr
|
|
62
|
+
Ens
|
|
63
|
+
Gen
|
|
64
|
+
Geom
|
|
65
|
+
Gov
|
|
66
|
+
Hon
|
|
67
|
+
Hosp
|
|
68
|
+
Hr
|
|
69
|
+
Id
|
|
70
|
+
Ing
|
|
71
|
+
Insp
|
|
72
|
+
Lt
|
|
73
|
+
MM
|
|
74
|
+
MR
|
|
75
|
+
MRS
|
|
76
|
+
MS
|
|
77
|
+
Maj
|
|
78
|
+
Messrs
|
|
79
|
+
Mlle
|
|
80
|
+
Mme
|
|
81
|
+
Mo
|
|
82
|
+
Mons
|
|
83
|
+
Mr
|
|
84
|
+
Mrs
|
|
85
|
+
Ms
|
|
86
|
+
Msgr
|
|
87
|
+
N.B
|
|
88
|
+
Op
|
|
89
|
+
Ord
|
|
90
|
+
P.S
|
|
91
|
+
P.T
|
|
92
|
+
Pfc
|
|
93
|
+
Ph
|
|
94
|
+
Prof
|
|
95
|
+
Pvt
|
|
96
|
+
RP
|
|
97
|
+
RSVP
|
|
98
|
+
Rag
|
|
99
|
+
Rep
|
|
100
|
+
Reps
|
|
101
|
+
Res
|
|
102
|
+
Rev
|
|
103
|
+
Rif
|
|
104
|
+
Rt
|
|
105
|
+
S.A
|
|
106
|
+
S.B.F
|
|
107
|
+
S.P.M
|
|
108
|
+
S.p.A
|
|
109
|
+
S.r.l
|
|
110
|
+
Sen
|
|
111
|
+
Sens
|
|
112
|
+
Sfc
|
|
113
|
+
Sgt
|
|
114
|
+
Sig
|
|
115
|
+
Sigg
|
|
116
|
+
Soc
|
|
117
|
+
Spett
|
|
118
|
+
Sr
|
|
119
|
+
St
|
|
120
|
+
Supt
|
|
121
|
+
Surg
|
|
122
|
+
V.P
|
|
123
|
+
|
|
124
|
+
# other
|
|
125
|
+
a.c
|
|
126
|
+
acc
|
|
127
|
+
all
|
|
128
|
+
banc
|
|
129
|
+
c.a
|
|
130
|
+
c.c.p
|
|
131
|
+
c.m
|
|
132
|
+
c.p
|
|
133
|
+
c.s
|
|
134
|
+
c.v
|
|
135
|
+
corr
|
|
136
|
+
dott
|
|
137
|
+
e.p.c
|
|
138
|
+
ecc
|
|
139
|
+
es
|
|
140
|
+
fatt
|
|
141
|
+
gg
|
|
142
|
+
int
|
|
143
|
+
lett
|
|
144
|
+
ogg
|
|
145
|
+
on
|
|
146
|
+
p.c
|
|
147
|
+
p.c.c
|
|
148
|
+
p.es
|
|
149
|
+
p.f
|
|
150
|
+
p.r
|
|
151
|
+
p.v
|
|
152
|
+
post
|
|
153
|
+
pp
|
|
154
|
+
racc
|
|
155
|
+
ric
|
|
156
|
+
s.n.c
|
|
157
|
+
seg
|
|
158
|
+
sgg
|
|
159
|
+
ss
|
|
160
|
+
tel
|
|
161
|
+
u.s
|
|
162
|
+
v.r
|
|
163
|
+
v.s
|
|
164
|
+
|
|
165
|
+
#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
|
|
166
|
+
v
|
|
167
|
+
vs
|
|
168
|
+
i.e
|
|
169
|
+
rev
|
|
170
|
+
e.g
|
|
171
|
+
|
|
172
|
+
#Numbers only. These should only induce breaks when followed by a numeric sequence
|
|
173
|
+
# add NUMERIC_ONLY after the word for this function
|
|
174
|
+
#This case is mostly for the english "No." which can either be a sentence of its own, or
|
|
175
|
+
#if followed by a number, a non-breaking prefix
|
|
176
|
+
No #NUMERIC_ONLY#
|
|
177
|
+
Nos
|
|
178
|
+
Art #NUMERIC_ONLY#
|
|
179
|
+
Nr
|
|
180
|
+
pp #NUMERIC_ONLY#
|