OpenCC 1.2.0__cp38-cp38-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opencc/__init__.py +49 -0
- opencc/clib/__init__.py +0 -0
- opencc/clib/bin/opencc +0 -0
- opencc/clib/bin/opencc_dict +0 -0
- opencc/clib/bin/opencc_phrase_extract +0 -0
- opencc/clib/include/opencc/BinaryDict.hpp +53 -0
- opencc/clib/include/opencc/Common.hpp +82 -0
- opencc/clib/include/opencc/Config.hpp +49 -0
- opencc/clib/include/opencc/Conversion.hpp +47 -0
- opencc/clib/include/opencc/ConversionChain.hpp +43 -0
- opencc/clib/include/opencc/Converter.hpp +51 -0
- opencc/clib/include/opencc/DartsDict.hpp +60 -0
- opencc/clib/include/opencc/Dict.hpp +92 -0
- opencc/clib/include/opencc/DictConverter.hpp +32 -0
- opencc/clib/include/opencc/DictEntry.hpp +173 -0
- opencc/clib/include/opencc/DictGroup.hpp +57 -0
- opencc/clib/include/opencc/Exception.hpp +88 -0
- opencc/clib/include/opencc/Export.hpp +40 -0
- opencc/clib/include/opencc/Lexicon.hpp +70 -0
- opencc/clib/include/opencc/MarisaDict.hpp +63 -0
- opencc/clib/include/opencc/MaxMatchSegmentation.hpp +43 -0
- opencc/clib/include/opencc/Optional.hpp +76 -0
- opencc/clib/include/opencc/PhraseExtract.hpp +195 -0
- opencc/clib/include/opencc/Segmentation.hpp +32 -0
- opencc/clib/include/opencc/Segments.hpp +118 -0
- opencc/clib/include/opencc/SerializableDict.hpp +77 -0
- opencc/clib/include/opencc/SerializedValues.hpp +52 -0
- opencc/clib/include/opencc/SimpleConverter.hpp +113 -0
- opencc/clib/include/opencc/TextDict.hpp +60 -0
- opencc/clib/include/opencc/UTF8StringSlice.hpp +246 -0
- opencc/clib/include/opencc/UTF8Util.hpp +291 -0
- opencc/clib/include/opencc/opencc.h +161 -0
- opencc/clib/include/opencc/opencc_config.h +21 -0
- opencc/clib/lib/cmake/opencc/OpenCCConfig.cmake +31 -0
- opencc/clib/lib/cmake/opencc/OpenCCConfigVersion.cmake +65 -0
- opencc/clib/lib/cmake/opencc/OpenCCTargets-release.cmake +29 -0
- opencc/clib/lib/cmake/opencc/OpenCCTargets.cmake +110 -0
- opencc/clib/lib/libmarisa.a +0 -0
- opencc/clib/lib/libopencc.a +0 -0
- opencc/clib/lib/pkgconfig/opencc.pc +11 -0
- opencc/clib/opencc_clib.cpython-38-x86_64-linux-gnu.so +0 -0
- opencc/clib/share/opencc/HKVariants.ocd2 +0 -0
- opencc/clib/share/opencc/HKVariantsRev.ocd2 +0 -0
- opencc/clib/share/opencc/HKVariantsRevPhrases.ocd2 +0 -0
- opencc/clib/share/opencc/JPShinjitaiCharacters.ocd2 +0 -0
- opencc/clib/share/opencc/JPShinjitaiPhrases.ocd2 +0 -0
- opencc/clib/share/opencc/JPVariants.ocd2 +0 -0
- opencc/clib/share/opencc/JPVariantsRev.ocd2 +0 -0
- opencc/clib/share/opencc/STCharacters.ocd2 +0 -0
- opencc/clib/share/opencc/STPhrases.ocd2 +0 -0
- opencc/clib/share/opencc/TSCharacters.ocd2 +0 -0
- opencc/clib/share/opencc/TSPhrases.ocd2 +0 -0
- opencc/clib/share/opencc/TWPhrases.ocd2 +0 -0
- opencc/clib/share/opencc/TWPhrasesRev.ocd2 +0 -0
- opencc/clib/share/opencc/TWVariants.ocd2 +0 -0
- opencc/clib/share/opencc/TWVariantsRev.ocd2 +0 -0
- opencc/clib/share/opencc/TWVariantsRevPhrases.ocd2 +0 -0
- opencc/clib/share/opencc/hk2s.json +33 -0
- opencc/clib/share/opencc/hk2t.json +22 -0
- opencc/clib/share/opencc/jp2t.json +25 -0
- opencc/clib/share/opencc/s2hk.json +27 -0
- opencc/clib/share/opencc/s2t.json +22 -0
- opencc/clib/share/opencc/s2tw.json +27 -0
- opencc/clib/share/opencc/s2twp.json +32 -0
- opencc/clib/share/opencc/t2hk.json +16 -0
- opencc/clib/share/opencc/t2jp.json +16 -0
- opencc/clib/share/opencc/t2s.json +22 -0
- opencc/clib/share/opencc/t2tw.json +16 -0
- opencc/clib/share/opencc/tw2s.json +33 -0
- opencc/clib/share/opencc/tw2sp.json +36 -0
- opencc/clib/share/opencc/tw2t.json +22 -0
- opencc/py.typed +0 -0
- opencc-1.2.0.dist-info/AUTHORS +12 -0
- opencc-1.2.0.dist-info/LICENSE +56 -0
- opencc-1.2.0.dist-info/METADATA +347 -0
- opencc-1.2.0.dist-info/RECORD +78 -0
- opencc-1.2.0.dist-info/WHEEL +5 -0
- opencc-1.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Open Chinese Convert
|
|
3
|
+
*
|
|
4
|
+
* Copyright 2015 Carbo Kuo <byvoid@byvoid.com>
|
|
5
|
+
*
|
|
6
|
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
* you may not use this file except in compliance with the License.
|
|
8
|
+
* You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
* See the License for the specific language governing permissions and
|
|
16
|
+
* limitations under the License.
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
#include <cstring>
|
|
20
|
+
|
|
21
|
+
#include "Common.hpp"
|
|
22
|
+
#include "UTF8Util.hpp"
|
|
23
|
+
|
|
24
|
+
namespace opencc {
|
|
25
|
+
|
|
26
|
+
namespace internal {
|
|
27
|
+
|
|
28
|
+
inline size_t FNVHash(const char* text, const size_t byteLength,
|
|
29
|
+
const size_t FNV_prime, const size_t FNV_offset_basis) {
|
|
30
|
+
size_t hash = FNV_offset_basis;
|
|
31
|
+
for (const char* pstr = text; pstr < text + byteLength; pstr++) {
|
|
32
|
+
hash ^= *pstr;
|
|
33
|
+
hash *= FNV_prime;
|
|
34
|
+
}
|
|
35
|
+
return hash;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
template <int> size_t FNVHash(const char* text, const size_t byteLength);
|
|
39
|
+
|
|
40
|
+
template <>
|
|
41
|
+
inline size_t FNVHash<4>(const char* text, const size_t byteLength) {
|
|
42
|
+
return FNVHash(text, byteLength, 16777619UL, 2166136261UL);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
#if SIZE_MAX == 0xffffffffffffffff
|
|
46
|
+
template <>
|
|
47
|
+
inline size_t FNVHash<8>(const char* text, const size_t byteLength) {
|
|
48
|
+
return FNVHash(text, byteLength, 1099511628211UL, 14695981039346656037UL);
|
|
49
|
+
}
|
|
50
|
+
#endif
|
|
51
|
+
|
|
52
|
+
} // namespace internal
|
|
53
|
+
|
|
54
|
+
template <typename LENGTH_TYPE> class UTF8StringSliceBase {
|
|
55
|
+
public:
|
|
56
|
+
typedef LENGTH_TYPE LengthType;
|
|
57
|
+
|
|
58
|
+
UTF8StringSliceBase(const char* _str)
|
|
59
|
+
: str(_str), utf8Length(static_cast<LengthType>(UTF8Util::Length(_str))),
|
|
60
|
+
byteLength(static_cast<LengthType>(strlen(_str))) {}
|
|
61
|
+
|
|
62
|
+
UTF8StringSliceBase(const char* _str, const LengthType _utf8Length)
|
|
63
|
+
: str(_str), utf8Length(_utf8Length) {
|
|
64
|
+
CalculateByteLength();
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
UTF8StringSliceBase(const char* _str, const LengthType _utf8Length,
|
|
68
|
+
const LengthType _byteLength)
|
|
69
|
+
: str(_str), utf8Length(_utf8Length), byteLength(_byteLength) {
|
|
70
|
+
CalculateByteLength();
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
LengthType UTF8Length() const { return utf8Length; }
|
|
74
|
+
|
|
75
|
+
LengthType ByteLength() const { return byteLength; }
|
|
76
|
+
|
|
77
|
+
UTF8StringSliceBase Left(const LengthType numberOfCharacters) const {
|
|
78
|
+
if (numberOfCharacters == UTF8Length()) {
|
|
79
|
+
return *this;
|
|
80
|
+
} else {
|
|
81
|
+
return UTF8StringSliceBase(str, numberOfCharacters);
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
UTF8StringSliceBase Right(const LengthType numberOfCharacters) const {
|
|
86
|
+
if (numberOfCharacters == UTF8Length()) {
|
|
87
|
+
return *this;
|
|
88
|
+
} else {
|
|
89
|
+
const char* pstr = str + byteLength;
|
|
90
|
+
for (size_t i = 0; i < numberOfCharacters; i++) {
|
|
91
|
+
pstr = UTF8Util::PrevChar(pstr);
|
|
92
|
+
}
|
|
93
|
+
return UTF8StringSliceBase(pstr, numberOfCharacters);
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
UTF8StringSliceBase SubString(const LengthType offset,
|
|
98
|
+
const LengthType numberOfCharacters) const {
|
|
99
|
+
if (offset == 0) {
|
|
100
|
+
return Left(numberOfCharacters);
|
|
101
|
+
} else {
|
|
102
|
+
const char* pstr = str;
|
|
103
|
+
for (size_t i = 0; i < offset; i++) {
|
|
104
|
+
pstr = UTF8Util::NextChar(pstr);
|
|
105
|
+
}
|
|
106
|
+
return UTF8StringSliceBase(pstr, numberOfCharacters);
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
std::string ToString() const { return std::string(str, str + byteLength); }
|
|
111
|
+
|
|
112
|
+
const char* CString() const { return str; }
|
|
113
|
+
|
|
114
|
+
LengthType CommonPrefixLength(const UTF8StringSliceBase& that) const {
|
|
115
|
+
if (str == that.str) {
|
|
116
|
+
return (std::min)(utf8Length, that.utf8Length);
|
|
117
|
+
} else {
|
|
118
|
+
const char* pstr1 = str;
|
|
119
|
+
const char* pstr2 = that.str;
|
|
120
|
+
for (size_t length = 0; length < utf8Length && length < that.utf8Length;
|
|
121
|
+
length++) {
|
|
122
|
+
size_t charLen1 = UTF8Util::NextCharLength(pstr1);
|
|
123
|
+
size_t charLen2 = UTF8Util::NextCharLength(pstr2);
|
|
124
|
+
if (charLen1 != charLen2 || strncmp(pstr1, pstr2, charLen1) != 0) {
|
|
125
|
+
return length;
|
|
126
|
+
}
|
|
127
|
+
pstr1 += charLen1;
|
|
128
|
+
pstr2 += charLen2;
|
|
129
|
+
}
|
|
130
|
+
return 0;
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
void MoveRight() {
|
|
135
|
+
if (utf8Length > 0) {
|
|
136
|
+
const size_t charLen = UTF8Util::NextCharLength(str);
|
|
137
|
+
str += charLen;
|
|
138
|
+
utf8Length--;
|
|
139
|
+
byteLength -= charLen;
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
void MoveLeft() {
|
|
144
|
+
if (utf8Length > 0) {
|
|
145
|
+
const size_t charLen = UTF8Util::PrevCharLength(str + byteLength);
|
|
146
|
+
utf8Length--;
|
|
147
|
+
byteLength -= charLen;
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
int ReverseCompare(const UTF8StringSliceBase& that) const {
|
|
152
|
+
const char* pstr1 = str + byteLength;
|
|
153
|
+
const char* pstr2 = that.str + that.byteLength;
|
|
154
|
+
const size_t length = (std::min)(utf8Length, that.utf8Length);
|
|
155
|
+
for (size_t i = 0; i < length; i++) {
|
|
156
|
+
const size_t charLen1 = UTF8Util::PrevCharLength(pstr1);
|
|
157
|
+
const size_t charLen2 = UTF8Util::PrevCharLength(pstr2);
|
|
158
|
+
pstr1 -= charLen1;
|
|
159
|
+
pstr2 -= charLen2;
|
|
160
|
+
const int cmp = strncmp(pstr1, pstr2, (std::min)(charLen1, charLen2));
|
|
161
|
+
if (cmp < 0) {
|
|
162
|
+
return -1;
|
|
163
|
+
} else if (cmp > 0) {
|
|
164
|
+
return 1;
|
|
165
|
+
} else if (charLen1 < charLen2) {
|
|
166
|
+
return -1;
|
|
167
|
+
} else if (charLen1 > charLen2) {
|
|
168
|
+
return 1;
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
if (utf8Length < that.utf8Length) {
|
|
172
|
+
return -1;
|
|
173
|
+
} else if (utf8Length > that.utf8Length) {
|
|
174
|
+
return 1;
|
|
175
|
+
} else {
|
|
176
|
+
return 0;
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
LengthType FindBytePosition(const UTF8StringSliceBase& pattern) const {
|
|
181
|
+
return static_cast<LengthType>(
|
|
182
|
+
ToString().find(pattern.str, 0, pattern.byteLength));
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
bool operator<(const UTF8StringSliceBase& that) const {
|
|
186
|
+
return Compare(that) < 0;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
bool operator>(const UTF8StringSliceBase& that) const {
|
|
190
|
+
return Compare(that) > 0;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
bool operator==(const UTF8StringSliceBase& that) const {
|
|
194
|
+
return (str == that.str && utf8Length == that.utf8Length) ||
|
|
195
|
+
Compare(that) == 0;
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
bool operator!=(const UTF8StringSliceBase& that) const {
|
|
199
|
+
return !this->operator==(that);
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
class Hasher {
|
|
203
|
+
public:
|
|
204
|
+
size_t operator()(const UTF8StringSliceBase& text) const {
|
|
205
|
+
return internal::FNVHash<sizeof(size_t)>(text.CString(),
|
|
206
|
+
text.ByteLength());
|
|
207
|
+
}
|
|
208
|
+
};
|
|
209
|
+
|
|
210
|
+
private:
|
|
211
|
+
inline int Compare(const UTF8StringSliceBase& that) const {
|
|
212
|
+
int cmp = strncmp(str, that.str, (std::min)(byteLength, that.byteLength));
|
|
213
|
+
if (cmp == 0) {
|
|
214
|
+
if (utf8Length < that.utf8Length) {
|
|
215
|
+
cmp = -1;
|
|
216
|
+
} else if (utf8Length > that.utf8Length) {
|
|
217
|
+
cmp = 1;
|
|
218
|
+
} else {
|
|
219
|
+
cmp = 0;
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
return cmp;
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
void CalculateByteLength() {
|
|
226
|
+
const char* pstr = str;
|
|
227
|
+
for (size_t i = 0; i < utf8Length; i++) {
|
|
228
|
+
pstr = UTF8Util::NextChar(pstr);
|
|
229
|
+
}
|
|
230
|
+
byteLength = static_cast<LengthType>(pstr - str);
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
const char* str;
|
|
234
|
+
LengthType utf8Length;
|
|
235
|
+
LengthType byteLength;
|
|
236
|
+
};
|
|
237
|
+
|
|
238
|
+
typedef UTF8StringSliceBase<size_t> UTF8StringSlice;
|
|
239
|
+
|
|
240
|
+
template <typename LENGTH_TYPE>
|
|
241
|
+
std::ostream& operator<<(::std::ostream& os,
|
|
242
|
+
const UTF8StringSliceBase<LENGTH_TYPE>& str) {
|
|
243
|
+
return os << str.ToString();
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
} // namespace opencc
|
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Open Chinese Convert
|
|
3
|
+
*
|
|
4
|
+
* Copyright 2013 Carbo Kuo <byvoid@byvoid.com>
|
|
5
|
+
*
|
|
6
|
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
* you may not use this file except in compliance with the License.
|
|
8
|
+
* You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
* See the License for the specific language governing permissions and
|
|
16
|
+
* limitations under the License.
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
#pragma once
|
|
20
|
+
|
|
21
|
+
#ifdef _MSC_VER
|
|
22
|
+
#ifndef NOMINMAX
|
|
23
|
+
#define NOMINMAX
|
|
24
|
+
#endif
|
|
25
|
+
#include <Windows.h>
|
|
26
|
+
#endif // _MSC_VER
|
|
27
|
+
|
|
28
|
+
#include <cstring>
|
|
29
|
+
|
|
30
|
+
#include "Common.hpp"
|
|
31
|
+
#include "Exception.hpp"
|
|
32
|
+
|
|
33
|
+
namespace opencc {
|
|
34
|
+
/**
|
|
35
|
+
* UTF8 std::string utilities
|
|
36
|
+
* @ingroup opencc_cpp_api
|
|
37
|
+
*/
|
|
38
|
+
class OPENCC_EXPORT UTF8Util {
|
|
39
|
+
public:
|
|
40
|
+
/**
|
|
41
|
+
* Detect UTF8 BOM and skip it.
|
|
42
|
+
*/
|
|
43
|
+
static void SkipUtf8Bom(FILE* fp);
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Returns the length in byte for the next UTF8 character.
|
|
47
|
+
* On error returns 0.
|
|
48
|
+
*/
|
|
49
|
+
static size_t NextCharLengthNoException(const char* str) {
|
|
50
|
+
char ch = *str;
|
|
51
|
+
if ((ch & 0xF0) == 0xE0) {
|
|
52
|
+
return 3;
|
|
53
|
+
} else if ((ch & 0x80) == 0x00) {
|
|
54
|
+
return 1;
|
|
55
|
+
} else if ((ch & 0xE0) == 0xC0) {
|
|
56
|
+
return 2;
|
|
57
|
+
} else if ((ch & 0xF8) == 0xF0) {
|
|
58
|
+
return 4;
|
|
59
|
+
} else if ((ch & 0xFC) == 0xF8) {
|
|
60
|
+
return 5;
|
|
61
|
+
} else if ((ch & 0xFE) == 0xFC) {
|
|
62
|
+
return 6;
|
|
63
|
+
}
|
|
64
|
+
return 0;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* Returns the length in byte for the next UTF8 character.
|
|
69
|
+
*/
|
|
70
|
+
static size_t NextCharLength(const char* str) {
|
|
71
|
+
size_t length = NextCharLengthNoException(str);
|
|
72
|
+
if (length == 0) {
|
|
73
|
+
throw InvalidUTF8(str);
|
|
74
|
+
}
|
|
75
|
+
return length;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* Returns the length in byte for the previous UTF8 character.
|
|
80
|
+
*/
|
|
81
|
+
static size_t PrevCharLength(const char* str) {
|
|
82
|
+
{
|
|
83
|
+
const size_t length = NextCharLengthNoException(str - 3);
|
|
84
|
+
if (length == 3) {
|
|
85
|
+
return length;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
{
|
|
89
|
+
const size_t length = NextCharLengthNoException(str - 1);
|
|
90
|
+
if (length == 1) {
|
|
91
|
+
return length;
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
{
|
|
95
|
+
const size_t length = NextCharLengthNoException(str - 2);
|
|
96
|
+
if (length == 2) {
|
|
97
|
+
return length;
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
for (size_t i = 4; i <= 6; i++) {
|
|
101
|
+
const size_t length = NextCharLengthNoException(str - i);
|
|
102
|
+
if (length == i) {
|
|
103
|
+
return length;
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
throw InvalidUTF8(str);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/**
|
|
110
|
+
* Returns the char* pointer over the next UTF8 character.
|
|
111
|
+
*/
|
|
112
|
+
static const char* NextChar(const char* str) {
|
|
113
|
+
return str + NextCharLength(str);
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/**
|
|
117
|
+
* Move the char* pointer before the previous UTF8 character.
|
|
118
|
+
*/
|
|
119
|
+
static const char* PrevChar(const char* str) {
|
|
120
|
+
return str - PrevCharLength(str);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* Returns the UTF8 length of a valid UTF8 std::string.
|
|
125
|
+
*/
|
|
126
|
+
static size_t Length(const char* str) {
|
|
127
|
+
size_t length = 0;
|
|
128
|
+
while (*str != '\0') {
|
|
129
|
+
str = NextChar(str);
|
|
130
|
+
length++;
|
|
131
|
+
}
|
|
132
|
+
return length;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Finds a character in the same line.
|
|
137
|
+
* @param str The text to be searched in.
|
|
138
|
+
* @param ch The character to find.
|
|
139
|
+
* @return The pointer that points to the found chacter in str or EOL/EOF.
|
|
140
|
+
*/
|
|
141
|
+
static const char* FindNextInline(const char* str, const char ch) {
|
|
142
|
+
while (!IsLineEndingOrFileEnding(*str) && *str != ch) {
|
|
143
|
+
str = NextChar(str);
|
|
144
|
+
}
|
|
145
|
+
return str;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
/**
|
|
149
|
+
* Returns true if the character is a line ending or end of file.
|
|
150
|
+
*/
|
|
151
|
+
static bool IsLineEndingOrFileEnding(const char ch) {
|
|
152
|
+
return ch == '\0' || ch == '\n' || ch == '\r';
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
/**
|
|
156
|
+
* Copies a substring with given length to a new string.
|
|
157
|
+
*/
|
|
158
|
+
static std::string FromSubstr(const char* str, size_t length) {
|
|
159
|
+
std::string newStr;
|
|
160
|
+
newStr.resize(length);
|
|
161
|
+
strncpy(const_cast<char*>(newStr.c_str()), str, length);
|
|
162
|
+
return newStr;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
/**
|
|
166
|
+
* Returns true if the given std::string is longer or as long as the given
|
|
167
|
+
* length.
|
|
168
|
+
*/
|
|
169
|
+
static bool NotShorterThan(const char* str, size_t byteLength) {
|
|
170
|
+
while (byteLength > 0) {
|
|
171
|
+
if (*str == '\0') {
|
|
172
|
+
return false;
|
|
173
|
+
}
|
|
174
|
+
byteLength--;
|
|
175
|
+
str++;
|
|
176
|
+
}
|
|
177
|
+
return true;
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
/**
|
|
181
|
+
* Truncates a std::string with a maximal length in byte.
|
|
182
|
+
* No UTF8 character will be broken.
|
|
183
|
+
*/
|
|
184
|
+
static std::string TruncateUTF8(const char* str, size_t maxByteLength) {
|
|
185
|
+
std::string wordTrunc;
|
|
186
|
+
if (NotShorterThan(str, maxByteLength)) {
|
|
187
|
+
size_t len = 0;
|
|
188
|
+
const char* pStr = str;
|
|
189
|
+
for (;;) {
|
|
190
|
+
const size_t charLength = NextCharLength(pStr);
|
|
191
|
+
if (len + charLength > maxByteLength) {
|
|
192
|
+
break;
|
|
193
|
+
}
|
|
194
|
+
pStr += charLength;
|
|
195
|
+
len += charLength;
|
|
196
|
+
}
|
|
197
|
+
wordTrunc = FromSubstr(str, len);
|
|
198
|
+
} else {
|
|
199
|
+
wordTrunc = str;
|
|
200
|
+
}
|
|
201
|
+
return wordTrunc;
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
/**
|
|
205
|
+
* Replaces all patterns in a std::string in place.
|
|
206
|
+
*/
|
|
207
|
+
static void ReplaceAll(std::string& str, const char* from, const char* to) {
|
|
208
|
+
std::string::size_type pos = 0;
|
|
209
|
+
std::string::size_type fromLen = strlen(from);
|
|
210
|
+
std::string::size_type toLen = strlen(to);
|
|
211
|
+
while ((pos = str.find(from, pos)) != std::string::npos) {
|
|
212
|
+
str.replace(pos, fromLen, to);
|
|
213
|
+
pos += toLen;
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
/**
|
|
218
|
+
* Joins a std::string vector in to a std::string with a separator.
|
|
219
|
+
*/
|
|
220
|
+
static std::string Join(const std::vector<std::string>& strings,
|
|
221
|
+
const std::string& separator) {
|
|
222
|
+
std::ostringstream buffer;
|
|
223
|
+
bool first = true;
|
|
224
|
+
for (const auto& str : strings) {
|
|
225
|
+
if (!first) {
|
|
226
|
+
buffer << separator;
|
|
227
|
+
}
|
|
228
|
+
buffer << str;
|
|
229
|
+
first = false;
|
|
230
|
+
}
|
|
231
|
+
return buffer.str();
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
/**
|
|
235
|
+
* Joins a std::string vector in to a std::string.
|
|
236
|
+
*/
|
|
237
|
+
static std::string Join(const std::vector<std::string>& strings) {
|
|
238
|
+
std::ostringstream buffer;
|
|
239
|
+
for (const auto& str : strings) {
|
|
240
|
+
buffer << str;
|
|
241
|
+
}
|
|
242
|
+
return buffer.str();
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
static void GetByteMap(const char* str, const size_t utf8Length,
|
|
246
|
+
std::vector<size_t>* byteMap) {
|
|
247
|
+
if (byteMap->size() < utf8Length) {
|
|
248
|
+
byteMap->resize(utf8Length);
|
|
249
|
+
}
|
|
250
|
+
const char* pstr = str;
|
|
251
|
+
for (size_t i = 0; i < utf8Length; i++) {
|
|
252
|
+
(*byteMap)[i] = pstr - str;
|
|
253
|
+
pstr = NextChar(pstr);
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
#ifdef _MSC_VER
|
|
258
|
+
static std::wstring GetPlatformString(const std::string& str) {
|
|
259
|
+
return U8ToU16(str);
|
|
260
|
+
}
|
|
261
|
+
#else
|
|
262
|
+
static std::string GetPlatformString(const std::string& str) { return str; }
|
|
263
|
+
#endif // _MSC_VER
|
|
264
|
+
|
|
265
|
+
#ifdef _MSC_VER
|
|
266
|
+
static std::string U16ToU8(const std::wstring& wstr) {
|
|
267
|
+
std::string ret;
|
|
268
|
+
int length = static_cast<int>(wstr.length());
|
|
269
|
+
int convcnt = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), length, NULL, 0,
|
|
270
|
+
NULL, NULL);
|
|
271
|
+
if (convcnt > 0) {
|
|
272
|
+
ret.resize(convcnt);
|
|
273
|
+
WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), length, &ret[0], convcnt,
|
|
274
|
+
NULL, NULL);
|
|
275
|
+
}
|
|
276
|
+
return ret;
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
static std::wstring U8ToU16(const std::string& str) {
|
|
280
|
+
std::wstring ret;
|
|
281
|
+
int length = static_cast<int>(str.length());
|
|
282
|
+
int convcnt = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), length, NULL, 0);
|
|
283
|
+
if (convcnt > 0) {
|
|
284
|
+
ret.resize(convcnt);
|
|
285
|
+
MultiByteToWideChar(CP_UTF8, 0, str.c_str(), length, &ret[0], convcnt);
|
|
286
|
+
}
|
|
287
|
+
return ret;
|
|
288
|
+
}
|
|
289
|
+
#endif // _MSC_VER
|
|
290
|
+
};
|
|
291
|
+
} // namespace opencc
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Open Chinese Convert
|
|
3
|
+
*
|
|
4
|
+
* Copyright 2010-2014 Carbo Kuo <byvoid@byvoid.com>
|
|
5
|
+
*
|
|
6
|
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
* you may not use this file except in compliance with the License.
|
|
8
|
+
* You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
* See the License for the specific language governing permissions and
|
|
16
|
+
* limitations under the License.
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
#ifndef __OPENCC_H_
|
|
20
|
+
#define __OPENCC_H_
|
|
21
|
+
|
|
22
|
+
#ifdef __cplusplus
|
|
23
|
+
|
|
24
|
+
#include "Export.hpp"
|
|
25
|
+
#include "SimpleConverter.hpp"
|
|
26
|
+
#include <string>
|
|
27
|
+
|
|
28
|
+
extern "C" {
|
|
29
|
+
#else
|
|
30
|
+
#include <stddef.h>
|
|
31
|
+
#endif
|
|
32
|
+
|
|
33
|
+
#ifndef OPENCC_EXPORT
|
|
34
|
+
#define OPENCC_EXPORT
|
|
35
|
+
#endif
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* @defgroup opencc_c_api OpenCC C API
|
|
39
|
+
*
|
|
40
|
+
* API in C language
|
|
41
|
+
*/
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Filename of default Simplified to Traditional configuration
|
|
45
|
+
*
|
|
46
|
+
* @ingroup opencc_c_api
|
|
47
|
+
*/
|
|
48
|
+
#define OPENCC_DEFAULT_CONFIG_SIMP_TO_TRAD "s2t.json"
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Filename of default Traditional to Simplified configuration
|
|
52
|
+
*
|
|
53
|
+
* @ingroup opencc_c_api
|
|
54
|
+
*/
|
|
55
|
+
#define OPENCC_DEFAULT_CONFIG_TRAD_TO_SIMP "t2s.json"
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Type of opencc descriptor
|
|
59
|
+
*
|
|
60
|
+
* @ingroup opencc_c_api
|
|
61
|
+
*/
|
|
62
|
+
typedef void* opencc_t;
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Makes an instance of opencc
|
|
66
|
+
*
|
|
67
|
+
* @param configFileName Location of configuration file. If this is set to NULL,
|
|
68
|
+
* OPENCC_DEFAULT_CONFIG_SIMP_TO_TRAD will be loaded.
|
|
69
|
+
* @return A description pointer of the newly allocated instance of
|
|
70
|
+
* opencc. On error the return value will be (opencc_t) -1.
|
|
71
|
+
* @ingroup opencc_c_api
|
|
72
|
+
*/
|
|
73
|
+
OPENCC_EXPORT opencc_t opencc_open(const char* configFileName);
|
|
74
|
+
#ifdef _MSC_VER
|
|
75
|
+
/**
|
|
76
|
+
* Makes an instance of opencc (wide char / Unicode)
|
|
77
|
+
*
|
|
78
|
+
* @param configFileName Location of configuration file. If this is set to NULL,
|
|
79
|
+
* OPENCC_DEFAULT_CONFIG_SIMP_TO_TRAD will be loaded.
|
|
80
|
+
* @return A description pointer of the newly allocated instance of
|
|
81
|
+
* opencc. On error the return value will be (opencc_t) -1.
|
|
82
|
+
* @ingroup opencc_c_api
|
|
83
|
+
*/
|
|
84
|
+
OPENCC_EXPORT opencc_t opencc_open_w(const wchar_t* configFileName);
|
|
85
|
+
#endif /* _MSC_VER */
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Destroys an instance of opencc
|
|
89
|
+
*
|
|
90
|
+
* @param opencc The description pointer.
|
|
91
|
+
* @return 0 on success or non-zero number on failure.
|
|
92
|
+
* @ingroup opencc_c_api
|
|
93
|
+
*/
|
|
94
|
+
OPENCC_EXPORT int opencc_close(opencc_t opencc);
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Converts UTF-8 std::string
|
|
98
|
+
*
|
|
99
|
+
* @param opencc The opencc description pointer.
|
|
100
|
+
* @param input The UTF-8 encoded std::string.
|
|
101
|
+
* @param length The maximum length in byte to convert. If length is (size_t)-1,
|
|
102
|
+
* the whole std::string (terminated by '\0') will be converted.
|
|
103
|
+
* @param output The buffer to store converted text. You MUST make sure this
|
|
104
|
+
* buffer has sufficient space.
|
|
105
|
+
*
|
|
106
|
+
* @return The length of converted std::string or (size_t)-1 on error.
|
|
107
|
+
*
|
|
108
|
+
* @ingroup opencc_c_api
|
|
109
|
+
*/
|
|
110
|
+
OPENCC_EXPORT size_t opencc_convert_utf8_to_buffer(opencc_t opencc,
|
|
111
|
+
const char* input,
|
|
112
|
+
size_t length, char* output);
|
|
113
|
+
|
|
114
|
+
/**
|
|
115
|
+
* Converts UTF-8 std::string
|
|
116
|
+
* This function returns an allocated C-Style std::string, which stores
|
|
117
|
+
* the converted std::string.
|
|
118
|
+
* You MUST call opencc_convert_utf8_free() to release allocated memory.
|
|
119
|
+
*
|
|
120
|
+
* @param opencc The opencc description pointer.
|
|
121
|
+
* @param input The UTF-8 encoded std::string.
|
|
122
|
+
* @param length The maximum length in byte to convert. If length is (size_t)-1,
|
|
123
|
+
* the whole std::string (terminated by '\0') will be converted.
|
|
124
|
+
*
|
|
125
|
+
* @return The newly allocated UTF-8 std::string that stores text
|
|
126
|
+
* converted, or NULL on error.
|
|
127
|
+
* @ingroup opencc_c_api
|
|
128
|
+
*/
|
|
129
|
+
OPENCC_EXPORT char* opencc_convert_utf8(opencc_t opencc, const char* input,
|
|
130
|
+
size_t length);
|
|
131
|
+
|
|
132
|
+
/**
|
|
133
|
+
* Releases allocated buffer by opencc_convert_utf8
|
|
134
|
+
*
|
|
135
|
+
* @param str Pointer to the allocated std::string buffer by
|
|
136
|
+
* opencc_convert_utf8.
|
|
137
|
+
*
|
|
138
|
+
* @ingroup opencc_c_api
|
|
139
|
+
*/
|
|
140
|
+
OPENCC_EXPORT void opencc_convert_utf8_free(char* str);
|
|
141
|
+
|
|
142
|
+
/**
|
|
143
|
+
* Returns the last error message
|
|
144
|
+
*
|
|
145
|
+
* Note that this function is the only one which is NOT thread-safe.
|
|
146
|
+
*
|
|
147
|
+
* @ingroup opencc_c_api
|
|
148
|
+
*/
|
|
149
|
+
OPENCC_EXPORT const char* opencc_error(void);
|
|
150
|
+
|
|
151
|
+
#ifdef __cplusplus
|
|
152
|
+
} // extern "C"
|
|
153
|
+
#endif
|
|
154
|
+
|
|
155
|
+
/**
|
|
156
|
+
* @defgroup opencc_cpp_api OpenCC C++ Comprehensive API
|
|
157
|
+
*
|
|
158
|
+
* Comprehensive API in C++ language
|
|
159
|
+
*/
|
|
160
|
+
|
|
161
|
+
#endif
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Open Chinese Convert
|
|
3
|
+
*
|
|
4
|
+
* Copyright 2021 Carbo Kuo <byvoid@byvoid.com>
|
|
5
|
+
*
|
|
6
|
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
* you may not use this file except in compliance with the License.
|
|
8
|
+
* You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
* See the License for the specific language governing permissions and
|
|
16
|
+
* limitations under the License.
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
#pragma once
|
|
20
|
+
|
|
21
|
+
#define OPENCC_ENABLE_DARTS
|