OpenCC 1.2.0__cp38-cp38-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opencc/__init__.py +49 -0
- opencc/clib/__init__.py +0 -0
- opencc/clib/bin/opencc +0 -0
- opencc/clib/bin/opencc_dict +0 -0
- opencc/clib/bin/opencc_phrase_extract +0 -0
- opencc/clib/include/opencc/BinaryDict.hpp +53 -0
- opencc/clib/include/opencc/Common.hpp +82 -0
- opencc/clib/include/opencc/Config.hpp +49 -0
- opencc/clib/include/opencc/Conversion.hpp +47 -0
- opencc/clib/include/opencc/ConversionChain.hpp +43 -0
- opencc/clib/include/opencc/Converter.hpp +51 -0
- opencc/clib/include/opencc/DartsDict.hpp +60 -0
- opencc/clib/include/opencc/Dict.hpp +92 -0
- opencc/clib/include/opencc/DictConverter.hpp +32 -0
- opencc/clib/include/opencc/DictEntry.hpp +173 -0
- opencc/clib/include/opencc/DictGroup.hpp +57 -0
- opencc/clib/include/opencc/Exception.hpp +88 -0
- opencc/clib/include/opencc/Export.hpp +40 -0
- opencc/clib/include/opencc/Lexicon.hpp +70 -0
- opencc/clib/include/opencc/MarisaDict.hpp +63 -0
- opencc/clib/include/opencc/MaxMatchSegmentation.hpp +43 -0
- opencc/clib/include/opencc/Optional.hpp +76 -0
- opencc/clib/include/opencc/PhraseExtract.hpp +195 -0
- opencc/clib/include/opencc/Segmentation.hpp +32 -0
- opencc/clib/include/opencc/Segments.hpp +118 -0
- opencc/clib/include/opencc/SerializableDict.hpp +77 -0
- opencc/clib/include/opencc/SerializedValues.hpp +52 -0
- opencc/clib/include/opencc/SimpleConverter.hpp +113 -0
- opencc/clib/include/opencc/TextDict.hpp +60 -0
- opencc/clib/include/opencc/UTF8StringSlice.hpp +246 -0
- opencc/clib/include/opencc/UTF8Util.hpp +291 -0
- opencc/clib/include/opencc/opencc.h +161 -0
- opencc/clib/include/opencc/opencc_config.h +21 -0
- opencc/clib/lib/cmake/opencc/OpenCCConfig.cmake +31 -0
- opencc/clib/lib/cmake/opencc/OpenCCConfigVersion.cmake +65 -0
- opencc/clib/lib/cmake/opencc/OpenCCTargets-release.cmake +29 -0
- opencc/clib/lib/cmake/opencc/OpenCCTargets.cmake +110 -0
- opencc/clib/lib/libmarisa.a +0 -0
- opencc/clib/lib/libopencc.a +0 -0
- opencc/clib/lib/pkgconfig/opencc.pc +11 -0
- opencc/clib/opencc_clib.cpython-38-x86_64-linux-gnu.so +0 -0
- opencc/clib/share/opencc/HKVariants.ocd2 +0 -0
- opencc/clib/share/opencc/HKVariantsRev.ocd2 +0 -0
- opencc/clib/share/opencc/HKVariantsRevPhrases.ocd2 +0 -0
- opencc/clib/share/opencc/JPShinjitaiCharacters.ocd2 +0 -0
- opencc/clib/share/opencc/JPShinjitaiPhrases.ocd2 +0 -0
- opencc/clib/share/opencc/JPVariants.ocd2 +0 -0
- opencc/clib/share/opencc/JPVariantsRev.ocd2 +0 -0
- opencc/clib/share/opencc/STCharacters.ocd2 +0 -0
- opencc/clib/share/opencc/STPhrases.ocd2 +0 -0
- opencc/clib/share/opencc/TSCharacters.ocd2 +0 -0
- opencc/clib/share/opencc/TSPhrases.ocd2 +0 -0
- opencc/clib/share/opencc/TWPhrases.ocd2 +0 -0
- opencc/clib/share/opencc/TWPhrasesRev.ocd2 +0 -0
- opencc/clib/share/opencc/TWVariants.ocd2 +0 -0
- opencc/clib/share/opencc/TWVariantsRev.ocd2 +0 -0
- opencc/clib/share/opencc/TWVariantsRevPhrases.ocd2 +0 -0
- opencc/clib/share/opencc/hk2s.json +33 -0
- opencc/clib/share/opencc/hk2t.json +22 -0
- opencc/clib/share/opencc/jp2t.json +25 -0
- opencc/clib/share/opencc/s2hk.json +27 -0
- opencc/clib/share/opencc/s2t.json +22 -0
- opencc/clib/share/opencc/s2tw.json +27 -0
- opencc/clib/share/opencc/s2twp.json +32 -0
- opencc/clib/share/opencc/t2hk.json +16 -0
- opencc/clib/share/opencc/t2jp.json +16 -0
- opencc/clib/share/opencc/t2s.json +22 -0
- opencc/clib/share/opencc/t2tw.json +16 -0
- opencc/clib/share/opencc/tw2s.json +33 -0
- opencc/clib/share/opencc/tw2sp.json +36 -0
- opencc/clib/share/opencc/tw2t.json +22 -0
- opencc/py.typed +0 -0
- opencc-1.2.0.dist-info/AUTHORS +12 -0
- opencc-1.2.0.dist-info/LICENSE +56 -0
- opencc-1.2.0.dist-info/METADATA +347 -0
- opencc-1.2.0.dist-info/RECORD +78 -0
- opencc-1.2.0.dist-info/WHEEL +5 -0
- opencc-1.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Open Chinese Convert
|
|
3
|
+
*
|
|
4
|
+
* Copyright 2015 Carbo Kuo <byvoid@byvoid.com>
|
|
5
|
+
*
|
|
6
|
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
* you may not use this file except in compliance with the License.
|
|
8
|
+
* You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
* See the License for the specific language governing permissions and
|
|
16
|
+
* limitations under the License.
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
#pragma once
|
|
20
|
+
|
|
21
|
+
#include <functional>
|
|
22
|
+
#include <unordered_map>
|
|
23
|
+
|
|
24
|
+
#include "Common.hpp"
|
|
25
|
+
#include "UTF8StringSlice.hpp"
|
|
26
|
+
|
|
27
|
+
namespace opencc {
|
|
28
|
+
|
|
29
|
+
class OPENCC_EXPORT PhraseExtract {
|
|
30
|
+
public:
|
|
31
|
+
typedef UTF8StringSlice::LengthType LengthType;
|
|
32
|
+
|
|
33
|
+
typedef UTF8StringSliceBase<unsigned char> UTF8StringSlice8Bit;
|
|
34
|
+
|
|
35
|
+
PhraseExtract();
|
|
36
|
+
|
|
37
|
+
virtual ~PhraseExtract();
|
|
38
|
+
|
|
39
|
+
void Extract(const std::string& text) {
|
|
40
|
+
SetFullText(text);
|
|
41
|
+
ExtractSuffixes();
|
|
42
|
+
CalculateFrequency();
|
|
43
|
+
CalculateSuffixEntropy();
|
|
44
|
+
ReleaseSuffixes();
|
|
45
|
+
ExtractPrefixes();
|
|
46
|
+
CalculatePrefixEntropy();
|
|
47
|
+
ReleasePrefixes();
|
|
48
|
+
ExtractWordCandidates();
|
|
49
|
+
CalculateCohesions();
|
|
50
|
+
SelectWords();
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
void SetFullText(const std::string& fullText) {
|
|
54
|
+
utf8FullText = UTF8StringSlice(fullText.c_str());
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
void SetFullText(const char* fullText) {
|
|
58
|
+
utf8FullText = UTF8StringSlice(fullText);
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
void SetFullText(const UTF8StringSlice& fullText) { utf8FullText = fullText; }
|
|
62
|
+
|
|
63
|
+
void SetWordMinLength(const LengthType _wordMinLength) {
|
|
64
|
+
wordMinLength = _wordMinLength;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
void SetWordMaxLength(const LengthType _wordMaxLength) {
|
|
68
|
+
wordMaxLength = _wordMaxLength;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
void SetPrefixSetLength(const LengthType _prefixSetLength) {
|
|
72
|
+
prefixSetLength = _prefixSetLength;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
void SetSuffixSetLength(const LengthType _suffixSetLength) {
|
|
76
|
+
suffixSetLength = _suffixSetLength;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// PreCalculationFilter is called after frequencies statistics.
|
|
80
|
+
void SetPreCalculationFilter(
|
|
81
|
+
const std::function<bool(const PhraseExtract&,
|
|
82
|
+
const UTF8StringSlice8Bit&)>& filter) {
|
|
83
|
+
preCalculationFilter = filter;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
void SetPostCalculationFilter(
|
|
87
|
+
const std::function<bool(const PhraseExtract&,
|
|
88
|
+
const UTF8StringSlice8Bit&)>& filter) {
|
|
89
|
+
postCalculationFilter = filter;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
void ReleaseSuffixes() { std::vector<UTF8StringSlice8Bit>().swap(suffixes); }
|
|
93
|
+
|
|
94
|
+
void ReleasePrefixes() { std::vector<UTF8StringSlice8Bit>().swap(prefixes); }
|
|
95
|
+
|
|
96
|
+
const std::vector<UTF8StringSlice8Bit>& Words() const { return words; }
|
|
97
|
+
|
|
98
|
+
const std::vector<UTF8StringSlice8Bit>& WordCandidates() const {
|
|
99
|
+
return wordCandidates;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
struct Signals {
|
|
103
|
+
size_t frequency;
|
|
104
|
+
double cohesion;
|
|
105
|
+
double suffixEntropy;
|
|
106
|
+
double prefixEntropy;
|
|
107
|
+
};
|
|
108
|
+
|
|
109
|
+
const Signals& Signal(const UTF8StringSlice8Bit& wordCandidate) const;
|
|
110
|
+
|
|
111
|
+
double Cohesion(const UTF8StringSlice8Bit& wordCandidate) const;
|
|
112
|
+
|
|
113
|
+
double Entropy(const UTF8StringSlice8Bit& wordCandidate) const;
|
|
114
|
+
|
|
115
|
+
double SuffixEntropy(const UTF8StringSlice8Bit& wordCandidate) const;
|
|
116
|
+
|
|
117
|
+
double PrefixEntropy(const UTF8StringSlice8Bit& wordCandidate) const;
|
|
118
|
+
|
|
119
|
+
size_t Frequency(const UTF8StringSlice8Bit& word) const;
|
|
120
|
+
|
|
121
|
+
double Probability(const UTF8StringSlice8Bit& word) const;
|
|
122
|
+
|
|
123
|
+
double LogProbability(const UTF8StringSlice8Bit& word) const;
|
|
124
|
+
|
|
125
|
+
void Reset();
|
|
126
|
+
|
|
127
|
+
void ExtractSuffixes();
|
|
128
|
+
|
|
129
|
+
void ExtractPrefixes();
|
|
130
|
+
|
|
131
|
+
void ExtractWordCandidates();
|
|
132
|
+
|
|
133
|
+
void CalculateFrequency();
|
|
134
|
+
|
|
135
|
+
void CalculateCohesions();
|
|
136
|
+
|
|
137
|
+
void CalculateSuffixEntropy();
|
|
138
|
+
|
|
139
|
+
void CalculatePrefixEntropy();
|
|
140
|
+
|
|
141
|
+
void SelectWords();
|
|
142
|
+
|
|
143
|
+
static bool
|
|
144
|
+
DefaultPreCalculationFilter(const PhraseExtract&,
|
|
145
|
+
const PhraseExtract::UTF8StringSlice8Bit&);
|
|
146
|
+
|
|
147
|
+
static bool
|
|
148
|
+
DefaultPostCalculationFilter(const PhraseExtract&,
|
|
149
|
+
const PhraseExtract::UTF8StringSlice8Bit&);
|
|
150
|
+
|
|
151
|
+
private:
|
|
152
|
+
class DictType;
|
|
153
|
+
|
|
154
|
+
// Pointwise Mutual Information
|
|
155
|
+
double PMI(const UTF8StringSlice8Bit& wordCandidate,
|
|
156
|
+
const UTF8StringSlice8Bit& part1,
|
|
157
|
+
const UTF8StringSlice8Bit& part2) const;
|
|
158
|
+
|
|
159
|
+
double CalculateCohesion(const UTF8StringSlice8Bit& wordCandidate) const;
|
|
160
|
+
|
|
161
|
+
double CalculateEntropy(
|
|
162
|
+
const std::unordered_map<UTF8StringSlice8Bit, size_t,
|
|
163
|
+
UTF8StringSlice8Bit::Hasher>& choices) const;
|
|
164
|
+
|
|
165
|
+
LengthType wordMinLength;
|
|
166
|
+
LengthType wordMaxLength;
|
|
167
|
+
LengthType prefixSetLength;
|
|
168
|
+
LengthType suffixSetLength;
|
|
169
|
+
std::function<bool(const PhraseExtract&, const UTF8StringSlice8Bit&)>
|
|
170
|
+
preCalculationFilter;
|
|
171
|
+
std::function<bool(const PhraseExtract&, const UTF8StringSlice8Bit&)>
|
|
172
|
+
postCalculationFilter;
|
|
173
|
+
|
|
174
|
+
bool prefixesExtracted;
|
|
175
|
+
bool suffixesExtracted;
|
|
176
|
+
bool frequenciesCalculated;
|
|
177
|
+
bool wordCandidatesExtracted;
|
|
178
|
+
bool cohesionsCalculated;
|
|
179
|
+
bool prefixEntropiesCalculated;
|
|
180
|
+
bool suffixEntropiesCalculated;
|
|
181
|
+
bool wordsSelected;
|
|
182
|
+
|
|
183
|
+
UTF8StringSlice utf8FullText;
|
|
184
|
+
size_t totalOccurrence;
|
|
185
|
+
double logTotalOccurrence;
|
|
186
|
+
std::vector<UTF8StringSlice8Bit> prefixes;
|
|
187
|
+
std::vector<UTF8StringSlice8Bit> suffixes;
|
|
188
|
+
std::vector<UTF8StringSlice8Bit> wordCandidates;
|
|
189
|
+
std::vector<UTF8StringSlice8Bit> words;
|
|
190
|
+
DictType* signals;
|
|
191
|
+
|
|
192
|
+
friend class PhraseExtractTest;
|
|
193
|
+
};
|
|
194
|
+
|
|
195
|
+
} // namespace opencc
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Open Chinese Convert
|
|
3
|
+
*
|
|
4
|
+
* Copyright 2010-2014 Carbo Kuo <byvoid@byvoid.com>
|
|
5
|
+
*
|
|
6
|
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
* you may not use this file except in compliance with the License.
|
|
8
|
+
* You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
* See the License for the specific language governing permissions and
|
|
16
|
+
* limitations under the License.
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
#pragma once
|
|
20
|
+
|
|
21
|
+
#include "Common.hpp"
|
|
22
|
+
|
|
23
|
+
namespace opencc {
|
|
24
|
+
/**
|
|
25
|
+
* Abstract segmentation
|
|
26
|
+
* @ingroup opencc_cpp_api
|
|
27
|
+
*/
|
|
28
|
+
class OPENCC_EXPORT Segmentation {
|
|
29
|
+
public:
|
|
30
|
+
virtual SegmentsPtr Segment(const std::string& text) const = 0;
|
|
31
|
+
};
|
|
32
|
+
} // namespace opencc
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Open Chinese Convert
|
|
3
|
+
*
|
|
4
|
+
* Copyright 2010-2014 Carbo Kuo <byvoid@byvoid.com>
|
|
5
|
+
*
|
|
6
|
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
* you may not use this file except in compliance with the License.
|
|
8
|
+
* You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
* See the License for the specific language governing permissions and
|
|
16
|
+
* limitations under the License.
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
#pragma once
|
|
20
|
+
|
|
21
|
+
#include <iterator>
|
|
22
|
+
#include <sstream>
|
|
23
|
+
|
|
24
|
+
#include "Common.hpp"
|
|
25
|
+
|
|
26
|
+
namespace opencc {
|
|
27
|
+
/**
|
|
28
|
+
* Segmented text
|
|
29
|
+
* @ingroup opencc_cpp_api
|
|
30
|
+
*/
|
|
31
|
+
class OPENCC_EXPORT Segments {
|
|
32
|
+
public:
|
|
33
|
+
Segments() {}
|
|
34
|
+
|
|
35
|
+
Segments(std::initializer_list<const char*> initList) {
|
|
36
|
+
for (const char* item : initList) {
|
|
37
|
+
AddSegment(item);
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
Segments(std::initializer_list<std::string> initList) {
|
|
42
|
+
for (const std::string& item : initList) {
|
|
43
|
+
AddSegment(item);
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
void AddSegment(const char* unmanagedString) {
|
|
48
|
+
indexes.push_back(std::make_pair(unmanaged.size(), false));
|
|
49
|
+
unmanaged.push_back(unmanagedString);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
void AddSegment(const std::string& str) {
|
|
53
|
+
indexes.push_back(std::make_pair(managed.size(), true));
|
|
54
|
+
managed.push_back(str);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
class iterator {
|
|
58
|
+
public:
|
|
59
|
+
using iterator_category = std::input_iterator_tag;
|
|
60
|
+
using value_type = const char*;
|
|
61
|
+
|
|
62
|
+
iterator(const Segments* const _segments, size_t _cursor)
|
|
63
|
+
: segments(_segments), cursor(_cursor) {}
|
|
64
|
+
|
|
65
|
+
iterator& operator++() {
|
|
66
|
+
cursor++;
|
|
67
|
+
return *this;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
bool operator==(const iterator& that) const {
|
|
71
|
+
return cursor == that.cursor && segments == that.segments;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
bool operator!=(const iterator& that) const {
|
|
75
|
+
return !this->operator==(that);
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
const char* operator*() const { return segments->At(cursor); }
|
|
79
|
+
|
|
80
|
+
private:
|
|
81
|
+
const Segments* const segments;
|
|
82
|
+
size_t cursor;
|
|
83
|
+
};
|
|
84
|
+
|
|
85
|
+
const char* At(size_t cursor) const {
|
|
86
|
+
const auto& index = indexes[cursor];
|
|
87
|
+
if (index.second) {
|
|
88
|
+
return managed[index.first].c_str();
|
|
89
|
+
} else {
|
|
90
|
+
return unmanaged[index.first];
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
size_t Length() const { return indexes.size(); }
|
|
95
|
+
|
|
96
|
+
iterator begin() const { return iterator(this, 0); }
|
|
97
|
+
|
|
98
|
+
iterator end() const { return iterator(this, indexes.size()); }
|
|
99
|
+
|
|
100
|
+
std::string ToString() const {
|
|
101
|
+
// TODO implement a nested structure to reduce concatenation,
|
|
102
|
+
// like a purely functional differential list
|
|
103
|
+
std::ostringstream buffer;
|
|
104
|
+
for (const char* segment : *this) {
|
|
105
|
+
buffer << segment;
|
|
106
|
+
}
|
|
107
|
+
return buffer.str();
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
private:
|
|
111
|
+
Segments(const Segments&) {}
|
|
112
|
+
|
|
113
|
+
std::vector<const char*> unmanaged;
|
|
114
|
+
std::vector<std::string> managed;
|
|
115
|
+
// index, managed
|
|
116
|
+
std::vector<std::pair<size_t, bool>> indexes;
|
|
117
|
+
};
|
|
118
|
+
} // namespace opencc
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Open Chinese Convert
|
|
3
|
+
*
|
|
4
|
+
* Copyright 2010-2014 Carbo Kuo <byvoid@byvoid.com>
|
|
5
|
+
*
|
|
6
|
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
* you may not use this file except in compliance with the License.
|
|
8
|
+
* You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
* See the License for the specific language governing permissions and
|
|
16
|
+
* limitations under the License.
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
#pragma once
|
|
20
|
+
|
|
21
|
+
#include "Dict.hpp"
|
|
22
|
+
|
|
23
|
+
namespace opencc {
|
|
24
|
+
/**
|
|
25
|
+
* Serializable dictionary interface
|
|
26
|
+
* @ingroup opencc_cpp_api
|
|
27
|
+
*/
|
|
28
|
+
class OPENCC_EXPORT SerializableDict {
|
|
29
|
+
public:
|
|
30
|
+
/**
|
|
31
|
+
* Serializes the dictionary and writes in to a file.
|
|
32
|
+
*/
|
|
33
|
+
virtual void SerializeToFile(FILE* fp) const = 0;
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Serializes the dictionary and writes in to a file.
|
|
37
|
+
*/
|
|
38
|
+
virtual void SerializeToFile(const std::string& fileName) const {
|
|
39
|
+
FILE* fp = fopen(fileName.c_str(), "wb");
|
|
40
|
+
if (fp == NULL) {
|
|
41
|
+
throw FileNotWritable(fileName);
|
|
42
|
+
}
|
|
43
|
+
SerializeToFile(fp);
|
|
44
|
+
fclose(fp);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
template <typename DICT>
|
|
48
|
+
static bool TryLoadFromFile(const std::string& fileName,
|
|
49
|
+
std::shared_ptr<DICT>* dict) {
|
|
50
|
+
FILE* fp =
|
|
51
|
+
#ifdef _MSC_VER
|
|
52
|
+
// well, the 'GetPlatformString' shall return a 'wstring'
|
|
53
|
+
_wfopen(UTF8Util::GetPlatformString(fileName).c_str(), L"rb")
|
|
54
|
+
#else
|
|
55
|
+
fopen(UTF8Util::GetPlatformString(fileName).c_str(), "rb")
|
|
56
|
+
#endif // _MSC_VER
|
|
57
|
+
;
|
|
58
|
+
|
|
59
|
+
if (fp == NULL) {
|
|
60
|
+
return false;
|
|
61
|
+
}
|
|
62
|
+
std::shared_ptr<DICT> loadedDict = DICT::NewFromFile(fp);
|
|
63
|
+
fclose(fp);
|
|
64
|
+
*dict = loadedDict;
|
|
65
|
+
return true;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
template <typename DICT>
|
|
69
|
+
static std::shared_ptr<DICT> NewFromFile(const std::string& fileName) {
|
|
70
|
+
std::shared_ptr<DICT> dict;
|
|
71
|
+
if (!TryLoadFromFile<DICT>(fileName, &dict)) {
|
|
72
|
+
throw FileNotFound(fileName);
|
|
73
|
+
}
|
|
74
|
+
return dict;
|
|
75
|
+
}
|
|
76
|
+
};
|
|
77
|
+
} // namespace opencc
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Open Chinese Convert
|
|
3
|
+
*
|
|
4
|
+
* Copyright 2020 Carbo Kuo <byvoid@byvoid.com>
|
|
5
|
+
*
|
|
6
|
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
* you may not use this file except in compliance with the License.
|
|
8
|
+
* You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
* See the License for the specific language governing permissions and
|
|
16
|
+
* limitations under the License.
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
#pragma once
|
|
20
|
+
|
|
21
|
+
#include <cstdint>
|
|
22
|
+
|
|
23
|
+
#include "Common.hpp"
|
|
24
|
+
#include "SerializableDict.hpp"
|
|
25
|
+
|
|
26
|
+
namespace opencc {
|
|
27
|
+
/**
|
|
28
|
+
* Binary format for dictionary values serialization.
|
|
29
|
+
* @ingroup opencc_cpp_api
|
|
30
|
+
*/
|
|
31
|
+
class OPENCC_EXPORT SerializedValues : public SerializableDict {
|
|
32
|
+
public:
|
|
33
|
+
SerializedValues(const LexiconPtr& _lexicon) : lexicon(_lexicon) {}
|
|
34
|
+
|
|
35
|
+
virtual ~SerializedValues() {}
|
|
36
|
+
|
|
37
|
+
virtual void SerializeToFile(FILE* fp) const;
|
|
38
|
+
|
|
39
|
+
static std::shared_ptr<SerializedValues> NewFromFile(FILE* fp);
|
|
40
|
+
|
|
41
|
+
const LexiconPtr& GetLexicon() const { return lexicon; }
|
|
42
|
+
|
|
43
|
+
size_t KeyMaxLength() const;
|
|
44
|
+
|
|
45
|
+
private:
|
|
46
|
+
LexiconPtr lexicon;
|
|
47
|
+
|
|
48
|
+
void ConstructBuffer(std::string* valueBuffer,
|
|
49
|
+
std::vector<uint16_t>* valueBytes,
|
|
50
|
+
uint32_t* valueTotalLength) const;
|
|
51
|
+
};
|
|
52
|
+
} // namespace opencc
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Open Chinese Convert
|
|
3
|
+
*
|
|
4
|
+
* Copyright 2010-2014 Carbo Kuo <byvoid@byvoid.com>
|
|
5
|
+
*
|
|
6
|
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
* you may not use this file except in compliance with the License.
|
|
8
|
+
* You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
* See the License for the specific language governing permissions and
|
|
16
|
+
* limitations under the License.
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
#include "Export.hpp"
|
|
20
|
+
#include <string>
|
|
21
|
+
#include <vector>
|
|
22
|
+
|
|
23
|
+
#ifndef __OPENCC_SIMPLECONVERTER_HPP_
|
|
24
|
+
#define __OPENCC_SIMPLECONVERTER_HPP_
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* @defgroup opencc_simple_api OpenCC C++ Simple API
|
|
28
|
+
*
|
|
29
|
+
* Simple API in C++ language
|
|
30
|
+
*/
|
|
31
|
+
|
|
32
|
+
namespace opencc {
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* A high level converter
|
|
36
|
+
* This interface does not require C++11 to compile.
|
|
37
|
+
* @ingroup opencc_simple_api
|
|
38
|
+
*/
|
|
39
|
+
class OPENCC_EXPORT SimpleConverter {
|
|
40
|
+
public:
|
|
41
|
+
/**
|
|
42
|
+
* Constructor of SimpleConverter
|
|
43
|
+
* @param configFileName File name of configuration.
|
|
44
|
+
*/
|
|
45
|
+
explicit SimpleConverter(const std::string& configFileName);
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Constructor of SimpleConverter
|
|
49
|
+
* @param configFileName File name of configuration.
|
|
50
|
+
* @param paths Additional paths to locate configuration and dictionary files.
|
|
51
|
+
*/
|
|
52
|
+
SimpleConverter(const std::string& configFileName,
|
|
53
|
+
const std::vector<std::string>& paths);
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Constructor of SimpleConverter
|
|
57
|
+
* @param configFileName File name of configuration.
|
|
58
|
+
* @param paths Additional paths to locate configuration and dictionary files.
|
|
59
|
+
* @param argv0 Path of the executable (argv[0]), in addition to additional
|
|
60
|
+
* paths.
|
|
61
|
+
*/
|
|
62
|
+
SimpleConverter(const std::string& configFileName,
|
|
63
|
+
const std::vector<std::string>& paths, const char* argv0);
|
|
64
|
+
|
|
65
|
+
~SimpleConverter();
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* Converts a text
|
|
69
|
+
* @param input Text to be converted.
|
|
70
|
+
*/
|
|
71
|
+
std::string Convert(const std::string& input) const;
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Converts a text
|
|
75
|
+
* @param input A C-Style std::string (terminated by '\0') to be converted.
|
|
76
|
+
*/
|
|
77
|
+
std::string Convert(const char* input) const;
|
|
78
|
+
|
|
79
|
+
/**
|
|
80
|
+
* Converts a text
|
|
81
|
+
* @param input A C-Style std::string limited by a given length to be
|
|
82
|
+
* converted.
|
|
83
|
+
* @param length Maximal length in byte of the input std::string.
|
|
84
|
+
*/
|
|
85
|
+
std::string Convert(const char* input, size_t length) const;
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Converts a text and writes to an allocated buffer
|
|
89
|
+
* Please make sure the buffer has sufficient space.
|
|
90
|
+
* @param input A C-Style std::string (terminated by '\0') to be converted.
|
|
91
|
+
* @param output Buffer to write the converted text.
|
|
92
|
+
* @return Length of converted text.
|
|
93
|
+
*/
|
|
94
|
+
size_t Convert(const char* input, char* output) const;
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Converts a text and writes to an allocated buffer
|
|
98
|
+
* Please make sure the buffer has sufficient space.
|
|
99
|
+
* @param input A C-Style std::string limited by a given length to be
|
|
100
|
+
* converted.
|
|
101
|
+
* @param length Maximal length in byte of the input std::string.
|
|
102
|
+
* @param output Buffer to write the converted text.
|
|
103
|
+
* @return Length of converted text.
|
|
104
|
+
*/
|
|
105
|
+
size_t Convert(const char* input, size_t length, char* output) const;
|
|
106
|
+
|
|
107
|
+
private:
|
|
108
|
+
const void* internalData;
|
|
109
|
+
};
|
|
110
|
+
|
|
111
|
+
} // namespace opencc
|
|
112
|
+
|
|
113
|
+
#endif
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Open Chinese Convert
|
|
3
|
+
*
|
|
4
|
+
* Copyright 2010-2020 Carbo Kuo <byvoid@byvoid.com>
|
|
5
|
+
*
|
|
6
|
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
* you may not use this file except in compliance with the License.
|
|
8
|
+
* You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
* See the License for the specific language governing permissions and
|
|
16
|
+
* limitations under the License.
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
#pragma once
|
|
20
|
+
|
|
21
|
+
#include "Common.hpp"
|
|
22
|
+
#include "SerializableDict.hpp"
|
|
23
|
+
|
|
24
|
+
namespace opencc {
|
|
25
|
+
/**
|
|
26
|
+
* Text dictionary
|
|
27
|
+
* @ingroup opencc_cpp_api
|
|
28
|
+
*/
|
|
29
|
+
class OPENCC_EXPORT TextDict : public Dict, public SerializableDict {
|
|
30
|
+
public:
|
|
31
|
+
/**
|
|
32
|
+
* Constructor of TextDict.
|
|
33
|
+
* _lexicon must be sorted.
|
|
34
|
+
*/
|
|
35
|
+
TextDict(const LexiconPtr& _lexicon);
|
|
36
|
+
|
|
37
|
+
virtual ~TextDict();
|
|
38
|
+
|
|
39
|
+
virtual size_t KeyMaxLength() const;
|
|
40
|
+
|
|
41
|
+
virtual Optional<const DictEntry*> Match(const char* word, size_t len) const;
|
|
42
|
+
|
|
43
|
+
virtual LexiconPtr GetLexicon() const;
|
|
44
|
+
|
|
45
|
+
virtual void SerializeToFile(FILE* fp) const;
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Constructs a TextDict from another dictionary.
|
|
49
|
+
*/
|
|
50
|
+
static TextDictPtr NewFromDict(const Dict& dict);
|
|
51
|
+
|
|
52
|
+
static TextDictPtr NewFromFile(FILE* fp);
|
|
53
|
+
|
|
54
|
+
static TextDictPtr NewFromSortedFile(FILE* fp);
|
|
55
|
+
|
|
56
|
+
private:
|
|
57
|
+
const size_t maxLength;
|
|
58
|
+
const LexiconPtr lexicon;
|
|
59
|
+
};
|
|
60
|
+
} // namespace opencc
|