cppjieba_rb 0.4.1 → 0.4.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.editorconfig +21 -0
- data/.github/workflows/linting.yml +30 -0
- data/.github/workflows/release.yml +42 -0
- data/.github/workflows/tests.yml +47 -0
- data/.gitignore +1 -0
- data/.rubocop.yml +45 -0
- data/.ruby-version +1 -0
- data/.yamllint +35 -0
- data/CHANGELOG.md +17 -0
- data/Gemfile +11 -0
- data/README.md +5 -5
- data/Rakefile +16 -7
- data/cppjieba_rb.gemspec +46 -33
- data/ext/cppjieba/.github/workflows/cmake.yml +52 -0
- data/ext/cppjieba/.github/workflows/stale-issues.yml +24 -0
- data/ext/cppjieba/.gitmodules +3 -0
- data/ext/cppjieba/{ChangeLog.md → CHANGELOG.md} +50 -1
- data/ext/cppjieba/CMakeLists.txt +11 -14
- data/ext/cppjieba/LICENSE +20 -0
- data/ext/cppjieba/README.md +9 -18
- data/ext/cppjieba/deps/limonp/.github/workflows/cmake.yml +43 -0
- data/ext/cppjieba/deps/limonp/.gitignore +9 -0
- data/ext/cppjieba/deps/limonp/CHANGELOG.md +160 -0
- data/ext/cppjieba/deps/limonp/CMakeLists.txt +61 -0
- data/ext/cppjieba/deps/limonp/LICENSE +20 -0
- data/ext/cppjieba/deps/limonp/README.md +38 -0
- data/ext/cppjieba/deps/limonp/{LocalVector.hpp → include/limonp/LocalVector.hpp} +3 -3
- data/ext/cppjieba/deps/limonp/{Logging.hpp → include/limonp/Logging.hpp} +17 -3
- data/ext/cppjieba/deps/limonp/{StringUtil.hpp → include/limonp/StringUtil.hpp} +31 -10
- data/ext/cppjieba/deps/limonp/test/CMakeLists.txt +8 -0
- data/ext/cppjieba/deps/limonp/test/demo.cpp +40 -0
- data/ext/cppjieba/deps/limonp/test/testdata/1.conf +5 -0
- data/ext/cppjieba/deps/limonp/test/testdata/StdExtension.data +3 -0
- data/ext/cppjieba/deps/limonp/test/testdata/dict.gbk +50 -0
- data/ext/cppjieba/deps/limonp/test/testdata/dict.utf8 +50 -0
- data/ext/cppjieba/deps/limonp/test/testdata/io_testfile +2 -0
- data/ext/cppjieba/deps/limonp/test/testdata/jieba.dict.0.1.utf8 +93 -0
- data/ext/cppjieba/deps/limonp/test/testdata/jieba.dict.0.utf8 +93 -0
- data/ext/cppjieba/deps/limonp/test/testdata/jieba.dict.1.utf8 +67 -0
- data/ext/cppjieba/deps/limonp/test/testdata/jieba.dict.2.utf8 +64 -0
- data/ext/cppjieba/deps/limonp/test/unittest/CMakeLists.txt +30 -0
- data/ext/cppjieba/deps/limonp/test/unittest/TArgvContext.cpp +16 -0
- data/ext/cppjieba/deps/limonp/test/unittest/TCastFloat.cpp +19 -0
- data/ext/cppjieba/deps/limonp/test/unittest/TClosure.cpp +85 -0
- data/ext/cppjieba/deps/limonp/test/unittest/TColorPrint.cpp +20 -0
- data/ext/cppjieba/deps/limonp/test/unittest/TConfig.cpp +17 -0
- data/ext/cppjieba/deps/limonp/test/unittest/TLocalVector.cpp +41 -0
- data/ext/cppjieba/deps/limonp/test/unittest/TLogging.cpp +12 -0
- data/ext/cppjieba/deps/limonp/test/unittest/TStdExtension.cpp +95 -0
- data/ext/cppjieba/deps/limonp/test/unittest/TStringUtil.cpp +183 -0
- data/ext/cppjieba/include/cppjieba/DictTrie.hpp +9 -0
- data/ext/cppjieba/include/cppjieba/Jieba.hpp +4 -0
- data/ext/cppjieba/include/cppjieba/Trie.hpp +27 -1
- data/ext/cppjieba/test/CMakeLists.txt +4 -3
- data/ext/cppjieba/test/unittest/CMakeLists.txt +16 -7
- data/ext/cppjieba_rb/extconf.rb +11 -6
- data/ext/cppjieba_rb/internal.cc +1 -1
- data/lib/cppjieba_rb/segment.rb +4 -1
- data/lib/cppjieba_rb/version.rb +3 -1
- data/lib/cppjieba_rb.rb +12 -5
- data/test/test_keyword.rb +8 -8
- data/test/test_segment.rb +14 -10
- data/test/test_stop_word_filter.rb +5 -3
- data/test/test_tagging.rb +5 -2
- metadata +63 -140
- data/.travis.yml +0 -30
- data/ext/cppjieba/.travis.yml +0 -21
- data/ext/cppjieba/README_EN.md +0 -115
- data/ext/cppjieba/appveyor.yml +0 -32
- data/ext/cppjieba/deps/CMakeLists.txt +0 -1
- data/ext/cppjieba/deps/gtest/CMakeLists.txt +0 -5
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +0 -283
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +0 -230
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +0 -1421
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +0 -487
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +0 -796
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +0 -232
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +0 -176
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +0 -259
- data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +0 -2155
- data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +0 -358
- data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +0 -58
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +0 -308
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +0 -210
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +0 -1226
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +0 -233
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +0 -4822
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +0 -301
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +0 -619
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +0 -1788
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +0 -350
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +0 -968
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +0 -336
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +0 -3330
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +0 -296
- data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +0 -681
- data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +0 -509
- data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
- data/ext/cppjieba/deps/gtest/src/gtest-all.cc +0 -48
- data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +0 -1234
- data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +0 -380
- data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +0 -1038
- data/ext/cppjieba/deps/gtest/src/gtest-port.cc +0 -746
- data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +0 -356
- data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +0 -110
- data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +0 -110
- data/ext/cppjieba/deps/gtest/src/gtest.cc +0 -4898
- data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +0 -49
- data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +0 -67
- data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +0 -65
- data/ext/cppjieba/deps/limonp/FileLock.hpp +0 -74
- data/ext/cppjieba/deps/limonp/Md5.hpp +0 -411
- data/ext/cppjieba/deps/limonp/MutexLock.hpp +0 -51
- data/ext/cppjieba/deps/limonp/Thread.hpp +0 -44
- data/ext/cppjieba/deps/limonp/ThreadPool.hpp +0 -86
- data/ext/cppjieba/test/demo.cpp +0 -80
- /data/ext/cppjieba/deps/{gtest/src/.deps/.dirstamp → limonp/.gitmodules} +0 -0
- /data/ext/cppjieba/deps/limonp/{ArgvContext.hpp → include/limonp/ArgvContext.hpp} +0 -0
- /data/ext/cppjieba/deps/limonp/{Closure.hpp → include/limonp/Closure.hpp} +0 -0
- /data/ext/cppjieba/deps/limonp/{Colors.hpp → include/limonp/Colors.hpp} +0 -0
- /data/ext/cppjieba/deps/limonp/{Condition.hpp → include/limonp/Condition.hpp} +0 -0
- /data/ext/cppjieba/deps/limonp/{Config.hpp → include/limonp/Config.hpp} +0 -0
- /data/ext/cppjieba/deps/limonp/{ForcePublic.hpp → include/limonp/ForcePublic.hpp} +0 -0
- /data/ext/cppjieba/deps/limonp/{NonCopyable.hpp → include/limonp/NonCopyable.hpp} +0 -0
- /data/ext/cppjieba/deps/limonp/{StdExtension.hpp → include/limonp/StdExtension.hpp} +0 -0
- /data/ext/cppjieba/deps/{gtest/src/gtest_main.cc → limonp/test/unittest/gtest_main.cpp} +0 -0
@@ -0,0 +1,183 @@
|
|
1
|
+
#include "limonp/StringUtil.hpp"
|
2
|
+
#include "gtest/gtest.h"
|
3
|
+
using namespace limonp;
|
4
|
+
|
5
|
+
TEST(StringUtilTest, Test1) {
|
6
|
+
vector<string> vec;
|
7
|
+
string s;
|
8
|
+
Split("\t1\t3\t4\t", vec, "\t");
|
9
|
+
ASSERT_EQ(s << vec, "[\"\", \"1\", \"3\", \"4\"]");
|
10
|
+
s = " \t\n ni hao ad \r\n";
|
11
|
+
ASSERT_EQ("ni hao ad", Trim(s));
|
12
|
+
ASSERT_EQ("select * from table1 limit 1;" ,StringFormat("select %s from %s %s;", "*","table1","limit 1"));
|
13
|
+
s = StringFormat("select %s from %s %s;", "*","table1","limit 1");
|
14
|
+
ASSERT_EQ("select * from table1 limit 1;" ,s);
|
15
|
+
vec.clear();
|
16
|
+
vec.push_back("1");
|
17
|
+
vec.push_back("2");
|
18
|
+
vec.push_back("3");
|
19
|
+
s.clear();
|
20
|
+
Join(vec.begin(), vec.end(), s,",");
|
21
|
+
ASSERT_EQ("1,2,3",s);
|
22
|
+
s = Join(vec.begin(), vec.end(), "..");
|
23
|
+
ASSERT_EQ("1..2..3", s);
|
24
|
+
const char* arr[] = {"2","3","5"};
|
25
|
+
ASSERT_EQ("2,3,5", Join(arr, arr + sizeof(arr)/sizeof(arr[0]), ","));
|
26
|
+
map<string , int> mp;
|
27
|
+
mp["key1"] =2;
|
28
|
+
ASSERT_EQ("{key1:2}", s << mp);
|
29
|
+
std::unordered_map<int,int> hmp;
|
30
|
+
hmp[1]=2;
|
31
|
+
ASSERT_EQ("{1:2}", s << hmp);
|
32
|
+
}
|
33
|
+
|
34
|
+
TEST(StringUtilTest, Test2) {
|
35
|
+
string s, gbks;
|
36
|
+
ifstream ifs("../test/testdata/dict.gbk");
|
37
|
+
ASSERT_TRUE(!!ifs);
|
38
|
+
|
39
|
+
vector<uint16_t> uni;
|
40
|
+
while(getline(ifs, s)) {
|
41
|
+
GBKTrans(s, uni);
|
42
|
+
GBKTrans(uni.begin(), uni.end(), gbks);
|
43
|
+
ASSERT_EQ(s, gbks);
|
44
|
+
}
|
45
|
+
}
|
46
|
+
|
47
|
+
TEST(StringUtilTest, Test3) {
|
48
|
+
string s, utf8;
|
49
|
+
ifstream ifs("../test/testdata/dict.utf8");
|
50
|
+
ASSERT_TRUE(!!ifs);
|
51
|
+
|
52
|
+
vector<uint16_t> uni;
|
53
|
+
while(getline(ifs, s)) {
|
54
|
+
ASSERT_TRUE(Utf8ToUnicode(s, uni));
|
55
|
+
UnicodeToUtf8(uni.begin(), uni.end(), utf8);
|
56
|
+
ASSERT_EQ(s, utf8);
|
57
|
+
}
|
58
|
+
}
|
59
|
+
|
60
|
+
TEST(StringUtilTest, Test4) {
|
61
|
+
//ASSERT_TRUE(StartsWith("--help",NULL));
|
62
|
+
ASSERT_TRUE(StartsWith("--help","--"));
|
63
|
+
ASSERT_TRUE(StartsWith("--help","-"));
|
64
|
+
ASSERT_FALSE(StartsWith("--help","he"));
|
65
|
+
ASSERT_TRUE(StartsWith("help","help"));
|
66
|
+
ASSERT_FALSE(StartsWith("","help"));
|
67
|
+
ASSERT_TRUE(StartsWith("hel",""));
|
68
|
+
ASSERT_TRUE(EndsWith("hel",""));
|
69
|
+
ASSERT_TRUE(EndsWith("hel","el"));
|
70
|
+
}
|
71
|
+
|
72
|
+
TEST(StringUtilTest, Test5) {
|
73
|
+
const char* str = "1,2,3,4";
|
74
|
+
vector<string> vec;
|
75
|
+
string res;
|
76
|
+
Split(str, vec, ",");
|
77
|
+
ASSERT_EQ("[\"1\", \"2\", \"3\", \"4\"]", res << vec);
|
78
|
+
Split("1,2,3,4,", vec, ",");
|
79
|
+
ASSERT_EQ("[\"1\", \"2\", \"3\", \"4\"]", res << vec);
|
80
|
+
Split(str, vec, ",", 3);
|
81
|
+
ASSERT_EQ("[\"1\", \"2\", \"3\", \"4\"]", res << vec);
|
82
|
+
|
83
|
+
Split("1", vec, ",");
|
84
|
+
ASSERT_EQ("[\"1\"]", res << vec);
|
85
|
+
|
86
|
+
Split(str, vec, ",", 1);
|
87
|
+
ASSERT_EQ("[\"1\", \"2,3,4\"]", res << vec);
|
88
|
+
|
89
|
+
Split("", vec, ",");
|
90
|
+
ASSERT_EQ("[]", res << vec);
|
91
|
+
|
92
|
+
Split("1, 2", vec, ",");
|
93
|
+
ASSERT_EQ("[\"1\", \" 2\"]", res << vec);
|
94
|
+
|
95
|
+
Split("1==2", vec, "==");
|
96
|
+
ASSERT_EQ("[\"1\", \"\", \"2\"]", res << vec);
|
97
|
+
|
98
|
+
Split("1,", vec, ",");
|
99
|
+
ASSERT_EQ("[\"1\"]", res << vec);
|
100
|
+
|
101
|
+
Split(",1,", vec, ",");
|
102
|
+
ASSERT_EQ("[\"\", \"1\"]", res << vec);
|
103
|
+
|
104
|
+
Split("1, ", vec, ",");
|
105
|
+
ASSERT_EQ("[\"1\", \" \"]", res << vec);
|
106
|
+
|
107
|
+
res << Split("1|2,3", "|,");
|
108
|
+
ASSERT_EQ("[\"1\", \"2\", \"3\"]", res);
|
109
|
+
}
|
110
|
+
|
111
|
+
TEST(StringUtilTest, Trim) {
|
112
|
+
string s;
|
113
|
+
s = "xxxyyyxx";
|
114
|
+
ASSERT_EQ(RTrim(s, 'x'), "xxxyyy");
|
115
|
+
ASSERT_EQ(LTrim(s, 'x'), "yyy");
|
116
|
+
s = "xxxyyyxx";
|
117
|
+
ASSERT_EQ(Trim(s, 'x'), "yyy");
|
118
|
+
|
119
|
+
s = " x y ";
|
120
|
+
ASSERT_EQ(Trim(s), "x y");
|
121
|
+
|
122
|
+
// check if it core dump when using isalpha
|
123
|
+
wchar_t w = 1000024;
|
124
|
+
ASSERT_FALSE(IsSpace(w));
|
125
|
+
w = 0x20;
|
126
|
+
ASSERT_TRUE(IsSpace(w));
|
127
|
+
}
|
128
|
+
|
129
|
+
TEST(StringUtilTest, GetTime) {
|
130
|
+
string s;
|
131
|
+
GetTime("%Y-%m-%d %H:%M:%S", s);
|
132
|
+
//print(s);
|
133
|
+
}
|
134
|
+
|
135
|
+
TEST(StringUtilTest, PathJoin) {
|
136
|
+
const char * path1 = "/home/foo/dir";
|
137
|
+
const char * path2 = "file";
|
138
|
+
const char * path3 = "/home/foo/dir/";
|
139
|
+
const char * path4 = "file";
|
140
|
+
const char * answer = "/home/foo/dir/file";
|
141
|
+
|
142
|
+
ASSERT_EQ(answer, PathJoin(path1, path2));
|
143
|
+
ASSERT_EQ(answer, PathJoin(path3, path4));
|
144
|
+
}
|
145
|
+
|
146
|
+
TEST(StringUtilTest, JapaneseUnicode) {
|
147
|
+
// Japanese
|
148
|
+
const char* s = "がんば";
|
149
|
+
vector<uint16_t> unicode;
|
150
|
+
ASSERT_TRUE(Utf8ToUnicode(s, unicode));
|
151
|
+
ASSERT_EQ(3u, unicode.size());
|
152
|
+
}
|
153
|
+
|
154
|
+
TEST(StringUtilTest, RareChinese) {
|
155
|
+
//U+10000 – U+10FFFF
|
156
|
+
const char* s = "𪚥";
|
157
|
+
vector<uint16_t> unicode;
|
158
|
+
ASSERT_FALSE(Utf8ToUnicode(s, unicode));
|
159
|
+
ASSERT_EQ(0u, unicode.size());
|
160
|
+
}
|
161
|
+
|
162
|
+
TEST(StringUtilTest, RareChineseUnicode32) {
|
163
|
+
//U+10000 – U+10FFFF
|
164
|
+
const char* s = "𪚥";
|
165
|
+
vector<uint32_t> unicode;
|
166
|
+
ASSERT_TRUE(Utf8ToUnicode32(s, unicode));
|
167
|
+
ASSERT_EQ(1u, unicode.size());
|
168
|
+
|
169
|
+
string s2;
|
170
|
+
Unicode32ToUtf8(unicode.begin(), unicode.end(), s2);
|
171
|
+
ASSERT_EQ(s2, s);
|
172
|
+
}
|
173
|
+
|
174
|
+
TEST(StringUtilTest, Unicode32) {
|
175
|
+
const char* s = "1+1=2你好世界,。";
|
176
|
+
vector<uint32_t> unicode;
|
177
|
+
ASSERT_TRUE(Utf8ToUnicode32(s, unicode));
|
178
|
+
ASSERT_EQ(unicode.size(), 11u);
|
179
|
+
|
180
|
+
string s2;
|
181
|
+
Unicode32ToUtf8(unicode.begin(), unicode.end(), s2);
|
182
|
+
ASSERT_EQ(s2, s);
|
183
|
+
}
|
@@ -61,6 +61,15 @@ class DictTrie {
|
|
61
61
|
return true;
|
62
62
|
}
|
63
63
|
|
64
|
+
bool DeleteUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
|
65
|
+
DictUnit node_info;
|
66
|
+
if (!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) {
|
67
|
+
return false;
|
68
|
+
}
|
69
|
+
trie_->DeleteNode(node_info.word, &node_info);
|
70
|
+
return true;
|
71
|
+
}
|
72
|
+
|
64
73
|
const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
|
65
74
|
return trie_->Find(begin, end);
|
66
75
|
}
|
@@ -76,6 +76,10 @@ class Jieba {
|
|
76
76
|
return dict_trie_.InsertUserWord(word,freq, tag);
|
77
77
|
}
|
78
78
|
|
79
|
+
bool DeleteUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
|
80
|
+
return dict_trie_.DeleteUserWord(word, tag);
|
81
|
+
}
|
82
|
+
|
79
83
|
bool Find(const string& word)
|
80
84
|
{
|
81
85
|
return dict_trie_.Find(word);
|
@@ -141,7 +141,33 @@ class Trie {
|
|
141
141
|
assert(ptNode != NULL);
|
142
142
|
ptNode->ptValue = ptValue;
|
143
143
|
}
|
144
|
-
|
144
|
+
void DeleteNode(const Unicode& key, const DictUnit* ptValue) {
|
145
|
+
if (key.begin() == key.end()) {
|
146
|
+
return;
|
147
|
+
}
|
148
|
+
//定义一个NextMap迭代器
|
149
|
+
TrieNode::NextMap::const_iterator kmIter;
|
150
|
+
//定义一个指向root的TrieNode指针
|
151
|
+
TrieNode *ptNode = root_;
|
152
|
+
for (Unicode::const_iterator citer = key.begin(); citer != key.end(); ++citer) {
|
153
|
+
//链表不存在元素
|
154
|
+
if (NULL == ptNode->next) {
|
155
|
+
return;
|
156
|
+
}
|
157
|
+
kmIter = ptNode->next->find(*citer);
|
158
|
+
//如果map中不存在,跳出循环
|
159
|
+
if (ptNode->next->end() == kmIter) {
|
160
|
+
break;
|
161
|
+
}
|
162
|
+
//从unordered_map中擦除该项
|
163
|
+
ptNode->next->erase(*citer);
|
164
|
+
//删除该node
|
165
|
+
ptNode = kmIter->second;
|
166
|
+
delete ptNode;
|
167
|
+
break;
|
168
|
+
}
|
169
|
+
return;
|
170
|
+
}
|
145
171
|
private:
|
146
172
|
void CreateTrie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers) {
|
147
173
|
if (valuePointers.empty() || keys.empty()) {
|
@@ -1,8 +1,21 @@
|
|
1
|
+
if (MSVC)
|
2
|
+
set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreadedDebugDLL")
|
3
|
+
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
|
4
|
+
endif()
|
5
|
+
|
6
|
+
include(FetchContent)
|
7
|
+
|
8
|
+
FetchContent_Declare(
|
9
|
+
googletest
|
10
|
+
GIT_REPOSITORY https://github.com/google/googletest.git
|
11
|
+
GIT_TAG release-1.11.0
|
12
|
+
)
|
13
|
+
FetchContent_MakeAvailable(googletest)
|
14
|
+
|
15
|
+
|
1
16
|
SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/test)
|
2
17
|
SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
|
3
18
|
|
4
|
-
INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/deps/gtest/include)
|
5
|
-
|
6
19
|
ADD_DEFINITIONS(-DLOGGING_LEVEL=LL_WARNING)
|
7
20
|
|
8
21
|
ADD_EXECUTABLE(test.run
|
@@ -17,8 +30,4 @@ ADD_EXECUTABLE(test.run
|
|
17
30
|
textrank_test.cpp
|
18
31
|
)
|
19
32
|
|
20
|
-
|
21
|
-
TARGET_LINK_LIBRARIES(test.run gtest)
|
22
|
-
else()
|
23
|
-
TARGET_LINK_LIBRARIES(test.run gtest pthread)
|
24
|
-
endif()
|
33
|
+
TARGET_LINK_LIBRARIES(test.run gtest)
|
data/ext/cppjieba_rb/extconf.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'mkmf'
|
4
|
+
abs = __dir__
|
3
5
|
|
4
6
|
LIBDIR = RbConfig::CONFIG['libdir']
|
5
7
|
INCLUDEDIR = RbConfig::CONFIG['includedir']
|
@@ -7,17 +9,20 @@ INCLUDEDIR = RbConfig::CONFIG['includedir']
|
|
7
9
|
HEADER_DIRS = [
|
8
10
|
INCLUDEDIR,
|
9
11
|
"#{abs}/../cppjieba/include",
|
10
|
-
"#{abs}/../cppjieba/deps"
|
11
|
-
]
|
12
|
+
"#{abs}/../cppjieba/deps/limonp/include"
|
13
|
+
].freeze
|
12
14
|
|
13
15
|
LIB_DIRS = [
|
14
16
|
LIBDIR
|
15
|
-
]
|
17
|
+
].freeze
|
16
18
|
|
17
19
|
dir_config('cppjieba_rb', HEADER_DIRS, LIB_DIRS)
|
18
20
|
|
19
|
-
|
21
|
+
# rubocop:disable Style/GlobalVars
|
22
|
+
CONFIG['CXXFLAGS'] += ' -std=c++11 -O3'
|
20
23
|
$CXXFLAGS = "#{$CXXFLAGS} -std=c++11 -O3"
|
24
|
+
# rubocop:enable Style/GlobalVars
|
25
|
+
|
21
26
|
create_makefile 'cppjieba_rb/cppjieba_rb'
|
22
27
|
# respect header changes
|
23
28
|
headers = Dir.glob('*.{hpp,h}').join ' '
|
data/ext/cppjieba_rb/internal.cc
CHANGED
@@ -163,7 +163,7 @@ void Init_internal()
|
|
163
163
|
rb_sFull = rb_intern("full");
|
164
164
|
u8_enc = rb_utf8_encoding();
|
165
165
|
|
166
|
-
rb_cCppjiebaRb_Internal = rb_define_class_under(rb_mCppjiebaRb, "Internal",
|
166
|
+
rb_cCppjiebaRb_Internal = rb_define_class_under(rb_mCppjiebaRb, "Internal", rb_cObject);
|
167
167
|
rb_define_alloc_func(rb_cCppjiebaRb_Internal, internal_alloc);
|
168
168
|
rb_define_method(rb_cCppjiebaRb_Internal, "initialize", (ruby_method*) &internal_initialize, 5);
|
169
169
|
rb_define_method(rb_cCppjiebaRb_Internal, "extract_keyword", (ruby_method*) &internal_extract_keyword, 2);
|
data/lib/cppjieba_rb/segment.rb
CHANGED
@@ -1,4 +1,7 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module CppjiebaRb
|
4
|
+
# Sentence segmentation
|
2
5
|
class Segment
|
3
6
|
VALID_MODES = %i[mix hmm mp query full].freeze
|
4
7
|
|
@@ -17,4 +20,4 @@ module CppjiebaRb
|
|
17
20
|
CppjiebaRb.internal.segment(str, @mode, @max_word_length, @hmm)
|
18
21
|
end
|
19
22
|
end
|
20
|
-
end
|
23
|
+
end
|
data/lib/cppjieba_rb/version.rb
CHANGED
data/lib/cppjieba_rb.rb
CHANGED
@@ -1,7 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'cppjieba_rb/cppjieba_rb'
|
2
4
|
require 'cppjieba_rb/version'
|
3
5
|
require 'cppjieba_rb/segment'
|
4
6
|
|
7
|
+
# CppjiebaRb segments a Chinese sentence into words.
|
8
|
+
#
|
9
|
+
# Available segmentation methods include HMM, MP, and mix mode.
|
10
|
+
# Dictionaries takes a strong part in CppjiebaRb's accuracy.
|
11
|
+
# Read more https://github.com/yanyiwu/cppjieba
|
5
12
|
module CppjiebaRb
|
6
13
|
EXT_BASE = File.join(File.dirname(__FILE__), '..', 'ext', 'cppjieba', 'dict')
|
7
14
|
DICT_PATH = File.join(EXT_BASE, 'jieba.dict.utf8')
|
@@ -28,11 +35,11 @@ module CppjiebaRb
|
|
28
35
|
|
29
36
|
class << self
|
30
37
|
def internal
|
31
|
-
@
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
38
|
+
@internal ||= CppjiebaRb::Internal.new(DICT_PATH,
|
39
|
+
HMM_DICT_PATH,
|
40
|
+
USER_DICT,
|
41
|
+
IDF_PATH,
|
42
|
+
STOP_WORD_PATH)
|
36
43
|
end
|
37
44
|
end
|
38
45
|
end
|
data/test/test_keyword.rb
CHANGED
@@ -1,17 +1,17 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
2
3
|
require 'minitest/autorun'
|
3
4
|
require 'cppjieba_rb'
|
4
5
|
|
5
6
|
class JiebaTest < Minitest::Test
|
6
7
|
def test_keywords
|
7
|
-
results = CppjiebaRb.extract_keyword
|
8
|
+
results = CppjiebaRb.extract_keyword '我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。', 5
|
8
9
|
|
9
|
-
assert_equal [[
|
10
|
+
assert_equal [['CEO',
|
10
11
|
11.739204307083542],
|
11
|
-
[
|
12
|
-
[
|
13
|
-
[
|
14
|
-
[
|
15
|
-
|
12
|
+
['升职', 10.8561552143],
|
13
|
+
['加薪', 10.642581114],
|
14
|
+
['手扶拖拉机', 10.0088573539],
|
15
|
+
['巅峰', 9.49395840471]], results
|
16
16
|
end
|
17
17
|
end
|
data/test/test_segment.rb
CHANGED
@@ -1,24 +1,28 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
2
3
|
require 'minitest/autorun'
|
3
4
|
require 'cppjieba_rb'
|
4
5
|
|
5
6
|
class JiebaTest < Minitest::Test
|
6
7
|
def test_mix_segment
|
7
|
-
words = CppjiebaRb.segment
|
8
|
-
|
8
|
+
words = CppjiebaRb.segment '我来到南京市长江大桥'
|
9
|
+
|
10
|
+
assert_equal %w[我 来到 南京市 长江大桥], words
|
11
|
+
|
12
|
+
words = CppjiebaRb.segment '令狐冲是云计算行业的专家'
|
9
13
|
|
10
|
-
|
11
|
-
assert_equal %w(令狐冲 是 云计算 行业 的 专家), words
|
14
|
+
assert_equal %w[令狐冲 是 云计算 行业 的 专家], words
|
12
15
|
end
|
13
16
|
|
14
17
|
def test_hmm_segment
|
15
|
-
words = CppjiebaRb.segment
|
16
|
-
|
18
|
+
words = CppjiebaRb.segment '令狐冲是云计算行业的专家', mode: :hmm
|
19
|
+
|
20
|
+
assert_equal %w[令狐冲 是 云计算 行业 的 专家], words
|
17
21
|
end
|
18
22
|
|
19
23
|
def test_max_prob_segment
|
20
|
-
words = CppjiebaRb.segment
|
21
|
-
assert_equal %w(令狐冲 是 云计算 行业 的 专家), words
|
22
|
-
end
|
24
|
+
words = CppjiebaRb.segment '令狐冲是云计算行业的专家', mode: :mp
|
23
25
|
|
26
|
+
assert_equal %w[令狐冲 是 云计算 行业 的 专家], words
|
27
|
+
end
|
24
28
|
end
|
@@ -1,10 +1,12 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
2
3
|
require 'minitest/autorun'
|
3
4
|
require 'cppjieba_rb'
|
4
5
|
|
5
6
|
class JiebaTest < Minitest::Test
|
6
7
|
def test_filter
|
7
|
-
words = CppjiebaRb.filter_stop_word %w
|
8
|
-
|
8
|
+
words = CppjiebaRb.filter_stop_word %w[令狐冲 是 云计算 行业 的 专家]
|
9
|
+
|
10
|
+
assert_equal %w[令狐冲 云计算 行业 专家], words
|
9
11
|
end
|
10
12
|
end
|
data/test/test_tagging.rb
CHANGED
@@ -1,9 +1,12 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
2
3
|
require 'minitest/autorun'
|
3
4
|
require 'cppjieba_rb'
|
5
|
+
|
4
6
|
class JiebaTest < Minitest::Test
|
5
7
|
def test_tagging
|
6
8
|
pairs = CppjiebaRb.segment_tag '我是蓝翔技工拖拉机学院手扶拖拉机专业的。'
|
9
|
+
|
7
10
|
assert_equal({ '我' => 'r', '是' => 'v', '蓝翔' => 'nz', '技工' => 'n',
|
8
11
|
'拖拉机' => 'n', '学院' => 'n', '手扶拖拉机' => 'n', '专业' => 'n',
|
9
12
|
'的' => 'uj', '。' => 'x' }, pairs)
|
@@ -11,9 +14,9 @@ class JiebaTest < Minitest::Test
|
|
11
14
|
|
12
15
|
def test_tagging_with_user_dict
|
13
16
|
pairs = CppjiebaRb.segment_tag '我是蓝翔技工拖拉机学院手扶拖拉机专业的。'
|
17
|
+
|
14
18
|
assert_equal({ '我' => 'r', '是' => 'v', '蓝翔' => 'nz', '技工' => 'n',
|
15
19
|
'拖拉机' => 'n', '学院' => 'n', '手扶拖拉机' => 'n', '专业' => 'n',
|
16
20
|
'的' => 'uj', '。' => 'x' }, pairs)
|
17
21
|
end
|
18
|
-
|
19
22
|
end
|