cppjieba_rb 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.gitmodules +3 -0
- data/.travis.yml +26 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +22 -0
- data/README.md +81 -0
- data/Rakefile +20 -0
- data/cppjieba_rb.gemspec +50 -0
- data/ext/cppjieba/.gitignore +17 -0
- data/ext/cppjieba/.travis.yml +22 -0
- data/ext/cppjieba/CMakeLists.txt +28 -0
- data/ext/cppjieba/ChangeLog.md +236 -0
- data/ext/cppjieba/README.md +285 -0
- data/ext/cppjieba/README_EN.md +111 -0
- data/ext/cppjieba/appveyor.yml +32 -0
- data/ext/cppjieba/deps/CMakeLists.txt +1 -0
- data/ext/cppjieba/deps/gtest/CMakeLists.txt +5 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +283 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +230 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +1421 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +487 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +796 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +232 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +176 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +259 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +2155 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +358 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +58 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +308 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +210 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +1226 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +233 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +4822 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +301 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +619 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +1788 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +350 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +968 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +336 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +3330 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +296 -0
- data/ext/cppjieba/deps/gtest/src/.deps/.dirstamp +0 -0
- data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +681 -0
- data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +509 -0
- data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
- data/ext/cppjieba/deps/gtest/src/gtest-all.cc +48 -0
- data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +1234 -0
- data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +380 -0
- data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +1038 -0
- data/ext/cppjieba/deps/gtest/src/gtest-port.cc +746 -0
- data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +356 -0
- data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +110 -0
- data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +110 -0
- data/ext/cppjieba/deps/gtest/src/gtest.cc +4898 -0
- data/ext/cppjieba/deps/gtest/src/gtest_main.cc +39 -0
- data/ext/cppjieba/deps/limonp/ArgvContext.hpp +70 -0
- data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +49 -0
- data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +67 -0
- data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +65 -0
- data/ext/cppjieba/deps/limonp/Closure.hpp +206 -0
- data/ext/cppjieba/deps/limonp/Colors.hpp +31 -0
- data/ext/cppjieba/deps/limonp/Condition.hpp +38 -0
- data/ext/cppjieba/deps/limonp/Config.hpp +103 -0
- data/ext/cppjieba/deps/limonp/FileLock.hpp +74 -0
- data/ext/cppjieba/deps/limonp/ForcePublic.hpp +7 -0
- data/ext/cppjieba/deps/limonp/LocalVector.hpp +139 -0
- data/ext/cppjieba/deps/limonp/Logging.hpp +76 -0
- data/ext/cppjieba/deps/limonp/Md5.hpp +411 -0
- data/ext/cppjieba/deps/limonp/MutexLock.hpp +51 -0
- data/ext/cppjieba/deps/limonp/NonCopyable.hpp +21 -0
- data/ext/cppjieba/deps/limonp/StdExtension.hpp +159 -0
- data/ext/cppjieba/deps/limonp/StringUtil.hpp +365 -0
- data/ext/cppjieba/deps/limonp/Thread.hpp +44 -0
- data/ext/cppjieba/deps/limonp/ThreadPool.hpp +86 -0
- data/ext/cppjieba/dict/README.md +31 -0
- data/ext/cppjieba/dict/hmm_model.utf8 +34 -0
- data/ext/cppjieba/dict/idf.utf8 +258826 -0
- data/ext/cppjieba/dict/jieba.dict.utf8 +348982 -0
- data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
- data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
- data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
- data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
- data/ext/cppjieba/dict/stop_words.utf8 +1534 -0
- data/ext/cppjieba/dict/user.dict.utf8 +4 -0
- data/ext/cppjieba/include/cppjieba/DictTrie.hpp +227 -0
- data/ext/cppjieba/include/cppjieba/FullSegment.hpp +93 -0
- data/ext/cppjieba/include/cppjieba/HMMModel.hpp +129 -0
- data/ext/cppjieba/include/cppjieba/HMMSegment.hpp +190 -0
- data/ext/cppjieba/include/cppjieba/Jieba.hpp +108 -0
- data/ext/cppjieba/include/cppjieba/KeywordExtractor.hpp +153 -0
- data/ext/cppjieba/include/cppjieba/MPSegment.hpp +137 -0
- data/ext/cppjieba/include/cppjieba/MixSegment.hpp +109 -0
- data/ext/cppjieba/include/cppjieba/PosTagger.hpp +77 -0
- data/ext/cppjieba/include/cppjieba/PreFilter.hpp +54 -0
- data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +90 -0
- data/ext/cppjieba/include/cppjieba/SegmentBase.hpp +46 -0
- data/ext/cppjieba/include/cppjieba/SegmentTagged.hpp +23 -0
- data/ext/cppjieba/include/cppjieba/TextRankExtractor.hpp +190 -0
- data/ext/cppjieba/include/cppjieba/Trie.hpp +174 -0
- data/ext/cppjieba/include/cppjieba/Unicode.hpp +215 -0
- data/ext/cppjieba/test/CMakeLists.txt +5 -0
- data/ext/cppjieba/test/demo.cpp +80 -0
- data/ext/cppjieba/test/load_test.cpp +54 -0
- data/ext/cppjieba/test/testdata/curl.res +1 -0
- data/ext/cppjieba/test/testdata/extra_dict/jieba.dict.small.utf8 +109750 -0
- data/ext/cppjieba/test/testdata/gbk_dict/hmm_model.gbk +34 -0
- data/ext/cppjieba/test/testdata/gbk_dict/jieba.dict.gbk +348982 -0
- data/ext/cppjieba/test/testdata/jieba.dict.0.1.utf8 +93 -0
- data/ext/cppjieba/test/testdata/jieba.dict.0.utf8 +93 -0
- data/ext/cppjieba/test/testdata/jieba.dict.1.utf8 +67 -0
- data/ext/cppjieba/test/testdata/jieba.dict.2.utf8 +64 -0
- data/ext/cppjieba/test/testdata/load_test.urls +2 -0
- data/ext/cppjieba/test/testdata/review.100 +100 -0
- data/ext/cppjieba/test/testdata/review.100.res +200 -0
- data/ext/cppjieba/test/testdata/server.conf +19 -0
- data/ext/cppjieba/test/testdata/testlines.gbk +9 -0
- data/ext/cppjieba/test/testdata/testlines.utf8 +8 -0
- data/ext/cppjieba/test/testdata/userdict.2.utf8 +1 -0
- data/ext/cppjieba/test/testdata/userdict.english +2 -0
- data/ext/cppjieba/test/testdata/userdict.utf8 +8 -0
- data/ext/cppjieba/test/testdata/weicheng.utf8 +247 -0
- data/ext/cppjieba/test/unittest/CMakeLists.txt +24 -0
- data/ext/cppjieba/test/unittest/gtest_main.cpp +39 -0
- data/ext/cppjieba/test/unittest/jieba_test.cpp +133 -0
- data/ext/cppjieba/test/unittest/keyword_extractor_test.cpp +79 -0
- data/ext/cppjieba/test/unittest/pos_tagger_test.cpp +41 -0
- data/ext/cppjieba/test/unittest/pre_filter_test.cpp +43 -0
- data/ext/cppjieba/test/unittest/segments_test.cpp +256 -0
- data/ext/cppjieba/test/unittest/textrank_test.cpp +86 -0
- data/ext/cppjieba/test/unittest/trie_test.cpp +177 -0
- data/ext/cppjieba/test/unittest/unicode_test.cpp +43 -0
- data/ext/cppjieba_rb/cppjieba_rb.c +10 -0
- data/ext/cppjieba_rb/extconf.rb +26 -0
- data/ext/cppjieba_rb/internal.cc +148 -0
- data/lib/cppjieba_rb/segment.rb +20 -0
- data/lib/cppjieba_rb/version.rb +3 -0
- data/lib/cppjieba_rb.rb +34 -0
- data/test/test_keyword.rb +17 -0
- data/test/test_segment.rb +24 -0
- data/test/test_tagging.rb +19 -0
- metadata +244 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/test)
|
|
2
|
+
SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
|
|
3
|
+
|
|
4
|
+
INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/deps/gtest/include)
|
|
5
|
+
|
|
6
|
+
ADD_DEFINITIONS(-DLOGGING_LEVEL=LL_WARNING)
|
|
7
|
+
|
|
8
|
+
ADD_EXECUTABLE(test.run
|
|
9
|
+
gtest_main.cpp
|
|
10
|
+
keyword_extractor_test.cpp
|
|
11
|
+
trie_test.cpp
|
|
12
|
+
segments_test.cpp
|
|
13
|
+
pos_tagger_test.cpp
|
|
14
|
+
jieba_test.cpp
|
|
15
|
+
pre_filter_test.cpp
|
|
16
|
+
unicode_test.cpp
|
|
17
|
+
textrank_test.cpp
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
if(MSVC)
|
|
21
|
+
TARGET_LINK_LIBRARIES(test.run gtest)
|
|
22
|
+
else()
|
|
23
|
+
TARGET_LINK_LIBRARIES(test.run gtest pthread)
|
|
24
|
+
endif()
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
// Copyright 2006, Google Inc.
|
|
2
|
+
// All rights reserved.
|
|
3
|
+
//
|
|
4
|
+
// Redistribution and use in source and binary forms, with or without
|
|
5
|
+
// modification, are permitted provided that the following conditions are
|
|
6
|
+
// met:
|
|
7
|
+
//
|
|
8
|
+
// * Redistributions of source code must retain the above copyright
|
|
9
|
+
// notice, this list of conditions and the following disclaimer.
|
|
10
|
+
// * Redistributions in binary form must reproduce the above
|
|
11
|
+
// copyright notice, this list of conditions and the following disclaimer
|
|
12
|
+
// in the documentation and/or other materials provided with the
|
|
13
|
+
// distribution.
|
|
14
|
+
// * Neither the name of Google Inc. nor the names of its
|
|
15
|
+
// contributors may be used to endorse or promote products derived from
|
|
16
|
+
// this software without specific prior written permission.
|
|
17
|
+
//
|
|
18
|
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
19
|
+
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
20
|
+
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
21
|
+
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
22
|
+
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
23
|
+
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
24
|
+
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
25
|
+
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
26
|
+
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
27
|
+
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
28
|
+
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
29
|
+
|
|
30
|
+
#include <iostream>
|
|
31
|
+
|
|
32
|
+
#include "gtest/gtest.h"
|
|
33
|
+
|
|
34
|
+
GTEST_API_ int main(int argc, char **argv) {
|
|
35
|
+
std::cout << "Running main() from gtest_main.cc\n";
|
|
36
|
+
|
|
37
|
+
testing::InitGoogleTest(&argc, argv);
|
|
38
|
+
return RUN_ALL_TESTS();
|
|
39
|
+
}
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
#include "cppjieba/Jieba.hpp"
|
|
2
|
+
#include "gtest/gtest.h"
|
|
3
|
+
|
|
4
|
+
using namespace cppjieba;
|
|
5
|
+
|
|
6
|
+
TEST(JiebaTest, Test1) {
|
|
7
|
+
cppjieba::Jieba jieba("../dict/jieba.dict.utf8",
|
|
8
|
+
"../dict/hmm_model.utf8",
|
|
9
|
+
"../dict/user.dict.utf8",
|
|
10
|
+
"../dict/idf.utf8",
|
|
11
|
+
"../dict/stop_words.utf8");
|
|
12
|
+
vector<string> words;
|
|
13
|
+
string result;
|
|
14
|
+
|
|
15
|
+
jieba.Cut("他来到了网易杭研大厦", words);
|
|
16
|
+
result << words;
|
|
17
|
+
ASSERT_EQ("[\"他\", \"来到\", \"了\", \"网易\", \"杭研\", \"大厦\"]", result);
|
|
18
|
+
|
|
19
|
+
jieba.Cut("我来自北京邮电大学。", words, false);
|
|
20
|
+
result << words;
|
|
21
|
+
ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\"]", result);
|
|
22
|
+
|
|
23
|
+
jieba.CutSmall("南京市长江大桥", words, 3);
|
|
24
|
+
ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", result << words);
|
|
25
|
+
|
|
26
|
+
jieba.CutHMM("我来自北京邮电大学。。。学号123456", words);
|
|
27
|
+
result << words;
|
|
28
|
+
ASSERT_EQ("[\"我来\", \"自北京\", \"邮电大学\", \"。\", \"。\", \"。\", \"学号\", \"123456\"]", result);
|
|
29
|
+
|
|
30
|
+
jieba.Cut("我来自北京邮电大学。。。学号123456,用AK47", words);
|
|
31
|
+
result << words;
|
|
32
|
+
ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\", \"。\", \"。\", \"学号\", \"123456\", \",\", \"用\", \"AK47\"]", result);
|
|
33
|
+
|
|
34
|
+
jieba.CutAll("我来自北京邮电大学", words);
|
|
35
|
+
result << words;
|
|
36
|
+
ASSERT_EQ(result, "[\"我\", \"来自\", \"北京\", \"北京邮电\", \"北京邮电大学\", \"邮电\", \"邮电大学\", \"电大\", \"大学\"]");
|
|
37
|
+
|
|
38
|
+
jieba.CutForSearch("他来到了网易杭研大厦", words);
|
|
39
|
+
result << words;
|
|
40
|
+
ASSERT_EQ("[\"他\", \"来到\", \"了\", \"网易\", \"杭研\", \"大厦\"]", result);
|
|
41
|
+
|
|
42
|
+
}
|
|
43
|
+
TEST(JiebaTest, WordTest) {
|
|
44
|
+
cppjieba::Jieba jieba("../dict/jieba.dict.utf8",
|
|
45
|
+
"../dict/hmm_model.utf8",
|
|
46
|
+
"../dict/user.dict.utf8",
|
|
47
|
+
"../dict/idf.utf8",
|
|
48
|
+
"../dict/stop_words.utf8");
|
|
49
|
+
vector<Word> words;
|
|
50
|
+
string result;
|
|
51
|
+
|
|
52
|
+
jieba.Cut("他来到了网易杭研大厦", words);
|
|
53
|
+
result << words;
|
|
54
|
+
ASSERT_EQ("[{\"word\": \"\xE4\xBB\x96\", \"offset\": 0}, {\"word\": \"\xE6\x9D\xA5\xE5\x88\xB0\", \"offset\": 3}, {\"word\": \"\xE4\xBA\x86\", \"offset\": 9}, {\"word\": \"\xE7\xBD\x91\xE6\x98\x93\", \"offset\": 12}, {\"word\": \"\xE6\x9D\xAD\xE7\xA0\x94\", \"offset\": 18}, {\"word\": \"\xE5\xA4\xA7\xE5\x8E\xA6\", \"offset\": 24}]", result);
|
|
55
|
+
|
|
56
|
+
jieba.Cut("我来自北京邮电大学。", words, false);
|
|
57
|
+
result << words;
|
|
58
|
+
//ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\"]", result);
|
|
59
|
+
ASSERT_EQ("[{\"word\": \"\xE6\x88\x91\", \"offset\": 0}, {\"word\": \"\xE6\x9D\xA5\xE8\x87\xAA\", \"offset\": 3}, {\"word\": \"\xE5\x8C\x97\xE4\xBA\xAC\xE9\x82\xAE\xE7\x94\xB5\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 9}, {\"word\": \"\xE3\x80\x82\", \"offset\": 27}]", result);
|
|
60
|
+
|
|
61
|
+
jieba.CutSmall("南京市长江大桥", words, 3);
|
|
62
|
+
//ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", result << words);
|
|
63
|
+
ASSERT_EQ("[{\"word\": \"\xE5\x8D\x97\xE4\xBA\xAC\xE5\xB8\x82\", \"offset\": 0}, {\"word\": \"\xE9\x95\xBF\xE6\xB1\x9F\", \"offset\": 9}, {\"word\": \"\xE5\xA4\xA7\xE6\xA1\xA5\", \"offset\": 15}]", result << words);
|
|
64
|
+
|
|
65
|
+
jieba.CutHMM("我来自北京邮电大学。。。学号123456", words);
|
|
66
|
+
result << words;
|
|
67
|
+
ASSERT_EQ("[{\"word\": \"\xE6\x88\x91\xE6\x9D\xA5\", \"offset\": 0}, {\"word\": \"\xE8\x87\xAA\xE5\x8C\x97\xE4\xBA\xAC\", \"offset\": 6}, {\"word\": \"\xE9\x82\xAE\xE7\x94\xB5\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 15}, {\"word\": \"\xE3\x80\x82\", \"offset\": 27}, {\"word\": \"\xE3\x80\x82\", \"offset\": 30}, {\"word\": \"\xE3\x80\x82\", \"offset\": 33}, {\"word\": \"\xE5\xAD\xA6\xE5\x8F\xB7\", \"offset\": 36}, {\"word\": \"123456\", \"offset\": 42}]", result);
|
|
68
|
+
|
|
69
|
+
jieba.Cut("我来自北京邮电大学。。。学号123456,用AK47", words);
|
|
70
|
+
result << words;
|
|
71
|
+
//ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\", \"。\", \"。\", \"学号\", \"123456\", \",\", \"用\", \"AK47\"]", result);
|
|
72
|
+
ASSERT_EQ("[{\"word\": \"\xE6\x88\x91\", \"offset\": 0}, {\"word\": \"\xE6\x9D\xA5\xE8\x87\xAA\", \"offset\": 3}, {\"word\": \"\xE5\x8C\x97\xE4\xBA\xAC\xE9\x82\xAE\xE7\x94\xB5\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 9}, {\"word\": \"\xE3\x80\x82\", \"offset\": 27}, {\"word\": \"\xE3\x80\x82\", \"offset\": 30}, {\"word\": \"\xE3\x80\x82\", \"offset\": 33}, {\"word\": \"\xE5\xAD\xA6\xE5\x8F\xB7\", \"offset\": 36}, {\"word\": \"123456\", \"offset\": 42}, {\"word\": \"\xEF\xBC\x8C\", \"offset\": 48}, {\"word\": \"\xE7\x94\xA8\", \"offset\": 51}, {\"word\": \"AK47\", \"offset\": 54}]", result);
|
|
73
|
+
|
|
74
|
+
jieba.CutAll("我来自北京邮电大学", words);
|
|
75
|
+
result << words;
|
|
76
|
+
//ASSERT_EQ(result, "[\"我\", \"来自\", \"北京\", \"北京邮电\", \"北京邮电大学\", \"邮电\", \"邮电大学\", \"电大\", \"大学\"]");
|
|
77
|
+
ASSERT_EQ("[{\"word\": \"\xE6\x88\x91\", \"offset\": 0}, {\"word\": \"\xE6\x9D\xA5\xE8\x87\xAA\", \"offset\": 3}, {\"word\": \"\xE5\x8C\x97\xE4\xBA\xAC\", \"offset\": 9}, {\"word\": \"\xE5\x8C\x97\xE4\xBA\xAC\xE9\x82\xAE\xE7\x94\xB5\", \"offset\": 9}, {\"word\": \"\xE5\x8C\x97\xE4\xBA\xAC\xE9\x82\xAE\xE7\x94\xB5\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 9}, {\"word\": \"\xE9\x82\xAE\xE7\x94\xB5\", \"offset\": 15}, {\"word\": \"\xE9\x82\xAE\xE7\x94\xB5\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 15}, {\"word\": \"\xE7\x94\xB5\xE5\xA4\xA7\", \"offset\": 18}, {\"word\": \"\xE5\xA4\xA7\xE5\xAD\xA6\", \"offset\": 21}]", result);
|
|
78
|
+
|
|
79
|
+
jieba.CutForSearch("他来到了网易杭研大厦", words);
|
|
80
|
+
result << words;
|
|
81
|
+
//ASSERT_EQ("[\"他\", \"来到\", \"了\", \"网易\", \"杭研\", \"大厦\"]", result);
|
|
82
|
+
ASSERT_EQ("[{\"word\": \"\xE4\xBB\x96\", \"offset\": 0}, {\"word\": \"\xE6\x9D\xA5\xE5\x88\xB0\", \"offset\": 3}, {\"word\": \"\xE4\xBA\x86\", \"offset\": 9}, {\"word\": \"\xE7\xBD\x91\xE6\x98\x93\", \"offset\": 12}, {\"word\": \"\xE6\x9D\xAD\xE7\xA0\x94\", \"offset\": 18}, {\"word\": \"\xE5\xA4\xA7\xE5\x8E\xA6\", \"offset\": 24}]", result);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
TEST(JiebaTest, InsertUserWord) {
|
|
86
|
+
cppjieba::Jieba jieba("../dict/jieba.dict.utf8",
|
|
87
|
+
"../dict/hmm_model.utf8",
|
|
88
|
+
"../dict/user.dict.utf8",
|
|
89
|
+
"../dict/idf.utf8",
|
|
90
|
+
"../dict/stop_words.utf8");
|
|
91
|
+
vector<string> words;
|
|
92
|
+
string result;
|
|
93
|
+
|
|
94
|
+
jieba.Cut("男默女泪", words);
|
|
95
|
+
result << words;
|
|
96
|
+
ASSERT_EQ("[\"男默\", \"女泪\"]", result);
|
|
97
|
+
|
|
98
|
+
ASSERT_TRUE(jieba.InsertUserWord("男默女泪"));
|
|
99
|
+
|
|
100
|
+
jieba.Cut("男默女泪", words);
|
|
101
|
+
result << words;
|
|
102
|
+
ASSERT_EQ("[\"男默女泪\"]", result);
|
|
103
|
+
|
|
104
|
+
for (size_t i = 0; i < 100; i++) {
|
|
105
|
+
string newWord;
|
|
106
|
+
newWord << rand();
|
|
107
|
+
ASSERT_TRUE(jieba.InsertUserWord(newWord));
|
|
108
|
+
jieba.Cut(newWord, words);
|
|
109
|
+
result << words;
|
|
110
|
+
ASSERT_EQ(result, StringFormat("[\"%s\"]", newWord.c_str()));
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
ASSERT_TRUE(jieba.InsertUserWord("同一个世界,同一个梦想"));
|
|
114
|
+
jieba.Cut("同一个世界,同一个梦想", words);
|
|
115
|
+
result = Join(words.begin(), words.end(), "/");
|
|
116
|
+
ASSERT_EQ(result, "同一个/世界/,/同一个/梦想");
|
|
117
|
+
|
|
118
|
+
jieba.ResetSeparators("");
|
|
119
|
+
|
|
120
|
+
jieba.Cut("同一个世界,同一个梦想", words);
|
|
121
|
+
result = Join(words.begin(), words.end(), "/");
|
|
122
|
+
ASSERT_EQ(result, "同一个世界,同一个梦想");
|
|
123
|
+
|
|
124
|
+
{
|
|
125
|
+
string s("一部iPhone6");
|
|
126
|
+
string res;
|
|
127
|
+
vector<KeywordExtractor::Word> wordweights;
|
|
128
|
+
size_t topN = 5;
|
|
129
|
+
jieba.extractor.Extract(s, wordweights, topN);
|
|
130
|
+
res << wordweights;
|
|
131
|
+
ASSERT_EQ(res, "[{\"word\": \"iPhone6\", \"offset\": [6], \"weight\": 11.7392}, {\"word\": \"\xE4\xB8\x80\xE9\x83\xA8\", \"offset\": [0], \"weight\": 6.47592}]");
|
|
132
|
+
}
|
|
133
|
+
}
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
#include "cppjieba/KeywordExtractor.hpp"
|
|
2
|
+
#include "gtest/gtest.h"
|
|
3
|
+
|
|
4
|
+
using namespace cppjieba;
|
|
5
|
+
|
|
6
|
+
TEST(KeywordExtractorTest, Test1) {
|
|
7
|
+
KeywordExtractor Extractor("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
|
|
8
|
+
|
|
9
|
+
{
|
|
10
|
+
string s("你好世界世界而且而且");
|
|
11
|
+
string res;
|
|
12
|
+
size_t topN = 5;
|
|
13
|
+
|
|
14
|
+
{
|
|
15
|
+
vector<string> words;
|
|
16
|
+
Extractor.Extract(s, words, topN);
|
|
17
|
+
res << words;
|
|
18
|
+
ASSERT_EQ(res, "[\"世界\", \"你好\"]");
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
{
|
|
22
|
+
vector<pair<string, double> > words;
|
|
23
|
+
Extractor.Extract(s, words, topN);
|
|
24
|
+
res << words;
|
|
25
|
+
ASSERT_EQ(res, "[世界:8.73506, 你好:7.95788]");
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
{
|
|
29
|
+
vector<KeywordExtractor::Word> words;
|
|
30
|
+
Extractor.Extract(s, words, topN);
|
|
31
|
+
res << words;
|
|
32
|
+
ASSERT_EQ(res, "[{\"word\": \"\xE4\xB8\x96\xE7\x95\x8C\", \"offset\": [6, 12], \"weight\": 8.73506}, {\"word\": \"\xE4\xBD\xA0\xE5\xA5\xBD\", \"offset\": [0], \"weight\": 7.95788}]");
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
{
|
|
37
|
+
string s("我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。");
|
|
38
|
+
string res;
|
|
39
|
+
vector<KeywordExtractor::Word> wordweights;
|
|
40
|
+
size_t topN = 5;
|
|
41
|
+
Extractor.Extract(s, wordweights, topN);
|
|
42
|
+
res << wordweights;
|
|
43
|
+
ASSERT_EQ(res, "[{\"word\": \"CEO\", \"offset\": [93], \"weight\": 11.7392}, {\"word\": \"\xE5\x8D\x87\xE8\x81\x8C\", \"offset\": [72], \"weight\": 10.8562}, {\"word\": \"\xE5\x8A\xA0\xE8\x96\xAA\", \"offset\": [78], \"weight\": 10.6426}, {\"word\": \"\xE6\x89\x8B\xE6\x89\xB6\xE6\x8B\x96\xE6\x8B\x89\xE6\x9C\xBA\", \"offset\": [21], \"weight\": 10.0089}, {\"word\": \"\xE5\xB7\x85\xE5\xB3\xB0\", \"offset\": [111], \"weight\": 9.49396}]");
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
{
|
|
47
|
+
string s("一部iPhone6");
|
|
48
|
+
string res;
|
|
49
|
+
vector<KeywordExtractor::Word> wordweights;
|
|
50
|
+
size_t topN = 5;
|
|
51
|
+
Extractor.Extract(s, wordweights, topN);
|
|
52
|
+
res << wordweights;
|
|
53
|
+
ASSERT_EQ(res, "[{\"word\": \"iPhone6\", \"offset\": [6], \"weight\": 11.7392}, {\"word\": \"\xE4\xB8\x80\xE9\x83\xA8\", \"offset\": [0], \"weight\": 6.47592}]");
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
TEST(KeywordExtractorTest, Test2) {
|
|
58
|
+
KeywordExtractor Extractor("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8", "../test/testdata/userdict.utf8");
|
|
59
|
+
|
|
60
|
+
{
|
|
61
|
+
string s("蓝翔优秀毕业生");
|
|
62
|
+
string res;
|
|
63
|
+
vector<KeywordExtractor::Word> wordweights;
|
|
64
|
+
size_t topN = 5;
|
|
65
|
+
Extractor.Extract(s, wordweights, topN);
|
|
66
|
+
res << wordweights;
|
|
67
|
+
ASSERT_EQ(res, "[{\"word\": \"\xE8\x93\x9D\xE7\xBF\x94\", \"offset\": [0], \"weight\": 11.7392}, {\"word\": \"\xE6\xAF\x95\xE4\xB8\x9A\xE7\x94\x9F\", \"offset\": [12], \"weight\": 8.13549}, {\"word\": \"\xE4\xBC\x98\xE7\xA7\x80\", \"offset\": [6], \"weight\": 6.78347}]");
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
{
|
|
71
|
+
string s("一部iPhone6");
|
|
72
|
+
string res;
|
|
73
|
+
vector<KeywordExtractor::Word> wordweights;
|
|
74
|
+
size_t topN = 5;
|
|
75
|
+
Extractor.Extract(s, wordweights, topN);
|
|
76
|
+
res << wordweights;
|
|
77
|
+
ASSERT_EQ(res, "[{\"word\": \"iPhone6\", \"offset\": [6], \"weight\": 11.7392}, {\"word\": \"\xE4\xB8\x80\xE9\x83\xA8\", \"offset\": [0], \"weight\": 6.47592}]");
|
|
78
|
+
}
|
|
79
|
+
}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
#include "cppjieba/MixSegment.hpp"
|
|
2
|
+
#include "gtest/gtest.h"
|
|
3
|
+
|
|
4
|
+
using namespace cppjieba;
|
|
5
|
+
|
|
6
|
+
static const char * const QUERY_TEST1 = "我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上总经理,出任CEO,迎娶白富美,走上人生巅峰。";
|
|
7
|
+
static const char * const ANS_TEST1 = "[我:r, 是:v, 蓝翔:x, 技工:n, 拖拉机:n, 学院:n, 手扶拖拉机:n, 专业:n, 的:uj, 。:x, 不用:v, 多久:m, ,:x, 我:r, 就:d, 会:v, 升职:v, 加薪:nr, ,:x, 当上:t, 总经理:n, ,:x, 出任:v, CEO:eng, ,:x, 迎娶:v, 白富:x, 美:ns, ,:x, 走上:v, 人生:n, 巅峰:n, 。:x]";
|
|
8
|
+
static const char * const QUERY_TEST2 = "我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上总经理,出任CEO,迎娶白富美,走上人生巅峰。";
|
|
9
|
+
static const char * const ANS_TEST2 = "[我:r, 是:v, 蓝翔:nz, 技工:n, 拖拉机:n, 学院:n, 手扶拖拉机:n, 专业:n, 的:uj, 。:x, 不用:v, 多久:m, ,:x, 我:r, 就:d, 会:v, 升职:v, 加薪:nr, ,:x, 当上:t, 总经理:n, ,:x, 出任:v, CEO:eng, ,:x, 迎娶:v, 白富:x, 美:ns, ,:x, 走上:v, 人生:n, 巅峰:n, 。:x]";
|
|
10
|
+
|
|
11
|
+
static const char * const QUERY_TEST3 = "iPhone6手机的最大特点是很容易弯曲。";
|
|
12
|
+
static const char * const ANS_TEST3 = "[iPhone6:eng, 手机:n, 的:uj, 最大:a, 特点:n, 是:v, 很:zg, 容易:a, 弯曲:v, 。:x]";
|
|
13
|
+
//static const char * const ANS_TEST3 = "";
|
|
14
|
+
|
|
15
|
+
TEST(PosTaggerTest, Test) {
|
|
16
|
+
MixSegment tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");
|
|
17
|
+
{
|
|
18
|
+
vector<pair<string, string> > res;
|
|
19
|
+
tagger.Tag(QUERY_TEST1, res);
|
|
20
|
+
string s;
|
|
21
|
+
s << res;
|
|
22
|
+
ASSERT_TRUE(s == ANS_TEST1);
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
TEST(PosTagger, TestUserDict) {
|
|
26
|
+
MixSegment tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
|
|
27
|
+
{
|
|
28
|
+
vector<pair<string, string> > res;
|
|
29
|
+
tagger.Tag(QUERY_TEST2, res);
|
|
30
|
+
string s;
|
|
31
|
+
s << res;
|
|
32
|
+
ASSERT_EQ(s, ANS_TEST2);
|
|
33
|
+
}
|
|
34
|
+
{
|
|
35
|
+
vector<pair<string, string> > res;
|
|
36
|
+
tagger.Tag(QUERY_TEST3, res);
|
|
37
|
+
string s;
|
|
38
|
+
s << res;
|
|
39
|
+
ASSERT_EQ(s, ANS_TEST3);
|
|
40
|
+
}
|
|
41
|
+
}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
#include "gtest/gtest.h"
|
|
2
|
+
#include "cppjieba/PreFilter.hpp"
|
|
3
|
+
#include "limonp/StringUtil.hpp"
|
|
4
|
+
|
|
5
|
+
using namespace cppjieba;
|
|
6
|
+
|
|
7
|
+
TEST(PreFilterTest, Test1) {
|
|
8
|
+
unordered_set<Rune> symbol;
|
|
9
|
+
symbol.insert(65292u); // ","
|
|
10
|
+
symbol.insert(12290u); // "。"
|
|
11
|
+
string expected;
|
|
12
|
+
string res;
|
|
13
|
+
|
|
14
|
+
{
|
|
15
|
+
string s = "你好,美丽的,世界";
|
|
16
|
+
PreFilter filter(symbol, s);
|
|
17
|
+
expected = "你好/,/美丽的/,/世界";
|
|
18
|
+
ASSERT_TRUE(filter.HasNext());
|
|
19
|
+
vector<string> words;
|
|
20
|
+
while (filter.HasNext()) {
|
|
21
|
+
PreFilter::Range range;
|
|
22
|
+
range = filter.Next();
|
|
23
|
+
words.push_back(GetStringFromRunes(s, range.begin, range.end - 1));
|
|
24
|
+
}
|
|
25
|
+
res = limonp::Join(words.begin(), words.end(), "/");
|
|
26
|
+
ASSERT_EQ(res, expected);
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
{
|
|
30
|
+
string s = "我来自北京邮电大学。。。学号123456,用AK47";
|
|
31
|
+
PreFilter filter(symbol, s);
|
|
32
|
+
expected = "我来自北京邮电大学/。/。/。/学号123456/,/用AK47";
|
|
33
|
+
ASSERT_TRUE(filter.HasNext());
|
|
34
|
+
vector<string> words;
|
|
35
|
+
while (filter.HasNext()) {
|
|
36
|
+
PreFilter::Range range;
|
|
37
|
+
range = filter.Next();
|
|
38
|
+
words.push_back(GetStringFromRunes(s, range.begin, range.end - 1));
|
|
39
|
+
}
|
|
40
|
+
res = limonp::Join(words.begin(), words.end(), "/");
|
|
41
|
+
ASSERT_EQ(res, expected);
|
|
42
|
+
}
|
|
43
|
+
}
|
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
#include "cppjieba/SegmentBase.hpp"
|
|
2
|
+
#include "cppjieba/MixSegment.hpp"
|
|
3
|
+
#include "cppjieba/MPSegment.hpp"
|
|
4
|
+
#include "cppjieba/HMMSegment.hpp"
|
|
5
|
+
#include "cppjieba/FullSegment.hpp"
|
|
6
|
+
#include "cppjieba/QuerySegment.hpp"
|
|
7
|
+
#include "gtest/gtest.h"
|
|
8
|
+
|
|
9
|
+
using namespace cppjieba;
|
|
10
|
+
|
|
11
|
+
TEST(MixSegmentTest, Test1) {
|
|
12
|
+
MixSegment segment("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");;
|
|
13
|
+
string sentence;
|
|
14
|
+
vector<string> words;
|
|
15
|
+
string actual;
|
|
16
|
+
string expected;
|
|
17
|
+
|
|
18
|
+
{
|
|
19
|
+
sentence = "我来自北京邮电大学。。。学号123456,用AK47";
|
|
20
|
+
expected = "我/来自/北京邮电大学/。/。/。/学号/123456/,/用/AK47";
|
|
21
|
+
segment.Cut(sentence, words);
|
|
22
|
+
actual = Join(words.begin(), words.end(), "/");
|
|
23
|
+
ASSERT_EQ(actual, expected);
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
{
|
|
27
|
+
sentence = "B超 T恤";
|
|
28
|
+
expected = "B超/ /T恤";
|
|
29
|
+
segment.Cut(sentence, words);
|
|
30
|
+
actual = Join(words.begin(), words.end(), "/");
|
|
31
|
+
ASSERT_EQ(actual, expected);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
{
|
|
35
|
+
sentence = "他来到了网易杭研大厦";
|
|
36
|
+
expected = "他/来到/了/网易/杭/研/大厦";
|
|
37
|
+
segment.Cut(sentence, words, false);
|
|
38
|
+
actual = Join(words.begin(), words.end(), "/");
|
|
39
|
+
ASSERT_EQ(actual, expected);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
{
|
|
43
|
+
sentence = "他来到了网易杭研大厦";
|
|
44
|
+
expected = "他/来到/了/网易/杭研/大厦";
|
|
45
|
+
segment.Cut(sentence, words);
|
|
46
|
+
actual = Join(words.begin(), words.end(), "/");
|
|
47
|
+
ASSERT_EQ(actual, expected);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
TEST(MixSegmentTest, NoUserDict) {
|
|
52
|
+
MixSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8");
|
|
53
|
+
const char* str = "令狐冲是云计算方面的专家";
|
|
54
|
+
vector<string> words;
|
|
55
|
+
segment.Cut(str, words);
|
|
56
|
+
string res;
|
|
57
|
+
ASSERT_EQ("[\"令狐冲\", \"是\", \"云\", \"计算\", \"方面\", \"的\", \"专家\"]", res << words);
|
|
58
|
+
|
|
59
|
+
}
|
|
60
|
+
TEST(MixSegmentTest, UserDict) {
|
|
61
|
+
MixSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/user.dict.utf8");
|
|
62
|
+
{
|
|
63
|
+
const char* str = "令狐冲是云计算方面的专家";
|
|
64
|
+
vector<string> words;
|
|
65
|
+
segment.Cut(str, words);
|
|
66
|
+
string res;
|
|
67
|
+
ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words);
|
|
68
|
+
}
|
|
69
|
+
{
|
|
70
|
+
const char* str = "小明先就职于IBM,后在日本京都大学深造";
|
|
71
|
+
vector<string> words;
|
|
72
|
+
segment.Cut(str, words);
|
|
73
|
+
string res;
|
|
74
|
+
res << words;
|
|
75
|
+
ASSERT_EQ("[\"小明\", \"先\", \"就职\", \"于\", \"IBM\", \",\", \"后\", \"在\", \"日本\", \"京都大学\", \"深造\"]", res);
|
|
76
|
+
}
|
|
77
|
+
{
|
|
78
|
+
const char* str = "IBM,3.14";
|
|
79
|
+
vector<string> words;
|
|
80
|
+
segment.Cut(str, words);
|
|
81
|
+
string res;
|
|
82
|
+
res << words;
|
|
83
|
+
ASSERT_EQ("[\"IBM\", \",\", \"3.14\"]", res);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
TEST(MixSegmentTest, TestUserDict) {
|
|
87
|
+
MixSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8",
|
|
88
|
+
"../test/testdata/userdict.utf8");
|
|
89
|
+
vector<string> words;
|
|
90
|
+
string res;
|
|
91
|
+
|
|
92
|
+
segment.Cut("令狐冲是云计算方面的专家", words);
|
|
93
|
+
ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words);
|
|
94
|
+
|
|
95
|
+
segment.Cut("小明先就职于IBM,后在日本京都大学深造", words);
|
|
96
|
+
res << words;
|
|
97
|
+
ASSERT_EQ("[\"小明\", \"先\", \"就职\", \"于\", \"I\", \"B\", \"M\", \",\", \"后\", \"在\", \"日本\", \"京都大学\", \"深造\"]", res);
|
|
98
|
+
|
|
99
|
+
segment.Cut("IBM,3.14", words);
|
|
100
|
+
res << words;
|
|
101
|
+
ASSERT_EQ("[\"I\", \"B\", \"M\", \",\", \"3.14\"]", res);
|
|
102
|
+
|
|
103
|
+
segment.Cut("忽如一夜春风来,千树万树梨花开", words);
|
|
104
|
+
res = limonp::Join(words.begin(), words.end(), "/");
|
|
105
|
+
ASSERT_EQ("忽如一夜春风来/,/千树/万树/梨花/开", res);
|
|
106
|
+
|
|
107
|
+
// rand input
|
|
108
|
+
{
|
|
109
|
+
const size_t ITERATION = 16;
|
|
110
|
+
const size_t MAX_LEN = 256;
|
|
111
|
+
string s;
|
|
112
|
+
srand(time(NULL));
|
|
113
|
+
|
|
114
|
+
for (size_t i = 0; i < ITERATION; i++) {
|
|
115
|
+
size_t len = rand() % MAX_LEN;
|
|
116
|
+
s.resize(len);
|
|
117
|
+
for (size_t j = 0; j < len; j++) {
|
|
118
|
+
s[rand() % len] = rand();
|
|
119
|
+
}
|
|
120
|
+
segment.Cut(s, words);
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
TEST(MixSegmentTest, TestMultiUserDict) {
|
|
126
|
+
MixSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8",
|
|
127
|
+
"../test/testdata/userdict.utf8;../test/testdata/userdict.2.utf8");
|
|
128
|
+
vector<string> words;
|
|
129
|
+
string res;
|
|
130
|
+
|
|
131
|
+
segment.Cut("忽如一夜春风来,千树万树梨花开", words);
|
|
132
|
+
res = limonp::Join(words.begin(), words.end(), "/");
|
|
133
|
+
ASSERT_EQ("忽如一夜春风来/,/千树万树梨花开", res);
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
TEST(MPSegmentTest, Test1) {
|
|
137
|
+
MPSegment segment("../dict/jieba.dict.utf8");;
|
|
138
|
+
string s;
|
|
139
|
+
vector<string> words;
|
|
140
|
+
segment.Cut("我来自北京邮电大学。", words);
|
|
141
|
+
ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\"]", s << words);
|
|
142
|
+
|
|
143
|
+
segment.Cut("B超 T恤", words);
|
|
144
|
+
ASSERT_EQ(s << words, "[\"B超\", \" \", \"T恤\"]");
|
|
145
|
+
|
|
146
|
+
segment.Cut("南京市长江大桥", words);
|
|
147
|
+
ASSERT_EQ("[\"南京市\", \"长江大桥\"]", s << words);
|
|
148
|
+
|
|
149
|
+
// MaxWordLen
|
|
150
|
+
segment.Cut("南京市长江大桥", words, 3);
|
|
151
|
+
ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", s << words);
|
|
152
|
+
|
|
153
|
+
segment.Cut("南京市长江大桥", words, 0);
|
|
154
|
+
ASSERT_EQ("[\"南\", \"京\", \"市\", \"长\", \"江\", \"大\", \"桥\"]", s << words);
|
|
155
|
+
|
|
156
|
+
segment.Cut("湖南长沙市天心区", words);
|
|
157
|
+
s = Join(words.begin(), words.end(), "/");
|
|
158
|
+
ASSERT_EQ("湖南长沙市/天心区", s);
|
|
159
|
+
|
|
160
|
+
segment.Cut("湖南长沙市天心区", words, 3);
|
|
161
|
+
s = Join(words.begin(), words.end(), "/");
|
|
162
|
+
ASSERT_EQ("湖南/长沙市/天心区", s);
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
TEST(HMMSegmentTest, Test1) {
|
|
166
|
+
HMMSegment segment("../dict/hmm_model.utf8");;
|
|
167
|
+
{
|
|
168
|
+
const char* str = "我来自北京邮电大学。。。学号123456";
|
|
169
|
+
const char* res[] = {"我来", "自北京", "邮电大学", "。", "。", "。", "学号", "123456"};
|
|
170
|
+
vector<string> words;
|
|
171
|
+
segment.Cut(str, words);
|
|
172
|
+
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
{
|
|
176
|
+
const char* str = "IBM,1.2,123";
|
|
177
|
+
const char* res[] = {"IBM", ",", "1.2", ",", "123"};
|
|
178
|
+
vector<string> words;
|
|
179
|
+
segment.Cut(str, words);
|
|
180
|
+
ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
TEST(FullSegment, Test1) {
|
|
185
|
+
FullSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8");
|
|
186
|
+
vector<string> words;
|
|
187
|
+
string s;
|
|
188
|
+
|
|
189
|
+
segment.Cut("我来自北京邮电大学", words);
|
|
190
|
+
s << words;
|
|
191
|
+
ASSERT_EQ(s, "[\"我\", \"来自\", \"北京\", \"北京邮电大学\", \"邮电\", \"电大\", \"大学\"]");
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
segment.Cut("上市公司CEO", words);
|
|
195
|
+
s << words;
|
|
196
|
+
ASSERT_EQ(s, "[\"上市\", \"公司\", \"C\", \"E\", \"O\"]");
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
TEST(QuerySegment, Test1) {
|
|
200
|
+
QuerySegment segment("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "");
|
|
201
|
+
vector<string> words;
|
|
202
|
+
string s1, s2;
|
|
203
|
+
|
|
204
|
+
segment.Cut("小明硕士毕业于中国科学院计算所,后在日本京都大学深造", words);
|
|
205
|
+
s1 = Join(words.begin(), words.end(), "/");
|
|
206
|
+
s2 = "小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/,/后/在/日本/京都/大学/日本京都大学/深造";
|
|
207
|
+
ASSERT_EQ(s1, s2);
|
|
208
|
+
|
|
209
|
+
segment.Cut("亲口交代", words);
|
|
210
|
+
s1 = Join(words.begin(), words.end(), "/");
|
|
211
|
+
s2 = "亲口/交代";
|
|
212
|
+
ASSERT_EQ(s1, s2);
|
|
213
|
+
|
|
214
|
+
segment.Cut("他心理健康", words);
|
|
215
|
+
s1 = Join(words.begin(), words.end(), "/");
|
|
216
|
+
s2 = "他/心理/健康/心理健康";
|
|
217
|
+
ASSERT_EQ(s1, s2);
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
TEST(QuerySegment, Test2) {
|
|
221
|
+
QuerySegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8|../test/testdata/userdict.english");
|
|
222
|
+
vector<string> words;
|
|
223
|
+
string s1, s2;
|
|
224
|
+
|
|
225
|
+
{
|
|
226
|
+
segment.Cut("小明硕士毕业于中国科学院计算所,后在日本京都大学深造", words);
|
|
227
|
+
s1 = Join(words.begin(), words.end(), "/");
|
|
228
|
+
s2 = "小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/,/后/在/日本/京都/大学/京都大学/深造";
|
|
229
|
+
ASSERT_EQ(s1, s2);
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
{
|
|
233
|
+
segment.Cut("小明硕士毕业于中国科学院计算所iPhone6", words);
|
|
234
|
+
s1 = Join(words.begin(), words.end(), "/");
|
|
235
|
+
s2 = "小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/iPhone6";
|
|
236
|
+
ASSERT_EQ(s1, s2);
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
{
|
|
240
|
+
segment.Cut("中国科学院", words);
|
|
241
|
+
s1 = Join(words.begin(), words.end(), "/");
|
|
242
|
+
s2 = "中国/科学/学院/科学院/中国科学院";
|
|
243
|
+
ASSERT_EQ(s1, s2);
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
TEST(MPSegmentTest, Unicode32) {
|
|
249
|
+
string s("天气很好,🙋 我们去郊游。");
|
|
250
|
+
vector<string> words;
|
|
251
|
+
|
|
252
|
+
MPSegment segment("../dict/jieba.dict.utf8");;
|
|
253
|
+
segment.Cut(s, words);
|
|
254
|
+
|
|
255
|
+
ASSERT_EQ(Join(words.begin(), words.end(), "/"), "天气/很/好/,/🙋/ /我们/去/郊游/。");
|
|
256
|
+
}
|