cppjieba_rb 0.3.3 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (130) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +3 -0
  3. data/README.md +1 -1
  4. data/Rakefile +2 -2
  5. data/cppjieba_rb.gemspec +4 -4
  6. data/lib/cppjieba_rb/version.rb +1 -1
  7. metadata +17 -135
  8. data/ext/cppjieba/.gitignore +0 -17
  9. data/ext/cppjieba/.travis.yml +0 -21
  10. data/ext/cppjieba/CMakeLists.txt +0 -28
  11. data/ext/cppjieba/ChangeLog.md +0 -236
  12. data/ext/cppjieba/README.md +0 -292
  13. data/ext/cppjieba/README_EN.md +0 -113
  14. data/ext/cppjieba/appveyor.yml +0 -32
  15. data/ext/cppjieba/deps/CMakeLists.txt +0 -1
  16. data/ext/cppjieba/deps/gtest/CMakeLists.txt +0 -5
  17. data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +0 -283
  18. data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +0 -230
  19. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +0 -1421
  20. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +0 -487
  21. data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +0 -796
  22. data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +0 -232
  23. data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +0 -176
  24. data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +0 -259
  25. data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +0 -2155
  26. data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +0 -358
  27. data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +0 -58
  28. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +0 -308
  29. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +0 -210
  30. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +0 -1226
  31. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +0 -233
  32. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +0 -4822
  33. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +0 -301
  34. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +0 -619
  35. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +0 -1788
  36. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +0 -350
  37. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +0 -968
  38. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +0 -336
  39. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +0 -3330
  40. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +0 -296
  41. data/ext/cppjieba/deps/gtest/src/.deps/.dirstamp +0 -0
  42. data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +0 -681
  43. data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +0 -509
  44. data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
  45. data/ext/cppjieba/deps/gtest/src/gtest-all.cc +0 -48
  46. data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +0 -1234
  47. data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +0 -380
  48. data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +0 -1038
  49. data/ext/cppjieba/deps/gtest/src/gtest-port.cc +0 -746
  50. data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +0 -356
  51. data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +0 -110
  52. data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +0 -110
  53. data/ext/cppjieba/deps/gtest/src/gtest.cc +0 -4898
  54. data/ext/cppjieba/deps/gtest/src/gtest_main.cc +0 -39
  55. data/ext/cppjieba/deps/limonp/ArgvContext.hpp +0 -70
  56. data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +0 -49
  57. data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +0 -67
  58. data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +0 -65
  59. data/ext/cppjieba/deps/limonp/Closure.hpp +0 -206
  60. data/ext/cppjieba/deps/limonp/Colors.hpp +0 -31
  61. data/ext/cppjieba/deps/limonp/Condition.hpp +0 -38
  62. data/ext/cppjieba/deps/limonp/Config.hpp +0 -103
  63. data/ext/cppjieba/deps/limonp/FileLock.hpp +0 -74
  64. data/ext/cppjieba/deps/limonp/ForcePublic.hpp +0 -7
  65. data/ext/cppjieba/deps/limonp/LocalVector.hpp +0 -139
  66. data/ext/cppjieba/deps/limonp/Logging.hpp +0 -76
  67. data/ext/cppjieba/deps/limonp/Md5.hpp +0 -411
  68. data/ext/cppjieba/deps/limonp/MutexLock.hpp +0 -51
  69. data/ext/cppjieba/deps/limonp/NonCopyable.hpp +0 -21
  70. data/ext/cppjieba/deps/limonp/StdExtension.hpp +0 -159
  71. data/ext/cppjieba/deps/limonp/StringUtil.hpp +0 -365
  72. data/ext/cppjieba/deps/limonp/Thread.hpp +0 -44
  73. data/ext/cppjieba/deps/limonp/ThreadPool.hpp +0 -86
  74. data/ext/cppjieba/dict/README.md +0 -31
  75. data/ext/cppjieba/dict/hmm_model.utf8 +0 -34
  76. data/ext/cppjieba/dict/idf.utf8 +0 -258826
  77. data/ext/cppjieba/dict/jieba.dict.utf8 +0 -348982
  78. data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +0 -6653
  79. data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +0 -166
  80. data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +0 -259
  81. data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +0 -5222
  82. data/ext/cppjieba/dict/stop_words.utf8 +0 -1534
  83. data/ext/cppjieba/dict/user.dict.utf8 +0 -4
  84. data/ext/cppjieba/include/cppjieba/DictTrie.hpp +0 -277
  85. data/ext/cppjieba/include/cppjieba/FullSegment.hpp +0 -93
  86. data/ext/cppjieba/include/cppjieba/HMMModel.hpp +0 -129
  87. data/ext/cppjieba/include/cppjieba/HMMSegment.hpp +0 -190
  88. data/ext/cppjieba/include/cppjieba/Jieba.hpp +0 -130
  89. data/ext/cppjieba/include/cppjieba/KeywordExtractor.hpp +0 -153
  90. data/ext/cppjieba/include/cppjieba/MPSegment.hpp +0 -137
  91. data/ext/cppjieba/include/cppjieba/MixSegment.hpp +0 -109
  92. data/ext/cppjieba/include/cppjieba/PosTagger.hpp +0 -77
  93. data/ext/cppjieba/include/cppjieba/PreFilter.hpp +0 -54
  94. data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +0 -90
  95. data/ext/cppjieba/include/cppjieba/SegmentBase.hpp +0 -46
  96. data/ext/cppjieba/include/cppjieba/SegmentTagged.hpp +0 -23
  97. data/ext/cppjieba/include/cppjieba/TextRankExtractor.hpp +0 -190
  98. data/ext/cppjieba/include/cppjieba/Trie.hpp +0 -174
  99. data/ext/cppjieba/include/cppjieba/Unicode.hpp +0 -227
  100. data/ext/cppjieba/test/CMakeLists.txt +0 -5
  101. data/ext/cppjieba/test/demo.cpp +0 -80
  102. data/ext/cppjieba/test/load_test.cpp +0 -54
  103. data/ext/cppjieba/test/testdata/curl.res +0 -1
  104. data/ext/cppjieba/test/testdata/extra_dict/jieba.dict.small.utf8 +0 -109750
  105. data/ext/cppjieba/test/testdata/gbk_dict/hmm_model.gbk +0 -34
  106. data/ext/cppjieba/test/testdata/gbk_dict/jieba.dict.gbk +0 -348982
  107. data/ext/cppjieba/test/testdata/jieba.dict.0.1.utf8 +0 -93
  108. data/ext/cppjieba/test/testdata/jieba.dict.0.utf8 +0 -93
  109. data/ext/cppjieba/test/testdata/jieba.dict.1.utf8 +0 -67
  110. data/ext/cppjieba/test/testdata/jieba.dict.2.utf8 +0 -64
  111. data/ext/cppjieba/test/testdata/load_test.urls +0 -2
  112. data/ext/cppjieba/test/testdata/review.100 +0 -100
  113. data/ext/cppjieba/test/testdata/review.100.res +0 -200
  114. data/ext/cppjieba/test/testdata/server.conf +0 -19
  115. data/ext/cppjieba/test/testdata/testlines.gbk +0 -9
  116. data/ext/cppjieba/test/testdata/testlines.utf8 +0 -8
  117. data/ext/cppjieba/test/testdata/userdict.2.utf8 +0 -1
  118. data/ext/cppjieba/test/testdata/userdict.english +0 -2
  119. data/ext/cppjieba/test/testdata/userdict.utf8 +0 -8
  120. data/ext/cppjieba/test/testdata/weicheng.utf8 +0 -247
  121. data/ext/cppjieba/test/unittest/CMakeLists.txt +0 -24
  122. data/ext/cppjieba/test/unittest/gtest_main.cpp +0 -39
  123. data/ext/cppjieba/test/unittest/jieba_test.cpp +0 -133
  124. data/ext/cppjieba/test/unittest/keyword_extractor_test.cpp +0 -79
  125. data/ext/cppjieba/test/unittest/pos_tagger_test.cpp +0 -41
  126. data/ext/cppjieba/test/unittest/pre_filter_test.cpp +0 -43
  127. data/ext/cppjieba/test/unittest/segments_test.cpp +0 -256
  128. data/ext/cppjieba/test/unittest/textrank_test.cpp +0 -86
  129. data/ext/cppjieba/test/unittest/trie_test.cpp +0 -177
  130. data/ext/cppjieba/test/unittest/unicode_test.cpp +0 -43
@@ -1,79 +0,0 @@
1
- #include "cppjieba/KeywordExtractor.hpp"
2
- #include "gtest/gtest.h"
3
-
4
- using namespace cppjieba;
5
-
6
- TEST(KeywordExtractorTest, Test1) {
7
- KeywordExtractor Extractor("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8");
8
-
9
- {
10
- string s("你好世界世界而且而且");
11
- string res;
12
- size_t topN = 5;
13
-
14
- {
15
- vector<string> words;
16
- Extractor.Extract(s, words, topN);
17
- res << words;
18
- ASSERT_EQ(res, "[\"世界\", \"你好\"]");
19
- }
20
-
21
- {
22
- vector<pair<string, double> > words;
23
- Extractor.Extract(s, words, topN);
24
- res << words;
25
- ASSERT_EQ(res, "[世界:8.73506, 你好:7.95788]");
26
- }
27
-
28
- {
29
- vector<KeywordExtractor::Word> words;
30
- Extractor.Extract(s, words, topN);
31
- res << words;
32
- ASSERT_EQ(res, "[{\"word\": \"\xE4\xB8\x96\xE7\x95\x8C\", \"offset\": [6, 12], \"weight\": 8.73506}, {\"word\": \"\xE4\xBD\xA0\xE5\xA5\xBD\", \"offset\": [0], \"weight\": 7.95788}]");
33
- }
34
- }
35
-
36
- {
37
- string s("我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。");
38
- string res;
39
- vector<KeywordExtractor::Word> wordweights;
40
- size_t topN = 5;
41
- Extractor.Extract(s, wordweights, topN);
42
- res << wordweights;
43
- ASSERT_EQ(res, "[{\"word\": \"CEO\", \"offset\": [93], \"weight\": 11.7392}, {\"word\": \"\xE5\x8D\x87\xE8\x81\x8C\", \"offset\": [72], \"weight\": 10.8562}, {\"word\": \"\xE5\x8A\xA0\xE8\x96\xAA\", \"offset\": [78], \"weight\": 10.6426}, {\"word\": \"\xE6\x89\x8B\xE6\x89\xB6\xE6\x8B\x96\xE6\x8B\x89\xE6\x9C\xBA\", \"offset\": [21], \"weight\": 10.0089}, {\"word\": \"\xE5\xB7\x85\xE5\xB3\xB0\", \"offset\": [111], \"weight\": 9.49396}]");
44
- }
45
-
46
- {
47
- string s("一部iPhone6");
48
- string res;
49
- vector<KeywordExtractor::Word> wordweights;
50
- size_t topN = 5;
51
- Extractor.Extract(s, wordweights, topN);
52
- res << wordweights;
53
- ASSERT_EQ(res, "[{\"word\": \"iPhone6\", \"offset\": [6], \"weight\": 11.7392}, {\"word\": \"\xE4\xB8\x80\xE9\x83\xA8\", \"offset\": [0], \"weight\": 6.47592}]");
54
- }
55
- }
56
-
57
- TEST(KeywordExtractorTest, Test2) {
58
- KeywordExtractor Extractor("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8", "../test/testdata/userdict.utf8");
59
-
60
- {
61
- string s("蓝翔优秀毕业生");
62
- string res;
63
- vector<KeywordExtractor::Word> wordweights;
64
- size_t topN = 5;
65
- Extractor.Extract(s, wordweights, topN);
66
- res << wordweights;
67
- ASSERT_EQ(res, "[{\"word\": \"\xE8\x93\x9D\xE7\xBF\x94\", \"offset\": [0], \"weight\": 11.7392}, {\"word\": \"\xE6\xAF\x95\xE4\xB8\x9A\xE7\x94\x9F\", \"offset\": [12], \"weight\": 8.13549}, {\"word\": \"\xE4\xBC\x98\xE7\xA7\x80\", \"offset\": [6], \"weight\": 6.78347}]");
68
- }
69
-
70
- {
71
- string s("一部iPhone6");
72
- string res;
73
- vector<KeywordExtractor::Word> wordweights;
74
- size_t topN = 5;
75
- Extractor.Extract(s, wordweights, topN);
76
- res << wordweights;
77
- ASSERT_EQ(res, "[{\"word\": \"iPhone6\", \"offset\": [6], \"weight\": 11.7392}, {\"word\": \"\xE4\xB8\x80\xE9\x83\xA8\", \"offset\": [0], \"weight\": 6.47592}]");
78
- }
79
- }
@@ -1,41 +0,0 @@
1
- #include "cppjieba/MixSegment.hpp"
2
- #include "gtest/gtest.h"
3
-
4
- using namespace cppjieba;
5
-
6
- static const char * const QUERY_TEST1 = "我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上总经理,出任CEO,迎娶白富美,走上人生巅峰。";
7
- static const char * const ANS_TEST1 = "[我:r, 是:v, 蓝翔:x, 技工:n, 拖拉机:n, 学院:n, 手扶拖拉机:n, 专业:n, 的:uj, 。:x, 不用:v, 多久:m, ,:x, 我:r, 就:d, 会:v, 升职:v, 加薪:nr, ,:x, 当上:t, 总经理:n, ,:x, 出任:v, CEO:eng, ,:x, 迎娶:v, 白富:x, 美:ns, ,:x, 走上:v, 人生:n, 巅峰:n, 。:x]";
8
- static const char * const QUERY_TEST2 = "我是蓝翔技工拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上总经理,出任CEO,迎娶白富美,走上人生巅峰。";
9
- static const char * const ANS_TEST2 = "[我:r, 是:v, 蓝翔:nz, 技工:n, 拖拉机:n, 学院:n, 手扶拖拉机:n, 专业:n, 的:uj, 。:x, 不用:v, 多久:m, ,:x, 我:r, 就:d, 会:v, 升职:v, 加薪:nr, ,:x, 当上:t, 总经理:n, ,:x, 出任:v, CEO:eng, ,:x, 迎娶:v, 白富:x, 美:ns, ,:x, 走上:v, 人生:n, 巅峰:n, 。:x]";
10
-
11
- static const char * const QUERY_TEST3 = "iPhone6手机的最大特点是很容易弯曲。";
12
- static const char * const ANS_TEST3 = "[iPhone6:eng, 手机:n, 的:uj, 最大:a, 特点:n, 是:v, 很:zg, 容易:a, 弯曲:v, 。:x]";
13
- //static const char * const ANS_TEST3 = "";
14
-
15
- TEST(PosTaggerTest, Test) {
16
- MixSegment tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");
17
- {
18
- vector<pair<string, string> > res;
19
- tagger.Tag(QUERY_TEST1, res);
20
- string s;
21
- s << res;
22
- ASSERT_TRUE(s == ANS_TEST1);
23
- }
24
- }
25
- TEST(PosTagger, TestUserDict) {
26
- MixSegment tagger("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
27
- {
28
- vector<pair<string, string> > res;
29
- tagger.Tag(QUERY_TEST2, res);
30
- string s;
31
- s << res;
32
- ASSERT_EQ(s, ANS_TEST2);
33
- }
34
- {
35
- vector<pair<string, string> > res;
36
- tagger.Tag(QUERY_TEST3, res);
37
- string s;
38
- s << res;
39
- ASSERT_EQ(s, ANS_TEST3);
40
- }
41
- }
@@ -1,43 +0,0 @@
1
- #include "gtest/gtest.h"
2
- #include "cppjieba/PreFilter.hpp"
3
- #include "limonp/StringUtil.hpp"
4
-
5
- using namespace cppjieba;
6
-
7
- TEST(PreFilterTest, Test1) {
8
- unordered_set<Rune> symbol;
9
- symbol.insert(65292u); // ","
10
- symbol.insert(12290u); // "。"
11
- string expected;
12
- string res;
13
-
14
- {
15
- string s = "你好,美丽的,世界";
16
- PreFilter filter(symbol, s);
17
- expected = "你好/,/美丽的/,/世界";
18
- ASSERT_TRUE(filter.HasNext());
19
- vector<string> words;
20
- while (filter.HasNext()) {
21
- PreFilter::Range range;
22
- range = filter.Next();
23
- words.push_back(GetStringFromRunes(s, range.begin, range.end - 1));
24
- }
25
- res = limonp::Join(words.begin(), words.end(), "/");
26
- ASSERT_EQ(res, expected);
27
- }
28
-
29
- {
30
- string s = "我来自北京邮电大学。。。学号123456,用AK47";
31
- PreFilter filter(symbol, s);
32
- expected = "我来自北京邮电大学/。/。/。/学号123456/,/用AK47";
33
- ASSERT_TRUE(filter.HasNext());
34
- vector<string> words;
35
- while (filter.HasNext()) {
36
- PreFilter::Range range;
37
- range = filter.Next();
38
- words.push_back(GetStringFromRunes(s, range.begin, range.end - 1));
39
- }
40
- res = limonp::Join(words.begin(), words.end(), "/");
41
- ASSERT_EQ(res, expected);
42
- }
43
- }
@@ -1,256 +0,0 @@
1
- #include "cppjieba/SegmentBase.hpp"
2
- #include "cppjieba/MixSegment.hpp"
3
- #include "cppjieba/MPSegment.hpp"
4
- #include "cppjieba/HMMSegment.hpp"
5
- #include "cppjieba/FullSegment.hpp"
6
- #include "cppjieba/QuerySegment.hpp"
7
- #include "gtest/gtest.h"
8
-
9
- using namespace cppjieba;
10
-
11
- TEST(MixSegmentTest, Test1) {
12
- MixSegment segment("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8");;
13
- string sentence;
14
- vector<string> words;
15
- string actual;
16
- string expected;
17
-
18
- {
19
- sentence = "我来自北京邮电大学。。。学号123456,用AK47";
20
- expected = "我/来自/北京邮电大学/。/。/。/学号/123456/,/用/AK47";
21
- segment.Cut(sentence, words);
22
- actual = Join(words.begin(), words.end(), "/");
23
- ASSERT_EQ(actual, expected);
24
- }
25
-
26
- {
27
- sentence = "B超 T恤";
28
- expected = "B超/ /T恤";
29
- segment.Cut(sentence, words);
30
- actual = Join(words.begin(), words.end(), "/");
31
- ASSERT_EQ(actual, expected);
32
- }
33
-
34
- {
35
- sentence = "他来到了网易杭研大厦";
36
- expected = "他/来到/了/网易/杭/研/大厦";
37
- segment.Cut(sentence, words, false);
38
- actual = Join(words.begin(), words.end(), "/");
39
- ASSERT_EQ(actual, expected);
40
- }
41
-
42
- {
43
- sentence = "他来到了网易杭研大厦";
44
- expected = "他/来到/了/网易/杭研/大厦";
45
- segment.Cut(sentence, words);
46
- actual = Join(words.begin(), words.end(), "/");
47
- ASSERT_EQ(actual, expected);
48
- }
49
- }
50
-
51
- TEST(MixSegmentTest, NoUserDict) {
52
- MixSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8");
53
- const char* str = "令狐冲是云计算方面的专家";
54
- vector<string> words;
55
- segment.Cut(str, words);
56
- string res;
57
- ASSERT_EQ("[\"令狐冲\", \"是\", \"云\", \"计算\", \"方面\", \"的\", \"专家\"]", res << words);
58
-
59
- }
60
- TEST(MixSegmentTest, UserDict) {
61
- MixSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/user.dict.utf8");
62
- {
63
- const char* str = "令狐冲是云计算方面的专家";
64
- vector<string> words;
65
- segment.Cut(str, words);
66
- string res;
67
- ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words);
68
- }
69
- {
70
- const char* str = "小明先就职于IBM,后在日本京都大学深造";
71
- vector<string> words;
72
- segment.Cut(str, words);
73
- string res;
74
- res << words;
75
- ASSERT_EQ("[\"小明\", \"先\", \"就职\", \"于\", \"IBM\", \",\", \"后\", \"在\", \"日本\", \"京都大学\", \"深造\"]", res);
76
- }
77
- {
78
- const char* str = "IBM,3.14";
79
- vector<string> words;
80
- segment.Cut(str, words);
81
- string res;
82
- res << words;
83
- ASSERT_EQ("[\"IBM\", \",\", \"3.14\"]", res);
84
- }
85
- }
86
- TEST(MixSegmentTest, TestUserDict) {
87
- MixSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8",
88
- "../test/testdata/userdict.utf8");
89
- vector<string> words;
90
- string res;
91
-
92
- segment.Cut("令狐冲是云计算方面的专家", words);
93
- ASSERT_EQ("[\"令狐冲\", \"是\", \"云计算\", \"方面\", \"的\", \"专家\"]", res << words);
94
-
95
- segment.Cut("小明先就职于IBM,后在日本京都大学深造", words);
96
- res << words;
97
- ASSERT_EQ("[\"小明\", \"先\", \"就职\", \"于\", \"I\", \"B\", \"M\", \",\", \"后\", \"在\", \"日本\", \"京都大学\", \"深造\"]", res);
98
-
99
- segment.Cut("IBM,3.14", words);
100
- res << words;
101
- ASSERT_EQ("[\"I\", \"B\", \"M\", \",\", \"3.14\"]", res);
102
-
103
- segment.Cut("忽如一夜春风来,千树万树梨花开", words);
104
- res = limonp::Join(words.begin(), words.end(), "/");
105
- ASSERT_EQ("忽如一夜春风来/,/千树/万树/梨花/开", res);
106
-
107
- // rand input
108
- {
109
- const size_t ITERATION = 16;
110
- const size_t MAX_LEN = 256;
111
- string s;
112
- srand(time(NULL));
113
-
114
- for (size_t i = 0; i < ITERATION; i++) {
115
- size_t len = rand() % MAX_LEN;
116
- s.resize(len);
117
- for (size_t j = 0; j < len; j++) {
118
- s[rand() % len] = rand();
119
- }
120
- segment.Cut(s, words);
121
- }
122
- }
123
- }
124
-
125
- TEST(MixSegmentTest, TestMultiUserDict) {
126
- MixSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8",
127
- "../test/testdata/userdict.utf8;../test/testdata/userdict.2.utf8");
128
- vector<string> words;
129
- string res;
130
-
131
- segment.Cut("忽如一夜春风来,千树万树梨花开", words);
132
- res = limonp::Join(words.begin(), words.end(), "/");
133
- ASSERT_EQ("忽如一夜春风来/,/千树万树梨花开", res);
134
- }
135
-
136
- TEST(MPSegmentTest, Test1) {
137
- MPSegment segment("../dict/jieba.dict.utf8");;
138
- string s;
139
- vector<string> words;
140
- segment.Cut("我来自北京邮电大学。", words);
141
- ASSERT_EQ("[\"我\", \"来自\", \"北京邮电大学\", \"。\"]", s << words);
142
-
143
- segment.Cut("B超 T恤", words);
144
- ASSERT_EQ(s << words, "[\"B超\", \" \", \"T恤\"]");
145
-
146
- segment.Cut("南京市长江大桥", words);
147
- ASSERT_EQ("[\"南京市\", \"长江大桥\"]", s << words);
148
-
149
- // MaxWordLen
150
- segment.Cut("南京市长江大桥", words, 3);
151
- ASSERT_EQ("[\"南京市\", \"长江\", \"大桥\"]", s << words);
152
-
153
- segment.Cut("南京市长江大桥", words, 0);
154
- ASSERT_EQ("[\"南\", \"京\", \"市\", \"长\", \"江\", \"大\", \"桥\"]", s << words);
155
-
156
- segment.Cut("湖南长沙市天心区", words);
157
- s = Join(words.begin(), words.end(), "/");
158
- ASSERT_EQ("湖南长沙市/天心区", s);
159
-
160
- segment.Cut("湖南长沙市天心区", words, 3);
161
- s = Join(words.begin(), words.end(), "/");
162
- ASSERT_EQ("湖南/长沙市/天心区", s);
163
- }
164
-
165
- TEST(HMMSegmentTest, Test1) {
166
- HMMSegment segment("../dict/hmm_model.utf8");;
167
- {
168
- const char* str = "我来自北京邮电大学。。。学号123456";
169
- const char* res[] = {"我来", "自北京", "邮电大学", "。", "。", "。", "学号", "123456"};
170
- vector<string> words;
171
- segment.Cut(str, words);
172
- ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
173
- }
174
-
175
- {
176
- const char* str = "IBM,1.2,123";
177
- const char* res[] = {"IBM", ",", "1.2", ",", "123"};
178
- vector<string> words;
179
- segment.Cut(str, words);
180
- ASSERT_EQ(words, vector<string>(res, res + sizeof(res)/sizeof(res[0])));
181
- }
182
- }
183
-
184
- TEST(FullSegment, Test1) {
185
- FullSegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8");
186
- vector<string> words;
187
- string s;
188
-
189
- segment.Cut("我来自北京邮电大学", words);
190
- s << words;
191
- ASSERT_EQ(s, "[\"我\", \"来自\", \"北京\", \"北京邮电大学\", \"邮电\", \"电大\", \"大学\"]");
192
-
193
-
194
- segment.Cut("上市公司CEO", words);
195
- s << words;
196
- ASSERT_EQ(s, "[\"上市\", \"公司\", \"C\", \"E\", \"O\"]");
197
- }
198
-
199
- TEST(QuerySegment, Test1) {
200
- QuerySegment segment("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "");
201
- vector<string> words;
202
- string s1, s2;
203
-
204
- segment.Cut("小明硕士毕业于中国科学院计算所,后在日本京都大学深造", words);
205
- s1 = Join(words.begin(), words.end(), "/");
206
- s2 = "小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/,/后/在/日本/京都/大学/日本京都大学/深造";
207
- ASSERT_EQ(s1, s2);
208
-
209
- segment.Cut("亲口交代", words);
210
- s1 = Join(words.begin(), words.end(), "/");
211
- s2 = "亲口/交代";
212
- ASSERT_EQ(s1, s2);
213
-
214
- segment.Cut("他心理健康", words);
215
- s1 = Join(words.begin(), words.end(), "/");
216
- s2 = "他/心理/健康/心理健康";
217
- ASSERT_EQ(s1, s2);
218
- }
219
-
220
- TEST(QuerySegment, Test2) {
221
- QuerySegment segment("../test/testdata/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8|../test/testdata/userdict.english");
222
- vector<string> words;
223
- string s1, s2;
224
-
225
- {
226
- segment.Cut("小明硕士毕业于中国科学院计算所,后在日本京都大学深造", words);
227
- s1 = Join(words.begin(), words.end(), "/");
228
- s2 = "小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/,/后/在/日本/京都/大学/京都大学/深造";
229
- ASSERT_EQ(s1, s2);
230
- }
231
-
232
- {
233
- segment.Cut("小明硕士毕业于中国科学院计算所iPhone6", words);
234
- s1 = Join(words.begin(), words.end(), "/");
235
- s2 = "小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/iPhone6";
236
- ASSERT_EQ(s1, s2);
237
- }
238
-
239
- {
240
- segment.Cut("中国科学院", words);
241
- s1 = Join(words.begin(), words.end(), "/");
242
- s2 = "中国/科学/学院/科学院/中国科学院";
243
- ASSERT_EQ(s1, s2);
244
- }
245
-
246
- }
247
-
248
- TEST(MPSegmentTest, Unicode32) {
249
- string s("天气很好,🙋 我们去郊游。");
250
- vector<string> words;
251
-
252
- MPSegment segment("../dict/jieba.dict.utf8");;
253
- segment.Cut(s, words);
254
-
255
- ASSERT_EQ(Join(words.begin(), words.end(), "/"), "天气/很/好/,/🙋/ /我们/去/郊游/。");
256
- }
@@ -1,86 +0,0 @@
1
- #include "cppjieba/TextRankExtractor.hpp"
2
- #include "gtest/gtest.h"
3
-
4
- using namespace cppjieba;
5
-
6
- TEST(TextRankExtractorTest, Test1) {
7
- TextRankExtractor Extractor(
8
- "../test/testdata/extra_dict/jieba.dict.small.utf8",
9
- "../dict/hmm_model.utf8",
10
- "../dict/stop_words.utf8");
11
- {
12
- string s("你好世界世界而且而且");
13
- string res;
14
- size_t topN = 5;
15
-
16
- {
17
- vector<string> words;
18
- Extractor.Extract(s, words, topN);
19
- res << words;
20
- ASSERT_EQ(res, "[\"世界\", \"你好\"]");
21
- }
22
-
23
- {
24
- vector<pair<string, double> > words;
25
- Extractor.Extract(s, words, topN);
26
- res << words;
27
- ASSERT_EQ(res, "[世界:1, 你好:0.519787]");
28
- }
29
-
30
- {
31
- vector<TextRankExtractor::Word> words;
32
- Extractor.Extract(s, words, topN);
33
- res << words;
34
- ASSERT_EQ(res, "[{\"word\": \"世界\", \"offset\": [6, 12], \"weight\": 1}, {\"word\": \"你好\", \"offset\": [0], \"weight\": 0.519787}]");
35
- }
36
- }
37
-
38
- {
39
- string s("\xe6\x88\x91\xe6\x98\xaf\xe6\x8b\x96\xe6\x8b\x89\xe6\x9c\xba\xe5\xad\xa6\xe9\x99\xa2\xe6\x89\x8b\xe6\x89\xb6\xe6\x8b\x96\xe6\x8b\x89\xe6\x9c\xba\xe4\xb8\x93\xe4\xb8\x9a\xe7\x9a\x84\xe3\x80\x82\xe4\xb8\x8d\xe7\x94\xa8\xe5\xa4\x9a\xe4\xb9\x85\xef\xbc\x8c\xe6\x88\x91\xe5\xb0\xb1\xe4\xbc\x9a\xe5\x8d\x87\xe8\x81\x8c\xe5\x8a\xa0\xe8\x96\xaa\xef\xbc\x8c\xe5\xbd\x93\xe4\xb8\x8a CEO\xef\xbc\x8c\xe8\xb5\xb0\xe4\xb8\x8a\xe4\xba\xba\xe7\x94\x9f\xe5\xb7\x85\xe5\xb3\xb0");
40
- string res;
41
- vector<TextRankExtractor::Word> wordweights;
42
- size_t topN = 5;
43
- Extractor.Extract(s, wordweights, topN);
44
- res << wordweights;
45
- ASSERT_EQ(res, "[{\"word\": \"当上\", \"offset\": [87], \"weight\": 1}, {\"word\": \"不用\", \"offset\": [48], \"weight\": 0.989848}, {\"word\": \"多久\", \"offset\": [54], \"weight\": 0.985126}, {\"word\": \"加薪\", \"offset\": [78], \"weight\": 0.983046}, {\"word\": \"升职\", \"offset\": [72], \"weight\": 0.980278}]");
46
- //ASSERT_EQ(res, "[{\"word\": \"专业\", \"offset\": [36], \"weight\": 1}, {\"word\": \"CEO\", \"offset\": [94], \"weight\": 0.95375}, {\"word\": \"手扶拖拉机\", \"offset\": [21], \"weight\": 0.801701}, {\"word\": \"当上\", \"offset\": [87], \"weight\": 0.798968}, {\"word\": \"走上\", \"offset\": [100], \"weight\": 0.775505}]");
47
- }
48
-
49
- {
50
- string s("一部iPhone6");
51
- string res;
52
- vector<TextRankExtractor::Word> wordweights;
53
- size_t topN = 5;
54
- Extractor.Extract(s, wordweights, topN);
55
- res << wordweights;
56
- ASSERT_EQ(res, "[{\"word\": \"一部\", \"offset\": [0], \"weight\": 1}, {\"word\": \"iPhone6\", \"offset\": [6], \"weight\": 0.996126}]");
57
- }
58
- }
59
-
60
- TEST(TextRankExtractorTest, Test2) {
61
- TextRankExtractor Extractor(
62
- "../test/testdata/extra_dict/jieba.dict.small.utf8",
63
- "../dict/hmm_model.utf8",
64
- "../dict/stop_words.utf8",
65
- "../test/testdata/userdict.utf8");
66
-
67
- {
68
- string s("\xe8\x93\x9d\xe7\xbf\x94\xe4\xbc\x98\xe7\xa7\x80\xe6\xaf\x95\xe4\xb8\x9a\xe7\x94\x9f");
69
- string res;
70
- vector<TextRankExtractor::Word> wordweights;
71
- size_t topN = 5;
72
- Extractor.Extract(s, wordweights, topN);
73
- res << wordweights;
74
- ASSERT_EQ(res, "[{\"word\": \"蓝翔\", \"offset\": [0], \"weight\": 1}, {\"word\": \"毕业生\", \"offset\": [12], \"weight\": 0.996685}, {\"word\": \"优秀\", \"offset\": [6], \"weight\": 0.992994}]");
75
- }
76
-
77
- {
78
- string s("一部iPhone6");
79
- string res;
80
- vector<TextRankExtractor::Word> wordweights;
81
- size_t topN = 5;
82
- Extractor.Extract(s, wordweights, topN);
83
- res << wordweights;
84
- ASSERT_EQ(res, "[{\"word\": \"一部\", \"offset\": [0], \"weight\": 1}, {\"word\": \"iPhone6\", \"offset\": [6], \"weight\": 0.996126}]");
85
- }
86
- }