cppjieba_rb 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (142) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.gitmodules +3 -0
  4. data/.travis.yml +26 -0
  5. data/Gemfile +3 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +81 -0
  8. data/Rakefile +20 -0
  9. data/cppjieba_rb.gemspec +50 -0
  10. data/ext/cppjieba/.gitignore +17 -0
  11. data/ext/cppjieba/.travis.yml +22 -0
  12. data/ext/cppjieba/CMakeLists.txt +28 -0
  13. data/ext/cppjieba/ChangeLog.md +236 -0
  14. data/ext/cppjieba/README.md +285 -0
  15. data/ext/cppjieba/README_EN.md +111 -0
  16. data/ext/cppjieba/appveyor.yml +32 -0
  17. data/ext/cppjieba/deps/CMakeLists.txt +1 -0
  18. data/ext/cppjieba/deps/gtest/CMakeLists.txt +5 -0
  19. data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +283 -0
  20. data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +230 -0
  21. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +1421 -0
  22. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +487 -0
  23. data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +796 -0
  24. data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +232 -0
  25. data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +176 -0
  26. data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +259 -0
  27. data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +2155 -0
  28. data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +358 -0
  29. data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +58 -0
  30. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +308 -0
  31. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +210 -0
  32. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +1226 -0
  33. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +233 -0
  34. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +4822 -0
  35. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +301 -0
  36. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +619 -0
  37. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +1788 -0
  38. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +350 -0
  39. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +968 -0
  40. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +336 -0
  41. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +3330 -0
  42. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +296 -0
  43. data/ext/cppjieba/deps/gtest/src/.deps/.dirstamp +0 -0
  44. data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +681 -0
  45. data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +509 -0
  46. data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
  47. data/ext/cppjieba/deps/gtest/src/gtest-all.cc +48 -0
  48. data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +1234 -0
  49. data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +380 -0
  50. data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +1038 -0
  51. data/ext/cppjieba/deps/gtest/src/gtest-port.cc +746 -0
  52. data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +356 -0
  53. data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +110 -0
  54. data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +110 -0
  55. data/ext/cppjieba/deps/gtest/src/gtest.cc +4898 -0
  56. data/ext/cppjieba/deps/gtest/src/gtest_main.cc +39 -0
  57. data/ext/cppjieba/deps/limonp/ArgvContext.hpp +70 -0
  58. data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +49 -0
  59. data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +67 -0
  60. data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +65 -0
  61. data/ext/cppjieba/deps/limonp/Closure.hpp +206 -0
  62. data/ext/cppjieba/deps/limonp/Colors.hpp +31 -0
  63. data/ext/cppjieba/deps/limonp/Condition.hpp +38 -0
  64. data/ext/cppjieba/deps/limonp/Config.hpp +103 -0
  65. data/ext/cppjieba/deps/limonp/FileLock.hpp +74 -0
  66. data/ext/cppjieba/deps/limonp/ForcePublic.hpp +7 -0
  67. data/ext/cppjieba/deps/limonp/LocalVector.hpp +139 -0
  68. data/ext/cppjieba/deps/limonp/Logging.hpp +76 -0
  69. data/ext/cppjieba/deps/limonp/Md5.hpp +411 -0
  70. data/ext/cppjieba/deps/limonp/MutexLock.hpp +51 -0
  71. data/ext/cppjieba/deps/limonp/NonCopyable.hpp +21 -0
  72. data/ext/cppjieba/deps/limonp/StdExtension.hpp +159 -0
  73. data/ext/cppjieba/deps/limonp/StringUtil.hpp +365 -0
  74. data/ext/cppjieba/deps/limonp/Thread.hpp +44 -0
  75. data/ext/cppjieba/deps/limonp/ThreadPool.hpp +86 -0
  76. data/ext/cppjieba/dict/README.md +31 -0
  77. data/ext/cppjieba/dict/hmm_model.utf8 +34 -0
  78. data/ext/cppjieba/dict/idf.utf8 +258826 -0
  79. data/ext/cppjieba/dict/jieba.dict.utf8 +348982 -0
  80. data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
  81. data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
  82. data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
  83. data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
  84. data/ext/cppjieba/dict/stop_words.utf8 +1534 -0
  85. data/ext/cppjieba/dict/user.dict.utf8 +4 -0
  86. data/ext/cppjieba/include/cppjieba/DictTrie.hpp +227 -0
  87. data/ext/cppjieba/include/cppjieba/FullSegment.hpp +93 -0
  88. data/ext/cppjieba/include/cppjieba/HMMModel.hpp +129 -0
  89. data/ext/cppjieba/include/cppjieba/HMMSegment.hpp +190 -0
  90. data/ext/cppjieba/include/cppjieba/Jieba.hpp +108 -0
  91. data/ext/cppjieba/include/cppjieba/KeywordExtractor.hpp +153 -0
  92. data/ext/cppjieba/include/cppjieba/MPSegment.hpp +137 -0
  93. data/ext/cppjieba/include/cppjieba/MixSegment.hpp +109 -0
  94. data/ext/cppjieba/include/cppjieba/PosTagger.hpp +77 -0
  95. data/ext/cppjieba/include/cppjieba/PreFilter.hpp +54 -0
  96. data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +90 -0
  97. data/ext/cppjieba/include/cppjieba/SegmentBase.hpp +46 -0
  98. data/ext/cppjieba/include/cppjieba/SegmentTagged.hpp +23 -0
  99. data/ext/cppjieba/include/cppjieba/TextRankExtractor.hpp +190 -0
  100. data/ext/cppjieba/include/cppjieba/Trie.hpp +174 -0
  101. data/ext/cppjieba/include/cppjieba/Unicode.hpp +215 -0
  102. data/ext/cppjieba/test/CMakeLists.txt +5 -0
  103. data/ext/cppjieba/test/demo.cpp +80 -0
  104. data/ext/cppjieba/test/load_test.cpp +54 -0
  105. data/ext/cppjieba/test/testdata/curl.res +1 -0
  106. data/ext/cppjieba/test/testdata/extra_dict/jieba.dict.small.utf8 +109750 -0
  107. data/ext/cppjieba/test/testdata/gbk_dict/hmm_model.gbk +34 -0
  108. data/ext/cppjieba/test/testdata/gbk_dict/jieba.dict.gbk +348982 -0
  109. data/ext/cppjieba/test/testdata/jieba.dict.0.1.utf8 +93 -0
  110. data/ext/cppjieba/test/testdata/jieba.dict.0.utf8 +93 -0
  111. data/ext/cppjieba/test/testdata/jieba.dict.1.utf8 +67 -0
  112. data/ext/cppjieba/test/testdata/jieba.dict.2.utf8 +64 -0
  113. data/ext/cppjieba/test/testdata/load_test.urls +2 -0
  114. data/ext/cppjieba/test/testdata/review.100 +100 -0
  115. data/ext/cppjieba/test/testdata/review.100.res +200 -0
  116. data/ext/cppjieba/test/testdata/server.conf +19 -0
  117. data/ext/cppjieba/test/testdata/testlines.gbk +9 -0
  118. data/ext/cppjieba/test/testdata/testlines.utf8 +8 -0
  119. data/ext/cppjieba/test/testdata/userdict.2.utf8 +1 -0
  120. data/ext/cppjieba/test/testdata/userdict.english +2 -0
  121. data/ext/cppjieba/test/testdata/userdict.utf8 +8 -0
  122. data/ext/cppjieba/test/testdata/weicheng.utf8 +247 -0
  123. data/ext/cppjieba/test/unittest/CMakeLists.txt +24 -0
  124. data/ext/cppjieba/test/unittest/gtest_main.cpp +39 -0
  125. data/ext/cppjieba/test/unittest/jieba_test.cpp +133 -0
  126. data/ext/cppjieba/test/unittest/keyword_extractor_test.cpp +79 -0
  127. data/ext/cppjieba/test/unittest/pos_tagger_test.cpp +41 -0
  128. data/ext/cppjieba/test/unittest/pre_filter_test.cpp +43 -0
  129. data/ext/cppjieba/test/unittest/segments_test.cpp +256 -0
  130. data/ext/cppjieba/test/unittest/textrank_test.cpp +86 -0
  131. data/ext/cppjieba/test/unittest/trie_test.cpp +177 -0
  132. data/ext/cppjieba/test/unittest/unicode_test.cpp +43 -0
  133. data/ext/cppjieba_rb/cppjieba_rb.c +10 -0
  134. data/ext/cppjieba_rb/extconf.rb +26 -0
  135. data/ext/cppjieba_rb/internal.cc +148 -0
  136. data/lib/cppjieba_rb/segment.rb +20 -0
  137. data/lib/cppjieba_rb/version.rb +3 -0
  138. data/lib/cppjieba_rb.rb +34 -0
  139. data/test/test_keyword.rb +17 -0
  140. data/test/test_segment.rb +24 -0
  141. data/test/test_tagging.rb +19 -0
  142. metadata +244 -0
@@ -0,0 +1,86 @@
1
+ #include "cppjieba/TextRankExtractor.hpp"
2
+ #include "gtest/gtest.h"
3
+
4
+ using namespace cppjieba;
5
+
6
+ TEST(TextRankExtractorTest, Test1) {
7
+ TextRankExtractor Extractor(
8
+ "../test/testdata/extra_dict/jieba.dict.small.utf8",
9
+ "../dict/hmm_model.utf8",
10
+ "../dict/stop_words.utf8");
11
+ {
12
+ string s("你好世界世界而且而且");
13
+ string res;
14
+ size_t topN = 5;
15
+
16
+ {
17
+ vector<string> words;
18
+ Extractor.Extract(s, words, topN);
19
+ res << words;
20
+ ASSERT_EQ(res, "[\"世界\", \"你好\"]");
21
+ }
22
+
23
+ {
24
+ vector<pair<string, double> > words;
25
+ Extractor.Extract(s, words, topN);
26
+ res << words;
27
+ ASSERT_EQ(res, "[世界:1, 你好:0.519787]");
28
+ }
29
+
30
+ {
31
+ vector<TextRankExtractor::Word> words;
32
+ Extractor.Extract(s, words, topN);
33
+ res << words;
34
+ ASSERT_EQ(res, "[{\"word\": \"世界\", \"offset\": [6, 12], \"weight\": 1}, {\"word\": \"你好\", \"offset\": [0], \"weight\": 0.519787}]");
35
+ }
36
+ }
37
+
38
+ {
39
+ string s("\xe6\x88\x91\xe6\x98\xaf\xe6\x8b\x96\xe6\x8b\x89\xe6\x9c\xba\xe5\xad\xa6\xe9\x99\xa2\xe6\x89\x8b\xe6\x89\xb6\xe6\x8b\x96\xe6\x8b\x89\xe6\x9c\xba\xe4\xb8\x93\xe4\xb8\x9a\xe7\x9a\x84\xe3\x80\x82\xe4\xb8\x8d\xe7\x94\xa8\xe5\xa4\x9a\xe4\xb9\x85\xef\xbc\x8c\xe6\x88\x91\xe5\xb0\xb1\xe4\xbc\x9a\xe5\x8d\x87\xe8\x81\x8c\xe5\x8a\xa0\xe8\x96\xaa\xef\xbc\x8c\xe5\xbd\x93\xe4\xb8\x8a CEO\xef\xbc\x8c\xe8\xb5\xb0\xe4\xb8\x8a\xe4\xba\xba\xe7\x94\x9f\xe5\xb7\x85\xe5\xb3\xb0");
40
+ string res;
41
+ vector<TextRankExtractor::Word> wordweights;
42
+ size_t topN = 5;
43
+ Extractor.Extract(s, wordweights, topN);
44
+ res << wordweights;
45
+ ASSERT_EQ(res, "[{\"word\": \"当上\", \"offset\": [87], \"weight\": 1}, {\"word\": \"不用\", \"offset\": [48], \"weight\": 0.989848}, {\"word\": \"多久\", \"offset\": [54], \"weight\": 0.985126}, {\"word\": \"加薪\", \"offset\": [78], \"weight\": 0.983046}, {\"word\": \"升职\", \"offset\": [72], \"weight\": 0.980278}]");
46
+ //ASSERT_EQ(res, "[{\"word\": \"专业\", \"offset\": [36], \"weight\": 1}, {\"word\": \"CEO\", \"offset\": [94], \"weight\": 0.95375}, {\"word\": \"手扶拖拉机\", \"offset\": [21], \"weight\": 0.801701}, {\"word\": \"当上\", \"offset\": [87], \"weight\": 0.798968}, {\"word\": \"走上\", \"offset\": [100], \"weight\": 0.775505}]");
47
+ }
48
+
49
+ {
50
+ string s("一部iPhone6");
51
+ string res;
52
+ vector<TextRankExtractor::Word> wordweights;
53
+ size_t topN = 5;
54
+ Extractor.Extract(s, wordweights, topN);
55
+ res << wordweights;
56
+ ASSERT_EQ(res, "[{\"word\": \"一部\", \"offset\": [0], \"weight\": 1}, {\"word\": \"iPhone6\", \"offset\": [6], \"weight\": 0.996126}]");
57
+ }
58
+ }
59
+
60
+ TEST(TextRankExtractorTest, Test2) {
61
+ TextRankExtractor Extractor(
62
+ "../test/testdata/extra_dict/jieba.dict.small.utf8",
63
+ "../dict/hmm_model.utf8",
64
+ "../dict/stop_words.utf8",
65
+ "../test/testdata/userdict.utf8");
66
+
67
+ {
68
+ string s("\xe8\x93\x9d\xe7\xbf\x94\xe4\xbc\x98\xe7\xa7\x80\xe6\xaf\x95\xe4\xb8\x9a\xe7\x94\x9f");
69
+ string res;
70
+ vector<TextRankExtractor::Word> wordweights;
71
+ size_t topN = 5;
72
+ Extractor.Extract(s, wordweights, topN);
73
+ res << wordweights;
74
+ ASSERT_EQ(res, "[{\"word\": \"蓝翔\", \"offset\": [0], \"weight\": 1}, {\"word\": \"毕业生\", \"offset\": [12], \"weight\": 0.996685}, {\"word\": \"优秀\", \"offset\": [6], \"weight\": 0.992994}]");
75
+ }
76
+
77
+ {
78
+ string s("一部iPhone6");
79
+ string res;
80
+ vector<TextRankExtractor::Word> wordweights;
81
+ size_t topN = 5;
82
+ Extractor.Extract(s, wordweights, topN);
83
+ res << wordweights;
84
+ ASSERT_EQ(res, "[{\"word\": \"一部\", \"offset\": [0], \"weight\": 1}, {\"word\": \"iPhone6\", \"offset\": [6], \"weight\": 0.996126}]");
85
+ }
86
+ }
@@ -0,0 +1,177 @@
1
+ #include "cppjieba/DictTrie.hpp"
2
+ #include "cppjieba/MPSegment.hpp"
3
+ #include "gtest/gtest.h"
4
+
5
+ using namespace cppjieba;
6
+
7
+ static const char* const DICT_FILE = "../test/testdata/extra_dict/jieba.dict.small.utf8";
8
+
9
+ TEST(TrieTest, Empty) {
10
+ vector<Unicode> keys;
11
+ vector<const DictUnit*> values;
12
+ Trie trie(keys, values);
13
+ }
14
+
15
+ TEST(TrieTest, Construct) {
16
+ vector<Unicode> keys;
17
+ vector<const DictUnit*> values;
18
+ keys.push_back(DecodeRunesInString("你"));
19
+ values.push_back((const DictUnit*)(NULL));
20
+ Trie trie(keys, values);
21
+ }
22
+
23
+ TEST(DictTrieTest, NewAndDelete) {
24
+ DictTrie * trie;
25
+ trie = new DictTrie(DICT_FILE);
26
+ delete trie;
27
+ }
28
+
29
+ TEST(DictTrieTest, Test1) {
30
+ string s1, s2;
31
+ DictTrie trie(DICT_FILE);
32
+ ASSERT_LT(trie.GetMinWeight() + 15.6479, 0.001);
33
+ string word("来到");
34
+ cppjieba::RuneStrArray uni;
35
+ ASSERT_TRUE(DecodeRunesInString(word, uni));
36
+ //DictUnit nodeInfo;
37
+ //nodeInfo.word = uni;
38
+ //nodeInfo.tag = "v";
39
+ //nodeInfo.weight = -8.87033;
40
+ //s1 << nodeInfo;
41
+ //s2 << (*trie.Find(uni.begin(), uni.end()));
42
+ const DictUnit* du = trie.Find(uni.begin(), uni.end());
43
+ ASSERT_TRUE(du != NULL);
44
+ ASSERT_EQ(2u, du->word.size());
45
+ ASSERT_EQ(26469u, du->word[0]);
46
+ ASSERT_EQ(21040u, du->word[1]);
47
+ ASSERT_EQ("v", du->tag);
48
+ ASSERT_NEAR(-8.870, du->weight, 0.001);
49
+
50
+ //EXPECT_EQ("[\"26469\", \"21040\"] v -8.870", s2);
51
+ word = "清华大学";
52
+ LocalVector<pair<size_t, const DictUnit*> > res;
53
+ const char * words[] = {"清", "清华", "清华大学"};
54
+ for (size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) {
55
+ ASSERT_TRUE(DecodeRunesInString(words[i], uni));
56
+ res.push_back(make_pair(uni.size() - 1, trie.Find(uni.begin(), uni.end())));
57
+ //resMap[uni.size() - 1] = trie.Find(uni.begin(), uni.end());
58
+ }
59
+ vector<pair<size_t, const DictUnit*> > vec;
60
+ vector<struct Dag> dags;
61
+ ASSERT_TRUE(DecodeRunesInString(word, uni));
62
+ trie.Find(uni.begin(), uni.end(), dags);
63
+ ASSERT_EQ(dags.size(), uni.size());
64
+ ASSERT_NE(dags.size(), 0u);
65
+ s1 << res;
66
+ s2 << dags[0].nexts;
67
+ ASSERT_EQ(s1, s2);
68
+
69
+ }
70
+
71
+ TEST(DictTrieTest, UserDict) {
72
+ DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8");
73
+ string word = "云计算";
74
+ cppjieba::RuneStrArray unicode;
75
+ ASSERT_TRUE(DecodeRunesInString(word, unicode));
76
+ const DictUnit * unit = trie.Find(unicode.begin(), unicode.end());
77
+ ASSERT_TRUE(unit != NULL);
78
+ ASSERT_NEAR(unit->weight, -14.100, 0.001);
79
+
80
+ word = "蓝翔";
81
+ ASSERT_TRUE(DecodeRunesInString(word, unicode));
82
+ unit = trie.Find(unicode.begin(), unicode.end());
83
+ ASSERT_TRUE(unit != NULL);
84
+ ASSERT_EQ(unit->tag, "nz");
85
+ ASSERT_NEAR(unit->weight, -14.100, 0.001);
86
+
87
+ word = "区块链";
88
+ ASSERT_TRUE(DecodeRunesInString(word, unicode));
89
+ unit = trie.Find(unicode.begin(), unicode.end());
90
+ ASSERT_TRUE(unit != NULL);
91
+ ASSERT_EQ(unit->tag, "nz");
92
+ ASSERT_NEAR(unit->weight, -15.6478, 0.001);
93
+ }
94
+
95
+ TEST(DictTrieTest, UserDictWithMaxWeight) {
96
+ DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8", DictTrie::WordWeightMax);
97
+ string word = "云计算";
98
+ cppjieba::RuneStrArray unicode;
99
+ ASSERT_TRUE(DecodeRunesInString(word, unicode));
100
+ const DictUnit * unit = trie.Find(unicode.begin(), unicode.end());
101
+ ASSERT_TRUE(unit);
102
+ ASSERT_NEAR(unit->weight, -2.975, 0.001);
103
+ }
104
+
105
+ TEST(DictTrieTest, Dag) {
106
+ DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8");
107
+
108
+ {
109
+ string word = "清华大学";
110
+ cppjieba::RuneStrArray unicode;
111
+ ASSERT_TRUE(DecodeRunesInString(word, unicode));
112
+ vector<struct Dag> res;
113
+ trie.Find(unicode.begin(), unicode.end(), res);
114
+
115
+ size_t nexts_sizes[] = {3, 2, 2, 1};
116
+ ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
117
+ for (size_t i = 0; i < res.size(); i++) {
118
+ ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
119
+ }
120
+ }
121
+
122
+ {
123
+ string word = "北京邮电大学";
124
+ cppjieba::RuneStrArray unicode;
125
+ ASSERT_TRUE(DecodeRunesInString(word, unicode));
126
+ vector<struct Dag> res;
127
+ trie.Find(unicode.begin(), unicode.end(), res);
128
+
129
+ size_t nexts_sizes[] = {3, 1, 2, 2, 2, 1};
130
+ ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
131
+ for (size_t i = 0; i < res.size(); i++) {
132
+ ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
133
+ }
134
+ }
135
+
136
+ {
137
+ string word = "长江大桥";
138
+ cppjieba::RuneStrArray unicode;
139
+ ASSERT_TRUE(DecodeRunesInString(word, unicode));
140
+ vector<struct Dag> res;
141
+ trie.Find(unicode.begin(), unicode.end(), res);
142
+
143
+ size_t nexts_sizes[] = {3, 1, 2, 1};
144
+ ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
145
+ for (size_t i = 0; i < res.size(); i++) {
146
+ ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
147
+ }
148
+ }
149
+
150
+ {
151
+ string word = "长江大桥";
152
+ cppjieba::RuneStrArray unicode;
153
+ ASSERT_TRUE(DecodeRunesInString(word, unicode));
154
+ vector<struct Dag> res;
155
+ trie.Find(unicode.begin(), unicode.end(), res, 3);
156
+
157
+ size_t nexts_sizes[] = {2, 1, 2, 1};
158
+ ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
159
+ for (size_t i = 0; i < res.size(); i++) {
160
+ ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
161
+ }
162
+ }
163
+
164
+ {
165
+ string word = "长江大桥";
166
+ cppjieba::RuneStrArray unicode;
167
+ ASSERT_TRUE(DecodeRunesInString(word, unicode));
168
+ vector<struct Dag> res;
169
+ trie.Find(unicode.begin(), unicode.end(), res, 4);
170
+
171
+ size_t nexts_sizes[] = {3, 1, 2, 1};
172
+ ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
173
+ for (size_t i = 0; i < res.size(); i++) {
174
+ ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
175
+ }
176
+ }
177
+ }
@@ -0,0 +1,43 @@
1
+ #include "cppjieba/Unicode.hpp"
2
+ #include "limonp/StdExtension.hpp"
3
+ #include "gtest/gtest.h"
4
+
5
+ using namespace cppjieba;
6
+ using namespace std;
7
+
8
+ TEST(UnicodeTest, Test1) {
9
+ string s = "你好世界";
10
+ RuneStrArray runes;
11
+ ASSERT_TRUE(DecodeRunesInString(s, runes));
12
+ string actual;
13
+ string expected = "[\"{\"rune\": \"20320\", \"offset\": 0, \"len\": 3}\", \"{\"rune\": \"22909\", \"offset\": 3, \"len\": 3}\", \"{\"rune\": \"19990\", \"offset\": 6, \"len\": 3}\", \"{\"rune\": \"30028\", \"offset\": 9, \"len\": 3}\"]";
14
+ actual << runes;
15
+ ASSERT_EQ(expected, actual);
16
+ }
17
+
18
+ TEST(UnicodeTest, Illegal) {
19
+ string s = "123\x80";
20
+ RuneStrArray runes;
21
+ ASSERT_FALSE(DecodeRunesInString(s, runes));
22
+ string actual;
23
+ string expected = "[]";
24
+ actual << runes;
25
+ ASSERT_EQ(expected, actual);
26
+ }
27
+
28
+ TEST(UnicodeTest, Rand) {
29
+ const size_t ITERATION = 1024;
30
+ const size_t MAX_LEN = 256;
31
+ string s;
32
+ srand(time(NULL));
33
+
34
+ for (size_t i = 0; i < ITERATION; i++) {
35
+ size_t len = rand() % MAX_LEN;
36
+ s.resize(len);
37
+ for (size_t j = 0; j < len; j++) {
38
+ s[rand() % len] = rand();
39
+ }
40
+ RuneStrArray runes;
41
+ DecodeRunesInString(s, runes);
42
+ }
43
+ }
@@ -0,0 +1,10 @@
1
+ #include <ruby.h>
2
+
3
+ VALUE rb_mCppjiebaRb;
4
+
5
+ void Init_cppjieba_rb()
6
+ {
7
+ rb_mCppjiebaRb = rb_define_module("CppjiebaRb");
8
+
9
+ Init_internal();
10
+ }
@@ -0,0 +1,26 @@
1
+ require "mkmf"
2
+ abs = File.expand_path File.dirname(__FILE__)
3
+
4
+ LIBDIR = RbConfig::CONFIG['libdir']
5
+ INCLUDEDIR = RbConfig::CONFIG['includedir']
6
+
7
+ HEADER_DIRS = [
8
+ INCLUDEDIR,
9
+ "#{abs}/../cppjieba/include",
10
+ "#{abs}/../cppjieba/deps"
11
+ ]
12
+
13
+ LIB_DIRS = [
14
+ LIBDIR
15
+ ]
16
+
17
+ dir_config('cppjieba_src', HEADER_DIRS, LIB_DIRS)
18
+
19
+ CONFIG["CXXFLAGS"] += " -std=c++11 -O3"
20
+ $CXXFLAGS = "#{$CXXFLAGS} -std=c++11 -O3"
21
+ create_makefile 'cppjieba_rb'
22
+ # respect header changes
23
+ headers = Dir.glob('*.{hpp,h}').join ' '
24
+ File.open 'Makefile', 'a' do |f|
25
+ f.puts "\n$(OBJS): #{headers}"
26
+ end
@@ -0,0 +1,148 @@
1
+ #include <ruby.h>
2
+ #include <ruby/encoding.h>
3
+ #include "cppjieba/Jieba.hpp"
4
+
5
+ #define GET_CPPJIEBA(_data) jieba_cpp_data* _data; \
6
+ TypedData_Get_Struct(self, jieba_cpp_data, &jieba_cpp_type, _data)
7
+
8
+ typedef struct {
9
+ cppjieba::Jieba* jieba;
10
+ } jieba_cpp_data;
11
+
12
+ // make compiler happy
13
+ typedef VALUE (ruby_method)(...);
14
+
15
+ static ID rb_sMp;
16
+ static ID rb_sMix;
17
+ static ID rb_sHmm;
18
+ static ID rb_sQuery;
19
+ static ID rb_sFull;
20
+ static rb_encoding* u8_enc;
21
+ VALUE rb_cCppjiebaRb_Internal;
22
+ extern "C" VALUE rb_mCppjiebaRb;
23
+
24
+ static void jieba_cpp_free(void* _this)
25
+ {
26
+ jieba_cpp_data* data = static_cast<jieba_cpp_data*>(_this);
27
+ delete data->jieba;
28
+ data->jieba = nullptr;
29
+ }
30
+
31
+ static size_t jieba_cpp_memsize(const void* _)
32
+ {
33
+ return sizeof(jieba_cpp_data);
34
+ }
35
+
36
+ static const rb_data_type_t jieba_cpp_type = {
37
+ "jieba/internal",
38
+ {NULL, jieba_cpp_free, jieba_cpp_memsize,},
39
+ 0, 0,
40
+ RUBY_TYPED_FREE_IMMEDIATELY,
41
+ };
42
+
43
+ extern "C" {
44
+
45
+ VALUE internal_alloc(VALUE self)
46
+ {
47
+ jieba_cpp_data* data;
48
+ return TypedData_Make_Struct(self, jieba_cpp_data, &jieba_cpp_type, data);
49
+ }
50
+
51
+ VALUE internal_initialize(VALUE self,
52
+ VALUE dict_path,
53
+ VALUE model_path,
54
+ VALUE user_dict_path,
55
+ VALUE idf_path,
56
+ VALUE stop_word_path)
57
+ {
58
+ GET_CPPJIEBA(data);
59
+ data->jieba = new cppjieba::Jieba(StringValueCStr(dict_path),
60
+ StringValueCStr(model_path),
61
+ StringValueCStr(user_dict_path),
62
+ StringValueCStr(idf_path),
63
+ StringValueCStr(stop_word_path));
64
+ }
65
+
66
+ VALUE internal_extract_keyword(VALUE self, VALUE text_rbs, VALUE topN)
67
+ {
68
+ std::string text = StringValueCStr(text_rbs);
69
+ int top_n = NUM2INT(topN);
70
+ GET_CPPJIEBA(data);
71
+
72
+ std::vector<std::pair<std::string, double> > top_words;
73
+
74
+ data->jieba->extractor.Extract(text, top_words, top_n);
75
+ VALUE arr = rb_ary_new2(top_words.size());
76
+ for (auto iter = top_words.begin(); iter != top_words.end(); iter++) {
77
+ VALUE inner_arr = rb_ary_new2(2);
78
+ rb_ary_push(inner_arr, rb_enc_str_new(iter->first.c_str(), iter->first.length(), u8_enc));
79
+ rb_ary_push(inner_arr, rb_float_new(iter->second));
80
+ rb_ary_push(arr, inner_arr);
81
+ }
82
+ return arr;
83
+ }
84
+
85
+ static VALUE internal_segment(VALUE self, VALUE text_rbs, VALUE mode, VALUE max_length_, VALUE hmm_)
86
+ {
87
+ std::string text = StringValueCStr(text_rbs);
88
+ size_t max_length = NUM2UINT(max_length_);
89
+ int hmm = (hmm_ == Qtrue ? 1 : 0);
90
+ GET_CPPJIEBA(data);
91
+ ID rb_sMode = SYM2ID(mode);
92
+ std::vector<std::string> words;
93
+
94
+ if (rb_sMode == rb_sMp) {
95
+ data->jieba->CutSmall(text, words, max_length);
96
+ } else if (rb_sMode == rb_sMix) {
97
+ data->jieba->Cut(text, words, hmm);
98
+ } else if (rb_sMode == rb_sHmm) {
99
+ data->jieba->CutHMM(text, words);
100
+ } else if (rb_sMode == rb_sQuery) {
101
+ data->jieba->CutForSearch(text, words, hmm);
102
+ } else if (rb_sMode == rb_sFull) {
103
+ data->jieba->CutAll(text, words);
104
+ }
105
+
106
+ VALUE arr = rb_ary_new2(words.size());
107
+ for (auto iter = words.begin(); iter != words.end(); iter++) {
108
+ rb_ary_push(arr, rb_enc_str_new(iter->c_str(), iter->length(), u8_enc));
109
+ }
110
+ return arr;
111
+ }
112
+
113
+ static VALUE internal_segment_tag(VALUE self, VALUE text_rbs)
114
+ {
115
+ std::string text = StringValueCStr(text_rbs);
116
+ GET_CPPJIEBA(data);
117
+
118
+ std::vector<std::pair<std::string, std::string>> words;
119
+ data->jieba->Tag(text, words);
120
+
121
+ VALUE result = rb_hash_new();
122
+ for (auto iter = words.begin(); iter != words.end(); iter++) {
123
+ rb_hash_aset(result,
124
+ rb_enc_str_new(iter->first.c_str(), iter->first.length(), u8_enc),
125
+ rb_enc_str_new(iter->second.c_str(), iter->second.length(), u8_enc));
126
+ }
127
+ return result;
128
+ }
129
+
130
+
131
+ void Init_internal()
132
+ {
133
+ rb_sMp = rb_intern("mp");
134
+ rb_sMix = rb_intern("mix");
135
+ rb_sHmm = rb_intern("hmm");
136
+ rb_sQuery = rb_intern("query");
137
+ rb_sFull = rb_intern("full");
138
+ u8_enc = rb_utf8_encoding();
139
+
140
+ rb_cCppjiebaRb_Internal = rb_define_class_under(rb_mCppjiebaRb, "Internal", rb_cData);
141
+ rb_define_alloc_func(rb_cCppjiebaRb_Internal, internal_alloc);
142
+ rb_define_method(rb_cCppjiebaRb_Internal, "initialize", (ruby_method*) &internal_initialize, 5);
143
+ rb_define_method(rb_cCppjiebaRb_Internal, "extract_keyword", (ruby_method*) &internal_extract_keyword, 2);
144
+ rb_define_method(rb_cCppjiebaRb_Internal, "segment", (ruby_method*) &internal_segment, 4);
145
+ rb_define_method(rb_cCppjiebaRb_Internal, "segment_tag", (ruby_method*) &internal_segment_tag, 1);
146
+ }
147
+
148
+ }
@@ -0,0 +1,20 @@
1
+ module CppjiebaRb
2
+ class Segment
3
+ VALID_MODES = %i[mix hmm mp query full].freeze
4
+
5
+ def initialize(opts = nil)
6
+ opts ||= {}
7
+ unless opts[:mode].nil? || VALID_MODES.include?(opts[:mode])
8
+ raise ArgumentError, "The mode is #{opts[:mode]}. It should be one of :mix :hmm :mp"
9
+ end
10
+
11
+ @mode = opts[:mode] || :mix
12
+ @max_word_length = opts[:max_word_length] || 8
13
+ @hmm = opts[:hmm] || true
14
+ end
15
+
16
+ def segment(str)
17
+ CppjiebaRb.internal.segment(str, @mode, @max_word_length, @hmm)
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,3 @@
1
+ module CppjiebaRb
2
+ VERSION = '0.2.1'
3
+ end
@@ -0,0 +1,34 @@
1
+ require 'cppjieba_rb/cppjieba_rb'
2
+ require 'cppjieba_rb/version'
3
+ require 'cppjieba_rb/segment'
4
+
5
+ module CppjiebaRb
6
+ EXT_BASE = File.join(File.dirname(__FILE__), '..', 'ext', 'cppjieba', 'dict')
7
+ DICT_PATH = File.join(EXT_BASE, 'jieba.dict.utf8')
8
+ HMM_DICT_PATH = File.join(EXT_BASE, 'hmm_model.utf8')
9
+ USER_DICT = File.join(EXT_BASE, 'user.dict.utf8')
10
+ IDF_PATH = File.join(EXT_BASE, 'idf.utf8')
11
+ STOP_WORD_PATH = File.join(EXT_BASE, 'stop_words.utf8')
12
+
13
+ def self.extract_keyword(str, top_n)
14
+ internal.extract_keyword(str, top_n)
15
+ end
16
+
17
+ def self.segment(str, opts = nil)
18
+ CppjiebaRb::Segment.new(opts).segment(str)
19
+ end
20
+
21
+ def self.segment_tag(str)
22
+ internal.segment_tag(str)
23
+ end
24
+
25
+ class << self
26
+ def internal
27
+ @backend ||= CppjiebaRb::Internal.new(DICT_PATH,
28
+ HMM_DICT_PATH,
29
+ USER_DICT,
30
+ IDF_PATH,
31
+ STOP_WORD_PATH)
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,17 @@
1
+ # coding: utf-8
2
+ require 'minitest/autorun'
3
+ require 'cppjieba_rb'
4
+
5
+ class JiebaTest < Minitest::Test
6
+ def test_keywords
7
+ results = CppjiebaRb.extract_keyword "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。", 5
8
+
9
+ assert_equal [["CEO",
10
+ 11.739204307083542],
11
+ ["升职", 10.8561552143],
12
+ ["加薪", 10.642581114],
13
+ ["手扶拖拉机", 10.0088573539],
14
+ ["巅峰", 9.49395840471]], results
15
+
16
+ end
17
+ end
@@ -0,0 +1,24 @@
1
+ # coding: utf-8
2
+ require 'minitest/autorun'
3
+ require 'cppjieba_rb'
4
+
5
+ class JiebaTest < Minitest::Test
6
+ def test_mix_segment
7
+ words = CppjiebaRb.segment "我来到南京市长江大桥"
8
+ assert_equal %w(我 来到 南京市 长江大桥), words
9
+
10
+ words = CppjiebaRb.segment "令狐冲是云计算行业的专家"
11
+ assert_equal %w(令狐冲 是 云计算 行业 的 专家), words
12
+ end
13
+
14
+ def test_hmm_segment
15
+ words = CppjiebaRb.segment "令狐冲是云计算行业的专家", mode: :hmm
16
+ assert_equal %w(令狐冲 是 云计算 行业 的 专家), words
17
+ end
18
+
19
+ def test_max_prob_segment
20
+ words = CppjiebaRb.segment "令狐冲是云计算行业的专家", mode: :mp
21
+ assert_equal %w(令狐冲 是 云计算 行业 的 专家), words
22
+ end
23
+
24
+ end
@@ -0,0 +1,19 @@
1
+ # coding: utf-8
2
+ require 'minitest/autorun'
3
+ require 'cppjieba_rb'
4
+ class JiebaTest < Minitest::Test
5
+ def test_tagging
6
+ pairs = CppjiebaRb.segment_tag '我是蓝翔技工拖拉机学院手扶拖拉机专业的。'
7
+ assert_equal({ '我' => 'r', '是' => 'v', '蓝翔' => 'nz', '技工' => 'n',
8
+ '拖拉机' => 'n', '学院' => 'n', '手扶拖拉机' => 'n', '专业' => 'n',
9
+ '的' => 'uj', '。' => 'x' }, pairs)
10
+ end
11
+
12
+ def test_tagging_with_user_dict
13
+ pairs = CppjiebaRb.segment_tag '我是蓝翔技工拖拉机学院手扶拖拉机专业的。'
14
+ assert_equal({ '我' => 'r', '是' => 'v', '蓝翔' => 'nz', '技工' => 'n',
15
+ '拖拉机' => 'n', '学院' => 'n', '手扶拖拉机' => 'n', '专业' => 'n',
16
+ '的' => 'uj', '。' => 'x' }, pairs)
17
+ end
18
+
19
+ end