cppjieba_rb 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.gitmodules +3 -0
  4. data/.travis.yml +26 -0
  5. data/Gemfile +3 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +81 -0
  8. data/Rakefile +20 -0
  9. data/cppjieba_rb.gemspec +50 -0
  10. data/ext/cppjieba/.gitignore +17 -0
  11. data/ext/cppjieba/.travis.yml +22 -0
  12. data/ext/cppjieba/CMakeLists.txt +28 -0
  13. data/ext/cppjieba/ChangeLog.md +236 -0
  14. data/ext/cppjieba/README.md +285 -0
  15. data/ext/cppjieba/README_EN.md +111 -0
  16. data/ext/cppjieba/appveyor.yml +32 -0
  17. data/ext/cppjieba/deps/CMakeLists.txt +1 -0
  18. data/ext/cppjieba/deps/gtest/CMakeLists.txt +5 -0
  19. data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +283 -0
  20. data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +230 -0
  21. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +1421 -0
  22. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +487 -0
  23. data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +796 -0
  24. data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +232 -0
  25. data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +176 -0
  26. data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +259 -0
  27. data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +2155 -0
  28. data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +358 -0
  29. data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +58 -0
  30. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +308 -0
  31. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +210 -0
  32. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +1226 -0
  33. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +233 -0
  34. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +4822 -0
  35. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +301 -0
  36. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +619 -0
  37. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +1788 -0
  38. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +350 -0
  39. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +968 -0
  40. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +336 -0
  41. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +3330 -0
  42. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +296 -0
  43. data/ext/cppjieba/deps/gtest/src/.deps/.dirstamp +0 -0
  44. data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +681 -0
  45. data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +509 -0
  46. data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
  47. data/ext/cppjieba/deps/gtest/src/gtest-all.cc +48 -0
  48. data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +1234 -0
  49. data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +380 -0
  50. data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +1038 -0
  51. data/ext/cppjieba/deps/gtest/src/gtest-port.cc +746 -0
  52. data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +356 -0
  53. data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +110 -0
  54. data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +110 -0
  55. data/ext/cppjieba/deps/gtest/src/gtest.cc +4898 -0
  56. data/ext/cppjieba/deps/gtest/src/gtest_main.cc +39 -0
  57. data/ext/cppjieba/deps/limonp/ArgvContext.hpp +70 -0
  58. data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +49 -0
  59. data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +67 -0
  60. data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +65 -0
  61. data/ext/cppjieba/deps/limonp/Closure.hpp +206 -0
  62. data/ext/cppjieba/deps/limonp/Colors.hpp +31 -0
  63. data/ext/cppjieba/deps/limonp/Condition.hpp +38 -0
  64. data/ext/cppjieba/deps/limonp/Config.hpp +103 -0
  65. data/ext/cppjieba/deps/limonp/FileLock.hpp +74 -0
  66. data/ext/cppjieba/deps/limonp/ForcePublic.hpp +7 -0
  67. data/ext/cppjieba/deps/limonp/LocalVector.hpp +139 -0
  68. data/ext/cppjieba/deps/limonp/Logging.hpp +76 -0
  69. data/ext/cppjieba/deps/limonp/Md5.hpp +411 -0
  70. data/ext/cppjieba/deps/limonp/MutexLock.hpp +51 -0
  71. data/ext/cppjieba/deps/limonp/NonCopyable.hpp +21 -0
  72. data/ext/cppjieba/deps/limonp/StdExtension.hpp +159 -0
  73. data/ext/cppjieba/deps/limonp/StringUtil.hpp +365 -0
  74. data/ext/cppjieba/deps/limonp/Thread.hpp +44 -0
  75. data/ext/cppjieba/deps/limonp/ThreadPool.hpp +86 -0
  76. data/ext/cppjieba/dict/README.md +31 -0
  77. data/ext/cppjieba/dict/hmm_model.utf8 +34 -0
  78. data/ext/cppjieba/dict/idf.utf8 +258826 -0
  79. data/ext/cppjieba/dict/jieba.dict.utf8 +348982 -0
  80. data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
  81. data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
  82. data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
  83. data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
  84. data/ext/cppjieba/dict/stop_words.utf8 +1534 -0
  85. data/ext/cppjieba/dict/user.dict.utf8 +4 -0
  86. data/ext/cppjieba/include/cppjieba/DictTrie.hpp +227 -0
  87. data/ext/cppjieba/include/cppjieba/FullSegment.hpp +93 -0
  88. data/ext/cppjieba/include/cppjieba/HMMModel.hpp +129 -0
  89. data/ext/cppjieba/include/cppjieba/HMMSegment.hpp +190 -0
  90. data/ext/cppjieba/include/cppjieba/Jieba.hpp +108 -0
  91. data/ext/cppjieba/include/cppjieba/KeywordExtractor.hpp +153 -0
  92. data/ext/cppjieba/include/cppjieba/MPSegment.hpp +137 -0
  93. data/ext/cppjieba/include/cppjieba/MixSegment.hpp +109 -0
  94. data/ext/cppjieba/include/cppjieba/PosTagger.hpp +77 -0
  95. data/ext/cppjieba/include/cppjieba/PreFilter.hpp +54 -0
  96. data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +90 -0
  97. data/ext/cppjieba/include/cppjieba/SegmentBase.hpp +46 -0
  98. data/ext/cppjieba/include/cppjieba/SegmentTagged.hpp +23 -0
  99. data/ext/cppjieba/include/cppjieba/TextRankExtractor.hpp +190 -0
  100. data/ext/cppjieba/include/cppjieba/Trie.hpp +174 -0
  101. data/ext/cppjieba/include/cppjieba/Unicode.hpp +215 -0
  102. data/ext/cppjieba/test/CMakeLists.txt +5 -0
  103. data/ext/cppjieba/test/demo.cpp +80 -0
  104. data/ext/cppjieba/test/load_test.cpp +54 -0
  105. data/ext/cppjieba/test/testdata/curl.res +1 -0
  106. data/ext/cppjieba/test/testdata/extra_dict/jieba.dict.small.utf8 +109750 -0
  107. data/ext/cppjieba/test/testdata/gbk_dict/hmm_model.gbk +34 -0
  108. data/ext/cppjieba/test/testdata/gbk_dict/jieba.dict.gbk +348982 -0
  109. data/ext/cppjieba/test/testdata/jieba.dict.0.1.utf8 +93 -0
  110. data/ext/cppjieba/test/testdata/jieba.dict.0.utf8 +93 -0
  111. data/ext/cppjieba/test/testdata/jieba.dict.1.utf8 +67 -0
  112. data/ext/cppjieba/test/testdata/jieba.dict.2.utf8 +64 -0
  113. data/ext/cppjieba/test/testdata/load_test.urls +2 -0
  114. data/ext/cppjieba/test/testdata/review.100 +100 -0
  115. data/ext/cppjieba/test/testdata/review.100.res +200 -0
  116. data/ext/cppjieba/test/testdata/server.conf +19 -0
  117. data/ext/cppjieba/test/testdata/testlines.gbk +9 -0
  118. data/ext/cppjieba/test/testdata/testlines.utf8 +8 -0
  119. data/ext/cppjieba/test/testdata/userdict.2.utf8 +1 -0
  120. data/ext/cppjieba/test/testdata/userdict.english +2 -0
  121. data/ext/cppjieba/test/testdata/userdict.utf8 +8 -0
  122. data/ext/cppjieba/test/testdata/weicheng.utf8 +247 -0
  123. data/ext/cppjieba/test/unittest/CMakeLists.txt +24 -0
  124. data/ext/cppjieba/test/unittest/gtest_main.cpp +39 -0
  125. data/ext/cppjieba/test/unittest/jieba_test.cpp +133 -0
  126. data/ext/cppjieba/test/unittest/keyword_extractor_test.cpp +79 -0
  127. data/ext/cppjieba/test/unittest/pos_tagger_test.cpp +41 -0
  128. data/ext/cppjieba/test/unittest/pre_filter_test.cpp +43 -0
  129. data/ext/cppjieba/test/unittest/segments_test.cpp +256 -0
  130. data/ext/cppjieba/test/unittest/textrank_test.cpp +86 -0
  131. data/ext/cppjieba/test/unittest/trie_test.cpp +177 -0
  132. data/ext/cppjieba/test/unittest/unicode_test.cpp +43 -0
  133. data/ext/cppjieba_rb/cppjieba_rb.c +10 -0
  134. data/ext/cppjieba_rb/extconf.rb +26 -0
  135. data/ext/cppjieba_rb/internal.cc +148 -0
  136. data/lib/cppjieba_rb/segment.rb +20 -0
  137. data/lib/cppjieba_rb/version.rb +3 -0
  138. data/lib/cppjieba_rb.rb +34 -0
  139. data/test/test_keyword.rb +17 -0
  140. data/test/test_segment.rb +24 -0
  141. data/test/test_tagging.rb +19 -0
  142. metadata +244 -0
@@ -0,0 +1,86 @@
1
+ #include "cppjieba/TextRankExtractor.hpp"
2
+ #include "gtest/gtest.h"
3
+
4
+ using namespace cppjieba;
5
+
6
+ TEST(TextRankExtractorTest, Test1) {
7
+ TextRankExtractor Extractor(
8
+ "../test/testdata/extra_dict/jieba.dict.small.utf8",
9
+ "../dict/hmm_model.utf8",
10
+ "../dict/stop_words.utf8");
11
+ {
12
+ string s("你好世界世界而且而且");
13
+ string res;
14
+ size_t topN = 5;
15
+
16
+ {
17
+ vector<string> words;
18
+ Extractor.Extract(s, words, topN);
19
+ res << words;
20
+ ASSERT_EQ(res, "[\"世界\", \"你好\"]");
21
+ }
22
+
23
+ {
24
+ vector<pair<string, double> > words;
25
+ Extractor.Extract(s, words, topN);
26
+ res << words;
27
+ ASSERT_EQ(res, "[世界:1, 你好:0.519787]");
28
+ }
29
+
30
+ {
31
+ vector<TextRankExtractor::Word> words;
32
+ Extractor.Extract(s, words, topN);
33
+ res << words;
34
+ ASSERT_EQ(res, "[{\"word\": \"世界\", \"offset\": [6, 12], \"weight\": 1}, {\"word\": \"你好\", \"offset\": [0], \"weight\": 0.519787}]");
35
+ }
36
+ }
37
+
38
+ {
39
+ string s("\xe6\x88\x91\xe6\x98\xaf\xe6\x8b\x96\xe6\x8b\x89\xe6\x9c\xba\xe5\xad\xa6\xe9\x99\xa2\xe6\x89\x8b\xe6\x89\xb6\xe6\x8b\x96\xe6\x8b\x89\xe6\x9c\xba\xe4\xb8\x93\xe4\xb8\x9a\xe7\x9a\x84\xe3\x80\x82\xe4\xb8\x8d\xe7\x94\xa8\xe5\xa4\x9a\xe4\xb9\x85\xef\xbc\x8c\xe6\x88\x91\xe5\xb0\xb1\xe4\xbc\x9a\xe5\x8d\x87\xe8\x81\x8c\xe5\x8a\xa0\xe8\x96\xaa\xef\xbc\x8c\xe5\xbd\x93\xe4\xb8\x8a CEO\xef\xbc\x8c\xe8\xb5\xb0\xe4\xb8\x8a\xe4\xba\xba\xe7\x94\x9f\xe5\xb7\x85\xe5\xb3\xb0");
40
+ string res;
41
+ vector<TextRankExtractor::Word> wordweights;
42
+ size_t topN = 5;
43
+ Extractor.Extract(s, wordweights, topN);
44
+ res << wordweights;
45
+ ASSERT_EQ(res, "[{\"word\": \"当上\", \"offset\": [87], \"weight\": 1}, {\"word\": \"不用\", \"offset\": [48], \"weight\": 0.989848}, {\"word\": \"多久\", \"offset\": [54], \"weight\": 0.985126}, {\"word\": \"加薪\", \"offset\": [78], \"weight\": 0.983046}, {\"word\": \"升职\", \"offset\": [72], \"weight\": 0.980278}]");
46
+ //ASSERT_EQ(res, "[{\"word\": \"专业\", \"offset\": [36], \"weight\": 1}, {\"word\": \"CEO\", \"offset\": [94], \"weight\": 0.95375}, {\"word\": \"手扶拖拉机\", \"offset\": [21], \"weight\": 0.801701}, {\"word\": \"当上\", \"offset\": [87], \"weight\": 0.798968}, {\"word\": \"走上\", \"offset\": [100], \"weight\": 0.775505}]");
47
+ }
48
+
49
+ {
50
+ string s("一部iPhone6");
51
+ string res;
52
+ vector<TextRankExtractor::Word> wordweights;
53
+ size_t topN = 5;
54
+ Extractor.Extract(s, wordweights, topN);
55
+ res << wordweights;
56
+ ASSERT_EQ(res, "[{\"word\": \"一部\", \"offset\": [0], \"weight\": 1}, {\"word\": \"iPhone6\", \"offset\": [6], \"weight\": 0.996126}]");
57
+ }
58
+ }
59
+
60
+ TEST(TextRankExtractorTest, Test2) {
61
+ TextRankExtractor Extractor(
62
+ "../test/testdata/extra_dict/jieba.dict.small.utf8",
63
+ "../dict/hmm_model.utf8",
64
+ "../dict/stop_words.utf8",
65
+ "../test/testdata/userdict.utf8");
66
+
67
+ {
68
+ string s("\xe8\x93\x9d\xe7\xbf\x94\xe4\xbc\x98\xe7\xa7\x80\xe6\xaf\x95\xe4\xb8\x9a\xe7\x94\x9f");
69
+ string res;
70
+ vector<TextRankExtractor::Word> wordweights;
71
+ size_t topN = 5;
72
+ Extractor.Extract(s, wordweights, topN);
73
+ res << wordweights;
74
+ ASSERT_EQ(res, "[{\"word\": \"蓝翔\", \"offset\": [0], \"weight\": 1}, {\"word\": \"毕业生\", \"offset\": [12], \"weight\": 0.996685}, {\"word\": \"优秀\", \"offset\": [6], \"weight\": 0.992994}]");
75
+ }
76
+
77
+ {
78
+ string s("一部iPhone6");
79
+ string res;
80
+ vector<TextRankExtractor::Word> wordweights;
81
+ size_t topN = 5;
82
+ Extractor.Extract(s, wordweights, topN);
83
+ res << wordweights;
84
+ ASSERT_EQ(res, "[{\"word\": \"一部\", \"offset\": [0], \"weight\": 1}, {\"word\": \"iPhone6\", \"offset\": [6], \"weight\": 0.996126}]");
85
+ }
86
+ }
@@ -0,0 +1,177 @@
1
+ #include "cppjieba/DictTrie.hpp"
2
+ #include "cppjieba/MPSegment.hpp"
3
+ #include "gtest/gtest.h"
4
+
5
+ using namespace cppjieba;
6
+
7
+ static const char* const DICT_FILE = "../test/testdata/extra_dict/jieba.dict.small.utf8";
8
+
9
+ TEST(TrieTest, Empty) {
10
+ vector<Unicode> keys;
11
+ vector<const DictUnit*> values;
12
+ Trie trie(keys, values);
13
+ }
14
+
15
+ TEST(TrieTest, Construct) {
16
+ vector<Unicode> keys;
17
+ vector<const DictUnit*> values;
18
+ keys.push_back(DecodeRunesInString("你"));
19
+ values.push_back((const DictUnit*)(NULL));
20
+ Trie trie(keys, values);
21
+ }
22
+
23
+ TEST(DictTrieTest, NewAndDelete) {
24
+ DictTrie * trie;
25
+ trie = new DictTrie(DICT_FILE);
26
+ delete trie;
27
+ }
28
+
29
+ TEST(DictTrieTest, Test1) {
30
+ string s1, s2;
31
+ DictTrie trie(DICT_FILE);
32
+ ASSERT_LT(trie.GetMinWeight() + 15.6479, 0.001);
33
+ string word("来到");
34
+ cppjieba::RuneStrArray uni;
35
+ ASSERT_TRUE(DecodeRunesInString(word, uni));
36
+ //DictUnit nodeInfo;
37
+ //nodeInfo.word = uni;
38
+ //nodeInfo.tag = "v";
39
+ //nodeInfo.weight = -8.87033;
40
+ //s1 << nodeInfo;
41
+ //s2 << (*trie.Find(uni.begin(), uni.end()));
42
+ const DictUnit* du = trie.Find(uni.begin(), uni.end());
43
+ ASSERT_TRUE(du != NULL);
44
+ ASSERT_EQ(2u, du->word.size());
45
+ ASSERT_EQ(26469u, du->word[0]);
46
+ ASSERT_EQ(21040u, du->word[1]);
47
+ ASSERT_EQ("v", du->tag);
48
+ ASSERT_NEAR(-8.870, du->weight, 0.001);
49
+
50
+ //EXPECT_EQ("[\"26469\", \"21040\"] v -8.870", s2);
51
+ word = "清华大学";
52
+ LocalVector<pair<size_t, const DictUnit*> > res;
53
+ const char * words[] = {"清", "清华", "清华大学"};
54
+ for (size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) {
55
+ ASSERT_TRUE(DecodeRunesInString(words[i], uni));
56
+ res.push_back(make_pair(uni.size() - 1, trie.Find(uni.begin(), uni.end())));
57
+ //resMap[uni.size() - 1] = trie.Find(uni.begin(), uni.end());
58
+ }
59
+ vector<pair<size_t, const DictUnit*> > vec;
60
+ vector<struct Dag> dags;
61
+ ASSERT_TRUE(DecodeRunesInString(word, uni));
62
+ trie.Find(uni.begin(), uni.end(), dags);
63
+ ASSERT_EQ(dags.size(), uni.size());
64
+ ASSERT_NE(dags.size(), 0u);
65
+ s1 << res;
66
+ s2 << dags[0].nexts;
67
+ ASSERT_EQ(s1, s2);
68
+
69
+ }
70
+
71
+ TEST(DictTrieTest, UserDict) {
72
+ DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8");
73
+ string word = "云计算";
74
+ cppjieba::RuneStrArray unicode;
75
+ ASSERT_TRUE(DecodeRunesInString(word, unicode));
76
+ const DictUnit * unit = trie.Find(unicode.begin(), unicode.end());
77
+ ASSERT_TRUE(unit != NULL);
78
+ ASSERT_NEAR(unit->weight, -14.100, 0.001);
79
+
80
+ word = "蓝翔";
81
+ ASSERT_TRUE(DecodeRunesInString(word, unicode));
82
+ unit = trie.Find(unicode.begin(), unicode.end());
83
+ ASSERT_TRUE(unit != NULL);
84
+ ASSERT_EQ(unit->tag, "nz");
85
+ ASSERT_NEAR(unit->weight, -14.100, 0.001);
86
+
87
+ word = "区块链";
88
+ ASSERT_TRUE(DecodeRunesInString(word, unicode));
89
+ unit = trie.Find(unicode.begin(), unicode.end());
90
+ ASSERT_TRUE(unit != NULL);
91
+ ASSERT_EQ(unit->tag, "nz");
92
+ ASSERT_NEAR(unit->weight, -15.6478, 0.001);
93
+ }
94
+
95
+ TEST(DictTrieTest, UserDictWithMaxWeight) {
96
+ DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8", DictTrie::WordWeightMax);
97
+ string word = "云计算";
98
+ cppjieba::RuneStrArray unicode;
99
+ ASSERT_TRUE(DecodeRunesInString(word, unicode));
100
+ const DictUnit * unit = trie.Find(unicode.begin(), unicode.end());
101
+ ASSERT_TRUE(unit);
102
+ ASSERT_NEAR(unit->weight, -2.975, 0.001);
103
+ }
104
+
105
+ TEST(DictTrieTest, Dag) {
106
+ DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8");
107
+
108
+ {
109
+ string word = "清华大学";
110
+ cppjieba::RuneStrArray unicode;
111
+ ASSERT_TRUE(DecodeRunesInString(word, unicode));
112
+ vector<struct Dag> res;
113
+ trie.Find(unicode.begin(), unicode.end(), res);
114
+
115
+ size_t nexts_sizes[] = {3, 2, 2, 1};
116
+ ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
117
+ for (size_t i = 0; i < res.size(); i++) {
118
+ ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
119
+ }
120
+ }
121
+
122
+ {
123
+ string word = "北京邮电大学";
124
+ cppjieba::RuneStrArray unicode;
125
+ ASSERT_TRUE(DecodeRunesInString(word, unicode));
126
+ vector<struct Dag> res;
127
+ trie.Find(unicode.begin(), unicode.end(), res);
128
+
129
+ size_t nexts_sizes[] = {3, 1, 2, 2, 2, 1};
130
+ ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
131
+ for (size_t i = 0; i < res.size(); i++) {
132
+ ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
133
+ }
134
+ }
135
+
136
+ {
137
+ string word = "长江大桥";
138
+ cppjieba::RuneStrArray unicode;
139
+ ASSERT_TRUE(DecodeRunesInString(word, unicode));
140
+ vector<struct Dag> res;
141
+ trie.Find(unicode.begin(), unicode.end(), res);
142
+
143
+ size_t nexts_sizes[] = {3, 1, 2, 1};
144
+ ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
145
+ for (size_t i = 0; i < res.size(); i++) {
146
+ ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
147
+ }
148
+ }
149
+
150
+ {
151
+ string word = "长江大桥";
152
+ cppjieba::RuneStrArray unicode;
153
+ ASSERT_TRUE(DecodeRunesInString(word, unicode));
154
+ vector<struct Dag> res;
155
+ trie.Find(unicode.begin(), unicode.end(), res, 3);
156
+
157
+ size_t nexts_sizes[] = {2, 1, 2, 1};
158
+ ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
159
+ for (size_t i = 0; i < res.size(); i++) {
160
+ ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
161
+ }
162
+ }
163
+
164
+ {
165
+ string word = "长江大桥";
166
+ cppjieba::RuneStrArray unicode;
167
+ ASSERT_TRUE(DecodeRunesInString(word, unicode));
168
+ vector<struct Dag> res;
169
+ trie.Find(unicode.begin(), unicode.end(), res, 4);
170
+
171
+ size_t nexts_sizes[] = {3, 1, 2, 1};
172
+ ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
173
+ for (size_t i = 0; i < res.size(); i++) {
174
+ ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
175
+ }
176
+ }
177
+ }
@@ -0,0 +1,43 @@
1
+ #include "cppjieba/Unicode.hpp"
2
+ #include "limonp/StdExtension.hpp"
3
+ #include "gtest/gtest.h"
4
+
5
+ using namespace cppjieba;
6
+ using namespace std;
7
+
8
+ TEST(UnicodeTest, Test1) {
9
+ string s = "你好世界";
10
+ RuneStrArray runes;
11
+ ASSERT_TRUE(DecodeRunesInString(s, runes));
12
+ string actual;
13
+ string expected = "[\"{\"rune\": \"20320\", \"offset\": 0, \"len\": 3}\", \"{\"rune\": \"22909\", \"offset\": 3, \"len\": 3}\", \"{\"rune\": \"19990\", \"offset\": 6, \"len\": 3}\", \"{\"rune\": \"30028\", \"offset\": 9, \"len\": 3}\"]";
14
+ actual << runes;
15
+ ASSERT_EQ(expected, actual);
16
+ }
17
+
18
+ TEST(UnicodeTest, Illegal) {
19
+ string s = "123\x80";
20
+ RuneStrArray runes;
21
+ ASSERT_FALSE(DecodeRunesInString(s, runes));
22
+ string actual;
23
+ string expected = "[]";
24
+ actual << runes;
25
+ ASSERT_EQ(expected, actual);
26
+ }
27
+
28
+ TEST(UnicodeTest, Rand) {
29
+ const size_t ITERATION = 1024;
30
+ const size_t MAX_LEN = 256;
31
+ string s;
32
+ srand(time(NULL));
33
+
34
+ for (size_t i = 0; i < ITERATION; i++) {
35
+ size_t len = rand() % MAX_LEN;
36
+ s.resize(len);
37
+ for (size_t j = 0; j < len; j++) {
38
+ s[rand() % len] = rand();
39
+ }
40
+ RuneStrArray runes;
41
+ DecodeRunesInString(s, runes);
42
+ }
43
+ }
@@ -0,0 +1,10 @@
1
+ #include <ruby.h>
2
+
3
+ VALUE rb_mCppjiebaRb;
4
+
5
+ void Init_cppjieba_rb()
6
+ {
7
+ rb_mCppjiebaRb = rb_define_module("CppjiebaRb");
8
+
9
+ Init_internal();
10
+ }
@@ -0,0 +1,26 @@
1
+ require "mkmf"
2
+ abs = File.expand_path File.dirname(__FILE__)
3
+
4
+ LIBDIR = RbConfig::CONFIG['libdir']
5
+ INCLUDEDIR = RbConfig::CONFIG['includedir']
6
+
7
+ HEADER_DIRS = [
8
+ INCLUDEDIR,
9
+ "#{abs}/../cppjieba/include",
10
+ "#{abs}/../cppjieba/deps"
11
+ ]
12
+
13
+ LIB_DIRS = [
14
+ LIBDIR
15
+ ]
16
+
17
+ dir_config('cppjieba_src', HEADER_DIRS, LIB_DIRS)
18
+
19
+ CONFIG["CXXFLAGS"] += " -std=c++11 -O3"
20
+ $CXXFLAGS = "#{$CXXFLAGS} -std=c++11 -O3"
21
+ create_makefile 'cppjieba_rb'
22
+ # respect header changes
23
+ headers = Dir.glob('*.{hpp,h}').join ' '
24
+ File.open 'Makefile', 'a' do |f|
25
+ f.puts "\n$(OBJS): #{headers}"
26
+ end
@@ -0,0 +1,148 @@
1
+ #include <ruby.h>
2
+ #include <ruby/encoding.h>
3
+ #include "cppjieba/Jieba.hpp"
4
+
5
+ #define GET_CPPJIEBA(_data) jieba_cpp_data* _data; \
6
+ TypedData_Get_Struct(self, jieba_cpp_data, &jieba_cpp_type, _data)
7
+
8
+ typedef struct {
9
+ cppjieba::Jieba* jieba;
10
+ } jieba_cpp_data;
11
+
12
+ // make compiler happy
13
+ typedef VALUE (ruby_method)(...);
14
+
15
+ static ID rb_sMp;
16
+ static ID rb_sMix;
17
+ static ID rb_sHmm;
18
+ static ID rb_sQuery;
19
+ static ID rb_sFull;
20
+ static rb_encoding* u8_enc;
21
+ VALUE rb_cCppjiebaRb_Internal;
22
+ extern "C" VALUE rb_mCppjiebaRb;
23
+
24
+ static void jieba_cpp_free(void* _this)
25
+ {
26
+ jieba_cpp_data* data = static_cast<jieba_cpp_data*>(_this);
27
+ delete data->jieba;
28
+ data->jieba = nullptr;
29
+ }
30
+
31
+ static size_t jieba_cpp_memsize(const void* _)
32
+ {
33
+ return sizeof(jieba_cpp_data);
34
+ }
35
+
36
+ static const rb_data_type_t jieba_cpp_type = {
37
+ "jieba/internal",
38
+ {NULL, jieba_cpp_free, jieba_cpp_memsize,},
39
+ 0, 0,
40
+ RUBY_TYPED_FREE_IMMEDIATELY,
41
+ };
42
+
43
+ extern "C" {
44
+
45
+ VALUE internal_alloc(VALUE self)
46
+ {
47
+ jieba_cpp_data* data;
48
+ return TypedData_Make_Struct(self, jieba_cpp_data, &jieba_cpp_type, data);
49
+ }
50
+
51
+ VALUE internal_initialize(VALUE self,
52
+ VALUE dict_path,
53
+ VALUE model_path,
54
+ VALUE user_dict_path,
55
+ VALUE idf_path,
56
+ VALUE stop_word_path)
57
+ {
58
+ GET_CPPJIEBA(data);
59
+ data->jieba = new cppjieba::Jieba(StringValueCStr(dict_path),
60
+ StringValueCStr(model_path),
61
+ StringValueCStr(user_dict_path),
62
+ StringValueCStr(idf_path),
63
+ StringValueCStr(stop_word_path));
64
+ }
65
+
66
+ VALUE internal_extract_keyword(VALUE self, VALUE text_rbs, VALUE topN)
67
+ {
68
+ std::string text = StringValueCStr(text_rbs);
69
+ int top_n = NUM2INT(topN);
70
+ GET_CPPJIEBA(data);
71
+
72
+ std::vector<std::pair<std::string, double> > top_words;
73
+
74
+ data->jieba->extractor.Extract(text, top_words, top_n);
75
+ VALUE arr = rb_ary_new2(top_words.size());
76
+ for (auto iter = top_words.begin(); iter != top_words.end(); iter++) {
77
+ VALUE inner_arr = rb_ary_new2(2);
78
+ rb_ary_push(inner_arr, rb_enc_str_new(iter->first.c_str(), iter->first.length(), u8_enc));
79
+ rb_ary_push(inner_arr, rb_float_new(iter->second));
80
+ rb_ary_push(arr, inner_arr);
81
+ }
82
+ return arr;
83
+ }
84
+
85
+ static VALUE internal_segment(VALUE self, VALUE text_rbs, VALUE mode, VALUE max_length_, VALUE hmm_)
86
+ {
87
+ std::string text = StringValueCStr(text_rbs);
88
+ size_t max_length = NUM2UINT(max_length_);
89
+ int hmm = (hmm_ == Qtrue ? 1 : 0);
90
+ GET_CPPJIEBA(data);
91
+ ID rb_sMode = SYM2ID(mode);
92
+ std::vector<std::string> words;
93
+
94
+ if (rb_sMode == rb_sMp) {
95
+ data->jieba->CutSmall(text, words, max_length);
96
+ } else if (rb_sMode == rb_sMix) {
97
+ data->jieba->Cut(text, words, hmm);
98
+ } else if (rb_sMode == rb_sHmm) {
99
+ data->jieba->CutHMM(text, words);
100
+ } else if (rb_sMode == rb_sQuery) {
101
+ data->jieba->CutForSearch(text, words, hmm);
102
+ } else if (rb_sMode == rb_sFull) {
103
+ data->jieba->CutAll(text, words);
104
+ }
105
+
106
+ VALUE arr = rb_ary_new2(words.size());
107
+ for (auto iter = words.begin(); iter != words.end(); iter++) {
108
+ rb_ary_push(arr, rb_enc_str_new(iter->c_str(), iter->length(), u8_enc));
109
+ }
110
+ return arr;
111
+ }
112
+
113
+ static VALUE internal_segment_tag(VALUE self, VALUE text_rbs)
114
+ {
115
+ std::string text = StringValueCStr(text_rbs);
116
+ GET_CPPJIEBA(data);
117
+
118
+ std::vector<std::pair<std::string, std::string>> words;
119
+ data->jieba->Tag(text, words);
120
+
121
+ VALUE result = rb_hash_new();
122
+ for (auto iter = words.begin(); iter != words.end(); iter++) {
123
+ rb_hash_aset(result,
124
+ rb_enc_str_new(iter->first.c_str(), iter->first.length(), u8_enc),
125
+ rb_enc_str_new(iter->second.c_str(), iter->second.length(), u8_enc));
126
+ }
127
+ return result;
128
+ }
129
+
130
+
131
+ void Init_internal()
132
+ {
133
+ rb_sMp = rb_intern("mp");
134
+ rb_sMix = rb_intern("mix");
135
+ rb_sHmm = rb_intern("hmm");
136
+ rb_sQuery = rb_intern("query");
137
+ rb_sFull = rb_intern("full");
138
+ u8_enc = rb_utf8_encoding();
139
+
140
+ rb_cCppjiebaRb_Internal = rb_define_class_under(rb_mCppjiebaRb, "Internal", rb_cData);
141
+ rb_define_alloc_func(rb_cCppjiebaRb_Internal, internal_alloc);
142
+ rb_define_method(rb_cCppjiebaRb_Internal, "initialize", (ruby_method*) &internal_initialize, 5);
143
+ rb_define_method(rb_cCppjiebaRb_Internal, "extract_keyword", (ruby_method*) &internal_extract_keyword, 2);
144
+ rb_define_method(rb_cCppjiebaRb_Internal, "segment", (ruby_method*) &internal_segment, 4);
145
+ rb_define_method(rb_cCppjiebaRb_Internal, "segment_tag", (ruby_method*) &internal_segment_tag, 1);
146
+ }
147
+
148
+ }
@@ -0,0 +1,20 @@
1
+ module CppjiebaRb
2
+ class Segment
3
+ VALID_MODES = %i[mix hmm mp query full].freeze
4
+
5
+ def initialize(opts = nil)
6
+ opts ||= {}
7
+ unless opts[:mode].nil? || VALID_MODES.include?(opts[:mode])
8
+ raise ArgumentError, "The mode is #{opts[:mode]}. It should be one of :mix :hmm :mp"
9
+ end
10
+
11
+ @mode = opts[:mode] || :mix
12
+ @max_word_length = opts[:max_word_length] || 8
13
+ @hmm = opts[:hmm] || true
14
+ end
15
+
16
+ def segment(str)
17
+ CppjiebaRb.internal.segment(str, @mode, @max_word_length, @hmm)
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,3 @@
1
+ module CppjiebaRb
2
+ VERSION = '0.2.1'
3
+ end
@@ -0,0 +1,34 @@
1
+ require 'cppjieba_rb/cppjieba_rb'
2
+ require 'cppjieba_rb/version'
3
+ require 'cppjieba_rb/segment'
4
+
5
+ module CppjiebaRb
6
+ EXT_BASE = File.join(File.dirname(__FILE__), '..', 'ext', 'cppjieba', 'dict')
7
+ DICT_PATH = File.join(EXT_BASE, 'jieba.dict.utf8')
8
+ HMM_DICT_PATH = File.join(EXT_BASE, 'hmm_model.utf8')
9
+ USER_DICT = File.join(EXT_BASE, 'user.dict.utf8')
10
+ IDF_PATH = File.join(EXT_BASE, 'idf.utf8')
11
+ STOP_WORD_PATH = File.join(EXT_BASE, 'stop_words.utf8')
12
+
13
+ def self.extract_keyword(str, top_n)
14
+ internal.extract_keyword(str, top_n)
15
+ end
16
+
17
+ def self.segment(str, opts = nil)
18
+ CppjiebaRb::Segment.new(opts).segment(str)
19
+ end
20
+
21
+ def self.segment_tag(str)
22
+ internal.segment_tag(str)
23
+ end
24
+
25
+ class << self
26
+ def internal
27
+ @backend ||= CppjiebaRb::Internal.new(DICT_PATH,
28
+ HMM_DICT_PATH,
29
+ USER_DICT,
30
+ IDF_PATH,
31
+ STOP_WORD_PATH)
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,17 @@
1
+ # coding: utf-8
2
+ require 'minitest/autorun'
3
+ require 'cppjieba_rb'
4
+
5
+ class JiebaTest < Minitest::Test
6
+ def test_keywords
7
+ results = CppjiebaRb.extract_keyword "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。", 5
8
+
9
+ assert_equal [["CEO",
10
+ 11.739204307083542],
11
+ ["升职", 10.8561552143],
12
+ ["加薪", 10.642581114],
13
+ ["手扶拖拉机", 10.0088573539],
14
+ ["巅峰", 9.49395840471]], results
15
+
16
+ end
17
+ end
@@ -0,0 +1,24 @@
1
+ # coding: utf-8
2
+ require 'minitest/autorun'
3
+ require 'cppjieba_rb'
4
+
5
+ class JiebaTest < Minitest::Test
6
+ def test_mix_segment
7
+ words = CppjiebaRb.segment "我来到南京市长江大桥"
8
+ assert_equal %w(我 来到 南京市 长江大桥), words
9
+
10
+ words = CppjiebaRb.segment "令狐冲是云计算行业的专家"
11
+ assert_equal %w(令狐冲 是 云计算 行业 的 专家), words
12
+ end
13
+
14
+ def test_hmm_segment
15
+ words = CppjiebaRb.segment "令狐冲是云计算行业的专家", mode: :hmm
16
+ assert_equal %w(令狐冲 是 云计算 行业 的 专家), words
17
+ end
18
+
19
+ def test_max_prob_segment
20
+ words = CppjiebaRb.segment "令狐冲是云计算行业的专家", mode: :mp
21
+ assert_equal %w(令狐冲 是 云计算 行业 的 专家), words
22
+ end
23
+
24
+ end
@@ -0,0 +1,19 @@
1
+ # coding: utf-8
2
+ require 'minitest/autorun'
3
+ require 'cppjieba_rb'
4
+ class JiebaTest < Minitest::Test
5
+ def test_tagging
6
+ pairs = CppjiebaRb.segment_tag '我是蓝翔技工拖拉机学院手扶拖拉机专业的。'
7
+ assert_equal({ '我' => 'r', '是' => 'v', '蓝翔' => 'nz', '技工' => 'n',
8
+ '拖拉机' => 'n', '学院' => 'n', '手扶拖拉机' => 'n', '专业' => 'n',
9
+ '的' => 'uj', '。' => 'x' }, pairs)
10
+ end
11
+
12
+ def test_tagging_with_user_dict
13
+ pairs = CppjiebaRb.segment_tag '我是蓝翔技工拖拉机学院手扶拖拉机专业的。'
14
+ assert_equal({ '我' => 'r', '是' => 'v', '蓝翔' => 'nz', '技工' => 'n',
15
+ '拖拉机' => 'n', '学院' => 'n', '手扶拖拉机' => 'n', '专业' => 'n',
16
+ '的' => 'uj', '。' => 'x' }, pairs)
17
+ end
18
+
19
+ end