RubyGems - cppjieba_rb - Versions diffs - 0.2.1 - Mend

cppjieba_rb 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (142) hide show

checksums.yaml +7 -0
data/.gitignore +18 -0
data/.gitmodules +3 -0
data/.travis.yml +26 -0
data/Gemfile +3 -0
data/LICENSE.txt +22 -0
data/README.md +81 -0
data/Rakefile +20 -0
data/cppjieba_rb.gemspec +50 -0
data/ext/cppjieba/.gitignore +17 -0
data/ext/cppjieba/.travis.yml +22 -0
data/ext/cppjieba/CMakeLists.txt +28 -0
data/ext/cppjieba/ChangeLog.md +236 -0
data/ext/cppjieba/README.md +285 -0
data/ext/cppjieba/README_EN.md +111 -0
data/ext/cppjieba/appveyor.yml +32 -0
data/ext/cppjieba/deps/CMakeLists.txt +1 -0
data/ext/cppjieba/deps/gtest/CMakeLists.txt +5 -0
data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +283 -0
data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +230 -0
data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +1421 -0
data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +487 -0
data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +796 -0
data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +232 -0
data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +176 -0
data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +259 -0
data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +2155 -0
data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +358 -0
data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +58 -0
data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +308 -0
data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +210 -0
data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +1226 -0
data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +233 -0
data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +4822 -0
data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +301 -0
data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +619 -0
data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +1788 -0
data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +350 -0
data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +968 -0
data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +336 -0
data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +3330 -0
data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +296 -0
data/ext/cppjieba/deps/gtest/src/.deps/.dirstamp +0 -0
data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +681 -0
data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +509 -0
data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
data/ext/cppjieba/deps/gtest/src/gtest-all.cc +48 -0
data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +1234 -0
data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +380 -0
data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +1038 -0
data/ext/cppjieba/deps/gtest/src/gtest-port.cc +746 -0
data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +356 -0
data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +110 -0
data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +110 -0
data/ext/cppjieba/deps/gtest/src/gtest.cc +4898 -0
data/ext/cppjieba/deps/gtest/src/gtest_main.cc +39 -0
data/ext/cppjieba/deps/limonp/ArgvContext.hpp +70 -0
data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +49 -0
data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +67 -0
data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +65 -0
data/ext/cppjieba/deps/limonp/Closure.hpp +206 -0
data/ext/cppjieba/deps/limonp/Colors.hpp +31 -0
data/ext/cppjieba/deps/limonp/Condition.hpp +38 -0
data/ext/cppjieba/deps/limonp/Config.hpp +103 -0
data/ext/cppjieba/deps/limonp/FileLock.hpp +74 -0
data/ext/cppjieba/deps/limonp/ForcePublic.hpp +7 -0
data/ext/cppjieba/deps/limonp/LocalVector.hpp +139 -0
data/ext/cppjieba/deps/limonp/Logging.hpp +76 -0
data/ext/cppjieba/deps/limonp/Md5.hpp +411 -0
data/ext/cppjieba/deps/limonp/MutexLock.hpp +51 -0
data/ext/cppjieba/deps/limonp/NonCopyable.hpp +21 -0
data/ext/cppjieba/deps/limonp/StdExtension.hpp +159 -0
data/ext/cppjieba/deps/limonp/StringUtil.hpp +365 -0
data/ext/cppjieba/deps/limonp/Thread.hpp +44 -0
data/ext/cppjieba/deps/limonp/ThreadPool.hpp +86 -0
data/ext/cppjieba/dict/README.md +31 -0
data/ext/cppjieba/dict/hmm_model.utf8 +34 -0
data/ext/cppjieba/dict/idf.utf8 +258826 -0
data/ext/cppjieba/dict/jieba.dict.utf8 +348982 -0
data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
data/ext/cppjieba/dict/stop_words.utf8 +1534 -0
data/ext/cppjieba/dict/user.dict.utf8 +4 -0
data/ext/cppjieba/include/cppjieba/DictTrie.hpp +227 -0
data/ext/cppjieba/include/cppjieba/FullSegment.hpp +93 -0
data/ext/cppjieba/include/cppjieba/HMMModel.hpp +129 -0
data/ext/cppjieba/include/cppjieba/HMMSegment.hpp +190 -0
data/ext/cppjieba/include/cppjieba/Jieba.hpp +108 -0
data/ext/cppjieba/include/cppjieba/KeywordExtractor.hpp +153 -0
data/ext/cppjieba/include/cppjieba/MPSegment.hpp +137 -0
data/ext/cppjieba/include/cppjieba/MixSegment.hpp +109 -0
data/ext/cppjieba/include/cppjieba/PosTagger.hpp +77 -0
data/ext/cppjieba/include/cppjieba/PreFilter.hpp +54 -0
data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +90 -0
data/ext/cppjieba/include/cppjieba/SegmentBase.hpp +46 -0
data/ext/cppjieba/include/cppjieba/SegmentTagged.hpp +23 -0
data/ext/cppjieba/include/cppjieba/TextRankExtractor.hpp +190 -0
data/ext/cppjieba/include/cppjieba/Trie.hpp +174 -0
data/ext/cppjieba/include/cppjieba/Unicode.hpp +215 -0
data/ext/cppjieba/test/CMakeLists.txt +5 -0
data/ext/cppjieba/test/demo.cpp +80 -0
data/ext/cppjieba/test/load_test.cpp +54 -0
data/ext/cppjieba/test/testdata/curl.res +1 -0
data/ext/cppjieba/test/testdata/extra_dict/jieba.dict.small.utf8 +109750 -0
data/ext/cppjieba/test/testdata/gbk_dict/hmm_model.gbk +34 -0
data/ext/cppjieba/test/testdata/gbk_dict/jieba.dict.gbk +348982 -0
data/ext/cppjieba/test/testdata/jieba.dict.0.1.utf8 +93 -0
data/ext/cppjieba/test/testdata/jieba.dict.0.utf8 +93 -0
data/ext/cppjieba/test/testdata/jieba.dict.1.utf8 +67 -0
data/ext/cppjieba/test/testdata/jieba.dict.2.utf8 +64 -0
data/ext/cppjieba/test/testdata/load_test.urls +2 -0
data/ext/cppjieba/test/testdata/review.100 +100 -0
data/ext/cppjieba/test/testdata/review.100.res +200 -0
data/ext/cppjieba/test/testdata/server.conf +19 -0
data/ext/cppjieba/test/testdata/testlines.gbk +9 -0
data/ext/cppjieba/test/testdata/testlines.utf8 +8 -0
data/ext/cppjieba/test/testdata/userdict.2.utf8 +1 -0
data/ext/cppjieba/test/testdata/userdict.english +2 -0
data/ext/cppjieba/test/testdata/userdict.utf8 +8 -0
data/ext/cppjieba/test/testdata/weicheng.utf8 +247 -0
data/ext/cppjieba/test/unittest/CMakeLists.txt +24 -0
data/ext/cppjieba/test/unittest/gtest_main.cpp +39 -0
data/ext/cppjieba/test/unittest/jieba_test.cpp +133 -0
data/ext/cppjieba/test/unittest/keyword_extractor_test.cpp +79 -0
data/ext/cppjieba/test/unittest/pos_tagger_test.cpp +41 -0
data/ext/cppjieba/test/unittest/pre_filter_test.cpp +43 -0
data/ext/cppjieba/test/unittest/segments_test.cpp +256 -0
data/ext/cppjieba/test/unittest/textrank_test.cpp +86 -0
data/ext/cppjieba/test/unittest/trie_test.cpp +177 -0
data/ext/cppjieba/test/unittest/unicode_test.cpp +43 -0
data/ext/cppjieba_rb/cppjieba_rb.c +10 -0
data/ext/cppjieba_rb/extconf.rb +26 -0
data/ext/cppjieba_rb/internal.cc +148 -0
data/lib/cppjieba_rb/segment.rb +20 -0
data/lib/cppjieba_rb/version.rb +3 -0
data/lib/cppjieba_rb.rb +34 -0
data/test/test_keyword.rb +17 -0
data/test/test_segment.rb +24 -0
data/test/test_tagging.rb +19 -0
metadata +244 -0

data/ext/cppjieba/test/unittest/textrank_test.cpp ADDED Viewed

@@ -0,0 +1,86 @@
+#include "cppjieba/TextRankExtractor.hpp"
+#include "gtest/gtest.h"
+using namespace cppjieba;
+TEST(TextRankExtractorTest, Test1) {
+  TextRankExtractor Extractor(
+    "../test/testdata/extra_dict/jieba.dict.small.utf8",
+    "../dict/hmm_model.utf8",
+    "../dict/stop_words.utf8");
+  {
+    string s("你好世界世界而且而且");
+    string res;
+    size_t topN = 5;
+    {
+      vector<string> words;
+      Extractor.Extract(s, words, topN);
+      res << words;
+      ASSERT_EQ(res, "[\"世界\", \"你好\"]");
+    }
+    {
+      vector<pair<string, double> > words;
+      Extractor.Extract(s, words, topN);
+      res << words;
+      ASSERT_EQ(res, "[世界:1, 你好:0.519787]");
+    }
+    {
+      vector<TextRankExtractor::Word> words;
+      Extractor.Extract(s, words, topN);
+      res << words;
+      ASSERT_EQ(res, "[{\"word\": \"世界\", \"offset\": [6, 12], \"weight\": 1}, {\"word\": \"你好\", \"offset\": [0], \"weight\": 0.519787}]");
+    }
+  }
+  {
+    string s("\xe6\x88\x91\xe6\x98\xaf\xe6\x8b\x96\xe6\x8b\x89\xe6\x9c\xba\xe5\xad\xa6\xe9\x99\xa2\xe6\x89\x8b\xe6\x89\xb6\xe6\x8b\x96\xe6\x8b\x89\xe6\x9c\xba\xe4\xb8\x93\xe4\xb8\x9a\xe7\x9a\x84\xe3\x80\x82\xe4\xb8\x8d\xe7\x94\xa8\xe5\xa4\x9a\xe4\xb9\x85\xef\xbc\x8c\xe6\x88\x91\xe5\xb0\xb1\xe4\xbc\x9a\xe5\x8d\x87\xe8\x81\x8c\xe5\x8a\xa0\xe8\x96\xaa\xef\xbc\x8c\xe5\xbd\x93\xe4\xb8\x8a CEO\xef\xbc\x8c\xe8\xb5\xb0\xe4\xb8\x8a\xe4\xba\xba\xe7\x94\x9f\xe5\xb7\x85\xe5\xb3\xb0");
+    string res;
+    vector<TextRankExtractor::Word> wordweights;
+    size_t topN = 5;
+    Extractor.Extract(s, wordweights, topN);
+    res << wordweights;
+    ASSERT_EQ(res, "[{\"word\": \"当上\", \"offset\": [87], \"weight\": 1}, {\"word\": \"不用\", \"offset\": [48], \"weight\": 0.989848}, {\"word\": \"多久\", \"offset\": [54], \"weight\": 0.985126}, {\"word\": \"加薪\", \"offset\": [78], \"weight\": 0.983046}, {\"word\": \"升职\", \"offset\": [72], \"weight\": 0.980278}]");
+    //ASSERT_EQ(res, "[{\"word\": \"专业\", \"offset\": [36], \"weight\": 1}, {\"word\": \"CEO\", \"offset\": [94], \"weight\": 0.95375}, {\"word\": \"手扶拖拉机\", \"offset\": [21], \"weight\": 0.801701}, {\"word\": \"当上\", \"offset\": [87], \"weight\": 0.798968}, {\"word\": \"走上\", \"offset\": [100], \"weight\": 0.775505}]");
+  }
+  {
+    string s("一部iPhone6");
+    string res;
+    vector<TextRankExtractor::Word> wordweights;
+    size_t topN = 5;
+    Extractor.Extract(s, wordweights, topN);
+    res << wordweights;
+    ASSERT_EQ(res, "[{\"word\": \"一部\", \"offset\": [0], \"weight\": 1}, {\"word\": \"iPhone6\", \"offset\": [6], \"weight\": 0.996126}]");
+  }
+}
+TEST(TextRankExtractorTest, Test2) {
+  TextRankExtractor Extractor(
+    "../test/testdata/extra_dict/jieba.dict.small.utf8",
+    "../dict/hmm_model.utf8",
+    "../dict/stop_words.utf8",
+    "../test/testdata/userdict.utf8");
+  {
+    string s("\xe8\x93\x9d\xe7\xbf\x94\xe4\xbc\x98\xe7\xa7\x80\xe6\xaf\x95\xe4\xb8\x9a\xe7\x94\x9f");
+    string res;
+    vector<TextRankExtractor::Word> wordweights;
+    size_t topN = 5;
+    Extractor.Extract(s, wordweights, topN);
+    res << wordweights;
+    ASSERT_EQ(res, "[{\"word\": \"蓝翔\", \"offset\": [0], \"weight\": 1}, {\"word\": \"毕业生\", \"offset\": [12], \"weight\": 0.996685}, {\"word\": \"优秀\", \"offset\": [6], \"weight\": 0.992994}]");
+  }
+  {
+    string s("一部iPhone6");
+    string res;
+    vector<TextRankExtractor::Word> wordweights;
+    size_t topN = 5;
+    Extractor.Extract(s, wordweights, topN);
+    res << wordweights;
+    ASSERT_EQ(res, "[{\"word\": \"一部\", \"offset\": [0], \"weight\": 1}, {\"word\": \"iPhone6\", \"offset\": [6], \"weight\": 0.996126}]");
+  }
+}

data/ext/cppjieba/test/unittest/trie_test.cpp ADDED Viewed

@@ -0,0 +1,177 @@
+#include "cppjieba/DictTrie.hpp"
+#include "cppjieba/MPSegment.hpp"
+#include "gtest/gtest.h"
+using namespace cppjieba;
+static const char* const DICT_FILE = "../test/testdata/extra_dict/jieba.dict.small.utf8";
+TEST(TrieTest, Empty) {
+  vector<Unicode> keys;
+  vector<const DictUnit*> values;
+  Trie trie(keys, values);
+}
+TEST(TrieTest, Construct) {
+  vector<Unicode> keys;
+  vector<const DictUnit*> values;
+  keys.push_back(DecodeRunesInString("你"));
+  values.push_back((const DictUnit*)(NULL));
+  Trie trie(keys, values);
+}
+TEST(DictTrieTest, NewAndDelete) {
+  DictTrie * trie;
+  trie = new DictTrie(DICT_FILE);
+  delete trie;
+}
+TEST(DictTrieTest, Test1) {
+  string s1, s2;
+  DictTrie trie(DICT_FILE);
+  ASSERT_LT(trie.GetMinWeight() + 15.6479, 0.001);
+  string word("来到");
+  cppjieba::RuneStrArray uni;
+  ASSERT_TRUE(DecodeRunesInString(word, uni));
+  //DictUnit nodeInfo;
+  //nodeInfo.word = uni;
+  //nodeInfo.tag = "v";
+  //nodeInfo.weight = -8.87033;
+  //s1 << nodeInfo;
+  //s2 << (*trie.Find(uni.begin(), uni.end()));
+  const DictUnit* du = trie.Find(uni.begin(), uni.end());
+  ASSERT_TRUE(du != NULL);
+  ASSERT_EQ(2u, du->word.size());
+  ASSERT_EQ(26469u, du->word[0]);
+  ASSERT_EQ(21040u, du->word[1]);
+  ASSERT_EQ("v", du->tag);
+  ASSERT_NEAR(-8.870, du->weight, 0.001);
+  //EXPECT_EQ("[\"26469\", \"21040\"] v -8.870", s2);
+  word = "清华大学";
+  LocalVector<pair<size_t, const DictUnit*> > res;
+  const char * words[] = {"清", "清华", "清华大学"};
+  for (size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) {
+    ASSERT_TRUE(DecodeRunesInString(words[i], uni));
+    res.push_back(make_pair(uni.size() - 1, trie.Find(uni.begin(), uni.end())));
+    //resMap[uni.size() - 1] = trie.Find(uni.begin(), uni.end());
+  }
+  vector<pair<size_t, const DictUnit*> > vec;
+  vector<struct Dag> dags;
+  ASSERT_TRUE(DecodeRunesInString(word, uni));
+  trie.Find(uni.begin(), uni.end(), dags);
+  ASSERT_EQ(dags.size(), uni.size());
+  ASSERT_NE(dags.size(), 0u);
+  s1 << res;
+  s2 << dags[0].nexts;
+  ASSERT_EQ(s1, s2);
+}
+TEST(DictTrieTest, UserDict) {
+  DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8");
+  string word = "云计算";
+  cppjieba::RuneStrArray unicode;
+  ASSERT_TRUE(DecodeRunesInString(word, unicode));
+  const DictUnit * unit = trie.Find(unicode.begin(), unicode.end());
+  ASSERT_TRUE(unit != NULL);
+  ASSERT_NEAR(unit->weight, -14.100, 0.001);
+  word = "蓝翔";
+  ASSERT_TRUE(DecodeRunesInString(word, unicode));
+  unit = trie.Find(unicode.begin(), unicode.end());
+  ASSERT_TRUE(unit != NULL);
+  ASSERT_EQ(unit->tag, "nz");
+  ASSERT_NEAR(unit->weight, -14.100, 0.001);
+  word = "区块链";
+  ASSERT_TRUE(DecodeRunesInString(word, unicode));
+  unit = trie.Find(unicode.begin(), unicode.end());
+  ASSERT_TRUE(unit != NULL);
+  ASSERT_EQ(unit->tag, "nz");
+  ASSERT_NEAR(unit->weight, -15.6478, 0.001);
+}
+TEST(DictTrieTest, UserDictWithMaxWeight) {
+  DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8", DictTrie::WordWeightMax);
+  string word = "云计算";
+  cppjieba::RuneStrArray unicode;
+  ASSERT_TRUE(DecodeRunesInString(word, unicode));
+  const DictUnit * unit = trie.Find(unicode.begin(), unicode.end());
+  ASSERT_TRUE(unit);
+  ASSERT_NEAR(unit->weight, -2.975, 0.001);
+}
+TEST(DictTrieTest, Dag) {
+  DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8");
+  {
+    string word = "清华大学";
+    cppjieba::RuneStrArray unicode;
+    ASSERT_TRUE(DecodeRunesInString(word, unicode));
+    vector<struct Dag> res;
+    trie.Find(unicode.begin(), unicode.end(), res);
+    size_t nexts_sizes[] = {3, 2, 2, 1};
+    ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
+    for (size_t i = 0; i < res.size(); i++) {
+      ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
+    }
+  }
+  {
+    string word = "北京邮电大学";
+    cppjieba::RuneStrArray unicode;
+    ASSERT_TRUE(DecodeRunesInString(word, unicode));
+    vector<struct Dag> res;
+    trie.Find(unicode.begin(), unicode.end(), res);
+    size_t nexts_sizes[] = {3, 1, 2, 2, 2, 1};
+    ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
+    for (size_t i = 0; i < res.size(); i++) {
+      ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
+    }
+  }
+  {
+    string word = "长江大桥";
+    cppjieba::RuneStrArray unicode;
+    ASSERT_TRUE(DecodeRunesInString(word, unicode));
+    vector<struct Dag> res;
+    trie.Find(unicode.begin(), unicode.end(), res);
+    size_t nexts_sizes[] = {3, 1, 2, 1};
+    ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
+    for (size_t i = 0; i < res.size(); i++) {
+      ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
+    }
+  }
+  {
+    string word = "长江大桥";
+    cppjieba::RuneStrArray unicode;
+    ASSERT_TRUE(DecodeRunesInString(word, unicode));
+    vector<struct Dag> res;
+    trie.Find(unicode.begin(), unicode.end(), res, 3);
+    size_t nexts_sizes[] = {2, 1, 2, 1};
+    ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
+    for (size_t i = 0; i < res.size(); i++) {
+      ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
+    }
+  }
+  {
+    string word = "长江大桥";
+    cppjieba::RuneStrArray unicode;
+    ASSERT_TRUE(DecodeRunesInString(word, unicode));
+    vector<struct Dag> res;
+    trie.Find(unicode.begin(), unicode.end(), res, 4);
+    size_t nexts_sizes[] = {3, 1, 2, 1};
+    ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0]));
+    for (size_t i = 0; i < res.size(); i++) {
+      ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]);
+    }
+  }
+}

data/ext/cppjieba/test/unittest/unicode_test.cpp ADDED Viewed

@@ -0,0 +1,43 @@
+#include "cppjieba/Unicode.hpp"
+#include "limonp/StdExtension.hpp"
+#include "gtest/gtest.h"
+using namespace cppjieba;
+using namespace std;
+TEST(UnicodeTest, Test1) {
+  string s = "你好世界";
+  RuneStrArray runes;
+  ASSERT_TRUE(DecodeRunesInString(s, runes));
+  string actual;
+  string expected = "[\"{\"rune\": \"20320\", \"offset\": 0, \"len\": 3}\", \"{\"rune\": \"22909\", \"offset\": 3, \"len\": 3}\", \"{\"rune\": \"19990\", \"offset\": 6, \"len\": 3}\", \"{\"rune\": \"30028\", \"offset\": 9, \"len\": 3}\"]";
+  actual << runes;
+  ASSERT_EQ(expected, actual);
+}
+TEST(UnicodeTest, Illegal) {
+  string s = "123\x80";
+  RuneStrArray runes;
+  ASSERT_FALSE(DecodeRunesInString(s, runes));
+  string actual;
+  string expected = "[]";
+  actual << runes;
+  ASSERT_EQ(expected, actual);
+}
+TEST(UnicodeTest, Rand) {
+  const size_t ITERATION = 1024;
+  const size_t MAX_LEN = 256;
+  string s;
+  srand(time(NULL));
+  for (size_t i = 0; i < ITERATION; i++) {
+    size_t len = rand() % MAX_LEN;
+    s.resize(len);
+    for (size_t j = 0; j < len; j++) {
+      s[rand() % len] = rand();
+    }
+    RuneStrArray runes;
+    DecodeRunesInString(s, runes);
+  }
+}

data/ext/cppjieba_rb/cppjieba_rb.c ADDED Viewed

@@ -0,0 +1,10 @@
+#include <ruby.h>
+VALUE rb_mCppjiebaRb;
+void Init_cppjieba_rb()
+{
+    rb_mCppjiebaRb = rb_define_module("CppjiebaRb");
+    Init_internal();
+}

data/ext/cppjieba_rb/extconf.rb ADDED Viewed

@@ -0,0 +1,26 @@
+require "mkmf"
+abs = File.expand_path File.dirname(__FILE__)
+LIBDIR      = RbConfig::CONFIG['libdir']
+INCLUDEDIR  = RbConfig::CONFIG['includedir']
+HEADER_DIRS = [
+  INCLUDEDIR,
+  "#{abs}/../cppjieba/include",
+  "#{abs}/../cppjieba/deps"
+]
+LIB_DIRS = [
+  LIBDIR
+]
+dir_config('cppjieba_src', HEADER_DIRS, LIB_DIRS)
+CONFIG["CXXFLAGS"] += " -std=c++11 -O3"
+$CXXFLAGS = "#{$CXXFLAGS} -std=c++11 -O3"
+create_makefile 'cppjieba_rb'
+# respect header changes
+headers = Dir.glob('*.{hpp,h}').join ' '
+File.open 'Makefile', 'a' do |f|
+  f.puts "\n$(OBJS): #{headers}"
+end

data/ext/cppjieba_rb/internal.cc ADDED Viewed

@@ -0,0 +1,148 @@
+#include <ruby.h>
+#include <ruby/encoding.h>
+#include "cppjieba/Jieba.hpp"
+#define GET_CPPJIEBA(_data) jieba_cpp_data* _data; \
+                            TypedData_Get_Struct(self, jieba_cpp_data, &jieba_cpp_type, _data)
+typedef struct {
+    cppjieba::Jieba* jieba;
+} jieba_cpp_data;
+// make compiler happy
+typedef VALUE (ruby_method)(...);
+static ID rb_sMp;
+static ID rb_sMix;
+static ID rb_sHmm;
+static ID rb_sQuery;
+static ID rb_sFull;
+static rb_encoding* u8_enc;
+VALUE rb_cCppjiebaRb_Internal;
+extern "C" VALUE rb_mCppjiebaRb;
+static void jieba_cpp_free(void* _this)
+{
+    jieba_cpp_data* data = static_cast<jieba_cpp_data*>(_this);
+    delete data->jieba;
+    data->jieba = nullptr;
+}
+static size_t jieba_cpp_memsize(const void* _)
+{
+    return sizeof(jieba_cpp_data);
+}
+static const rb_data_type_t jieba_cpp_type = {
+    "jieba/internal",
+    {NULL, jieba_cpp_free, jieba_cpp_memsize,},
+    0, 0,
+    RUBY_TYPED_FREE_IMMEDIATELY,
+};
+extern "C" {
+VALUE internal_alloc(VALUE self)
+{
+    jieba_cpp_data* data;
+    return TypedData_Make_Struct(self, jieba_cpp_data, &jieba_cpp_type, data);
+}
+VALUE internal_initialize(VALUE self,
+                          VALUE dict_path,
+                          VALUE model_path,
+                          VALUE user_dict_path,
+                          VALUE idf_path,
+                          VALUE stop_word_path)
+{
+    GET_CPPJIEBA(data);
+    data->jieba = new cppjieba::Jieba(StringValueCStr(dict_path),
+                                      StringValueCStr(model_path),
+                                      StringValueCStr(user_dict_path),
+                                      StringValueCStr(idf_path),
+                                      StringValueCStr(stop_word_path));
+}
+VALUE internal_extract_keyword(VALUE self, VALUE text_rbs, VALUE topN)
+{
+    std::string text = StringValueCStr(text_rbs);
+    int top_n = NUM2INT(topN);
+    GET_CPPJIEBA(data);
+    std::vector<std::pair<std::string, double> > top_words;
+    data->jieba->extractor.Extract(text, top_words, top_n);
+    VALUE arr = rb_ary_new2(top_words.size());
+    for (auto iter = top_words.begin(); iter != top_words.end(); iter++) {
+        VALUE inner_arr = rb_ary_new2(2);
+        rb_ary_push(inner_arr, rb_enc_str_new(iter->first.c_str(), iter->first.length(), u8_enc));
+        rb_ary_push(inner_arr, rb_float_new(iter->second));
+        rb_ary_push(arr, inner_arr);
+    }
+    return arr;
+}
+static VALUE internal_segment(VALUE self, VALUE text_rbs, VALUE mode, VALUE max_length_, VALUE hmm_)
+{
+    std::string text = StringValueCStr(text_rbs);
+    size_t max_length = NUM2UINT(max_length_);
+    int hmm = (hmm_ == Qtrue ? 1 : 0);
+    GET_CPPJIEBA(data);
+    ID rb_sMode = SYM2ID(mode);
+    std::vector<std::string> words;
+    if (rb_sMode == rb_sMp) {
+        data->jieba->CutSmall(text, words, max_length);
+    } else if (rb_sMode == rb_sMix) {
+        data->jieba->Cut(text, words, hmm);
+    } else if (rb_sMode == rb_sHmm) {
+        data->jieba->CutHMM(text, words);
+    } else if (rb_sMode == rb_sQuery) {
+        data->jieba->CutForSearch(text, words, hmm);
+    } else if (rb_sMode == rb_sFull) {
+        data->jieba->CutAll(text, words);
+    }
+    VALUE arr = rb_ary_new2(words.size());
+    for (auto iter = words.begin(); iter != words.end(); iter++) {
+        rb_ary_push(arr, rb_enc_str_new(iter->c_str(), iter->length(), u8_enc));
+    }
+    return arr;
+}
+static VALUE internal_segment_tag(VALUE self, VALUE text_rbs)
+{
+    std::string text = StringValueCStr(text_rbs);
+    GET_CPPJIEBA(data);
+    std::vector<std::pair<std::string, std::string>> words;
+    data->jieba->Tag(text, words);
+    VALUE result = rb_hash_new();
+    for (auto iter = words.begin(); iter != words.end(); iter++) {
+        rb_hash_aset(result,
+            rb_enc_str_new(iter->first.c_str(), iter->first.length(), u8_enc),
+            rb_enc_str_new(iter->second.c_str(), iter->second.length(), u8_enc));
+    }
+    return result;
+}
+void Init_internal()
+{
+    rb_sMp = rb_intern("mp");
+    rb_sMix = rb_intern("mix");
+    rb_sHmm = rb_intern("hmm");
+    rb_sQuery = rb_intern("query");
+    rb_sFull = rb_intern("full");
+    u8_enc = rb_utf8_encoding();
+    rb_cCppjiebaRb_Internal = rb_define_class_under(rb_mCppjiebaRb, "Internal", rb_cData);
+    rb_define_alloc_func(rb_cCppjiebaRb_Internal, internal_alloc);
+    rb_define_method(rb_cCppjiebaRb_Internal, "initialize", (ruby_method*) &internal_initialize, 5);
+    rb_define_method(rb_cCppjiebaRb_Internal, "extract_keyword", (ruby_method*) &internal_extract_keyword, 2);
+    rb_define_method(rb_cCppjiebaRb_Internal, "segment", (ruby_method*) &internal_segment, 4);
+    rb_define_method(rb_cCppjiebaRb_Internal, "segment_tag", (ruby_method*) &internal_segment_tag, 1);
+}
+}

data/lib/cppjieba_rb/segment.rb ADDED Viewed

@@ -0,0 +1,20 @@
+module CppjiebaRb
+  class Segment
+    VALID_MODES = %i[mix hmm mp query full].freeze
+    def initialize(opts = nil)
+      opts ||= {}
+      unless opts[:mode].nil? || VALID_MODES.include?(opts[:mode])
+        raise ArgumentError, "The mode is #{opts[:mode]}. It should be one of :mix :hmm :mp"
+      end
+      @mode = opts[:mode] || :mix
+      @max_word_length = opts[:max_word_length] || 8
+      @hmm = opts[:hmm] || true
+    end
+    def segment(str)
+      CppjiebaRb.internal.segment(str, @mode, @max_word_length, @hmm)
+    end
+  end
+end

data/lib/cppjieba_rb/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module CppjiebaRb
+  VERSION = '0.2.1'
+end

data/lib/cppjieba_rb.rb ADDED Viewed

@@ -0,0 +1,34 @@
+require 'cppjieba_rb/cppjieba_rb'
+require 'cppjieba_rb/version'
+require 'cppjieba_rb/segment'
+module CppjiebaRb
+  EXT_BASE = File.join(File.dirname(__FILE__), '..', 'ext', 'cppjieba', 'dict')
+  DICT_PATH = File.join(EXT_BASE, 'jieba.dict.utf8')
+  HMM_DICT_PATH = File.join(EXT_BASE, 'hmm_model.utf8')
+  USER_DICT = File.join(EXT_BASE, 'user.dict.utf8')
+  IDF_PATH = File.join(EXT_BASE, 'idf.utf8')
+  STOP_WORD_PATH = File.join(EXT_BASE, 'stop_words.utf8')
+  def self.extract_keyword(str, top_n)
+    internal.extract_keyword(str, top_n)
+  end
+  def self.segment(str, opts = nil)
+    CppjiebaRb::Segment.new(opts).segment(str)
+  end
+  def self.segment_tag(str)
+    internal.segment_tag(str)
+  end
+  class << self
+    def internal
+      @backend ||= CppjiebaRb::Internal.new(DICT_PATH,
+                                            HMM_DICT_PATH,
+                                            USER_DICT,
+                                            IDF_PATH,
+                                            STOP_WORD_PATH)
+    end
+  end
+end

data/test/test_keyword.rb ADDED Viewed

@@ -0,0 +1,17 @@
+# coding: utf-8
+require 'minitest/autorun'
+require 'cppjieba_rb'
+class JiebaTest < Minitest::Test
+  def test_keywords
+    results = CppjiebaRb.extract_keyword "我是拖拉机学院手扶拖拉机专业的。不用多久，我就会升职加薪，当上CEO，走上人生巅峰。", 5
+    assert_equal [["CEO",
+                   11.739204307083542],
+                  ["升职", 10.8561552143],
+                  ["加薪", 10.642581114],
+                  ["手扶拖拉机", 10.0088573539],
+                  ["巅峰", 9.49395840471]], results
+  end
+end

data/test/test_segment.rb ADDED Viewed

@@ -0,0 +1,24 @@
+# coding: utf-8
+require 'minitest/autorun'
+require 'cppjieba_rb'
+class JiebaTest < Minitest::Test
+  def test_mix_segment
+    words = CppjiebaRb.segment "我来到南京市长江大桥"
+    assert_equal %w(我 来到 南京市 长江大桥), words
+    words = CppjiebaRb.segment "令狐冲是云计算行业的专家"
+    assert_equal %w(令狐冲 是 云计算 行业 的 专家), words
+  end
+  def test_hmm_segment
+    words = CppjiebaRb.segment "令狐冲是云计算行业的专家", mode: :hmm
+    assert_equal %w(令狐冲 是 云计算 行业 的 专家), words
+  end
+  def test_max_prob_segment
+    words = CppjiebaRb.segment "令狐冲是云计算行业的专家", mode: :mp
+    assert_equal %w(令狐冲 是 云计算 行业 的 专家), words
+  end
+end

data/test/test_tagging.rb ADDED Viewed

@@ -0,0 +1,19 @@
+# coding: utf-8
+require 'minitest/autorun'
+require 'cppjieba_rb'
+class JiebaTest < Minitest::Test
+  def test_tagging
+    pairs = CppjiebaRb.segment_tag '我是蓝翔技工拖拉机学院手扶拖拉机专业的。'
+    assert_equal({ '我' => 'r', '是' => 'v', '蓝翔' => 'nz', '技工' => 'n',
+                   '拖拉机' => 'n', '学院' => 'n', '手扶拖拉机' => 'n', '专业' => 'n',
+                   '的' => 'uj', '。' => 'x' }, pairs)
+  end
+  def test_tagging_with_user_dict
+    pairs = CppjiebaRb.segment_tag '我是蓝翔技工拖拉机学院手扶拖拉机专业的。'
+    assert_equal({ '我' => 'r', '是' => 'v', '蓝翔' => 'nz', '技工' => 'n',
+                   '拖拉机' => 'n', '学院' => 'n', '手扶拖拉机' => 'n', '专业' => 'n',
+                   '的' => 'uj', '。' => 'x' }, pairs)
+  end
+end