cppjieba_rb 0.4.1 → 0.4.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (127) hide show
  1. checksums.yaml +4 -4
  2. data/.editorconfig +21 -0
  3. data/.github/workflows/linting.yml +30 -0
  4. data/.github/workflows/release.yml +42 -0
  5. data/.github/workflows/tests.yml +47 -0
  6. data/.gitignore +1 -0
  7. data/.rubocop.yml +45 -0
  8. data/.ruby-version +1 -0
  9. data/.yamllint +35 -0
  10. data/CHANGELOG.md +17 -0
  11. data/Gemfile +11 -0
  12. data/README.md +5 -5
  13. data/Rakefile +16 -7
  14. data/cppjieba_rb.gemspec +46 -33
  15. data/ext/cppjieba/.github/workflows/cmake.yml +52 -0
  16. data/ext/cppjieba/.github/workflows/stale-issues.yml +24 -0
  17. data/ext/cppjieba/.gitmodules +3 -0
  18. data/ext/cppjieba/{ChangeLog.md → CHANGELOG.md} +50 -1
  19. data/ext/cppjieba/CMakeLists.txt +11 -14
  20. data/ext/cppjieba/LICENSE +20 -0
  21. data/ext/cppjieba/README.md +9 -18
  22. data/ext/cppjieba/deps/limonp/.github/workflows/cmake.yml +43 -0
  23. data/ext/cppjieba/deps/limonp/.gitignore +9 -0
  24. data/ext/cppjieba/deps/limonp/CHANGELOG.md +160 -0
  25. data/ext/cppjieba/deps/limonp/CMakeLists.txt +61 -0
  26. data/ext/cppjieba/deps/limonp/LICENSE +20 -0
  27. data/ext/cppjieba/deps/limonp/README.md +38 -0
  28. data/ext/cppjieba/deps/limonp/{LocalVector.hpp → include/limonp/LocalVector.hpp} +3 -3
  29. data/ext/cppjieba/deps/limonp/{Logging.hpp → include/limonp/Logging.hpp} +17 -3
  30. data/ext/cppjieba/deps/limonp/{StringUtil.hpp → include/limonp/StringUtil.hpp} +31 -10
  31. data/ext/cppjieba/deps/limonp/test/CMakeLists.txt +8 -0
  32. data/ext/cppjieba/deps/limonp/test/demo.cpp +40 -0
  33. data/ext/cppjieba/deps/limonp/test/testdata/1.conf +5 -0
  34. data/ext/cppjieba/deps/limonp/test/testdata/StdExtension.data +3 -0
  35. data/ext/cppjieba/deps/limonp/test/testdata/dict.gbk +50 -0
  36. data/ext/cppjieba/deps/limonp/test/testdata/dict.utf8 +50 -0
  37. data/ext/cppjieba/deps/limonp/test/testdata/io_testfile +2 -0
  38. data/ext/cppjieba/deps/limonp/test/testdata/jieba.dict.0.1.utf8 +93 -0
  39. data/ext/cppjieba/deps/limonp/test/testdata/jieba.dict.0.utf8 +93 -0
  40. data/ext/cppjieba/deps/limonp/test/testdata/jieba.dict.1.utf8 +67 -0
  41. data/ext/cppjieba/deps/limonp/test/testdata/jieba.dict.2.utf8 +64 -0
  42. data/ext/cppjieba/deps/limonp/test/unittest/CMakeLists.txt +30 -0
  43. data/ext/cppjieba/deps/limonp/test/unittest/TArgvContext.cpp +16 -0
  44. data/ext/cppjieba/deps/limonp/test/unittest/TCastFloat.cpp +19 -0
  45. data/ext/cppjieba/deps/limonp/test/unittest/TClosure.cpp +85 -0
  46. data/ext/cppjieba/deps/limonp/test/unittest/TColorPrint.cpp +20 -0
  47. data/ext/cppjieba/deps/limonp/test/unittest/TConfig.cpp +17 -0
  48. data/ext/cppjieba/deps/limonp/test/unittest/TLocalVector.cpp +41 -0
  49. data/ext/cppjieba/deps/limonp/test/unittest/TLogging.cpp +12 -0
  50. data/ext/cppjieba/deps/limonp/test/unittest/TStdExtension.cpp +95 -0
  51. data/ext/cppjieba/deps/limonp/test/unittest/TStringUtil.cpp +183 -0
  52. data/ext/cppjieba/include/cppjieba/DictTrie.hpp +9 -0
  53. data/ext/cppjieba/include/cppjieba/Jieba.hpp +4 -0
  54. data/ext/cppjieba/include/cppjieba/Trie.hpp +27 -1
  55. data/ext/cppjieba/test/CMakeLists.txt +4 -3
  56. data/ext/cppjieba/test/unittest/CMakeLists.txt +16 -7
  57. data/ext/cppjieba_rb/extconf.rb +11 -6
  58. data/ext/cppjieba_rb/internal.cc +1 -1
  59. data/lib/cppjieba_rb/segment.rb +4 -1
  60. data/lib/cppjieba_rb/version.rb +3 -1
  61. data/lib/cppjieba_rb.rb +12 -5
  62. data/test/test_keyword.rb +8 -8
  63. data/test/test_segment.rb +14 -10
  64. data/test/test_stop_word_filter.rb +5 -3
  65. data/test/test_tagging.rb +5 -2
  66. metadata +63 -140
  67. data/.travis.yml +0 -30
  68. data/ext/cppjieba/.travis.yml +0 -21
  69. data/ext/cppjieba/README_EN.md +0 -115
  70. data/ext/cppjieba/appveyor.yml +0 -32
  71. data/ext/cppjieba/deps/CMakeLists.txt +0 -1
  72. data/ext/cppjieba/deps/gtest/CMakeLists.txt +0 -5
  73. data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +0 -283
  74. data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +0 -230
  75. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +0 -1421
  76. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +0 -487
  77. data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +0 -796
  78. data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +0 -232
  79. data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +0 -176
  80. data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +0 -259
  81. data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +0 -2155
  82. data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +0 -358
  83. data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +0 -58
  84. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +0 -308
  85. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +0 -210
  86. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +0 -1226
  87. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +0 -233
  88. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +0 -4822
  89. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +0 -301
  90. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +0 -619
  91. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +0 -1788
  92. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +0 -350
  93. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +0 -968
  94. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +0 -336
  95. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +0 -3330
  96. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +0 -296
  97. data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +0 -681
  98. data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +0 -509
  99. data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
  100. data/ext/cppjieba/deps/gtest/src/gtest-all.cc +0 -48
  101. data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +0 -1234
  102. data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +0 -380
  103. data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +0 -1038
  104. data/ext/cppjieba/deps/gtest/src/gtest-port.cc +0 -746
  105. data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +0 -356
  106. data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +0 -110
  107. data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +0 -110
  108. data/ext/cppjieba/deps/gtest/src/gtest.cc +0 -4898
  109. data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +0 -49
  110. data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +0 -67
  111. data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +0 -65
  112. data/ext/cppjieba/deps/limonp/FileLock.hpp +0 -74
  113. data/ext/cppjieba/deps/limonp/Md5.hpp +0 -411
  114. data/ext/cppjieba/deps/limonp/MutexLock.hpp +0 -51
  115. data/ext/cppjieba/deps/limonp/Thread.hpp +0 -44
  116. data/ext/cppjieba/deps/limonp/ThreadPool.hpp +0 -86
  117. data/ext/cppjieba/test/demo.cpp +0 -80
  118. /data/ext/cppjieba/deps/{gtest/src/.deps/.dirstamp → limonp/.gitmodules} +0 -0
  119. /data/ext/cppjieba/deps/limonp/{ArgvContext.hpp → include/limonp/ArgvContext.hpp} +0 -0
  120. /data/ext/cppjieba/deps/limonp/{Closure.hpp → include/limonp/Closure.hpp} +0 -0
  121. /data/ext/cppjieba/deps/limonp/{Colors.hpp → include/limonp/Colors.hpp} +0 -0
  122. /data/ext/cppjieba/deps/limonp/{Condition.hpp → include/limonp/Condition.hpp} +0 -0
  123. /data/ext/cppjieba/deps/limonp/{Config.hpp → include/limonp/Config.hpp} +0 -0
  124. /data/ext/cppjieba/deps/limonp/{ForcePublic.hpp → include/limonp/ForcePublic.hpp} +0 -0
  125. /data/ext/cppjieba/deps/limonp/{NonCopyable.hpp → include/limonp/NonCopyable.hpp} +0 -0
  126. /data/ext/cppjieba/deps/limonp/{StdExtension.hpp → include/limonp/StdExtension.hpp} +0 -0
  127. /data/ext/cppjieba/deps/{gtest/src/gtest_main.cc → limonp/test/unittest/gtest_main.cpp} +0 -0
@@ -0,0 +1,183 @@
1
+ #include "limonp/StringUtil.hpp"
2
+ #include "gtest/gtest.h"
3
+ using namespace limonp;
4
+
5
+ TEST(StringUtilTest, Test1) {
6
+ vector<string> vec;
7
+ string s;
8
+ Split("\t1\t3\t4\t", vec, "\t");
9
+ ASSERT_EQ(s << vec, "[\"\", \"1\", \"3\", \"4\"]");
10
+ s = " \t\n ni hao ad \r\n";
11
+ ASSERT_EQ("ni hao ad", Trim(s));
12
+ ASSERT_EQ("select * from table1 limit 1;" ,StringFormat("select %s from %s %s;", "*","table1","limit 1"));
13
+ s = StringFormat("select %s from %s %s;", "*","table1","limit 1");
14
+ ASSERT_EQ("select * from table1 limit 1;" ,s);
15
+ vec.clear();
16
+ vec.push_back("1");
17
+ vec.push_back("2");
18
+ vec.push_back("3");
19
+ s.clear();
20
+ Join(vec.begin(), vec.end(), s,",");
21
+ ASSERT_EQ("1,2,3",s);
22
+ s = Join(vec.begin(), vec.end(), "..");
23
+ ASSERT_EQ("1..2..3", s);
24
+ const char* arr[] = {"2","3","5"};
25
+ ASSERT_EQ("2,3,5", Join(arr, arr + sizeof(arr)/sizeof(arr[0]), ","));
26
+ map<string , int> mp;
27
+ mp["key1"] =2;
28
+ ASSERT_EQ("{key1:2}", s << mp);
29
+ std::unordered_map<int,int> hmp;
30
+ hmp[1]=2;
31
+ ASSERT_EQ("{1:2}", s << hmp);
32
+ }
33
+
34
+ TEST(StringUtilTest, Test2) {
35
+ string s, gbks;
36
+ ifstream ifs("../test/testdata/dict.gbk");
37
+ ASSERT_TRUE(!!ifs);
38
+
39
+ vector<uint16_t> uni;
40
+ while(getline(ifs, s)) {
41
+ GBKTrans(s, uni);
42
+ GBKTrans(uni.begin(), uni.end(), gbks);
43
+ ASSERT_EQ(s, gbks);
44
+ }
45
+ }
46
+
47
+ TEST(StringUtilTest, Test3) {
48
+ string s, utf8;
49
+ ifstream ifs("../test/testdata/dict.utf8");
50
+ ASSERT_TRUE(!!ifs);
51
+
52
+ vector<uint16_t> uni;
53
+ while(getline(ifs, s)) {
54
+ ASSERT_TRUE(Utf8ToUnicode(s, uni));
55
+ UnicodeToUtf8(uni.begin(), uni.end(), utf8);
56
+ ASSERT_EQ(s, utf8);
57
+ }
58
+ }
59
+
60
+ TEST(StringUtilTest, Test4) {
61
+ //ASSERT_TRUE(StartsWith("--help",NULL));
62
+ ASSERT_TRUE(StartsWith("--help","--"));
63
+ ASSERT_TRUE(StartsWith("--help","-"));
64
+ ASSERT_FALSE(StartsWith("--help","he"));
65
+ ASSERT_TRUE(StartsWith("help","help"));
66
+ ASSERT_FALSE(StartsWith("","help"));
67
+ ASSERT_TRUE(StartsWith("hel",""));
68
+ ASSERT_TRUE(EndsWith("hel",""));
69
+ ASSERT_TRUE(EndsWith("hel","el"));
70
+ }
71
+
72
+ TEST(StringUtilTest, Test5) {
73
+ const char* str = "1,2,3,4";
74
+ vector<string> vec;
75
+ string res;
76
+ Split(str, vec, ",");
77
+ ASSERT_EQ("[\"1\", \"2\", \"3\", \"4\"]", res << vec);
78
+ Split("1,2,3,4,", vec, ",");
79
+ ASSERT_EQ("[\"1\", \"2\", \"3\", \"4\"]", res << vec);
80
+ Split(str, vec, ",", 3);
81
+ ASSERT_EQ("[\"1\", \"2\", \"3\", \"4\"]", res << vec);
82
+
83
+ Split("1", vec, ",");
84
+ ASSERT_EQ("[\"1\"]", res << vec);
85
+
86
+ Split(str, vec, ",", 1);
87
+ ASSERT_EQ("[\"1\", \"2,3,4\"]", res << vec);
88
+
89
+ Split("", vec, ",");
90
+ ASSERT_EQ("[]", res << vec);
91
+
92
+ Split("1, 2", vec, ",");
93
+ ASSERT_EQ("[\"1\", \" 2\"]", res << vec);
94
+
95
+ Split("1==2", vec, "==");
96
+ ASSERT_EQ("[\"1\", \"\", \"2\"]", res << vec);
97
+
98
+ Split("1,", vec, ",");
99
+ ASSERT_EQ("[\"1\"]", res << vec);
100
+
101
+ Split(",1,", vec, ",");
102
+ ASSERT_EQ("[\"\", \"1\"]", res << vec);
103
+
104
+ Split("1, ", vec, ",");
105
+ ASSERT_EQ("[\"1\", \" \"]", res << vec);
106
+
107
+ res << Split("1|2,3", "|,");
108
+ ASSERT_EQ("[\"1\", \"2\", \"3\"]", res);
109
+ }
110
+
111
+ TEST(StringUtilTest, Trim) {
112
+ string s;
113
+ s = "xxxyyyxx";
114
+ ASSERT_EQ(RTrim(s, 'x'), "xxxyyy");
115
+ ASSERT_EQ(LTrim(s, 'x'), "yyy");
116
+ s = "xxxyyyxx";
117
+ ASSERT_EQ(Trim(s, 'x'), "yyy");
118
+
119
+ s = " x y ";
120
+ ASSERT_EQ(Trim(s), "x y");
121
+
122
+ // check if it core dump when using isalpha
123
+ wchar_t w = 1000024;
124
+ ASSERT_FALSE(IsSpace(w));
125
+ w = 0x20;
126
+ ASSERT_TRUE(IsSpace(w));
127
+ }
128
+
129
+ TEST(StringUtilTest, GetTime) {
130
+ string s;
131
+ GetTime("%Y-%m-%d %H:%M:%S", s);
132
+ //print(s);
133
+ }
134
+
135
+ TEST(StringUtilTest, PathJoin) {
136
+ const char * path1 = "/home/foo/dir";
137
+ const char * path2 = "file";
138
+ const char * path3 = "/home/foo/dir/";
139
+ const char * path4 = "file";
140
+ const char * answer = "/home/foo/dir/file";
141
+
142
+ ASSERT_EQ(answer, PathJoin(path1, path2));
143
+ ASSERT_EQ(answer, PathJoin(path3, path4));
144
+ }
145
+
146
+ TEST(StringUtilTest, JapaneseUnicode) {
147
+ // Japanese
148
+ const char* s = "がんば";
149
+ vector<uint16_t> unicode;
150
+ ASSERT_TRUE(Utf8ToUnicode(s, unicode));
151
+ ASSERT_EQ(3u, unicode.size());
152
+ }
153
+
154
+ TEST(StringUtilTest, RareChinese) {
155
+ //U+10000 – U+10FFFF
156
+ const char* s = "𪚥";
157
+ vector<uint16_t> unicode;
158
+ ASSERT_FALSE(Utf8ToUnicode(s, unicode));
159
+ ASSERT_EQ(0u, unicode.size());
160
+ }
161
+
162
+ TEST(StringUtilTest, RareChineseUnicode32) {
163
+ //U+10000 – U+10FFFF
164
+ const char* s = "𪚥";
165
+ vector<uint32_t> unicode;
166
+ ASSERT_TRUE(Utf8ToUnicode32(s, unicode));
167
+ ASSERT_EQ(1u, unicode.size());
168
+
169
+ string s2;
170
+ Unicode32ToUtf8(unicode.begin(), unicode.end(), s2);
171
+ ASSERT_EQ(s2, s);
172
+ }
173
+
174
+ TEST(StringUtilTest, Unicode32) {
175
+ const char* s = "1+1=2你好世界,。";
176
+ vector<uint32_t> unicode;
177
+ ASSERT_TRUE(Utf8ToUnicode32(s, unicode));
178
+ ASSERT_EQ(unicode.size(), 11u);
179
+
180
+ string s2;
181
+ Unicode32ToUtf8(unicode.begin(), unicode.end(), s2);
182
+ ASSERT_EQ(s2, s);
183
+ }
@@ -61,6 +61,15 @@ class DictTrie {
61
61
  return true;
62
62
  }
63
63
 
64
+ bool DeleteUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
65
+ DictUnit node_info;
66
+ if (!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) {
67
+ return false;
68
+ }
69
+ trie_->DeleteNode(node_info.word, &node_info);
70
+ return true;
71
+ }
72
+
64
73
  const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
65
74
  return trie_->Find(begin, end);
66
75
  }
@@ -76,6 +76,10 @@ class Jieba {
76
76
  return dict_trie_.InsertUserWord(word,freq, tag);
77
77
  }
78
78
 
79
+ bool DeleteUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
80
+ return dict_trie_.DeleteUserWord(word, tag);
81
+ }
82
+
79
83
  bool Find(const string& word)
80
84
  {
81
85
  return dict_trie_.Find(word);
@@ -141,7 +141,33 @@ class Trie {
141
141
  assert(ptNode != NULL);
142
142
  ptNode->ptValue = ptValue;
143
143
  }
144
-
144
+ void DeleteNode(const Unicode& key, const DictUnit* ptValue) {
145
+ if (key.begin() == key.end()) {
146
+ return;
147
+ }
148
+ //定义一个NextMap迭代器
149
+ TrieNode::NextMap::const_iterator kmIter;
150
+ //定义一个指向root的TrieNode指针
151
+ TrieNode *ptNode = root_;
152
+ for (Unicode::const_iterator citer = key.begin(); citer != key.end(); ++citer) {
153
+ //链表不存在元素
154
+ if (NULL == ptNode->next) {
155
+ return;
156
+ }
157
+ kmIter = ptNode->next->find(*citer);
158
+ //如果map中不存在,跳出循环
159
+ if (ptNode->next->end() == kmIter) {
160
+ break;
161
+ }
162
+ //从unordered_map中擦除该项
163
+ ptNode->next->erase(*citer);
164
+ //删除该node
165
+ ptNode = kmIter->second;
166
+ delete ptNode;
167
+ break;
168
+ }
169
+ return;
170
+ }
145
171
  private:
146
172
  void CreateTrie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers) {
147
173
  if (valuePointers.empty() || keys.empty()) {
@@ -1,5 +1,6 @@
1
1
  SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR})
2
2
 
3
- ADD_EXECUTABLE(demo demo.cpp)
4
- ADD_EXECUTABLE(load_test load_test.cpp)
5
- ADD_SUBDIRECTORY(unittest)
3
+ if(NOT MSVC)
4
+ ADD_EXECUTABLE(load_test load_test.cpp)
5
+ ADD_SUBDIRECTORY(unittest)
6
+ endif()
@@ -1,8 +1,21 @@
1
+ if (MSVC)
2
+ set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreadedDebugDLL")
3
+ set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
4
+ endif()
5
+
6
+ include(FetchContent)
7
+
8
+ FetchContent_Declare(
9
+ googletest
10
+ GIT_REPOSITORY https://github.com/google/googletest.git
11
+ GIT_TAG release-1.11.0
12
+ )
13
+ FetchContent_MakeAvailable(googletest)
14
+
15
+
1
16
  SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/test)
2
17
  SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
3
18
 
4
- INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/deps/gtest/include)
5
-
6
19
  ADD_DEFINITIONS(-DLOGGING_LEVEL=LL_WARNING)
7
20
 
8
21
  ADD_EXECUTABLE(test.run
@@ -17,8 +30,4 @@ ADD_EXECUTABLE(test.run
17
30
  textrank_test.cpp
18
31
  )
19
32
 
20
- if(MSVC)
21
- TARGET_LINK_LIBRARIES(test.run gtest)
22
- else()
23
- TARGET_LINK_LIBRARIES(test.run gtest pthread)
24
- endif()
33
+ TARGET_LINK_LIBRARIES(test.run gtest)
@@ -1,5 +1,7 @@
1
- require "mkmf"
2
- abs = File.expand_path File.dirname(__FILE__)
1
+ # frozen_string_literal: true
2
+
3
+ require 'mkmf'
4
+ abs = __dir__
3
5
 
4
6
  LIBDIR = RbConfig::CONFIG['libdir']
5
7
  INCLUDEDIR = RbConfig::CONFIG['includedir']
@@ -7,17 +9,20 @@ INCLUDEDIR = RbConfig::CONFIG['includedir']
7
9
  HEADER_DIRS = [
8
10
  INCLUDEDIR,
9
11
  "#{abs}/../cppjieba/include",
10
- "#{abs}/../cppjieba/deps"
11
- ]
12
+ "#{abs}/../cppjieba/deps/limonp/include"
13
+ ].freeze
12
14
 
13
15
  LIB_DIRS = [
14
16
  LIBDIR
15
- ]
17
+ ].freeze
16
18
 
17
19
  dir_config('cppjieba_rb', HEADER_DIRS, LIB_DIRS)
18
20
 
19
- CONFIG["CXXFLAGS"] += " -std=c++11 -O3"
21
+ # rubocop:disable Style/GlobalVars
22
+ CONFIG['CXXFLAGS'] += ' -std=c++11 -O3'
20
23
  $CXXFLAGS = "#{$CXXFLAGS} -std=c++11 -O3"
24
+ # rubocop:enable Style/GlobalVars
25
+
21
26
  create_makefile 'cppjieba_rb/cppjieba_rb'
22
27
  # respect header changes
23
28
  headers = Dir.glob('*.{hpp,h}').join ' '
@@ -163,7 +163,7 @@ void Init_internal()
163
163
  rb_sFull = rb_intern("full");
164
164
  u8_enc = rb_utf8_encoding();
165
165
 
166
- rb_cCppjiebaRb_Internal = rb_define_class_under(rb_mCppjiebaRb, "Internal", rb_cData);
166
+ rb_cCppjiebaRb_Internal = rb_define_class_under(rb_mCppjiebaRb, "Internal", rb_cObject);
167
167
  rb_define_alloc_func(rb_cCppjiebaRb_Internal, internal_alloc);
168
168
  rb_define_method(rb_cCppjiebaRb_Internal, "initialize", (ruby_method*) &internal_initialize, 5);
169
169
  rb_define_method(rb_cCppjiebaRb_Internal, "extract_keyword", (ruby_method*) &internal_extract_keyword, 2);
@@ -1,4 +1,7 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module CppjiebaRb
4
+ # Sentence segmentation
2
5
  class Segment
3
6
  VALID_MODES = %i[mix hmm mp query full].freeze
4
7
 
@@ -17,4 +20,4 @@ module CppjiebaRb
17
20
  CppjiebaRb.internal.segment(str, @mode, @max_word_length, @hmm)
18
21
  end
19
22
  end
20
- end
23
+ end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module CppjiebaRb
2
- VERSION = '0.4.1'
4
+ VERSION = '0.4.4'
3
5
  end
data/lib/cppjieba_rb.rb CHANGED
@@ -1,7 +1,14 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'cppjieba_rb/cppjieba_rb'
2
4
  require 'cppjieba_rb/version'
3
5
  require 'cppjieba_rb/segment'
4
6
 
7
+ # CppjiebaRb segments a Chinese sentence into words.
8
+ #
9
+ # Available segmentation methods include HMM, MP, and mix mode.
10
+ # Dictionaries takes a strong part in CppjiebaRb's accuracy.
11
+ # Read more https://github.com/yanyiwu/cppjieba
5
12
  module CppjiebaRb
6
13
  EXT_BASE = File.join(File.dirname(__FILE__), '..', 'ext', 'cppjieba', 'dict')
7
14
  DICT_PATH = File.join(EXT_BASE, 'jieba.dict.utf8')
@@ -28,11 +35,11 @@ module CppjiebaRb
28
35
 
29
36
  class << self
30
37
  def internal
31
- @backend ||= CppjiebaRb::Internal.new(DICT_PATH,
32
- HMM_DICT_PATH,
33
- USER_DICT,
34
- IDF_PATH,
35
- STOP_WORD_PATH)
38
+ @internal ||= CppjiebaRb::Internal.new(DICT_PATH,
39
+ HMM_DICT_PATH,
40
+ USER_DICT,
41
+ IDF_PATH,
42
+ STOP_WORD_PATH)
36
43
  end
37
44
  end
38
45
  end
data/test/test_keyword.rb CHANGED
@@ -1,17 +1,17 @@
1
- # coding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  require 'minitest/autorun'
3
4
  require 'cppjieba_rb'
4
5
 
5
6
  class JiebaTest < Minitest::Test
6
7
  def test_keywords
7
- results = CppjiebaRb.extract_keyword "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。", 5
8
+ results = CppjiebaRb.extract_keyword '我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。', 5
8
9
 
9
- assert_equal [["CEO",
10
+ assert_equal [['CEO',
10
11
  11.739204307083542],
11
- ["升职", 10.8561552143],
12
- ["加薪", 10.642581114],
13
- ["手扶拖拉机", 10.0088573539],
14
- ["巅峰", 9.49395840471]], results
15
-
12
+ ['升职', 10.8561552143],
13
+ ['加薪', 10.642581114],
14
+ ['手扶拖拉机', 10.0088573539],
15
+ ['巅峰', 9.49395840471]], results
16
16
  end
17
17
  end
data/test/test_segment.rb CHANGED
@@ -1,24 +1,28 @@
1
- # coding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  require 'minitest/autorun'
3
4
  require 'cppjieba_rb'
4
5
 
5
6
  class JiebaTest < Minitest::Test
6
7
  def test_mix_segment
7
- words = CppjiebaRb.segment "我来到南京市长江大桥"
8
- assert_equal %w(我 来到 南京市 长江大桥), words
8
+ words = CppjiebaRb.segment '我来到南京市长江大桥'
9
+
10
+ assert_equal %w[我 来到 南京市 长江大桥], words
11
+
12
+ words = CppjiebaRb.segment '令狐冲是云计算行业的专家'
9
13
 
10
- words = CppjiebaRb.segment "令狐冲是云计算行业的专家"
11
- assert_equal %w(令狐冲 是 云计算 行业 的 专家), words
14
+ assert_equal %w[令狐冲 云计算 行业 的 专家], words
12
15
  end
13
16
 
14
17
  def test_hmm_segment
15
- words = CppjiebaRb.segment "令狐冲是云计算行业的专家", mode: :hmm
16
- assert_equal %w(令狐冲 是 云计算 行业 的 专家), words
18
+ words = CppjiebaRb.segment '令狐冲是云计算行业的专家', mode: :hmm
19
+
20
+ assert_equal %w[令狐冲 是 云计算 行业 的 专家], words
17
21
  end
18
22
 
19
23
  def test_max_prob_segment
20
- words = CppjiebaRb.segment "令狐冲是云计算行业的专家", mode: :mp
21
- assert_equal %w(令狐冲 是 云计算 行业 的 专家), words
22
- end
24
+ words = CppjiebaRb.segment '令狐冲是云计算行业的专家', mode: :mp
23
25
 
26
+ assert_equal %w[令狐冲 是 云计算 行业 的 专家], words
27
+ end
24
28
  end
@@ -1,10 +1,12 @@
1
- # coding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  require 'minitest/autorun'
3
4
  require 'cppjieba_rb'
4
5
 
5
6
  class JiebaTest < Minitest::Test
6
7
  def test_filter
7
- words = CppjiebaRb.filter_stop_word %w(令狐冲 是 云计算 行业 的 专家)
8
- assert_equal %w(令狐冲 云计算 行业 专家), words
8
+ words = CppjiebaRb.filter_stop_word %w[令狐冲 是 云计算 行业 的 专家]
9
+
10
+ assert_equal %w[令狐冲 云计算 行业 专家], words
9
11
  end
10
12
  end
data/test/test_tagging.rb CHANGED
@@ -1,9 +1,12 @@
1
- # coding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  require 'minitest/autorun'
3
4
  require 'cppjieba_rb'
5
+
4
6
  class JiebaTest < Minitest::Test
5
7
  def test_tagging
6
8
  pairs = CppjiebaRb.segment_tag '我是蓝翔技工拖拉机学院手扶拖拉机专业的。'
9
+
7
10
  assert_equal({ '我' => 'r', '是' => 'v', '蓝翔' => 'nz', '技工' => 'n',
8
11
  '拖拉机' => 'n', '学院' => 'n', '手扶拖拉机' => 'n', '专业' => 'n',
9
12
  '的' => 'uj', '。' => 'x' }, pairs)
@@ -11,9 +14,9 @@ class JiebaTest < Minitest::Test
11
14
 
12
15
  def test_tagging_with_user_dict
13
16
  pairs = CppjiebaRb.segment_tag '我是蓝翔技工拖拉机学院手扶拖拉机专业的。'
17
+
14
18
  assert_equal({ '我' => 'r', '是' => 'v', '蓝翔' => 'nz', '技工' => 'n',
15
19
  '拖拉机' => 'n', '学院' => 'n', '手扶拖拉机' => 'n', '专业' => 'n',
16
20
  '的' => 'uj', '。' => 'x' }, pairs)
17
21
  end
18
-
19
22
  end