cppjieba_rb 0.4.1 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. checksums.yaml +4 -4
  2. data/.editorconfig +21 -0
  3. data/.github/workflows/linting.yml +30 -0
  4. data/.github/workflows/release.yml +42 -0
  5. data/.github/workflows/tests.yml +47 -0
  6. data/.gitignore +1 -0
  7. data/.rubocop.yml +45 -0
  8. data/.ruby-version +1 -0
  9. data/.yamllint +35 -0
  10. data/CHANGELOG.md +17 -0
  11. data/Gemfile +11 -0
  12. data/README.md +5 -5
  13. data/Rakefile +16 -7
  14. data/cppjieba_rb.gemspec +46 -33
  15. data/ext/cppjieba/.github/workflows/cmake.yml +52 -0
  16. data/ext/cppjieba/.github/workflows/stale-issues.yml +24 -0
  17. data/ext/cppjieba/.gitmodules +3 -0
  18. data/ext/cppjieba/{ChangeLog.md → CHANGELOG.md} +50 -1
  19. data/ext/cppjieba/CMakeLists.txt +11 -14
  20. data/ext/cppjieba/LICENSE +20 -0
  21. data/ext/cppjieba/README.md +9 -18
  22. data/ext/cppjieba/deps/limonp/.github/workflows/cmake.yml +43 -0
  23. data/ext/cppjieba/deps/limonp/.gitignore +9 -0
  24. data/ext/cppjieba/deps/limonp/CHANGELOG.md +160 -0
  25. data/ext/cppjieba/deps/limonp/CMakeLists.txt +61 -0
  26. data/ext/cppjieba/deps/limonp/LICENSE +20 -0
  27. data/ext/cppjieba/deps/limonp/README.md +38 -0
  28. data/ext/cppjieba/deps/limonp/{LocalVector.hpp → include/limonp/LocalVector.hpp} +3 -3
  29. data/ext/cppjieba/deps/limonp/{Logging.hpp → include/limonp/Logging.hpp} +17 -3
  30. data/ext/cppjieba/deps/limonp/{StringUtil.hpp → include/limonp/StringUtil.hpp} +31 -10
  31. data/ext/cppjieba/deps/limonp/test/CMakeLists.txt +8 -0
  32. data/ext/cppjieba/deps/limonp/test/demo.cpp +40 -0
  33. data/ext/cppjieba/deps/limonp/test/testdata/1.conf +5 -0
  34. data/ext/cppjieba/deps/limonp/test/testdata/StdExtension.data +3 -0
  35. data/ext/cppjieba/deps/limonp/test/testdata/dict.gbk +50 -0
  36. data/ext/cppjieba/deps/limonp/test/testdata/dict.utf8 +50 -0
  37. data/ext/cppjieba/deps/limonp/test/testdata/io_testfile +2 -0
  38. data/ext/cppjieba/deps/limonp/test/testdata/jieba.dict.0.1.utf8 +93 -0
  39. data/ext/cppjieba/deps/limonp/test/testdata/jieba.dict.0.utf8 +93 -0
  40. data/ext/cppjieba/deps/limonp/test/testdata/jieba.dict.1.utf8 +67 -0
  41. data/ext/cppjieba/deps/limonp/test/testdata/jieba.dict.2.utf8 +64 -0
  42. data/ext/cppjieba/deps/limonp/test/unittest/CMakeLists.txt +30 -0
  43. data/ext/cppjieba/deps/limonp/test/unittest/TArgvContext.cpp +16 -0
  44. data/ext/cppjieba/deps/limonp/test/unittest/TCastFloat.cpp +19 -0
  45. data/ext/cppjieba/deps/limonp/test/unittest/TClosure.cpp +85 -0
  46. data/ext/cppjieba/deps/limonp/test/unittest/TColorPrint.cpp +20 -0
  47. data/ext/cppjieba/deps/limonp/test/unittest/TConfig.cpp +17 -0
  48. data/ext/cppjieba/deps/limonp/test/unittest/TLocalVector.cpp +41 -0
  49. data/ext/cppjieba/deps/limonp/test/unittest/TLogging.cpp +12 -0
  50. data/ext/cppjieba/deps/limonp/test/unittest/TStdExtension.cpp +95 -0
  51. data/ext/cppjieba/deps/limonp/test/unittest/TStringUtil.cpp +183 -0
  52. data/ext/cppjieba/include/cppjieba/DictTrie.hpp +9 -0
  53. data/ext/cppjieba/include/cppjieba/Jieba.hpp +4 -0
  54. data/ext/cppjieba/include/cppjieba/Trie.hpp +27 -1
  55. data/ext/cppjieba/test/CMakeLists.txt +4 -3
  56. data/ext/cppjieba/test/unittest/CMakeLists.txt +16 -7
  57. data/ext/cppjieba_rb/extconf.rb +11 -6
  58. data/ext/cppjieba_rb/internal.cc +1 -1
  59. data/lib/cppjieba_rb/segment.rb +4 -1
  60. data/lib/cppjieba_rb/version.rb +3 -1
  61. data/lib/cppjieba_rb.rb +12 -5
  62. data/test/test_keyword.rb +8 -8
  63. data/test/test_segment.rb +14 -10
  64. data/test/test_stop_word_filter.rb +5 -3
  65. data/test/test_tagging.rb +5 -2
  66. metadata +63 -140
  67. data/.travis.yml +0 -30
  68. data/ext/cppjieba/.travis.yml +0 -21
  69. data/ext/cppjieba/README_EN.md +0 -115
  70. data/ext/cppjieba/appveyor.yml +0 -32
  71. data/ext/cppjieba/deps/CMakeLists.txt +0 -1
  72. data/ext/cppjieba/deps/gtest/CMakeLists.txt +0 -5
  73. data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +0 -283
  74. data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +0 -230
  75. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +0 -1421
  76. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +0 -487
  77. data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +0 -796
  78. data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +0 -232
  79. data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +0 -176
  80. data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +0 -259
  81. data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +0 -2155
  82. data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +0 -358
  83. data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +0 -58
  84. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +0 -308
  85. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +0 -210
  86. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +0 -1226
  87. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +0 -233
  88. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +0 -4822
  89. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +0 -301
  90. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +0 -619
  91. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +0 -1788
  92. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +0 -350
  93. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +0 -968
  94. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +0 -336
  95. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +0 -3330
  96. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +0 -296
  97. data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +0 -681
  98. data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +0 -509
  99. data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
  100. data/ext/cppjieba/deps/gtest/src/gtest-all.cc +0 -48
  101. data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +0 -1234
  102. data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +0 -380
  103. data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +0 -1038
  104. data/ext/cppjieba/deps/gtest/src/gtest-port.cc +0 -746
  105. data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +0 -356
  106. data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +0 -110
  107. data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +0 -110
  108. data/ext/cppjieba/deps/gtest/src/gtest.cc +0 -4898
  109. data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +0 -49
  110. data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +0 -67
  111. data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +0 -65
  112. data/ext/cppjieba/deps/limonp/FileLock.hpp +0 -74
  113. data/ext/cppjieba/deps/limonp/Md5.hpp +0 -411
  114. data/ext/cppjieba/deps/limonp/MutexLock.hpp +0 -51
  115. data/ext/cppjieba/deps/limonp/Thread.hpp +0 -44
  116. data/ext/cppjieba/deps/limonp/ThreadPool.hpp +0 -86
  117. data/ext/cppjieba/test/demo.cpp +0 -80
  118. /data/ext/cppjieba/deps/{gtest/src/.deps/.dirstamp → limonp/.gitmodules} +0 -0
  119. /data/ext/cppjieba/deps/limonp/{ArgvContext.hpp → include/limonp/ArgvContext.hpp} +0 -0
  120. /data/ext/cppjieba/deps/limonp/{Closure.hpp → include/limonp/Closure.hpp} +0 -0
  121. /data/ext/cppjieba/deps/limonp/{Colors.hpp → include/limonp/Colors.hpp} +0 -0
  122. /data/ext/cppjieba/deps/limonp/{Condition.hpp → include/limonp/Condition.hpp} +0 -0
  123. /data/ext/cppjieba/deps/limonp/{Config.hpp → include/limonp/Config.hpp} +0 -0
  124. /data/ext/cppjieba/deps/limonp/{ForcePublic.hpp → include/limonp/ForcePublic.hpp} +0 -0
  125. /data/ext/cppjieba/deps/limonp/{NonCopyable.hpp → include/limonp/NonCopyable.hpp} +0 -0
  126. /data/ext/cppjieba/deps/limonp/{StdExtension.hpp → include/limonp/StdExtension.hpp} +0 -0
  127. /data/ext/cppjieba/deps/{gtest/src/gtest_main.cc → limonp/test/unittest/gtest_main.cpp} +0 -0
@@ -0,0 +1,183 @@
1
+ #include "limonp/StringUtil.hpp"
2
+ #include "gtest/gtest.h"
3
+ using namespace limonp;
4
+
5
+ TEST(StringUtilTest, Test1) {
6
+ vector<string> vec;
7
+ string s;
8
+ Split("\t1\t3\t4\t", vec, "\t");
9
+ ASSERT_EQ(s << vec, "[\"\", \"1\", \"3\", \"4\"]");
10
+ s = " \t\n ni hao ad \r\n";
11
+ ASSERT_EQ("ni hao ad", Trim(s));
12
+ ASSERT_EQ("select * from table1 limit 1;" ,StringFormat("select %s from %s %s;", "*","table1","limit 1"));
13
+ s = StringFormat("select %s from %s %s;", "*","table1","limit 1");
14
+ ASSERT_EQ("select * from table1 limit 1;" ,s);
15
+ vec.clear();
16
+ vec.push_back("1");
17
+ vec.push_back("2");
18
+ vec.push_back("3");
19
+ s.clear();
20
+ Join(vec.begin(), vec.end(), s,",");
21
+ ASSERT_EQ("1,2,3",s);
22
+ s = Join(vec.begin(), vec.end(), "..");
23
+ ASSERT_EQ("1..2..3", s);
24
+ const char* arr[] = {"2","3","5"};
25
+ ASSERT_EQ("2,3,5", Join(arr, arr + sizeof(arr)/sizeof(arr[0]), ","));
26
+ map<string , int> mp;
27
+ mp["key1"] =2;
28
+ ASSERT_EQ("{key1:2}", s << mp);
29
+ std::unordered_map<int,int> hmp;
30
+ hmp[1]=2;
31
+ ASSERT_EQ("{1:2}", s << hmp);
32
+ }
33
+
34
+ TEST(StringUtilTest, Test2) {
35
+ string s, gbks;
36
+ ifstream ifs("../test/testdata/dict.gbk");
37
+ ASSERT_TRUE(!!ifs);
38
+
39
+ vector<uint16_t> uni;
40
+ while(getline(ifs, s)) {
41
+ GBKTrans(s, uni);
42
+ GBKTrans(uni.begin(), uni.end(), gbks);
43
+ ASSERT_EQ(s, gbks);
44
+ }
45
+ }
46
+
47
+ TEST(StringUtilTest, Test3) {
48
+ string s, utf8;
49
+ ifstream ifs("../test/testdata/dict.utf8");
50
+ ASSERT_TRUE(!!ifs);
51
+
52
+ vector<uint16_t> uni;
53
+ while(getline(ifs, s)) {
54
+ ASSERT_TRUE(Utf8ToUnicode(s, uni));
55
+ UnicodeToUtf8(uni.begin(), uni.end(), utf8);
56
+ ASSERT_EQ(s, utf8);
57
+ }
58
+ }
59
+
60
+ TEST(StringUtilTest, Test4) {
61
+ //ASSERT_TRUE(StartsWith("--help",NULL));
62
+ ASSERT_TRUE(StartsWith("--help","--"));
63
+ ASSERT_TRUE(StartsWith("--help","-"));
64
+ ASSERT_FALSE(StartsWith("--help","he"));
65
+ ASSERT_TRUE(StartsWith("help","help"));
66
+ ASSERT_FALSE(StartsWith("","help"));
67
+ ASSERT_TRUE(StartsWith("hel",""));
68
+ ASSERT_TRUE(EndsWith("hel",""));
69
+ ASSERT_TRUE(EndsWith("hel","el"));
70
+ }
71
+
72
+ TEST(StringUtilTest, Test5) {
73
+ const char* str = "1,2,3,4";
74
+ vector<string> vec;
75
+ string res;
76
+ Split(str, vec, ",");
77
+ ASSERT_EQ("[\"1\", \"2\", \"3\", \"4\"]", res << vec);
78
+ Split("1,2,3,4,", vec, ",");
79
+ ASSERT_EQ("[\"1\", \"2\", \"3\", \"4\"]", res << vec);
80
+ Split(str, vec, ",", 3);
81
+ ASSERT_EQ("[\"1\", \"2\", \"3\", \"4\"]", res << vec);
82
+
83
+ Split("1", vec, ",");
84
+ ASSERT_EQ("[\"1\"]", res << vec);
85
+
86
+ Split(str, vec, ",", 1);
87
+ ASSERT_EQ("[\"1\", \"2,3,4\"]", res << vec);
88
+
89
+ Split("", vec, ",");
90
+ ASSERT_EQ("[]", res << vec);
91
+
92
+ Split("1, 2", vec, ",");
93
+ ASSERT_EQ("[\"1\", \" 2\"]", res << vec);
94
+
95
+ Split("1==2", vec, "==");
96
+ ASSERT_EQ("[\"1\", \"\", \"2\"]", res << vec);
97
+
98
+ Split("1,", vec, ",");
99
+ ASSERT_EQ("[\"1\"]", res << vec);
100
+
101
+ Split(",1,", vec, ",");
102
+ ASSERT_EQ("[\"\", \"1\"]", res << vec);
103
+
104
+ Split("1, ", vec, ",");
105
+ ASSERT_EQ("[\"1\", \" \"]", res << vec);
106
+
107
+ res << Split("1|2,3", "|,");
108
+ ASSERT_EQ("[\"1\", \"2\", \"3\"]", res);
109
+ }
110
+
111
+ TEST(StringUtilTest, Trim) {
112
+ string s;
113
+ s = "xxxyyyxx";
114
+ ASSERT_EQ(RTrim(s, 'x'), "xxxyyy");
115
+ ASSERT_EQ(LTrim(s, 'x'), "yyy");
116
+ s = "xxxyyyxx";
117
+ ASSERT_EQ(Trim(s, 'x'), "yyy");
118
+
119
+ s = " x y ";
120
+ ASSERT_EQ(Trim(s), "x y");
121
+
122
+ // check if it core dump when using isalpha
123
+ wchar_t w = 1000024;
124
+ ASSERT_FALSE(IsSpace(w));
125
+ w = 0x20;
126
+ ASSERT_TRUE(IsSpace(w));
127
+ }
128
+
129
+ TEST(StringUtilTest, GetTime) {
130
+ string s;
131
+ GetTime("%Y-%m-%d %H:%M:%S", s);
132
+ //print(s);
133
+ }
134
+
135
+ TEST(StringUtilTest, PathJoin) {
136
+ const char * path1 = "/home/foo/dir";
137
+ const char * path2 = "file";
138
+ const char * path3 = "/home/foo/dir/";
139
+ const char * path4 = "file";
140
+ const char * answer = "/home/foo/dir/file";
141
+
142
+ ASSERT_EQ(answer, PathJoin(path1, path2));
143
+ ASSERT_EQ(answer, PathJoin(path3, path4));
144
+ }
145
+
146
+ TEST(StringUtilTest, JapaneseUnicode) {
147
+ // Japanese
148
+ const char* s = "がんば";
149
+ vector<uint16_t> unicode;
150
+ ASSERT_TRUE(Utf8ToUnicode(s, unicode));
151
+ ASSERT_EQ(3u, unicode.size());
152
+ }
153
+
154
+ TEST(StringUtilTest, RareChinese) {
155
+ //U+10000 – U+10FFFF
156
+ const char* s = "𪚥";
157
+ vector<uint16_t> unicode;
158
+ ASSERT_FALSE(Utf8ToUnicode(s, unicode));
159
+ ASSERT_EQ(0u, unicode.size());
160
+ }
161
+
162
+ TEST(StringUtilTest, RareChineseUnicode32) {
163
+ //U+10000 – U+10FFFF
164
+ const char* s = "𪚥";
165
+ vector<uint32_t> unicode;
166
+ ASSERT_TRUE(Utf8ToUnicode32(s, unicode));
167
+ ASSERT_EQ(1u, unicode.size());
168
+
169
+ string s2;
170
+ Unicode32ToUtf8(unicode.begin(), unicode.end(), s2);
171
+ ASSERT_EQ(s2, s);
172
+ }
173
+
174
+ TEST(StringUtilTest, Unicode32) {
175
+ const char* s = "1+1=2你好世界,。";
176
+ vector<uint32_t> unicode;
177
+ ASSERT_TRUE(Utf8ToUnicode32(s, unicode));
178
+ ASSERT_EQ(unicode.size(), 11u);
179
+
180
+ string s2;
181
+ Unicode32ToUtf8(unicode.begin(), unicode.end(), s2);
182
+ ASSERT_EQ(s2, s);
183
+ }
@@ -61,6 +61,15 @@ class DictTrie {
61
61
  return true;
62
62
  }
63
63
 
64
+ bool DeleteUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
65
+ DictUnit node_info;
66
+ if (!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) {
67
+ return false;
68
+ }
69
+ trie_->DeleteNode(node_info.word, &node_info);
70
+ return true;
71
+ }
72
+
64
73
  const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
65
74
  return trie_->Find(begin, end);
66
75
  }
@@ -76,6 +76,10 @@ class Jieba {
76
76
  return dict_trie_.InsertUserWord(word,freq, tag);
77
77
  }
78
78
 
79
+ bool DeleteUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
80
+ return dict_trie_.DeleteUserWord(word, tag);
81
+ }
82
+
79
83
  bool Find(const string& word)
80
84
  {
81
85
  return dict_trie_.Find(word);
@@ -141,7 +141,33 @@ class Trie {
141
141
  assert(ptNode != NULL);
142
142
  ptNode->ptValue = ptValue;
143
143
  }
144
-
144
+ void DeleteNode(const Unicode& key, const DictUnit* ptValue) {
145
+ if (key.begin() == key.end()) {
146
+ return;
147
+ }
148
+ //定义一个NextMap迭代器
149
+ TrieNode::NextMap::const_iterator kmIter;
150
+ //定义一个指向root的TrieNode指针
151
+ TrieNode *ptNode = root_;
152
+ for (Unicode::const_iterator citer = key.begin(); citer != key.end(); ++citer) {
153
+ //链表不存在元素
154
+ if (NULL == ptNode->next) {
155
+ return;
156
+ }
157
+ kmIter = ptNode->next->find(*citer);
158
+ //如果map中不存在,跳出循环
159
+ if (ptNode->next->end() == kmIter) {
160
+ break;
161
+ }
162
+ //从unordered_map中擦除该项
163
+ ptNode->next->erase(*citer);
164
+ //删除该node
165
+ ptNode = kmIter->second;
166
+ delete ptNode;
167
+ break;
168
+ }
169
+ return;
170
+ }
145
171
  private:
146
172
  void CreateTrie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers) {
147
173
  if (valuePointers.empty() || keys.empty()) {
@@ -1,5 +1,6 @@
1
1
  SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR})
2
2
 
3
- ADD_EXECUTABLE(demo demo.cpp)
4
- ADD_EXECUTABLE(load_test load_test.cpp)
5
- ADD_SUBDIRECTORY(unittest)
3
+ if(NOT MSVC)
4
+ ADD_EXECUTABLE(load_test load_test.cpp)
5
+ ADD_SUBDIRECTORY(unittest)
6
+ endif()
@@ -1,8 +1,21 @@
1
+ if (MSVC)
2
+ set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreadedDebugDLL")
3
+ set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
4
+ endif()
5
+
6
+ include(FetchContent)
7
+
8
+ FetchContent_Declare(
9
+ googletest
10
+ GIT_REPOSITORY https://github.com/google/googletest.git
11
+ GIT_TAG release-1.11.0
12
+ )
13
+ FetchContent_MakeAvailable(googletest)
14
+
15
+
1
16
  SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/test)
2
17
  SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
3
18
 
4
- INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/deps/gtest/include)
5
-
6
19
  ADD_DEFINITIONS(-DLOGGING_LEVEL=LL_WARNING)
7
20
 
8
21
  ADD_EXECUTABLE(test.run
@@ -17,8 +30,4 @@ ADD_EXECUTABLE(test.run
17
30
  textrank_test.cpp
18
31
  )
19
32
 
20
- if(MSVC)
21
- TARGET_LINK_LIBRARIES(test.run gtest)
22
- else()
23
- TARGET_LINK_LIBRARIES(test.run gtest pthread)
24
- endif()
33
+ TARGET_LINK_LIBRARIES(test.run gtest)
@@ -1,5 +1,7 @@
1
- require "mkmf"
2
- abs = File.expand_path File.dirname(__FILE__)
1
+ # frozen_string_literal: true
2
+
3
+ require 'mkmf'
4
+ abs = __dir__
3
5
 
4
6
  LIBDIR = RbConfig::CONFIG['libdir']
5
7
  INCLUDEDIR = RbConfig::CONFIG['includedir']
@@ -7,17 +9,20 @@ INCLUDEDIR = RbConfig::CONFIG['includedir']
7
9
  HEADER_DIRS = [
8
10
  INCLUDEDIR,
9
11
  "#{abs}/../cppjieba/include",
10
- "#{abs}/../cppjieba/deps"
11
- ]
12
+ "#{abs}/../cppjieba/deps/limonp/include"
13
+ ].freeze
12
14
 
13
15
  LIB_DIRS = [
14
16
  LIBDIR
15
- ]
17
+ ].freeze
16
18
 
17
19
  dir_config('cppjieba_rb', HEADER_DIRS, LIB_DIRS)
18
20
 
19
- CONFIG["CXXFLAGS"] += " -std=c++11 -O3"
21
+ # rubocop:disable Style/GlobalVars
22
+ CONFIG['CXXFLAGS'] += ' -std=c++11 -O3'
20
23
  $CXXFLAGS = "#{$CXXFLAGS} -std=c++11 -O3"
24
+ # rubocop:enable Style/GlobalVars
25
+
21
26
  create_makefile 'cppjieba_rb/cppjieba_rb'
22
27
  # respect header changes
23
28
  headers = Dir.glob('*.{hpp,h}').join ' '
@@ -163,7 +163,7 @@ void Init_internal()
163
163
  rb_sFull = rb_intern("full");
164
164
  u8_enc = rb_utf8_encoding();
165
165
 
166
- rb_cCppjiebaRb_Internal = rb_define_class_under(rb_mCppjiebaRb, "Internal", rb_cData);
166
+ rb_cCppjiebaRb_Internal = rb_define_class_under(rb_mCppjiebaRb, "Internal", rb_cObject);
167
167
  rb_define_alloc_func(rb_cCppjiebaRb_Internal, internal_alloc);
168
168
  rb_define_method(rb_cCppjiebaRb_Internal, "initialize", (ruby_method*) &internal_initialize, 5);
169
169
  rb_define_method(rb_cCppjiebaRb_Internal, "extract_keyword", (ruby_method*) &internal_extract_keyword, 2);
@@ -1,4 +1,7 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module CppjiebaRb
4
+ # Sentence segmentation
2
5
  class Segment
3
6
  VALID_MODES = %i[mix hmm mp query full].freeze
4
7
 
@@ -17,4 +20,4 @@ module CppjiebaRb
17
20
  CppjiebaRb.internal.segment(str, @mode, @max_word_length, @hmm)
18
21
  end
19
22
  end
20
- end
23
+ end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module CppjiebaRb
2
- VERSION = '0.4.1'
4
+ VERSION = '0.4.4'
3
5
  end
data/lib/cppjieba_rb.rb CHANGED
@@ -1,7 +1,14 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'cppjieba_rb/cppjieba_rb'
2
4
  require 'cppjieba_rb/version'
3
5
  require 'cppjieba_rb/segment'
4
6
 
7
+ # CppjiebaRb segments a Chinese sentence into words.
8
+ #
9
+ # Available segmentation methods include HMM, MP, and mix mode.
10
+ # Dictionaries takes a strong part in CppjiebaRb's accuracy.
11
+ # Read more https://github.com/yanyiwu/cppjieba
5
12
  module CppjiebaRb
6
13
  EXT_BASE = File.join(File.dirname(__FILE__), '..', 'ext', 'cppjieba', 'dict')
7
14
  DICT_PATH = File.join(EXT_BASE, 'jieba.dict.utf8')
@@ -28,11 +35,11 @@ module CppjiebaRb
28
35
 
29
36
  class << self
30
37
  def internal
31
- @backend ||= CppjiebaRb::Internal.new(DICT_PATH,
32
- HMM_DICT_PATH,
33
- USER_DICT,
34
- IDF_PATH,
35
- STOP_WORD_PATH)
38
+ @internal ||= CppjiebaRb::Internal.new(DICT_PATH,
39
+ HMM_DICT_PATH,
40
+ USER_DICT,
41
+ IDF_PATH,
42
+ STOP_WORD_PATH)
36
43
  end
37
44
  end
38
45
  end
data/test/test_keyword.rb CHANGED
@@ -1,17 +1,17 @@
1
- # coding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  require 'minitest/autorun'
3
4
  require 'cppjieba_rb'
4
5
 
5
6
  class JiebaTest < Minitest::Test
6
7
  def test_keywords
7
- results = CppjiebaRb.extract_keyword "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。", 5
8
+ results = CppjiebaRb.extract_keyword '我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。', 5
8
9
 
9
- assert_equal [["CEO",
10
+ assert_equal [['CEO',
10
11
  11.739204307083542],
11
- ["升职", 10.8561552143],
12
- ["加薪", 10.642581114],
13
- ["手扶拖拉机", 10.0088573539],
14
- ["巅峰", 9.49395840471]], results
15
-
12
+ ['升职', 10.8561552143],
13
+ ['加薪', 10.642581114],
14
+ ['手扶拖拉机', 10.0088573539],
15
+ ['巅峰', 9.49395840471]], results
16
16
  end
17
17
  end
data/test/test_segment.rb CHANGED
@@ -1,24 +1,28 @@
1
- # coding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  require 'minitest/autorun'
3
4
  require 'cppjieba_rb'
4
5
 
5
6
  class JiebaTest < Minitest::Test
6
7
  def test_mix_segment
7
- words = CppjiebaRb.segment "我来到南京市长江大桥"
8
- assert_equal %w(我 来到 南京市 长江大桥), words
8
+ words = CppjiebaRb.segment '我来到南京市长江大桥'
9
+
10
+ assert_equal %w[我 来到 南京市 长江大桥], words
11
+
12
+ words = CppjiebaRb.segment '令狐冲是云计算行业的专家'
9
13
 
10
- words = CppjiebaRb.segment "令狐冲是云计算行业的专家"
11
- assert_equal %w(令狐冲 是 云计算 行业 的 专家), words
14
+ assert_equal %w[令狐冲 云计算 行业 的 专家], words
12
15
  end
13
16
 
14
17
  def test_hmm_segment
15
- words = CppjiebaRb.segment "令狐冲是云计算行业的专家", mode: :hmm
16
- assert_equal %w(令狐冲 是 云计算 行业 的 专家), words
18
+ words = CppjiebaRb.segment '令狐冲是云计算行业的专家', mode: :hmm
19
+
20
+ assert_equal %w[令狐冲 是 云计算 行业 的 专家], words
17
21
  end
18
22
 
19
23
  def test_max_prob_segment
20
- words = CppjiebaRb.segment "令狐冲是云计算行业的专家", mode: :mp
21
- assert_equal %w(令狐冲 是 云计算 行业 的 专家), words
22
- end
24
+ words = CppjiebaRb.segment '令狐冲是云计算行业的专家', mode: :mp
23
25
 
26
+ assert_equal %w[令狐冲 是 云计算 行业 的 专家], words
27
+ end
24
28
  end
@@ -1,10 +1,12 @@
1
- # coding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  require 'minitest/autorun'
3
4
  require 'cppjieba_rb'
4
5
 
5
6
  class JiebaTest < Minitest::Test
6
7
  def test_filter
7
- words = CppjiebaRb.filter_stop_word %w(令狐冲 是 云计算 行业 的 专家)
8
- assert_equal %w(令狐冲 云计算 行业 专家), words
8
+ words = CppjiebaRb.filter_stop_word %w[令狐冲 是 云计算 行业 的 专家]
9
+
10
+ assert_equal %w[令狐冲 云计算 行业 专家], words
9
11
  end
10
12
  end
data/test/test_tagging.rb CHANGED
@@ -1,9 +1,12 @@
1
- # coding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  require 'minitest/autorun'
3
4
  require 'cppjieba_rb'
5
+
4
6
  class JiebaTest < Minitest::Test
5
7
  def test_tagging
6
8
  pairs = CppjiebaRb.segment_tag '我是蓝翔技工拖拉机学院手扶拖拉机专业的。'
9
+
7
10
  assert_equal({ '我' => 'r', '是' => 'v', '蓝翔' => 'nz', '技工' => 'n',
8
11
  '拖拉机' => 'n', '学院' => 'n', '手扶拖拉机' => 'n', '专业' => 'n',
9
12
  '的' => 'uj', '。' => 'x' }, pairs)
@@ -11,9 +14,9 @@ class JiebaTest < Minitest::Test
11
14
 
12
15
  def test_tagging_with_user_dict
13
16
  pairs = CppjiebaRb.segment_tag '我是蓝翔技工拖拉机学院手扶拖拉机专业的。'
17
+
14
18
  assert_equal({ '我' => 'r', '是' => 'v', '蓝翔' => 'nz', '技工' => 'n',
15
19
  '拖拉机' => 'n', '学院' => 'n', '手扶拖拉机' => 'n', '专业' => 'n',
16
20
  '的' => 'uj', '。' => 'x' }, pairs)
17
21
  end
18
-
19
22
  end