cppjieba_rb 0.4.2 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. checksums.yaml +4 -4
  2. data/.editorconfig +21 -0
  3. data/.github/workflows/linting.yml +30 -0
  4. data/.github/workflows/release.yml +42 -0
  5. data/.github/workflows/tests.yml +47 -0
  6. data/.gitignore +1 -0
  7. data/.rubocop.yml +45 -0
  8. data/.ruby-version +1 -0
  9. data/.yamllint +35 -0
  10. data/CHANGELOG.md +17 -0
  11. data/Gemfile +11 -0
  12. data/README.md +5 -5
  13. data/Rakefile +16 -7
  14. data/cppjieba_rb.gemspec +46 -33
  15. data/ext/cppjieba/.github/workflows/cmake.yml +52 -0
  16. data/ext/cppjieba/.github/workflows/stale-issues.yml +24 -0
  17. data/ext/cppjieba/.gitmodules +3 -0
  18. data/ext/cppjieba/{ChangeLog.md → CHANGELOG.md} +50 -1
  19. data/ext/cppjieba/CMakeLists.txt +11 -14
  20. data/ext/cppjieba/LICENSE +20 -0
  21. data/ext/cppjieba/README.md +9 -18
  22. data/ext/cppjieba/deps/limonp/.github/workflows/cmake.yml +43 -0
  23. data/ext/cppjieba/deps/limonp/.gitignore +9 -0
  24. data/ext/cppjieba/deps/limonp/CHANGELOG.md +160 -0
  25. data/ext/cppjieba/deps/limonp/CMakeLists.txt +61 -0
  26. data/ext/cppjieba/deps/limonp/LICENSE +20 -0
  27. data/ext/cppjieba/deps/limonp/README.md +38 -0
  28. data/ext/cppjieba/deps/limonp/{LocalVector.hpp → include/limonp/LocalVector.hpp} +3 -3
  29. data/ext/cppjieba/deps/limonp/{Logging.hpp → include/limonp/Logging.hpp} +17 -3
  30. data/ext/cppjieba/deps/limonp/{StringUtil.hpp → include/limonp/StringUtil.hpp} +31 -10
  31. data/ext/cppjieba/deps/limonp/test/CMakeLists.txt +8 -0
  32. data/ext/cppjieba/deps/limonp/test/demo.cpp +40 -0
  33. data/ext/cppjieba/deps/limonp/test/testdata/1.conf +5 -0
  34. data/ext/cppjieba/deps/limonp/test/testdata/StdExtension.data +3 -0
  35. data/ext/cppjieba/deps/limonp/test/testdata/dict.gbk +50 -0
  36. data/ext/cppjieba/deps/limonp/test/testdata/dict.utf8 +50 -0
  37. data/ext/cppjieba/deps/limonp/test/testdata/io_testfile +2 -0
  38. data/ext/cppjieba/deps/limonp/test/testdata/jieba.dict.0.1.utf8 +93 -0
  39. data/ext/cppjieba/deps/limonp/test/testdata/jieba.dict.0.utf8 +93 -0
  40. data/ext/cppjieba/deps/limonp/test/testdata/jieba.dict.1.utf8 +67 -0
  41. data/ext/cppjieba/deps/limonp/test/testdata/jieba.dict.2.utf8 +64 -0
  42. data/ext/cppjieba/deps/limonp/test/unittest/CMakeLists.txt +30 -0
  43. data/ext/cppjieba/deps/limonp/test/unittest/TArgvContext.cpp +16 -0
  44. data/ext/cppjieba/deps/limonp/test/unittest/TCastFloat.cpp +19 -0
  45. data/ext/cppjieba/deps/limonp/test/unittest/TClosure.cpp +85 -0
  46. data/ext/cppjieba/deps/limonp/test/unittest/TColorPrint.cpp +20 -0
  47. data/ext/cppjieba/deps/limonp/test/unittest/TConfig.cpp +17 -0
  48. data/ext/cppjieba/deps/limonp/test/unittest/TLocalVector.cpp +41 -0
  49. data/ext/cppjieba/deps/limonp/test/unittest/TLogging.cpp +12 -0
  50. data/ext/cppjieba/deps/limonp/test/unittest/TStdExtension.cpp +95 -0
  51. data/ext/cppjieba/deps/limonp/test/unittest/TStringUtil.cpp +183 -0
  52. data/ext/cppjieba/include/cppjieba/DictTrie.hpp +9 -0
  53. data/ext/cppjieba/include/cppjieba/Jieba.hpp +4 -0
  54. data/ext/cppjieba/include/cppjieba/Trie.hpp +27 -1
  55. data/ext/cppjieba/test/CMakeLists.txt +4 -3
  56. data/ext/cppjieba/test/unittest/CMakeLists.txt +16 -7
  57. data/ext/cppjieba_rb/extconf.rb +11 -6
  58. data/lib/cppjieba_rb/segment.rb +4 -1
  59. data/lib/cppjieba_rb/version.rb +3 -1
  60. data/lib/cppjieba_rb.rb +12 -5
  61. data/test/test_keyword.rb +8 -8
  62. data/test/test_segment.rb +14 -10
  63. data/test/test_stop_word_filter.rb +5 -3
  64. data/test/test_tagging.rb +5 -2
  65. metadata +63 -140
  66. data/.travis.yml +0 -30
  67. data/ext/cppjieba/.travis.yml +0 -21
  68. data/ext/cppjieba/README_EN.md +0 -115
  69. data/ext/cppjieba/appveyor.yml +0 -32
  70. data/ext/cppjieba/deps/CMakeLists.txt +0 -1
  71. data/ext/cppjieba/deps/gtest/CMakeLists.txt +0 -5
  72. data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +0 -283
  73. data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +0 -230
  74. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +0 -1421
  75. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +0 -487
  76. data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +0 -796
  77. data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +0 -232
  78. data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +0 -176
  79. data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +0 -259
  80. data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +0 -2155
  81. data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +0 -358
  82. data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +0 -58
  83. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +0 -308
  84. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +0 -210
  85. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +0 -1226
  86. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +0 -233
  87. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +0 -4822
  88. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +0 -301
  89. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +0 -619
  90. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +0 -1788
  91. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +0 -350
  92. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +0 -968
  93. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +0 -336
  94. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +0 -3330
  95. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +0 -296
  96. data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +0 -681
  97. data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +0 -509
  98. data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
  99. data/ext/cppjieba/deps/gtest/src/gtest-all.cc +0 -48
  100. data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +0 -1234
  101. data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +0 -380
  102. data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +0 -1038
  103. data/ext/cppjieba/deps/gtest/src/gtest-port.cc +0 -746
  104. data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +0 -356
  105. data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +0 -110
  106. data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +0 -110
  107. data/ext/cppjieba/deps/gtest/src/gtest.cc +0 -4898
  108. data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +0 -49
  109. data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +0 -67
  110. data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +0 -65
  111. data/ext/cppjieba/deps/limonp/FileLock.hpp +0 -74
  112. data/ext/cppjieba/deps/limonp/Md5.hpp +0 -411
  113. data/ext/cppjieba/deps/limonp/MutexLock.hpp +0 -51
  114. data/ext/cppjieba/deps/limonp/Thread.hpp +0 -44
  115. data/ext/cppjieba/deps/limonp/ThreadPool.hpp +0 -86
  116. data/ext/cppjieba/test/demo.cpp +0 -80
  117. /data/ext/cppjieba/deps/{gtest/src/.deps/.dirstamp → limonp/.gitmodules} +0 -0
  118. /data/ext/cppjieba/deps/limonp/{ArgvContext.hpp → include/limonp/ArgvContext.hpp} +0 -0
  119. /data/ext/cppjieba/deps/limonp/{Closure.hpp → include/limonp/Closure.hpp} +0 -0
  120. /data/ext/cppjieba/deps/limonp/{Colors.hpp → include/limonp/Colors.hpp} +0 -0
  121. /data/ext/cppjieba/deps/limonp/{Condition.hpp → include/limonp/Condition.hpp} +0 -0
  122. /data/ext/cppjieba/deps/limonp/{Config.hpp → include/limonp/Config.hpp} +0 -0
  123. /data/ext/cppjieba/deps/limonp/{ForcePublic.hpp → include/limonp/ForcePublic.hpp} +0 -0
  124. /data/ext/cppjieba/deps/limonp/{NonCopyable.hpp → include/limonp/NonCopyable.hpp} +0 -0
  125. /data/ext/cppjieba/deps/limonp/{StdExtension.hpp → include/limonp/StdExtension.hpp} +0 -0
  126. /data/ext/cppjieba/deps/{gtest/src/gtest_main.cc → limonp/test/unittest/gtest_main.cpp} +0 -0
@@ -0,0 +1,183 @@
1
+ #include "limonp/StringUtil.hpp"
2
+ #include "gtest/gtest.h"
3
+ using namespace limonp;
4
+
5
+ TEST(StringUtilTest, Test1) {
6
+ vector<string> vec;
7
+ string s;
8
+ Split("\t1\t3\t4\t", vec, "\t");
9
+ ASSERT_EQ(s << vec, "[\"\", \"1\", \"3\", \"4\"]");
10
+ s = " \t\n ni hao ad \r\n";
11
+ ASSERT_EQ("ni hao ad", Trim(s));
12
+ ASSERT_EQ("select * from table1 limit 1;" ,StringFormat("select %s from %s %s;", "*","table1","limit 1"));
13
+ s = StringFormat("select %s from %s %s;", "*","table1","limit 1");
14
+ ASSERT_EQ("select * from table1 limit 1;" ,s);
15
+ vec.clear();
16
+ vec.push_back("1");
17
+ vec.push_back("2");
18
+ vec.push_back("3");
19
+ s.clear();
20
+ Join(vec.begin(), vec.end(), s,",");
21
+ ASSERT_EQ("1,2,3",s);
22
+ s = Join(vec.begin(), vec.end(), "..");
23
+ ASSERT_EQ("1..2..3", s);
24
+ const char* arr[] = {"2","3","5"};
25
+ ASSERT_EQ("2,3,5", Join(arr, arr + sizeof(arr)/sizeof(arr[0]), ","));
26
+ map<string , int> mp;
27
+ mp["key1"] =2;
28
+ ASSERT_EQ("{key1:2}", s << mp);
29
+ std::unordered_map<int,int> hmp;
30
+ hmp[1]=2;
31
+ ASSERT_EQ("{1:2}", s << hmp);
32
+ }
33
+
34
+ TEST(StringUtilTest, Test2) {
35
+ string s, gbks;
36
+ ifstream ifs("../test/testdata/dict.gbk");
37
+ ASSERT_TRUE(!!ifs);
38
+
39
+ vector<uint16_t> uni;
40
+ while(getline(ifs, s)) {
41
+ GBKTrans(s, uni);
42
+ GBKTrans(uni.begin(), uni.end(), gbks);
43
+ ASSERT_EQ(s, gbks);
44
+ }
45
+ }
46
+
47
+ TEST(StringUtilTest, Test3) {
48
+ string s, utf8;
49
+ ifstream ifs("../test/testdata/dict.utf8");
50
+ ASSERT_TRUE(!!ifs);
51
+
52
+ vector<uint16_t> uni;
53
+ while(getline(ifs, s)) {
54
+ ASSERT_TRUE(Utf8ToUnicode(s, uni));
55
+ UnicodeToUtf8(uni.begin(), uni.end(), utf8);
56
+ ASSERT_EQ(s, utf8);
57
+ }
58
+ }
59
+
60
+ TEST(StringUtilTest, Test4) {
61
+ //ASSERT_TRUE(StartsWith("--help",NULL));
62
+ ASSERT_TRUE(StartsWith("--help","--"));
63
+ ASSERT_TRUE(StartsWith("--help","-"));
64
+ ASSERT_FALSE(StartsWith("--help","he"));
65
+ ASSERT_TRUE(StartsWith("help","help"));
66
+ ASSERT_FALSE(StartsWith("","help"));
67
+ ASSERT_TRUE(StartsWith("hel",""));
68
+ ASSERT_TRUE(EndsWith("hel",""));
69
+ ASSERT_TRUE(EndsWith("hel","el"));
70
+ }
71
+
72
+ TEST(StringUtilTest, Test5) {
73
+ const char* str = "1,2,3,4";
74
+ vector<string> vec;
75
+ string res;
76
+ Split(str, vec, ",");
77
+ ASSERT_EQ("[\"1\", \"2\", \"3\", \"4\"]", res << vec);
78
+ Split("1,2,3,4,", vec, ",");
79
+ ASSERT_EQ("[\"1\", \"2\", \"3\", \"4\"]", res << vec);
80
+ Split(str, vec, ",", 3);
81
+ ASSERT_EQ("[\"1\", \"2\", \"3\", \"4\"]", res << vec);
82
+
83
+ Split("1", vec, ",");
84
+ ASSERT_EQ("[\"1\"]", res << vec);
85
+
86
+ Split(str, vec, ",", 1);
87
+ ASSERT_EQ("[\"1\", \"2,3,4\"]", res << vec);
88
+
89
+ Split("", vec, ",");
90
+ ASSERT_EQ("[]", res << vec);
91
+
92
+ Split("1, 2", vec, ",");
93
+ ASSERT_EQ("[\"1\", \" 2\"]", res << vec);
94
+
95
+ Split("1==2", vec, "==");
96
+ ASSERT_EQ("[\"1\", \"\", \"2\"]", res << vec);
97
+
98
+ Split("1,", vec, ",");
99
+ ASSERT_EQ("[\"1\"]", res << vec);
100
+
101
+ Split(",1,", vec, ",");
102
+ ASSERT_EQ("[\"\", \"1\"]", res << vec);
103
+
104
+ Split("1, ", vec, ",");
105
+ ASSERT_EQ("[\"1\", \" \"]", res << vec);
106
+
107
+ res << Split("1|2,3", "|,");
108
+ ASSERT_EQ("[\"1\", \"2\", \"3\"]", res);
109
+ }
110
+
111
+ TEST(StringUtilTest, Trim) {
112
+ string s;
113
+ s = "xxxyyyxx";
114
+ ASSERT_EQ(RTrim(s, 'x'), "xxxyyy");
115
+ ASSERT_EQ(LTrim(s, 'x'), "yyy");
116
+ s = "xxxyyyxx";
117
+ ASSERT_EQ(Trim(s, 'x'), "yyy");
118
+
119
+ s = " x y ";
120
+ ASSERT_EQ(Trim(s), "x y");
121
+
122
+ // check if it core dump when using isalpha
123
+ wchar_t w = 1000024;
124
+ ASSERT_FALSE(IsSpace(w));
125
+ w = 0x20;
126
+ ASSERT_TRUE(IsSpace(w));
127
+ }
128
+
129
+ TEST(StringUtilTest, GetTime) {
130
+ string s;
131
+ GetTime("%Y-%m-%d %H:%M:%S", s);
132
+ //print(s);
133
+ }
134
+
135
+ TEST(StringUtilTest, PathJoin) {
136
+ const char * path1 = "/home/foo/dir";
137
+ const char * path2 = "file";
138
+ const char * path3 = "/home/foo/dir/";
139
+ const char * path4 = "file";
140
+ const char * answer = "/home/foo/dir/file";
141
+
142
+ ASSERT_EQ(answer, PathJoin(path1, path2));
143
+ ASSERT_EQ(answer, PathJoin(path3, path4));
144
+ }
145
+
146
+ TEST(StringUtilTest, JapaneseUnicode) {
147
+ // Japanese
148
+ const char* s = "がんば";
149
+ vector<uint16_t> unicode;
150
+ ASSERT_TRUE(Utf8ToUnicode(s, unicode));
151
+ ASSERT_EQ(3u, unicode.size());
152
+ }
153
+
154
+ TEST(StringUtilTest, RareChinese) {
155
+ //U+10000 – U+10FFFF
156
+ const char* s = "𪚥";
157
+ vector<uint16_t> unicode;
158
+ ASSERT_FALSE(Utf8ToUnicode(s, unicode));
159
+ ASSERT_EQ(0u, unicode.size());
160
+ }
161
+
162
+ TEST(StringUtilTest, RareChineseUnicode32) {
163
+ //U+10000 – U+10FFFF
164
+ const char* s = "𪚥";
165
+ vector<uint32_t> unicode;
166
+ ASSERT_TRUE(Utf8ToUnicode32(s, unicode));
167
+ ASSERT_EQ(1u, unicode.size());
168
+
169
+ string s2;
170
+ Unicode32ToUtf8(unicode.begin(), unicode.end(), s2);
171
+ ASSERT_EQ(s2, s);
172
+ }
173
+
174
+ TEST(StringUtilTest, Unicode32) {
175
+ const char* s = "1+1=2你好世界,。";
176
+ vector<uint32_t> unicode;
177
+ ASSERT_TRUE(Utf8ToUnicode32(s, unicode));
178
+ ASSERT_EQ(unicode.size(), 11u);
179
+
180
+ string s2;
181
+ Unicode32ToUtf8(unicode.begin(), unicode.end(), s2);
182
+ ASSERT_EQ(s2, s);
183
+ }
@@ -61,6 +61,15 @@ class DictTrie {
61
61
  return true;
62
62
  }
63
63
 
64
+ bool DeleteUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
65
+ DictUnit node_info;
66
+ if (!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) {
67
+ return false;
68
+ }
69
+ trie_->DeleteNode(node_info.word, &node_info);
70
+ return true;
71
+ }
72
+
64
73
  const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
65
74
  return trie_->Find(begin, end);
66
75
  }
@@ -76,6 +76,10 @@ class Jieba {
76
76
  return dict_trie_.InsertUserWord(word,freq, tag);
77
77
  }
78
78
 
79
+ bool DeleteUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
80
+ return dict_trie_.DeleteUserWord(word, tag);
81
+ }
82
+
79
83
  bool Find(const string& word)
80
84
  {
81
85
  return dict_trie_.Find(word);
@@ -141,7 +141,33 @@ class Trie {
141
141
  assert(ptNode != NULL);
142
142
  ptNode->ptValue = ptValue;
143
143
  }
144
-
144
+ void DeleteNode(const Unicode& key, const DictUnit* ptValue) {
145
+ if (key.begin() == key.end()) {
146
+ return;
147
+ }
148
+ //定义一个NextMap迭代器
149
+ TrieNode::NextMap::const_iterator kmIter;
150
+ //定义一个指向root的TrieNode指针
151
+ TrieNode *ptNode = root_;
152
+ for (Unicode::const_iterator citer = key.begin(); citer != key.end(); ++citer) {
153
+ //链表不存在元素
154
+ if (NULL == ptNode->next) {
155
+ return;
156
+ }
157
+ kmIter = ptNode->next->find(*citer);
158
+ //如果map中不存在,跳出循环
159
+ if (ptNode->next->end() == kmIter) {
160
+ break;
161
+ }
162
+ //从unordered_map中擦除该项
163
+ ptNode->next->erase(*citer);
164
+ //删除该node
165
+ ptNode = kmIter->second;
166
+ delete ptNode;
167
+ break;
168
+ }
169
+ return;
170
+ }
145
171
  private:
146
172
  void CreateTrie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers) {
147
173
  if (valuePointers.empty() || keys.empty()) {
@@ -1,5 +1,6 @@
1
1
  SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR})
2
2
 
3
- ADD_EXECUTABLE(demo demo.cpp)
4
- ADD_EXECUTABLE(load_test load_test.cpp)
5
- ADD_SUBDIRECTORY(unittest)
3
+ if(NOT MSVC)
4
+ ADD_EXECUTABLE(load_test load_test.cpp)
5
+ ADD_SUBDIRECTORY(unittest)
6
+ endif()
@@ -1,8 +1,21 @@
1
+ if (MSVC)
2
+ set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreadedDebugDLL")
3
+ set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
4
+ endif()
5
+
6
+ include(FetchContent)
7
+
8
+ FetchContent_Declare(
9
+ googletest
10
+ GIT_REPOSITORY https://github.com/google/googletest.git
11
+ GIT_TAG release-1.11.0
12
+ )
13
+ FetchContent_MakeAvailable(googletest)
14
+
15
+
1
16
  SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/test)
2
17
  SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
3
18
 
4
- INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/deps/gtest/include)
5
-
6
19
  ADD_DEFINITIONS(-DLOGGING_LEVEL=LL_WARNING)
7
20
 
8
21
  ADD_EXECUTABLE(test.run
@@ -17,8 +30,4 @@ ADD_EXECUTABLE(test.run
17
30
  textrank_test.cpp
18
31
  )
19
32
 
20
- if(MSVC)
21
- TARGET_LINK_LIBRARIES(test.run gtest)
22
- else()
23
- TARGET_LINK_LIBRARIES(test.run gtest pthread)
24
- endif()
33
+ TARGET_LINK_LIBRARIES(test.run gtest)
@@ -1,5 +1,7 @@
1
- require "mkmf"
2
- abs = File.expand_path File.dirname(__FILE__)
1
+ # frozen_string_literal: true
2
+
3
+ require 'mkmf'
4
+ abs = __dir__
3
5
 
4
6
  LIBDIR = RbConfig::CONFIG['libdir']
5
7
  INCLUDEDIR = RbConfig::CONFIG['includedir']
@@ -7,17 +9,20 @@ INCLUDEDIR = RbConfig::CONFIG['includedir']
7
9
  HEADER_DIRS = [
8
10
  INCLUDEDIR,
9
11
  "#{abs}/../cppjieba/include",
10
- "#{abs}/../cppjieba/deps"
11
- ]
12
+ "#{abs}/../cppjieba/deps/limonp/include"
13
+ ].freeze
12
14
 
13
15
  LIB_DIRS = [
14
16
  LIBDIR
15
- ]
17
+ ].freeze
16
18
 
17
19
  dir_config('cppjieba_rb', HEADER_DIRS, LIB_DIRS)
18
20
 
19
- CONFIG["CXXFLAGS"] += " -std=c++11 -O3"
21
+ # rubocop:disable Style/GlobalVars
22
+ CONFIG['CXXFLAGS'] += ' -std=c++11 -O3'
20
23
  $CXXFLAGS = "#{$CXXFLAGS} -std=c++11 -O3"
24
+ # rubocop:enable Style/GlobalVars
25
+
21
26
  create_makefile 'cppjieba_rb/cppjieba_rb'
22
27
  # respect header changes
23
28
  headers = Dir.glob('*.{hpp,h}').join ' '
@@ -1,4 +1,7 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module CppjiebaRb
4
+ # Sentence segmentation
2
5
  class Segment
3
6
  VALID_MODES = %i[mix hmm mp query full].freeze
4
7
 
@@ -17,4 +20,4 @@ module CppjiebaRb
17
20
  CppjiebaRb.internal.segment(str, @mode, @max_word_length, @hmm)
18
21
  end
19
22
  end
20
- end
23
+ end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module CppjiebaRb
2
- VERSION = '0.4.2'
4
+ VERSION = '0.4.4'
3
5
  end
data/lib/cppjieba_rb.rb CHANGED
@@ -1,7 +1,14 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'cppjieba_rb/cppjieba_rb'
2
4
  require 'cppjieba_rb/version'
3
5
  require 'cppjieba_rb/segment'
4
6
 
7
+ # CppjiebaRb segments a Chinese sentence into words.
8
+ #
9
+ # Available segmentation methods include HMM, MP, and mix mode.
10
+ # Dictionaries takes a strong part in CppjiebaRb's accuracy.
11
+ # Read more https://github.com/yanyiwu/cppjieba
5
12
  module CppjiebaRb
6
13
  EXT_BASE = File.join(File.dirname(__FILE__), '..', 'ext', 'cppjieba', 'dict')
7
14
  DICT_PATH = File.join(EXT_BASE, 'jieba.dict.utf8')
@@ -28,11 +35,11 @@ module CppjiebaRb
28
35
 
29
36
  class << self
30
37
  def internal
31
- @backend ||= CppjiebaRb::Internal.new(DICT_PATH,
32
- HMM_DICT_PATH,
33
- USER_DICT,
34
- IDF_PATH,
35
- STOP_WORD_PATH)
38
+ @internal ||= CppjiebaRb::Internal.new(DICT_PATH,
39
+ HMM_DICT_PATH,
40
+ USER_DICT,
41
+ IDF_PATH,
42
+ STOP_WORD_PATH)
36
43
  end
37
44
  end
38
45
  end
data/test/test_keyword.rb CHANGED
@@ -1,17 +1,17 @@
1
- # coding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  require 'minitest/autorun'
3
4
  require 'cppjieba_rb'
4
5
 
5
6
  class JiebaTest < Minitest::Test
6
7
  def test_keywords
7
- results = CppjiebaRb.extract_keyword "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。", 5
8
+ results = CppjiebaRb.extract_keyword '我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。', 5
8
9
 
9
- assert_equal [["CEO",
10
+ assert_equal [['CEO',
10
11
  11.739204307083542],
11
- ["升职", 10.8561552143],
12
- ["加薪", 10.642581114],
13
- ["手扶拖拉机", 10.0088573539],
14
- ["巅峰", 9.49395840471]], results
15
-
12
+ ['升职', 10.8561552143],
13
+ ['加薪', 10.642581114],
14
+ ['手扶拖拉机', 10.0088573539],
15
+ ['巅峰', 9.49395840471]], results
16
16
  end
17
17
  end
data/test/test_segment.rb CHANGED
@@ -1,24 +1,28 @@
1
- # coding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  require 'minitest/autorun'
3
4
  require 'cppjieba_rb'
4
5
 
5
6
  class JiebaTest < Minitest::Test
6
7
  def test_mix_segment
7
- words = CppjiebaRb.segment "我来到南京市长江大桥"
8
- assert_equal %w(我 来到 南京市 长江大桥), words
8
+ words = CppjiebaRb.segment '我来到南京市长江大桥'
9
+
10
+ assert_equal %w[我 来到 南京市 长江大桥], words
11
+
12
+ words = CppjiebaRb.segment '令狐冲是云计算行业的专家'
9
13
 
10
- words = CppjiebaRb.segment "令狐冲是云计算行业的专家"
11
- assert_equal %w(令狐冲 是 云计算 行业 的 专家), words
14
+ assert_equal %w[令狐冲 云计算 行业 的 专家], words
12
15
  end
13
16
 
14
17
  def test_hmm_segment
15
- words = CppjiebaRb.segment "令狐冲是云计算行业的专家", mode: :hmm
16
- assert_equal %w(令狐冲 是 云计算 行业 的 专家), words
18
+ words = CppjiebaRb.segment '令狐冲是云计算行业的专家', mode: :hmm
19
+
20
+ assert_equal %w[令狐冲 是 云计算 行业 的 专家], words
17
21
  end
18
22
 
19
23
  def test_max_prob_segment
20
- words = CppjiebaRb.segment "令狐冲是云计算行业的专家", mode: :mp
21
- assert_equal %w(令狐冲 是 云计算 行业 的 专家), words
22
- end
24
+ words = CppjiebaRb.segment '令狐冲是云计算行业的专家', mode: :mp
23
25
 
26
+ assert_equal %w[令狐冲 是 云计算 行业 的 专家], words
27
+ end
24
28
  end
@@ -1,10 +1,12 @@
1
- # coding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  require 'minitest/autorun'
3
4
  require 'cppjieba_rb'
4
5
 
5
6
  class JiebaTest < Minitest::Test
6
7
  def test_filter
7
- words = CppjiebaRb.filter_stop_word %w(令狐冲 是 云计算 行业 的 专家)
8
- assert_equal %w(令狐冲 云计算 行业 专家), words
8
+ words = CppjiebaRb.filter_stop_word %w[令狐冲 是 云计算 行业 的 专家]
9
+
10
+ assert_equal %w[令狐冲 云计算 行业 专家], words
9
11
  end
10
12
  end
data/test/test_tagging.rb CHANGED
@@ -1,9 +1,12 @@
1
- # coding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  require 'minitest/autorun'
3
4
  require 'cppjieba_rb'
5
+
4
6
  class JiebaTest < Minitest::Test
5
7
  def test_tagging
6
8
  pairs = CppjiebaRb.segment_tag '我是蓝翔技工拖拉机学院手扶拖拉机专业的。'
9
+
7
10
  assert_equal({ '我' => 'r', '是' => 'v', '蓝翔' => 'nz', '技工' => 'n',
8
11
  '拖拉机' => 'n', '学院' => 'n', '手扶拖拉机' => 'n', '专业' => 'n',
9
12
  '的' => 'uj', '。' => 'x' }, pairs)
@@ -11,9 +14,9 @@ class JiebaTest < Minitest::Test
11
14
 
12
15
  def test_tagging_with_user_dict
13
16
  pairs = CppjiebaRb.segment_tag '我是蓝翔技工拖拉机学院手扶拖拉机专业的。'
17
+
14
18
  assert_equal({ '我' => 'r', '是' => 'v', '蓝翔' => 'nz', '技工' => 'n',
15
19
  '拖拉机' => 'n', '学院' => 'n', '手扶拖拉机' => 'n', '专业' => 'n',
16
20
  '的' => 'uj', '。' => 'x' }, pairs)
17
21
  end
18
-
19
22
  end