nodejieba-plus 3.5.13 → 3.5.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/nodejieba.cpp CHANGED
@@ -7,6 +7,7 @@
7
7
  #include "cppjieba/TextRankExtractor.hpp"
8
8
 
9
9
  #include <sstream>
10
+ #include <cctype>
10
11
 
11
12
  NodeJieba::NodeJieba(Napi::Env env, Napi::Object exports) {
12
13
  DefineAddon(exports, {
@@ -20,7 +21,8 @@ NodeJieba::NodeJieba(Napi::Env env, Napi::Object exports) {
20
21
  InstanceMethod("extract", &NodeJieba::extract),
21
22
  InstanceMethod("textRankExtract", &NodeJieba::textRankExtract),
22
23
  InstanceMethod("insertWord", &NodeJieba::insertWord),
23
- InstanceMethod("loadUserDict", &NodeJieba::loadUserDict)
24
+ InstanceMethod("loadUserDict", &NodeJieba::loadUserDict),
25
+ InstanceMethod("setIdf", &NodeJieba::setIdf)
24
26
  });
25
27
  }
26
28
 
@@ -52,6 +54,44 @@ Napi::Value NodeJieba::load(const Napi::CallbackInfo& info) {
52
54
  return Napi::Boolean::New(info.Env(), true);
53
55
  }
54
56
 
57
+ Napi::Value NodeJieba::setIdf(const Napi::CallbackInfo& info) {
58
+ if (info.Length() < 1) {
59
+ return Napi::Boolean::New(info.Env(), false);
60
+ }
61
+
62
+ if (!_jieba_handle) {
63
+ Napi::Error::New(info.Env(), "Before calling any other function you have to call load() first").ThrowAsJavaScriptException();
64
+ return Napi::Boolean::New(info.Env(), false);
65
+ }
66
+
67
+ std::string word;
68
+ double idf = 0.0;
69
+ double multiplier = 2.0;
70
+
71
+ if (info[0].IsString()) {
72
+ word = info[0].As<Napi::String>().Utf8Value();
73
+ } else {
74
+ return Napi::Boolean::New(info.Env(), false);
75
+ }
76
+
77
+ if (info.Length() >= 2) {
78
+ if (info[1].IsNumber()) {
79
+ idf = info[1].As<Napi::Number>().DoubleValue();
80
+ _jieba_handle->SetIdfForWord(word, idf);
81
+ return Napi::Boolean::New(info.Env(), true);
82
+ }
83
+ }
84
+
85
+ if (info.Length() >= 3) {
86
+ if (info[2].IsNumber()) {
87
+ multiplier = info[2].As<Napi::Number>().DoubleValue();
88
+ }
89
+ }
90
+
91
+ _jieba_handle->SetIdfForWordWithMultiplier(word, multiplier);
92
+ return Napi::Boolean::New(info.Env(), true);
93
+ }
94
+
55
95
  Napi::Value NodeJieba::insertWord(const Napi::CallbackInfo& info) {
56
96
  if(info.Length() < 1) {
57
97
  return Napi::Boolean::New(info.Env(), false);
@@ -229,43 +269,135 @@ Napi::Value NodeJieba::loadUserDict(const Napi::CallbackInfo& info) {
229
269
  Napi::Error::New(info.Env(), "Before calling any other function you have to call load() first").ThrowAsJavaScriptException();
230
270
  }
231
271
 
232
- // 支持传入字符串数组、单个字符串或 Buffer
272
+ auto isBlankString = [](const std::string& str) -> bool {
273
+ for (char c : str) {
274
+ if (!std::isspace(static_cast<unsigned char>(c))) {
275
+ return false;
276
+ }
277
+ }
278
+ return true;
279
+ };
280
+
281
+ auto trimString = [](std::string& str) -> void {
282
+ size_t start = 0;
283
+ size_t end = str.length();
284
+
285
+ while (start < end && std::isspace(static_cast<unsigned char>(str[start]))) {
286
+ start++;
287
+ }
288
+
289
+ while (end > start && std::isspace(static_cast<unsigned char>(str[end - 1]))) {
290
+ end--;
291
+ }
292
+
293
+ str = str.substr(start, end - start);
294
+ };
295
+
296
+ auto extractKeyword = [](const std::string& line) -> std::string {
297
+ std::istringstream iss(line);
298
+ std::vector<std::string> parts;
299
+ std::string part;
300
+
301
+ while (iss >> part) {
302
+ parts.push_back(part);
303
+ }
304
+
305
+ if (parts.empty()) {
306
+ return "";
307
+ }
308
+
309
+ if (parts.size() == 1) {
310
+ return parts[0];
311
+ }
312
+
313
+ if (parts.size() == 2) {
314
+ size_t pos;
315
+ try {
316
+ std::stoi(parts[1], &pos);
317
+ if (pos == parts[1].length()) {
318
+ return parts[0];
319
+ }
320
+ } catch (...) {
321
+ }
322
+ return parts[0];
323
+ }
324
+
325
+ if (parts.size() >= 3) {
326
+ size_t pos;
327
+ try {
328
+ std::stoi(parts[parts.size() - 2], &pos);
329
+ if (pos == parts[parts.size() - 2].length()) {
330
+ std::string keyword;
331
+ for (size_t i = 0; i < parts.size() - 2; i++) {
332
+ if (i > 0) keyword += " ";
333
+ keyword += parts[i];
334
+ }
335
+ return keyword;
336
+ }
337
+ } catch (...) {
338
+ }
339
+
340
+ std::string keyword;
341
+ for (size_t i = 0; i < parts.size() - 1; i++) {
342
+ if (i > 0) keyword += " ";
343
+ keyword += parts[i];
344
+ }
345
+ return keyword;
346
+ }
347
+
348
+ return parts[0];
349
+ };
350
+
351
+ auto setDefaultIdf = [&](const std::vector<std::string>& dictLines) {
352
+ for (const auto& line : dictLines) {
353
+ std::string keyword = extractKeyword(line);
354
+ if (!keyword.empty()) {
355
+ _jieba_handle->SetIdfForWordWithMultiplier(keyword, 1.3);
356
+ }
357
+ }
358
+ };
359
+
233
360
  if (info[0].IsArray()) {
234
361
  Napi::Array arr = info[0].As<Napi::Array>();
235
362
  std::vector<std::string> buf;
236
363
  for (size_t i = 0; i < arr.Length(); i++) {
237
364
  Napi::Value val = arr[i];
238
- if (val.IsString()) {
239
- std::string line = val.As<Napi::String>().Utf8Value();
240
- // 过滤空字符串,避免断言失败
241
- if (!line.empty()) {
242
- buf.push_back(line);
243
- }
365
+ if (!val.IsString()) {
366
+ Napi::TypeError::New(info.Env(), "Array elements must be strings")
367
+ .ThrowAsJavaScriptException();
368
+ return Napi::Boolean::New(info.Env(), false);
369
+ }
370
+ std::string line = val.As<Napi::String>().Utf8Value();
371
+ trimString(line);
372
+ if (!line.empty() && !isBlankString(line)) {
373
+ buf.push_back(line);
244
374
  }
245
375
  }
246
376
  _jieba_handle->LoadUserDict(buf);
377
+ setDefaultIdf(buf);
247
378
  } else if (info[0].IsString()) {
248
- // 支持传入单个词典条目字符串
249
379
  std::string line = info[0].As<Napi::String>().Utf8Value();
380
+ trimString(line);
250
381
  std::vector<std::string> buf;
251
- // 过滤空字符串
252
- if (!line.empty()) {
382
+ if (!line.empty() && !isBlankString(line)) {
253
383
  buf.push_back(line);
254
384
  _jieba_handle->LoadUserDict(buf);
385
+ setDefaultIdf(buf);
255
386
  }
256
387
  } else if (info[0].IsBuffer()) {
257
- // 支持传入 Buffer,将其转换为字符串并按行分割
258
388
  Napi::Buffer<char> buffer = info[0].As<Napi::Buffer<char>>();
259
389
  std::string content(buffer.Data(), buffer.Length());
260
390
  std::vector<std::string> buf;
261
391
  std::istringstream iss(content);
262
392
  std::string line;
263
393
  while (std::getline(iss, line)) {
264
- if (!line.empty()) {
394
+ trimString(line);
395
+ if (!line.empty() && !isBlankString(line)) {
265
396
  buf.push_back(line);
266
397
  }
267
398
  }
268
399
  _jieba_handle->LoadUserDict(buf);
400
+ setDefaultIdf(buf);
269
401
  } else {
270
402
  return Napi::Boolean::New(info.Env(), false);
271
403
  }
package/lib/nodejieba.h CHANGED
@@ -21,6 +21,7 @@ private:
21
21
  Napi::Value textRankExtract(const Napi::CallbackInfo& info);
22
22
  Napi::Value insertWord(const Napi::CallbackInfo& info);
23
23
  Napi::Value loadUserDict(const Napi::CallbackInfo& info);
24
+ Napi::Value setIdf(const Napi::CallbackInfo& info);
24
25
 
25
26
  cppjieba::Jieba* _jieba_handle{nullptr};
26
27
  cppjieba::TextRankExtractor* _text_rank_extractor_handle{nullptr};
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "nodejieba-plus",
3
3
  "description": "chinese word segmentation for node",
4
- "version": "3.5.13",
4
+ "version": "3.5.17",
5
5
  "author": "Yanyi Wu <wuyanyi09@foxmail.com>",
6
6
  "maintainers": [
7
7
  "Yanyi Wu <wuyanyi09@foxmail.com>"
@@ -10,6 +10,7 @@
10
10
  #include <stdint.h>
11
11
  #include <cmath>
12
12
  #include <limits>
13
+ #include <algorithm>
13
14
  #include "limonp/StringUtil.hpp"
14
15
  #include "limonp/Logging.hpp"
15
16
  #include "Unicode.hpp"
@@ -32,7 +33,7 @@ class DictTrie {
32
33
  WordWeightMax,
33
34
  }; // enum UserWordWeightOption
34
35
 
35
- DictTrie(const string& dict_path, const string& user_dict_paths = "", UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
36
+ DictTrie(const string& dict_path, const string& user_dict_paths = "", UserWordWeightOption user_word_weight_opt = WordWeightMedian) : trie_(NULL) {
36
37
  Init(dict_path, user_dict_paths, user_word_weight_opt);
37
38
  }
38
39
 
@@ -41,23 +42,84 @@ class DictTrie {
41
42
  }
42
43
 
43
44
  bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
44
- DictUnit node_info;
45
- if (!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) {
46
- return false;
45
+ std::set<string> insertedWords;
46
+ insertedWords.insert(word);
47
+
48
+ bool hasSpace = (word.find(' ') != string::npos);
49
+ if (hasSpace) {
50
+ string wordNoSpace = word;
51
+ wordNoSpace.erase(remove(wordNoSpace.begin(), wordNoSpace.end(), ' '), wordNoSpace.end());
52
+ if (!wordNoSpace.empty() && wordNoSpace != word) {
53
+ insertedWords.insert(wordNoSpace);
54
+ }
55
+ }
56
+
57
+ string wordLower = ToLowerString(word);
58
+ if (wordLower != word) {
59
+ insertedWords.insert(wordLower);
60
+ }
61
+
62
+ if (hasSpace) {
63
+ string wordNoSpace = word;
64
+ wordNoSpace.erase(remove(wordNoSpace.begin(), wordNoSpace.end(), ' '), wordNoSpace.end());
65
+ if (!wordNoSpace.empty()) {
66
+ string wordNoSpaceLower = ToLowerString(wordNoSpace);
67
+ if (wordNoSpaceLower != wordNoSpace) {
68
+ insertedWords.insert(wordNoSpaceLower);
69
+ }
70
+ }
71
+ }
72
+
73
+ for (std::set<string>::const_iterator it = insertedWords.begin(); it != insertedWords.end(); ++it) {
74
+ DictUnit node_info;
75
+ if (!MakeNodeInfo(node_info, *it, user_word_default_weight_, tag)) {
76
+ continue;
77
+ }
78
+ active_node_infos_.push_back(node_info);
79
+ trie_->InsertNode(node_info.word, &active_node_infos_.back());
47
80
  }
48
- active_node_infos_.push_back(node_info);
49
- trie_->InsertNode(node_info.word, &active_node_infos_.back());
50
81
  return true;
51
82
  }
52
83
 
53
84
  bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) {
54
- DictUnit node_info;
55
- double weight = freq ? log(1.0 * freq / freq_sum_) : user_word_default_weight_ ;
56
- if (!MakeNodeInfo(node_info, word, weight , tag)) {
57
- return false;
85
+ double weight = freq ? log(1.0 * freq / freq_sum_) : user_word_default_weight_;
86
+
87
+ std::set<string> insertedWords;
88
+ insertedWords.insert(word);
89
+
90
+ bool hasSpace = (word.find(' ') != string::npos);
91
+ if (hasSpace) {
92
+ string wordNoSpace = word;
93
+ wordNoSpace.erase(remove(wordNoSpace.begin(), wordNoSpace.end(), ' '), wordNoSpace.end());
94
+ if (!wordNoSpace.empty() && wordNoSpace != word) {
95
+ insertedWords.insert(wordNoSpace);
96
+ }
97
+ }
98
+
99
+ string wordLower = ToLowerString(word);
100
+ if (wordLower != word) {
101
+ insertedWords.insert(wordLower);
102
+ }
103
+
104
+ if (hasSpace) {
105
+ string wordNoSpace = word;
106
+ wordNoSpace.erase(remove(wordNoSpace.begin(), wordNoSpace.end(), ' '), wordNoSpace.end());
107
+ if (!wordNoSpace.empty()) {
108
+ string wordNoSpaceLower = ToLowerString(wordNoSpace);
109
+ if (wordNoSpaceLower != wordNoSpace) {
110
+ insertedWords.insert(wordNoSpaceLower);
111
+ }
112
+ }
113
+ }
114
+
115
+ for (std::set<string>::const_iterator it = insertedWords.begin(); it != insertedWords.end(); ++it) {
116
+ DictUnit node_info;
117
+ if (!MakeNodeInfo(node_info, *it, weight, tag)) {
118
+ continue;
119
+ }
120
+ active_node_infos_.push_back(node_info);
121
+ trie_->InsertNode(node_info.word, &active_node_infos_.back());
58
122
  }
59
- active_node_infos_.push_back(node_info);
60
- trie_->InsertNode(node_info.word, &active_node_infos_.back());
61
123
  return true;
62
124
  }
63
125
 
@@ -112,26 +174,93 @@ class DictTrie {
112
174
  vector<string> buf;
113
175
  DictUnit node_info;
114
176
  Split(line, buf, " ");
115
- if(buf.size() == 1){
116
- MakeNodeInfo(node_info,
117
- buf[0],
118
- user_word_default_weight_,
119
- UNKNOWN_TAG);
120
- } else if (buf.size() == 2) {
121
- MakeNodeInfo(node_info,
122
- buf[0],
123
- user_word_default_weight_,
124
- buf[1]);
125
- } else if (buf.size() == 3) {
126
- int freq = atoi(buf[1].c_str());
127
- assert(freq_sum_ > 0.0);
128
- double weight = log(1.0 * freq / freq_sum_);
129
- MakeNodeInfo(node_info, buf[0], weight, buf[2]);
177
+
178
+ string word;
179
+ string tag = UNKNOWN_TAG;
180
+ double weight = user_word_default_weight_;
181
+ bool hasSpace = false;
182
+
183
+ if (buf.size() == 1) {
184
+ word = buf[0];
185
+ } else if (buf.size() == 2) {
186
+ int freq = atoi(buf[1].c_str());
187
+ if (freq > 0) {
188
+ assert(freq_sum_ > 0.0);
189
+ weight = log(1.0 * freq / freq_sum_);
190
+ word = buf[0];
191
+ } else {
192
+ word = line;
193
+ }
194
+ } else if (buf.size() >= 3) {
195
+ bool isFreq = true;
196
+ for (char c : buf[buf.size() - 2]) {
197
+ if (!isdigit(c)) {
198
+ isFreq = false;
199
+ break;
130
200
  }
131
- static_node_infos_.push_back(node_info);
132
- if (node_info.word.size() == 1) {
133
- user_dict_single_chinese_word_.insert(node_info.word[0]);
201
+ }
202
+
203
+ if (isFreq) {
204
+ int freq = atoi(buf[buf.size() - 2].c_str());
205
+ assert(freq_sum_ > 0.0);
206
+ weight = log(1.0 * freq / freq_sum_);
207
+ for (size_t i = 0; i < buf.size() - 2; ++i) {
208
+ if (i > 0) word += " ";
209
+ word += buf[i];
210
+ }
211
+ tag = buf[buf.size() - 1];
212
+ } else {
213
+ word = line;
214
+ }
215
+ }
216
+
217
+ hasSpace = (word.find(' ') != string::npos);
218
+
219
+ std::set<string> insertedWords;
220
+
221
+ insertedWords.insert(word);
222
+
223
+ if (hasSpace) {
224
+ string wordNoSpace = word;
225
+ wordNoSpace.erase(remove(wordNoSpace.begin(), wordNoSpace.end(), ' '), wordNoSpace.end());
226
+ if (!wordNoSpace.empty() && wordNoSpace != word) {
227
+ insertedWords.insert(wordNoSpace);
228
+ }
229
+ }
230
+
231
+ string wordLower = ToLowerString(word);
232
+ if (wordLower != word) {
233
+ insertedWords.insert(wordLower);
234
+ }
235
+
236
+ if (hasSpace) {
237
+ string wordNoSpace = word;
238
+ wordNoSpace.erase(remove(wordNoSpace.begin(), wordNoSpace.end(), ' '), wordNoSpace.end());
239
+ if (!wordNoSpace.empty()) {
240
+ string wordNoSpaceLower = ToLowerString(wordNoSpace);
241
+ if (wordNoSpaceLower != wordNoSpace) {
242
+ insertedWords.insert(wordNoSpaceLower);
243
+ }
244
+ }
245
+ }
246
+
247
+ for (std::set<string>::const_iterator it = insertedWords.begin(); it != insertedWords.end(); ++it) {
248
+ DictUnit temp_node_info;
249
+ if (MakeNodeInfo(temp_node_info, *it, weight, tag)) {
250
+ if (trie_) {
251
+ active_node_infos_.push_back(temp_node_info);
252
+ trie_->InsertNode(active_node_infos_.back().word, &active_node_infos_.back());
253
+ if (active_node_infos_.back().word.size() == 1) {
254
+ user_dict_single_chinese_word_.insert(active_node_infos_.back().word[0]);
255
+ }
256
+ } else {
257
+ static_node_infos_.push_back(temp_node_info);
258
+ if (temp_node_info.word.size() == 1) {
259
+ user_dict_single_chinese_word_.insert(temp_node_info.word[0]);
260
+ }
134
261
  }
262
+ }
263
+ }
135
264
  }
136
265
 
137
266
  void LoadUserDict(const vector<string>& buf) {
@@ -206,6 +335,16 @@ class DictTrie {
206
335
  return true;
207
336
  }
208
337
 
338
+ bool MakeNodeInfo(DictUnit& node_info,
339
+ const Unicode& word,
340
+ double weight,
341
+ const string& tag) {
342
+ node_info.word = word;
343
+ node_info.weight = weight;
344
+ node_info.tag = tag;
345
+ return true;
346
+ }
347
+
209
348
  void LoadDict(const string& filePath) {
210
349
  ifstream ifs(filePath.c_str());
211
350
  XCHECK(ifs.is_open()) << "open " << filePath << " failed.";
@@ -116,6 +116,14 @@ class Jieba {
116
116
  dict_trie_.LoadUserDict(path);
117
117
  }
118
118
 
119
+ void SetIdfForWord(const string& word, double idf) {
120
+ extractor.SetIdf(word, idf);
121
+ }
122
+
123
+ void SetIdfForWordWithMultiplier(const string& word, double multiplier = 2.0) {
124
+ extractor.SetIdfWithMultiplier(word, multiplier);
125
+ }
126
+
119
127
  private:
120
128
  static string pathJoin(const string& dir, const string& filename) {
121
129
  if (dir.empty()) {
@@ -39,6 +39,19 @@ class KeywordExtractor {
39
39
  ~KeywordExtractor() {
40
40
  }
41
41
 
42
+ void SetIdf(const string& word, double idf) {
43
+ idfMap_[word] = idf;
44
+ }
45
+
46
+ void SetIdfWithMultiplier(const string& word, double multiplier = 2.0) {
47
+ unordered_map<string, double>::const_iterator cit = idfMap_.find(word);
48
+ if (cit != idfMap_.end()) {
49
+ idfMap_[word] = cit->second * multiplier;
50
+ } else {
51
+ idfMap_[word] = idfAverage_ * multiplier;
52
+ }
53
+ }
54
+
42
55
  void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
43
56
  vector<Word> topWords;
44
57
  Extract(sentence, topWords, topN);
@@ -96,25 +109,33 @@ class KeywordExtractor {
96
109
  ifstream ifs(idfPath.c_str());
97
110
  XCHECK(ifs.is_open()) << "open " << idfPath << " failed";
98
111
  string line ;
99
- vector<string> buf;
100
112
  double idf = 0.0;
101
113
  double idfSum = 0.0;
102
114
  size_t lineno = 0;
103
115
  for (; getline(ifs, line); lineno++) {
104
- buf.clear();
105
116
  if (line.empty()) {
106
117
  XLOG(ERROR) << "lineno: " << lineno << " empty. skipped.";
107
118
  continue;
108
119
  }
109
- Split(line, buf, " ");
110
- if (buf.size() != 2) {
111
- XLOG(ERROR) << "line: " << line << ", lineno: " << lineno << " empty. skipped.";
120
+
121
+ size_t lastSpace = line.find_last_of(" \t");
122
+ if (lastSpace == string::npos) {
123
+ XLOG(ERROR) << "line: " << line << ", lineno: " << lineno << " format error. skipped.";
124
+ continue;
125
+ }
126
+
127
+ string word = line.substr(0, lastSpace);
128
+ string idfStr = line.substr(lastSpace + 1);
129
+
130
+ char* endptr;
131
+ idf = strtod(idfStr.c_str(), &endptr);
132
+ if (endptr == idfStr.c_str()) {
133
+ XLOG(ERROR) << "line: " << line << ", lineno: " << lineno << " idf format error. skipped.";
112
134
  continue;
113
135
  }
114
- idf = atof(buf[1].c_str());
115
- idfMap_[buf[0]] = idf;
136
+
137
+ idfMap_[word] = idf;
116
138
  idfSum += idf;
117
-
118
139
  }
119
140
 
120
141
  assert(lineno);
@@ -8,7 +8,7 @@
8
8
 
9
9
  namespace cppjieba {
10
10
 
11
- const char* const SPECIAL_SEPARATORS = " \t\n\xEF\xBC\x8C\xE3\x80\x82";
11
+ const char* const SPECIAL_SEPARATORS = "\t\n\xEF\xBC\x8C\xE3\x80\x82";
12
12
 
13
13
  using namespace limonp;
14
14
 
@@ -69,7 +69,8 @@ class Trie {
69
69
  if (NULL == ptNode->next) {
70
70
  return NULL;
71
71
  }
72
- citer = ptNode->next->find(it->rune);
72
+ Rune searchRune = ToLowerRune(it->rune);
73
+ citer = ptNode->next->find(searchRune);
73
74
  if (ptNode->next->end() == citer) {
74
75
  return NULL;
75
76
  }
@@ -90,7 +91,7 @@ class Trie {
90
91
  for (size_t i = 0; i < size_t(end - begin); i++) {
91
92
  res[i].runestr = *(begin + i);
92
93
 
93
- if (root_->next != NULL && root_->next->end() != (citer = root_->next->find(res[i].runestr.rune))) {
94
+ if (root_->next != NULL && root_->next->end() != (citer = root_->next->find(ToLowerRune(res[i].runestr.rune)))) {
94
95
  ptNode = citer->second;
95
96
  } else {
96
97
  ptNode = NULL;
@@ -105,7 +106,7 @@ class Trie {
105
106
  if (ptNode == NULL || ptNode->next == NULL) {
106
107
  break;
107
108
  }
108
- citer = ptNode->next->find((begin + j)->rune);
109
+ citer = ptNode->next->find(ToLowerRune((begin + j)->rune));
109
110
  if (ptNode->next->end() == citer) {
110
111
  break;
111
112
  }
@@ -128,11 +129,12 @@ class Trie {
128
129
  if (NULL == ptNode->next) {
129
130
  ptNode->next = new TrieNode::NextMap;
130
131
  }
131
- kmIter = ptNode->next->find(*citer);
132
+ Rune insertRune = ToLowerRune(*citer);
133
+ kmIter = ptNode->next->find(insertRune);
132
134
  if (ptNode->next->end() == kmIter) {
133
135
  TrieNode *nextNode = new TrieNode;
134
136
 
135
- ptNode->next->insert(make_pair(*citer, nextNode));
137
+ ptNode->next->insert(make_pair(insertRune, nextNode));
136
138
  ptNode = nextNode;
137
139
  } else {
138
140
  ptNode = kmIter->second;
@@ -145,23 +147,18 @@ class Trie {
145
147
  if (key.begin() == key.end()) {
146
148
  return;
147
149
  }
148
- //定义一个NextMap迭代器
149
150
  TrieNode::NextMap::const_iterator kmIter;
150
- //定义一个指向root的TrieNode指针
151
151
  TrieNode *ptNode = root_;
152
152
  for (Unicode::const_iterator citer = key.begin(); citer != key.end(); ++citer) {
153
- //链表不存在元素
154
153
  if (NULL == ptNode->next) {
155
154
  return;
156
155
  }
157
- kmIter = ptNode->next->find(*citer);
158
- //如果map中不存在,跳出循环
156
+ Rune deleteRune = ToLowerRune(*citer);
157
+ kmIter = ptNode->next->find(deleteRune);
159
158
  if (ptNode->next->end() == kmIter) {
160
159
  break;
161
160
  }
162
- //从unordered_map中擦除该项
163
- ptNode->next->erase(*citer);
164
- //删除该node
161
+ ptNode->next->erase(deleteRune);
165
162
  ptNode = kmIter->second;
166
163
  delete ptNode;
167
164
  break;
@@ -222,6 +222,58 @@ inline void GetStringsFromWords(const vector<Word>& words, vector<string>& strs)
222
222
  }
223
223
  }
224
224
 
225
+ inline Rune ToLowerRune(Rune r) {
226
+ if (r >= 'A' && r <= 'Z') {
227
+ return r + ('a' - 'A');
228
+ }
229
+ return r;
230
+ }
231
+
232
+ inline Rune ToUpperRune(Rune r) {
233
+ if (r >= 'a' && r <= 'z') {
234
+ return r - ('a' - 'A');
235
+ }
236
+ return r;
237
+ }
238
+
239
+ inline Unicode ToLowerUnicode(const Unicode& unicode) {
240
+ Unicode result;
241
+ result.reserve(unicode.size());
242
+ for (size_t i = 0; i < unicode.size(); i++) {
243
+ result.push_back(ToLowerRune(unicode[i]));
244
+ }
245
+ return result;
246
+ }
247
+
248
+ inline Unicode ToUpperUnicode(const Unicode& unicode) {
249
+ Unicode result;
250
+ result.reserve(unicode.size());
251
+ for (size_t i = 0; i < unicode.size(); i++) {
252
+ result.push_back(ToUpperRune(unicode[i]));
253
+ }
254
+ return result;
255
+ }
256
+
257
+ inline string ToLowerString(const string& s) {
258
+ string result = s;
259
+ for (size_t i = 0; i < result.size(); i++) {
260
+ if (result[i] >= 'A' && result[i] <= 'Z') {
261
+ result[i] = result[i] + ('a' - 'A');
262
+ }
263
+ }
264
+ return result;
265
+ }
266
+
267
+ inline string ToUpperString(const string& s) {
268
+ string result = s;
269
+ for (size_t i = 0; i < result.size(); i++) {
270
+ if (result[i] >= 'a' && result[i] <= 'z') {
271
+ result[i] = result[i] - ('a' - 'A');
272
+ }
273
+ }
274
+ return result;
275
+ }
276
+
225
277
  } // namespace cppjieba
226
278
 
227
279
  #endif // CPPJIEBA_UNICODE_H