npm - nodejieba-plus - Versions diffs - 3.5.12 → 3.5.16 - Mend

nodejieba-plus 3.5.12 → 3.5.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/README.md +68 -93
package/build/Release/nodejieba.node +0 -0
package/index.js +8 -1
package/lib/nodejieba.cpp +41 -8
package/package.json +1 -1
package/submodules/cppjieba/include/cppjieba/DictTrie.hpp +169 -30
package/submodules/cppjieba/include/cppjieba/SegmentBase.hpp +1 -1
package/submodules/cppjieba/include/cppjieba/Trie.hpp +10 -13
package/submodules/cppjieba/include/cppjieba/Unicode.hpp +52 -0
package/test/load_user_dict_test.js +57 -0
package/test_assertion_fix.js +60 -0
package/test_simple.js +17 -0
package/test_space_keyword.js +66 -0

package/README.md CHANGED Viewed

@@ -1,38 +1,38 @@
-[![Build Status](https://github.com/yanyiwu/nodejieba/actions/workflows/test.yml/badge.svg)](https://github.com/yanyiwu/nodejieba/actions/workflows/test.yml)
-[![Financial Contributors on Open Collective](https://opencollective.com/nodejieba/all/badge.svg?label=financial+contributors)](https://opencollective.com/nodejieba) [![Author](https://img.shields.io/badge/author-@yanyiwu-blue.svg?style=flat)](https://github.com/yanyiwu/)
-[![Platform](https://img.shields.io/badge/platform-Linux,macOS,Windows-green.svg?style=flat)](https://github.com/yanyiwu/nodejieba)
-[![Performance](https://img.shields.io/badge/performance-excellent-brightgreen.svg?style=flat)](https://github.com/yanyiwu/blog/blob/posts2023archive/_posts/2015-06-14-jieba-series-performance-test.md)
-[![License](https://img.shields.io/badge/license-MIT-yellow.svg?style=flat)](http://yanyiwu.mit-license.org)
-[![NpmDownload Status](http://img.shields.io/npm/dm/nodejieba.svg)](https://www.npmjs.org/package/nodejieba)
-[![NPM Version](https://img.shields.io/npm/v/nodejieba.svg?style=flat)](https://www.npmjs.org/package/nodejieba)
-[![Code Climate](https://codeclimate.com/github/yanyiwu/nodejieba/badges/gpa.svg)](https://codeclimate.com/github/yanyiwu/nodejieba)
+[!\[Build Status\](https://github.com/yanyiwu/nodejieba/actions/workflows/test.yml/badge.svg null)](https://github.com/yanyiwu/nodejieba/actions/workflows/test.yml)
+[!\[Financial Contributors on Open Collective\](https://opencollective.com/nodejieba/all/badge.svg?label=financial+contributors null)](https://opencollective.com/nodejieba) [!\[Author\](https://img.shields.io/badge/author-@yanyiwu-blue.svg?style=flat null)](https://github.com/yanyiwu/)
+[!\[Platform\](https://img.shields.io/badge/platform-Linux,macOS,Windows-green.svg?style=flat null)](https://github.com/yanyiwu/nodejieba)
+[!\[Performance\](https://img.shields.io/badge/performance-excellent-brightgreen.svg?style=flat null)](https://github.com/yanyiwu/blog/blob/posts2023archive/_posts/2015-06-14-jieba-series-performance-test.md)
+[!\[License\](https://img.shields.io/badge/license-MIT-yellow.svg?style=flat null)](http://yanyiwu.mit-license.org)
+[!\[NpmDownload Status\](http://img.shields.io/npm/dm/nodejieba.svg null)](https://www.npmjs.org/package/nodejieba)
+[!\[NPM Version\](https://img.shields.io/npm/v/nodejieba.svg?style=flat null)](https://www.npmjs.org/package/nodejieba)
+[!\[Code Climate\](https://codeclimate.com/github/yanyiwu/nodejieba/badges/gpa.svg null)](https://codeclimate.com/github/yanyiwu/nodejieba)
-- - -
+***
 # NodeJieba "结巴"分词的Node.js版本
-## 介绍
+## 介绍
 `NodeJieba`是"结巴"中文分词的 Node.js 版本实现，
-由[CppJieba]提供底层分词算法实现，
+由[CppJieba](https://github.com/yanyiwu/cppjieba.git)提供底层分词算法实现，
 是兼具高性能和易用性两者的 Node.js 中文分词组件。
 ## 特点
-+ 词典载入方式灵活，无需配置词典路径也可使用，需要定制自己的词典路径时也可灵活定制。
-+ 底层算法实现是C++，性能高效。
-+ 支持多种分词算法，各种分词算法见[CppJieba]的README.md介绍。
-+ 支持动态补充词库。
-+ 支持TypeScript，提供完整的类型定义。
-+ **支持包含空格的关键词**（如 "Open Claw"）。
-+ **支持无空格版本匹配**（如 "OpenClaw" 可匹配 "Open Claw"）。
-+ **支持英文大小写不敏感匹配**（如 "open claw"、"OPEN CLAW" 都可匹配 "Open Claw"）。
-+ **支持批量加载用户词典**（字符串数组、单个字符串、Buffer 格式）。
+- 词典载入方式灵活，无需配置词典路径也可使用，需要定制自己的词典路径时也可灵活定制。
+- 底层算法实现是C++，性能高效。
+- 支持多种分词算法，各种分词算法见[CppJieba](https://github.com/yanyiwu/cppjieba.git)的README.md介绍。
+- 支持动态补充词库。
+- 支持TypeScript，提供完整的类型定义。
+- **支持包含空格的关键词**（如 "Open Claw"）。
+- **支持无空格版本匹配**（如 "Open Claw" 可匹配 "OpenClaw"）。
+- **支持英文大小写不敏感匹配**（如 "open claw"、"OPEN CLAW" 都可匹配 "Open Claw"）。
+- **支持批量加载用户词典**（字符串数组、单个字符串、Buffer 格式）。
 对实现细节感兴趣的请看如下博文：
-+ [Node.js的C++扩展初体验之NodeJieba]
-+ [由NodeJieba谈谈Node.js异步实现]
+- [Node.js的C++扩展初体验之NodeJieba](https://github.com/yanyiwu/blog/blob/posts2023archive/_posts/2014-02-22-nodejs-cpp-addon-nodejieba.md)
+- [由NodeJieba谈谈Node.js异步实现](https://github.com/yanyiwu/blog/blob/posts2023archive/_posts/2015-03-21-nodejs-asynchronous-insight.md)
 ## 安装
@@ -88,11 +88,11 @@ nodejieba.load({
 #### 词典说明
-+ dict: 主词典，带权重和词性标签，建议使用默认词典。
-+ hmmDict: 隐式马尔科夫模型，建议使用默认词典。
-+ userDict: 用户词典，建议自己根据需要定制。
-+ idfDict: 关键词抽取所需的idf信息。
-+ stopWordDict: 关键词抽取所需的停用词列表。
+- dict: 主词典，带权重和词性标签，建议使用默认词典。
+- hmmDict: 隐式马尔科夫模型，建议使用默认词典。
+- userDict: 用户词典，建议自己根据需要定制。
+- idfDict: 关键词抽取所需的idf信息。
+- stopWordDict: 关键词抽取所需的停用词列表。
 ## API 文档
@@ -211,10 +211,12 @@ nodejieba.loadUserDict(dictSet);
 // 方式3：使用单个字符串
 nodejieba.loadUserDict("区块链");
-// 方式4：使用 Buffer
+// 方式4：使用 Buffer（必须是 UTF-8 编码）
 const dictBuffer = Buffer.from("新词1\n新词2 100 n\n新词3 nz");
 nodejieba.loadUserDict(dictBuffer);
+// 注意：Buffer 必须是 UTF-8 编码，其他编码可能导致乱码或加载失败
 // 分词时会识别用户词典中的词
 var result = nodejieba.cut("云计算和大数据是人工智能的基础");
 console.log(result); // ['云计算', '和', '大数据', '是', '人工智能', '的', '基础']
@@ -242,76 +244,72 @@ console.log(result); // ['云计算', '和', '大数据', '是', '人工智能',
 支持在自定义词典中使用包含空格的关键词，且支持无空格版本匹配和大小写不敏感匹配。
+**注意**：本版本已移除空格作为默认分隔符，因此包含空格的关键词可以正确匹配文本中的对应内容，不会被分割。
 #### 用户词典格式
 用户词典支持以下格式：
 ```
-# 只有关键词
+# 只有关键词（包含空格）
 Open Claw
+Game Master
-# 关键词 + 词性标签
-Open Claw n
+# 关键词 + 词频（仅支持单关键词，不支持包含空格的关键词+词频）
+人工智能 1000
-# 关键词 + 词频 + 词性标签
+# 关键词 + 词频 + 词性标签（支持包含空格的关键词）
 Open Claw 100 n
+Machine Learning 200 n
 # 包含多个空格的关键词
-Machine Learning 200 n
 Artificial Intelligence 300 n
+Deep Learning 400 n
 ```
+**格式说明**：
+- 当词典行只有关键词时（如 `Open Claw`），整个字符串作为关键词
+- 当词典行有词频时（如 `人工智能 1000`），第一个部分是关键词，第二个是词频
+- 当词典行有三个部分且倒数第二个是数字时（如 `Open Claw 100 n`），前面的部分组成关键词，后面是词频和词性
 #### 使用示例
 ```js
 var nodejieba = require("nodejieba");
-var fs = require('fs');
-var path = require('path');
-// 创建包含空格关键词的用户词典
-var dictContent = `Open Claw 100 n
-Machine Learning 200 n
-Artificial Intelligence 300 n
-`;
+nodejieba.load();
-var testDictPath = path.join(__dirname, 'user_dict.utf8');
-fs.writeFileSync(testDictPath, dictContent);
+// 方式1：使用 loadUserDict 加载包含空格的关键词
+nodejieba.loadUserDict(["Open Claw 100 n", "Game Master"]);
-// 加载词典
-nodejieba.load({
-  userDict: testDictPath,
-});
+// 方式2：使用 insertWord 添加包含空格的关键词
+nodejieba.insertWord("Deep Learning");
 // 测试1: 包含空格的关键词匹配
-console.log(nodejieba.cut("I want to use Open Claw tool"));
-// 输出包含: ['Open Claw']
+console.log(nodejieba.cut("I like Open Claw game"));
+// 输出: ['I', ' ', 'l', 'i', 'k', 'e', ' ', 'Open Claw', ' ', 'g', 'a', 'm', 'e']
-// 测试2: 大小写不敏感匹配
+// 测试2: 在中文句子中匹配
+console.log(nodejieba.cut("Open Claw和Game Master都是好游戏"));
+// 输出: ['Open Claw', '和', 'Game Master', '都', '是', '好', '游戏']
+// 测试3: 大小写不敏感匹配
 console.log(nodejieba.cut("open claw"));        // 匹配 Open Claw
 console.log(nodejieba.cut("OPEN CLAW"));        // 匹配 Open Claw
 console.log(nodejieba.cut("Open Claw"));        // 匹配 Open Claw
-// 测试3: 无空格版本匹配
+// 测试4: 无空格版本匹配
 console.log(nodejieba.cut("OpenClaw"));         // 匹配 Open Claw
 console.log(nodejieba.cut("openclaw"));         // 匹配 Open Claw
 console.log(nodejieba.cut("OPENCLAW"));         // 匹配 Open Claw
-// 测试4: 其他包含空格的关键词
-console.log(nodejieba.cut("Machine Learning is great"));
-// 输出包含: ['Machine Learning']
-console.log(nodejieba.cut("Artificial Intelligence will change the world"));
-// 输出包含: ['Artificial Intelligence']
-// 清理测试文件
-fs.unlinkSync(testDictPath);
 ```
 #### 功能说明
-1. **包含空格的关键词**: 词典中的 "Open Claw" 可以匹配文本中的 "Open Claw"
+1. **包含空格的关键词**: 词典中的 "Open Claw" 可以匹配文本中的 "Open Claw"（不会被分割）
 2. **无空格版本匹配**: 词典中的 "Open Claw" 也可以匹配文本中的 "OpenClaw"
 3. **大小写不敏感**: 词典中的 "Open Claw" 可以匹配 "open claw"、"OPEN CLAW"、"Open Claw" 等任意大小写组合
+4. **自动生成变体**: 添加包含空格的关键词时，会自动生成无空格版本和小写版本，确保各种变体都能匹配
 More Detals in [demo](https://github.com/yanyiwu/nodejieba-demo)
@@ -347,37 +345,23 @@ npm test
 ## 应用
-+ 支持中文搜索的 gitbook 插件: [gitbook-plugin-search-pro]
-+ 汉字拼音转换工具: [pinyin]
+- 支持中文搜索的 gitbook 插件: [gitbook-plugin-search-pro](https://plugins.gitbook.com/plugin/search-pro)
+- 汉字拼音转换工具: [pinyin](https://github.com/hotoo/pinyin)
 ## 性能评测
 应该是目前性能最好的 Node.js 中文分词库
-详见: [Jieba中文分词系列性能评测]
-[由NodeJieba谈谈Node.js异步实现]:https://github.com/yanyiwu/blog/blob/posts2023archive/_posts/2015-03-21-nodejs-asynchronous-insight.md
-[Node.js的C++扩展初体验之NodeJieba]:https://github.com/yanyiwu/blog/blob/posts2023archive/_posts/2014-02-22-nodejs-cpp-addon-nodejieba.md
-[CppJieba]:https://github.com/yanyiwu/cppjieba.git
-[cnpm]:http://cnpmjs.org
-[Jieba中文分词]:https://github.com/fxsjy/jieba
-[Jieba中文分词系列性能评测]:https://github.com/yanyiwu/blog/blob/posts2023archive/_posts/2015-06-14-jieba-series-performance-test.md
-[contributors]:https://github.com/yanyiwu/nodejieba/graphs/contributors
-[YanyiWu]:http://github.com/yanyiwu
-[gitbook-plugin-search-pro]:https://plugins.gitbook.com/plugin/search-pro
-[pinyin]:https://github.com/hotoo/pinyin
+详见: [Jieba中文分词系列性能评测](https://github.com/yanyiwu/blog/blob/posts2023archive/_posts/2015-06-14-jieba-series-performance-test.md)
 ## Contributors
 ### Code Contributors
-This project exists thanks to all the people who contribute. [[Contribute](CONTRIBUTING.md)].
-<a href="https://github.com/yanyiwu/nodejieba/graphs/contributors"><img src="https://opencollective.com/nodejieba/contributors.svg?width=890&button=false" /></a>
+This project exists thanks to all the people who contribute. \[[Contribute](CONTRIBUTING.md)]. <a href="https://github.com/yanyiwu/nodejieba/graphs/contributors"><img src="https://opencollective.com/nodejieba/contributors.svg?width=890&button=false" /></a>
 ### Financial Contributors
-Become a financial contributor and help us sustain our community. [[Contribute](https://opencollective.com/nodejieba/contribute)]
+Become a financial contributor and help us sustain our community. \[[Contribute](https://opencollective.com/nodejieba/contribute)]
 #### Individuals
@@ -385,15 +369,6 @@ Become a financial contributor and help us sustain our community. [[Contribute](
 #### Organizations
-Support this project with your organization. Your logo will show up here with a link to your website. [[Contribute](https://opencollective.com/nodejieba/contribute)]
-<a href="https://opencollective.com/nodejieba/organization/0/website"><img src="https://opencollective.com/nodejieba/organization/0/avatar.svg"></a>
-<a href="https://opencollective.com/nodejieba/organization/1/website"><img src="https://opencollective.com/nodejieba/organization/1/avatar.svg"></a>
-<a href="https://opencollective.com/nodejieba/organization/2/website"><img src="https://opencollective.com/nodejieba/organization/2/avatar.svg"></a>
-<a href="https://opencollective.com/nodejieba/organization/3/website"><img src="https://opencollective.com/nodejieba/organization/3/avatar.svg"></a>
-<a href="https://opencollective.com/nodejieba/organization/4/website"><img src="https://opencollective.com/nodejieba/organization/4/avatar.svg"></a>
-<a href="https://opencollective.com/nodejieba/organization/5/website"><img src="https://opencollective.com/nodejieba/organization/5/avatar.svg"></a>
-<a href="https://opencollective.com/nodejieba/organization/6/website"><img src="https://opencollective.com/nodejieba/organization/6/avatar.svg"></a>
-<a href="https://opencollective.com/nodejieba/organization/7/website"><img src="https://opencollective.com/nodejieba/organization/7/avatar.svg"></a>
-<a href="https://opencollective.com/nodejieba/organization/8/website"><img src="https://opencollective.com/nodejieba/organization/8/avatar.svg"></a>
-<a href="https://opencollective.com/nodejieba/organization/9/website"><img src="https://opencollective.com/nodejieba/organization/9/avatar.svg"></a>
+Support this project with your organization. Your logo will show up here with a link to your website. \[[Contribute](https://opencollective.com/nodejieba/contribute)]
+<a href="https://opencollective.com/nodejieba/organization/0/website"><img src="https://opencollective.com/nodejieba/organization/0/avatar.svg"></a> <a href="https://opencollective.com/nodejieba/organization/1/website"><img src="https://opencollective.com/nodejieba/organization/1/avatar.svg"></a> <a href="https://opencollective.com/nodejieba/organization/2/website"><img src="https://opencollective.com/nodejieba/organization/2/avatar.svg"></a> <a href="https://opencollective.com/nodejieba/organization/3/website"><img src="https://opencollective.com/nodejieba/organization/3/avatar.svg"></a> <a href="https://opencollective.com/nodejieba/organization/4/website"><img src="https://opencollective.com/nodejieba/organization/4/avatar.svg"></a> <a href="https://opencollective.com/nodejieba/organization/5/website"><img src="https://opencollective.com/nodejieba/organization/5/avatar.svg"></a> <a href="https://opencollective.com/nodejieba/organization/6/website"><img src="https://opencollective.com/nodejieba/organization/6/avatar.svg"></a> <a href="https://opencollective.com/nodejieba/organization/7/website"><img src="https://opencollective.com/nodejieba/organization/7/avatar.svg"></a> <a href="https://opencollective.com/nodejieba/organization/8/website"><img src="https://opencollective.com/nodejieba/organization/8/avatar.svg"></a> <a href="https://opencollective.com/nodejieba/organization/9/website"><img src="https://opencollective.com/nodejieba/organization/9/avatar.svg"></a>

package/build/Release/nodejieba.node CHANGED Viewed

Binary file

package/index.js CHANGED Viewed

@@ -84,11 +84,18 @@ exports.loadUserDict = function (dict) {
     exports.load();
   }
-  // 如果是 Set 对象，转换为数组
+  if (dict === null || dict === undefined) {
+    return false;
+  }
   if (dict instanceof Set) {
     dict = Array.from(dict);
   }
+  if (typeof dict !== 'string' && !Array.isArray(dict) && !Buffer.isBuffer(dict)) {
+    throw new TypeError('dict must be string, string[], Set<string>, or Buffer');
+  }
   return _loadUserDict.call(this, dict);
 };

package/lib/nodejieba.cpp CHANGED Viewed

@@ -7,6 +7,7 @@
 #include "cppjieba/TextRankExtractor.hpp"
 #include <sstream>
+#include <cctype>
 NodeJieba::NodeJieba(Napi::Env env, Napi::Object exports) {
   DefineAddon(exports, {
@@ -229,32 +230,64 @@ Napi::Value NodeJieba::loadUserDict(const Napi::CallbackInfo& info) {
     Napi::Error::New(info.Env(), "Before calling any other function you have to call load() first").ThrowAsJavaScriptException();
   }
-  // 支持传入字符串数组、单个字符串或 Buffer
+  auto isBlankString = [](const std::string& str) -> bool {
+    for (char c : str) {
+      if (!std::isspace(static_cast<unsigned char>(c))) {
+        return false;
+      }
+    }
+    return true;
+  };
+  auto trimString = [](std::string& str) -> void {
+    size_t start = 0;
+    size_t end = str.length();
+    while (start < end && std::isspace(static_cast<unsigned char>(str[start]))) {
+      start++;
+    }
+    while (end > start && std::isspace(static_cast<unsigned char>(str[end - 1]))) {
+      end--;
+    }
+    str = str.substr(start, end - start);
+  };
   if (info[0].IsArray()) {
     Napi::Array arr = info[0].As<Napi::Array>();
     std::vector<std::string> buf;
     for (size_t i = 0; i < arr.Length(); i++) {
       Napi::Value val = arr[i];
-      if (val.IsString()) {
-        buf.push_back(val.As<Napi::String>().Utf8Value());
+      if (!val.IsString()) {
+        Napi::TypeError::New(info.Env(), "Array elements must be strings")
+          .ThrowAsJavaScriptException();
+        return Napi::Boolean::New(info.Env(), false);
+      }
+      std::string line = val.As<Napi::String>().Utf8Value();
+      trimString(line);
+      if (!line.empty() && !isBlankString(line)) {
+        buf.push_back(line);
       }
     }
     _jieba_handle->LoadUserDict(buf);
   } else if (info[0].IsString()) {
-    // 支持传入单个词典条目字符串
     std::string line = info[0].As<Napi::String>().Utf8Value();
+    trimString(line);
     std::vector<std::string> buf;
-    buf.push_back(line);
-    _jieba_handle->LoadUserDict(buf);
+    if (!line.empty() && !isBlankString(line)) {
+      buf.push_back(line);
+      _jieba_handle->LoadUserDict(buf);
+    }
   } else if (info[0].IsBuffer()) {
-    // 支持传入 Buffer，将其转换为字符串并按行分割
     Napi::Buffer<char> buffer = info[0].As<Napi::Buffer<char>>();
     std::string content(buffer.Data(), buffer.Length());
     std::vector<std::string> buf;
     std::istringstream iss(content);
     std::string line;
     while (std::getline(iss, line)) {
-      if (!line.empty()) {
+      trimString(line);
+      if (!line.empty() && !isBlankString(line)) {
         buf.push_back(line);
       }
     }

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "nodejieba-plus",
   "description": "chinese word segmentation for node",
-  "version": "3.5.12",
+  "version": "3.5.16",
   "author": "Yanyi Wu <wuyanyi09@foxmail.com>",
   "maintainers": [
     "Yanyi Wu <wuyanyi09@foxmail.com>"

package/submodules/cppjieba/include/cppjieba/DictTrie.hpp CHANGED Viewed

@@ -10,6 +10,7 @@
 #include <stdint.h>
 #include <cmath>
 #include <limits>
+#include <algorithm>
 #include "limonp/StringUtil.hpp"
 #include "limonp/Logging.hpp"
 #include "Unicode.hpp"
@@ -32,7 +33,7 @@ class DictTrie {
     WordWeightMax,
   }; // enum UserWordWeightOption
-  DictTrie(const string& dict_path, const string& user_dict_paths = "", UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
+  DictTrie(const string& dict_path, const string& user_dict_paths = "", UserWordWeightOption user_word_weight_opt = WordWeightMedian) : trie_(NULL) {
     Init(dict_path, user_dict_paths, user_word_weight_opt);
   }
@@ -41,23 +42,84 @@ class DictTrie {
   }
   bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
-    DictUnit node_info;
-    if (!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) {
-      return false;
+    std::set<string> insertedWords;
+    insertedWords.insert(word);
+    bool hasSpace = (word.find(' ') != string::npos);
+    if (hasSpace) {
+      string wordNoSpace = word;
+      wordNoSpace.erase(remove(wordNoSpace.begin(), wordNoSpace.end(), ' '), wordNoSpace.end());
+      if (!wordNoSpace.empty() && wordNoSpace != word) {
+        insertedWords.insert(wordNoSpace);
+      }
+    }
+    string wordLower = ToLowerString(word);
+    if (wordLower != word) {
+      insertedWords.insert(wordLower);
+    }
+    if (hasSpace) {
+      string wordNoSpace = word;
+      wordNoSpace.erase(remove(wordNoSpace.begin(), wordNoSpace.end(), ' '), wordNoSpace.end());
+      if (!wordNoSpace.empty()) {
+        string wordNoSpaceLower = ToLowerString(wordNoSpace);
+        if (wordNoSpaceLower != wordNoSpace) {
+          insertedWords.insert(wordNoSpaceLower);
+        }
+      }
+    }
+    for (std::set<string>::const_iterator it = insertedWords.begin(); it != insertedWords.end(); ++it) {
+      DictUnit node_info;
+      if (!MakeNodeInfo(node_info, *it, user_word_default_weight_, tag)) {
+        continue;
+      }
+      active_node_infos_.push_back(node_info);
+      trie_->InsertNode(node_info.word, &active_node_infos_.back());
     }
-    active_node_infos_.push_back(node_info);
-    trie_->InsertNode(node_info.word, &active_node_infos_.back());
     return true;
   }
   bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) {
-    DictUnit node_info;
-    double weight = freq ? log(1.0 * freq / freq_sum_) : user_word_default_weight_ ;
-    if (!MakeNodeInfo(node_info, word, weight , tag)) {
-      return false;
+    double weight = freq ? log(1.0 * freq / freq_sum_) : user_word_default_weight_;
+    std::set<string> insertedWords;
+    insertedWords.insert(word);
+    bool hasSpace = (word.find(' ') != string::npos);
+    if (hasSpace) {
+      string wordNoSpace = word;
+      wordNoSpace.erase(remove(wordNoSpace.begin(), wordNoSpace.end(), ' '), wordNoSpace.end());
+      if (!wordNoSpace.empty() && wordNoSpace != word) {
+        insertedWords.insert(wordNoSpace);
+      }
+    }
+    string wordLower = ToLowerString(word);
+    if (wordLower != word) {
+      insertedWords.insert(wordLower);
+    }
+    if (hasSpace) {
+      string wordNoSpace = word;
+      wordNoSpace.erase(remove(wordNoSpace.begin(), wordNoSpace.end(), ' '), wordNoSpace.end());
+      if (!wordNoSpace.empty()) {
+        string wordNoSpaceLower = ToLowerString(wordNoSpace);
+        if (wordNoSpaceLower != wordNoSpace) {
+          insertedWords.insert(wordNoSpaceLower);
+        }
+      }
+    }
+    for (std::set<string>::const_iterator it = insertedWords.begin(); it != insertedWords.end(); ++it) {
+      DictUnit node_info;
+      if (!MakeNodeInfo(node_info, *it, weight, tag)) {
+        continue;
+      }
+      active_node_infos_.push_back(node_info);
+      trie_->InsertNode(node_info.word, &active_node_infos_.back());
     }
-    active_node_infos_.push_back(node_info);
-    trie_->InsertNode(node_info.word, &active_node_infos_.back());
     return true;
   }
@@ -112,26 +174,93 @@ class DictTrie {
     vector<string> buf;
     DictUnit node_info;
     Split(line, buf, " ");
-    if(buf.size() == 1){
-          MakeNodeInfo(node_info,
-                buf[0],
-                user_word_default_weight_,
-                UNKNOWN_TAG);
-        } else if (buf.size() == 2) {
-          MakeNodeInfo(node_info,
-                buf[0],
-                user_word_default_weight_,
-                buf[1]);
-        } else if (buf.size() == 3) {
-          int freq = atoi(buf[1].c_str());
-          assert(freq_sum_ > 0.0);
-          double weight = log(1.0 * freq / freq_sum_);
-          MakeNodeInfo(node_info, buf[0], weight, buf[2]);
+    string word;
+    string tag = UNKNOWN_TAG;
+    double weight = user_word_default_weight_;
+    bool hasSpace = false;
+    if (buf.size() == 1) {
+      word = buf[0];
+    } else if (buf.size() == 2) {
+      int freq = atoi(buf[1].c_str());
+      if (freq > 0) {
+        assert(freq_sum_ > 0.0);
+        weight = log(1.0 * freq / freq_sum_);
+        word = buf[0];
+      } else {
+        word = line;
+      }
+    } else if (buf.size() >= 3) {
+      bool isFreq = true;
+      for (char c : buf[buf.size() - 2]) {
+        if (!isdigit(c)) {
+          isFreq = false;
+          break;
         }
-        static_node_infos_.push_back(node_info);
-        if (node_info.word.size() == 1) {
-          user_dict_single_chinese_word_.insert(node_info.word[0]);
+      }
+      if (isFreq) {
+        int freq = atoi(buf[buf.size() - 2].c_str());
+        assert(freq_sum_ > 0.0);
+        weight = log(1.0 * freq / freq_sum_);
+        for (size_t i = 0; i < buf.size() - 2; ++i) {
+          if (i > 0) word += " ";
+          word += buf[i];
+        }
+        tag = buf[buf.size() - 1];
+      } else {
+        word = line;
+      }
+    }
+    hasSpace = (word.find(' ') != string::npos);
+    std::set<string> insertedWords;
+    insertedWords.insert(word);
+    if (hasSpace) {
+      string wordNoSpace = word;
+      wordNoSpace.erase(remove(wordNoSpace.begin(), wordNoSpace.end(), ' '), wordNoSpace.end());
+      if (!wordNoSpace.empty() && wordNoSpace != word) {
+        insertedWords.insert(wordNoSpace);
+      }
+    }
+    string wordLower = ToLowerString(word);
+    if (wordLower != word) {
+      insertedWords.insert(wordLower);
+    }
+    if (hasSpace) {
+      string wordNoSpace = word;
+      wordNoSpace.erase(remove(wordNoSpace.begin(), wordNoSpace.end(), ' '), wordNoSpace.end());
+      if (!wordNoSpace.empty()) {
+        string wordNoSpaceLower = ToLowerString(wordNoSpace);
+        if (wordNoSpaceLower != wordNoSpace) {
+          insertedWords.insert(wordNoSpaceLower);
+        }
+      }
+    }
+    for (std::set<string>::const_iterator it = insertedWords.begin(); it != insertedWords.end(); ++it) {
+      DictUnit temp_node_info;
+      if (MakeNodeInfo(temp_node_info, *it, weight, tag)) {
+        if (trie_) {
+          active_node_infos_.push_back(temp_node_info);
+          trie_->InsertNode(active_node_infos_.back().word, &active_node_infos_.back());
+          if (active_node_infos_.back().word.size() == 1) {
+            user_dict_single_chinese_word_.insert(active_node_infos_.back().word[0]);
+          }
+        } else {
+          static_node_infos_.push_back(temp_node_info);
+          if (temp_node_info.word.size() == 1) {
+            user_dict_single_chinese_word_.insert(temp_node_info.word[0]);
+          }
         }
+      }
+    }
   }
   void LoadUserDict(const vector<string>& buf) {
@@ -206,6 +335,16 @@ class DictTrie {
     return true;
   }
+  bool MakeNodeInfo(DictUnit& node_info,
+        const Unicode& word,
+        double weight,
+        const string& tag) {
+    node_info.word = word;
+    node_info.weight = weight;
+    node_info.tag = tag;
+    return true;
+  }
   void LoadDict(const string& filePath) {
     ifstream ifs(filePath.c_str());
     XCHECK(ifs.is_open()) << "open " << filePath << " failed.";

package/submodules/cppjieba/include/cppjieba/SegmentBase.hpp CHANGED Viewed

@@ -8,7 +8,7 @@
 namespace cppjieba {
-const char* const SPECIAL_SEPARATORS = " \t\n\xEF\xBC\x8C\xE3\x80\x82";
+const char* const SPECIAL_SEPARATORS = "\t\n\xEF\xBC\x8C\xE3\x80\x82";
 using namespace limonp;

package/submodules/cppjieba/include/cppjieba/Trie.hpp CHANGED Viewed

@@ -69,7 +69,8 @@ class Trie {
       if (NULL == ptNode->next) {
         return NULL;
       }
-      citer = ptNode->next->find(it->rune);
+      Rune searchRune = ToLowerRune(it->rune);
+      citer = ptNode->next->find(searchRune);
       if (ptNode->next->end() == citer) {
         return NULL;
       }
@@ -90,7 +91,7 @@ class Trie {
     for (size_t i = 0; i < size_t(end - begin); i++) {
       res[i].runestr = *(begin + i);
-      if (root_->next != NULL && root_->next->end() != (citer = root_->next->find(res[i].runestr.rune))) {
+      if (root_->next != NULL && root_->next->end() != (citer = root_->next->find(ToLowerRune(res[i].runestr.rune)))) {
         ptNode = citer->second;
       } else {
         ptNode = NULL;
@@ -105,7 +106,7 @@ class Trie {
         if (ptNode == NULL || ptNode->next == NULL) {
           break;
         }
-        citer = ptNode->next->find((begin + j)->rune);
+        citer = ptNode->next->find(ToLowerRune((begin + j)->rune));
         if (ptNode->next->end() == citer) {
           break;
         }
@@ -128,11 +129,12 @@ class Trie {
       if (NULL == ptNode->next) {
         ptNode->next = new TrieNode::NextMap;
       }
-      kmIter = ptNode->next->find(*citer);
+      Rune insertRune = ToLowerRune(*citer);
+      kmIter = ptNode->next->find(insertRune);
       if (ptNode->next->end() == kmIter) {
         TrieNode *nextNode = new TrieNode;
-        ptNode->next->insert(make_pair(*citer, nextNode));
+        ptNode->next->insert(make_pair(insertRune, nextNode));
         ptNode = nextNode;
       } else {
         ptNode = kmIter->second;
@@ -145,23 +147,18 @@ class Trie {
       if (key.begin() == key.end()) {
         return;
       }
-      //定义一个NextMap迭代器
       TrieNode::NextMap::const_iterator kmIter;
-      //定义一个指向root的TrieNode指针
       TrieNode *ptNode = root_;
       for (Unicode::const_iterator citer = key.begin(); citer != key.end(); ++citer) {
-        //链表不存在元素
         if (NULL == ptNode->next) {
           return;
         }
-        kmIter = ptNode->next->find(*citer);
-        //如果map中不存在,跳出循环
+        Rune deleteRune = ToLowerRune(*citer);
+        kmIter = ptNode->next->find(deleteRune);
         if (ptNode->next->end() == kmIter) {
               break;
         }
-        //从unordered_map中擦除该项
-        ptNode->next->erase(*citer);
-        //删除该node
+        ptNode->next->erase(deleteRune);
         ptNode = kmIter->second;
         delete ptNode;
         break;

package/submodules/cppjieba/include/cppjieba/Unicode.hpp CHANGED Viewed

@@ -222,6 +222,58 @@ inline void GetStringsFromWords(const vector<Word>& words, vector<string>& strs)
   }
 }
+inline Rune ToLowerRune(Rune r) {
+  if (r >= 'A' && r <= 'Z') {
+    return r + ('a' - 'A');
+  }
+  return r;
+}
+inline Rune ToUpperRune(Rune r) {
+  if (r >= 'a' && r <= 'z') {
+    return r - ('a' - 'A');
+  }
+  return r;
+}
+inline Unicode ToLowerUnicode(const Unicode& unicode) {
+  Unicode result;
+  result.reserve(unicode.size());
+  for (size_t i = 0; i < unicode.size(); i++) {
+    result.push_back(ToLowerRune(unicode[i]));
+  }
+  return result;
+}
+inline Unicode ToUpperUnicode(const Unicode& unicode) {
+  Unicode result;
+  result.reserve(unicode.size());
+  for (size_t i = 0; i < unicode.size(); i++) {
+    result.push_back(ToUpperRune(unicode[i]));
+  }
+  return result;
+}
+inline string ToLowerString(const string& s) {
+  string result = s;
+  for (size_t i = 0; i < result.size(); i++) {
+    if (result[i] >= 'A' && result[i] <= 'Z') {
+      result[i] = result[i] + ('a' - 'A');
+    }
+  }
+  return result;
+}
+inline string ToUpperString(const string& s) {
+  string result = s;
+  for (size_t i = 0; i < result.size(); i++) {
+    if (result[i] >= 'a' && result[i] <= 'z') {
+      result[i] = result[i] - ('a' - 'A');
+    }
+  }
+  return result;
+}
 } // namespace cppjieba
 #endif // CPPJIEBA_UNICODE_H

package/test/load_user_dict_test.js CHANGED Viewed

@@ -74,4 +74,61 @@ describe("nodejieba.loadUserDict", function() {
     var loadResult = nodejieba.loadUserDict(dictSet);
     loadResult.should.eql(true);
   });
+  it("nodejieba.loadUserDict should filter empty strings", function() {
+    var dictLines = [
+      "有效词1",
+      "",
+      "有效词2",
+      "",
+      "   ",
+      "\t",
+      "\n",
+      "有效词3"
+    ];
+    var loadResult = nodejieba.loadUserDict(dictLines);
+    loadResult.should.eql(true);
+  });
+  it("nodejieba.loadUserDict with space-containing keywords", function() {
+    var dictLines = [
+      "深度 学习",
+      "机器 学习 200 n",
+      "人工 智能 300 nz"
+    ];
+    var loadResult = nodejieba.loadUserDict(dictLines);
+    loadResult.should.eql(true);
+  });
+  it("nodejieba.loadUserDict should throw error for non-string array elements", function() {
+    var dictLines = [
+      "有效词",
+      123,
+      "另一个有效词"
+    ];
+    (function() {
+      nodejieba.loadUserDict(dictLines);
+    }).should.throw();
+  });
+  it("nodejieba.loadUserDict should return false for null", function() {
+    var loadResult = nodejieba.loadUserDict(null);
+    loadResult.should.eql(false);
+  });
+  it("nodejieba.loadUserDict should return false for undefined", function() {
+    var loadResult = nodejieba.loadUserDict(undefined);
+    loadResult.should.eql(false);
+  });
+  it("nodejieba.loadUserDict should throw TypeError for invalid type", function() {
+    (function() {
+      nodejieba.loadUserDict(123);
+    }).should.throw(TypeError);
+    (function() {
+      nodejieba.loadUserDict({});
+    }).should.throw(TypeError);
+  });
 });

package/test_assertion_fix.js ADDED Viewed

@@ -0,0 +1,60 @@
+var nodejieba = require("./index.js");
+nodejieba.load();
+console.log("测试1: 加载包含空白字符的词典条目");
+try {
+  var result = nodejieba.loadUserDict([
+    "有效词1",
+    "",
+    "   ",
+    "\t",
+    "\n",
+    "有效词2",
+    " 测试词 ",
+    "  空格词  "
+  ]);
+  console.log("✅ 加载成功:", result);
+} catch (e) {
+  console.log("❌ 加载失败:", e.message);
+}
+console.log("\n测试2: 使用包含空白字符的词典进行分词");
+try {
+  var result = nodejieba.cut("有效词1和有效词2以及测试词");
+  console.log("✅ 分词成功:", result);
+  if (result.includes("有效词1") && result.includes("有效词2") && result.includes("测试词")) {
+    console.log("✅ 词典条目正确识别");
+  }
+} catch (e) {
+  console.log("❌ 分词失败:", e.message);
+}
+console.log("\n测试3: 加载大量空白字符的词典");
+try {
+  var largeDict = [];
+  for (var i = 0; i < 100; i++) {
+    largeDict.push("词" + i);
+    largeDict.push("");
+    largeDict.push("   ");
+    largeDict.push("\t\n");
+  }
+  var result = nodejieba.loadUserDict(largeDict);
+  console.log("✅ 大量词典加载成功:", result);
+} catch (e) {
+  console.log("❌ 大量词典加载失败:", e.message);
+}
+console.log("\n测试4: Buffer 包含空白行");
+try {
+  var bufferContent = "词A\n\n   \n\t\n词B\n  词C  \n";
+  var result = nodejieba.loadUserDict(Buffer.from(bufferContent));
+  console.log("✅ Buffer 加载成功:", result);
+  var cutResult = nodejieba.cut("词A和词B以及词C");
+  console.log("✅ Buffer 词典分词成功:", cutResult);
+} catch (e) {
+  console.log("❌ Buffer 加载失败:", e.message);
+}
+console.log("\n✅ 所有测试完成，断言错误已修复！");

package/test_simple.js ADDED Viewed

@@ -0,0 +1,17 @@
+var nodejieba = require("./index.js");
+console.log("=== 开始测试 ===\n");
+try {
+  console.log("调用 load()...");
+  nodejieba.load();
+  console.log("load() 完成");
+  console.log("\n测试分词...");
+  var result = nodejieba.cut("测试");
+  console.log("分词结果:", result);
+} catch (e) {
+  console.error("错误:", e);
+}
+console.log("\n=== 测试结束 ===");

package/test_space_keyword.js ADDED Viewed

@@ -0,0 +1,66 @@
+var nodejieba = require("./index.js");
+console.log("=== 测试包含空格的关键词匹配功能 ===\n");
+// 加载词典
+nodejieba.load();
+// 测试1: 加载包含空格的关键词（带词频和词性）
+console.log("测试1: 加载 'Open Claw 2 n'");
+nodejieba.loadUserDict(["Open Claw 2 n"]);
+var testCases1 = [
+  "Open Claw",
+  "OpenClaw",
+  "Openclaw",
+  "OPENCLAW",
+  "open claw",
+  "OPEN CLAW"
+];
+console.log("测试各种大小写变体:");
+testCases1.forEach(function(testText) {
+  var result = nodejieba.cut(testText);
+  console.log("  '" + testText + "' ->", result);
+});
+console.log("\n");
+// 测试2: 加载包含空格的关键词（只有关键词）
+console.log("测试2: 加载 'Game Master' (只有关键词)");
+nodejieba.loadUserDict("Game Master");
+var testCases2 = [
+  "Game Master",
+  "GameMaster",
+  "gamemaster",
+  "GAMEMASTER",
+  "GAME MASTER"
+];
+console.log("测试各种大小写变体:");
+testCases2.forEach(function(testText) {
+  var result = nodejieba.cut(testText);
+  console.log("  '" + testText + "' ->", result);
+});
+console.log("\n");
+// 测试3: 在句子中匹配
+console.log("测试3: 在句子中匹配包含空格的关键词");
+var sentence1 = "I like Open Claw game very much";
+var result1 = nodejieba.cut(sentence1);
+console.log("  句子: '" + sentence1 + "'");
+console.log("  分词结果:", result1);
+var sentence2 = "Open Claw和Game Master都是好游戏";
+var result2 = nodejieba.cut(sentence2);
+console.log("  句子: '" + sentence2 + "'");
+console.log("  分词结果:", result2);
+var sentence3 = "OPENCLAW和gamemaster都是好游戏";
+var result3 = nodejieba.cut(sentence3);
+console.log("  句子: '" + sentence3 + "'");
+console.log("  分词结果:", result3);
+console.log("\n=== 测试完成 ===");