npm - nodejieba-plus - Versions diffs - 3.5.17 → 3.5.18 - Mend

nodejieba-plus 3.5.17 → 3.5.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/README.md +109 -4
package/build/Release/nodejieba.node +0 -0
package/index.js +25 -0
package/lib/nodejieba.cpp +174 -1
package/lib/nodejieba.h +1 -0
package/package.json +1 -1
package/submodules/cppjieba/include/cppjieba/Jieba.hpp +4 -0
package/submodules/cppjieba/include/cppjieba/KeywordExtractor.hpp +35 -0
package/test_2x_weight.js +39 -0
package/test_load_idf_dict.js +98 -0
package/types/index.d.ts +1 -0

package/README.md CHANGED Viewed

@@ -231,7 +231,7 @@ console.log(keywords); // 用户词典中的词排名会显著提升
 从 v3.5.16 开始，加载用户词典时会自动为词典中的词设置更高的 IDF 权重，确保在关键词提取时获得更高的排名：
 **自动权重提升**：
-- 加载用户词典后，词典中的词会自动获得 **1.3 倍 IDF 权重**
+- 加载用户词典后，词典中的词会自动获得 **2 倍 IDF 权重**
 - 这意味着用户词典中的词在关键词提取时会优先显示
 **手动设置权重**：
@@ -239,11 +239,11 @@ console.log(keywords); // 用户词典中的词排名会显著提升
 // 方式1：设置具体的 IDF 值
 nodejieba.setIdf("Open Claw", 30.0);
-// 方式2：使用倍数提升权重（默认1.3倍）
-nodejieba.setIdf("Open Claw");  // 1.3倍权重
+// 方式2：使用倍数提升权重（默认2倍）
+nodejieba.setIdf("Open Claw");  // 2倍权重
 // 方式3：自定义倍数
-nodejieba.setIdf("Open Claw", null, 2.0);  // 2倍权重
+nodejieba.setIdf("Open Claw", null, 3.0);  // 3倍权重
 ```
 #### IDF 词典支持空格关键词（新功能）
@@ -302,6 +302,111 @@ console.log(keywords);
 大数据 500 n
 ```
+### 批量加载IDF词典（新功能）
+支持通过字符串数组、Set、单个字符串或 Buffer 批量加载 IDF 词典，用于关键词提取时设置词语的权重。
+**功能特点**：
+- 支持多种输入格式：字符串数组、Set、单个字符串、Buffer
+- 支持包含空格的词组（如 "Deep Learning"）
+- 自动适配不含空格的版本（"Deep Learning" 也可匹配 "DeepLearning"）
+- 英文大小写不敏感（"MachineLearning" 可匹配 "machinelearning"、"MACHINELEARNING"）
+- 同时添加到分词词典，确保分词时能正确识别
+**使用方法**：
+```js
+var nodejieba = require("nodejieba");
+nodejieba.load();
+// 方式1：使用字符串数组
+nodejieba.loadIdfDict(['自定义词1 15.0', '自定义词2 12.5', '测试词汇 10.0']);
+// 方式2：使用 Set 集合
+const idfSet = new Set(['集合词汇1 14.0', '集合词汇2 13.0']);
+nodejieba.loadIdfDict(idfSet);
+// 方式3：使用单个字符串
+nodejieba.loadIdfDict('单个词汇 20.0');
+// 方式4：使用 Buffer（必须是 UTF-8 编码）
+const idfBuffer = Buffer.from('缓冲词汇1 18.0\n缓冲词汇2 16.5');
+nodejieba.loadIdfDict(idfBuffer);
+// 关键词提取时会使用设置的 IDF 权重
+var keywords = nodejieba.extract('这是一个自定义词1和自定义词2的测试词汇', 3);
+console.log(keywords);
+// 输出: [
+//   { word: '自定义词1', weight: 15 },
+//   { word: '自定义词2', weight: 12.5 },
+//   { word: '测试词汇', weight: 10 }
+// ]
+```
+**支持包含空格的词组**：
+```js
+// 加载包含空格的词组
+nodejieba.loadIdfDict(['人工智能 技术 25.0', '机器 学习 22.0']);
+// 分词时会识别这些词组
+console.log(nodejieba.cut('人工智能技术的发展和机器学习的应用'));
+// 输出: ['人工智能技术', '的', '发展', '和', '机器学习', '的', '应用']
+// 关键词提取时使用设置的权重
+var keywords = nodejieba.extract('人工智能技术的发展和机器学习的应用', 5);
+console.log(keywords);
+// 输出: [
+//   { word: '人工智能技术', weight: 25 },
+//   { word: '机器学习', weight: 22 },
+//   ...
+// ]
+```
+**英文大小写不敏感**：
+```js
+// 加载英文 IDF 词组
+nodejieba.loadIdfDict(['MachineLearning 30.0', 'Deep Learning 28.0', 'AI Technology 26.0']);
+// 各种大小写都能正确匹配
+console.log(nodejieba.extract('MachineLearning is important', 2));
+// 输出: [{ word: 'MachineLearning', weight: 30 }, ...]
+console.log(nodejieba.extract('machinelearning is important', 2));
+// 输出: [{ word: 'machinelearning', weight: 30 }, ...]
+console.log(nodejieba.extract('MACHINELEARNING is important', 2));
+// 输出: [{ word: 'MACHINELEARNING', weight: 30 }, ...]
+// 包含空格的英文词组也能正确匹配
+console.log(nodejieba.extract('Deep Learning and AI Technology', 3));
+// 输出: [{ word: 'Deep Learning', weight: 28 }, { word: 'AI Technology', weight: 26 }, ...]
+// 不含空格的版本也能匹配
+console.log(nodejieba.extract('DeepLearning and AITechnology', 3));
+// 输出: [{ word: 'DeepLearning', weight: 28 }, { word: 'AITechnology', weight: 26 }, ...]
+```
+**IDF 词典条目格式**：
+```
+# 词语 IDF权重
+自定义词 15.0
+# 包含空格的词组
+Deep Learning 28.0
+Machine Learning 25.0
+# 中文词组
+人工智能 技术 25.0
+```
+**注意事项**：
+- IDF 值越大，词语在关键词提取时的权重越高
+- 建议根据词语的重要性设置合理的 IDF 值（通常在 5-30 之间）
+- 加载 IDF 词典时会自动将词语添加到分词词典，无需额外调用 `loadUserDict`
 ### 包含空格的关键词（新功能）
 支持在自定义词典中使用包含空格的关键词，且支持无空格版本匹配和大小写不敏感匹配。

package/build/Release/nodejieba.node CHANGED Viewed

Binary file

package/index.js CHANGED Viewed

@@ -74,6 +74,7 @@ wrapWithDictLoad("extract");
 wrapWithDictLoad("textRankExtract");
 wrapWithDictLoad("insertWord");
 wrapWithDictLoad("loadUserDict");
+wrapWithDictLoad("loadIdfDict");
 wrapWithDictLoad("setIdf");
 // 保存原始的 loadUserDict 函数
@@ -100,4 +101,28 @@ exports.loadUserDict = function (dict) {
   return _loadUserDict.call(this, dict);
 };
+// 保存原始的 loadIdfDict 函数
+var _loadIdfDict = exports.loadIdfDict;
+// 重写 loadIdfDict 以支持 Set 格式
+exports.loadIdfDict = function (dict) {
+  if (!isDictLoaded) {
+    exports.load();
+  }
+  if (dict === null || dict === undefined) {
+    return false;
+  }
+  if (dict instanceof Set) {
+    dict = Array.from(dict);
+  }
+  if (typeof dict !== 'string' && !Array.isArray(dict) && !Buffer.isBuffer(dict)) {
+    throw new TypeError('dict must be string, string[], Set<string>, or Buffer');
+  }
+  return _loadIdfDict.call(this, dict);
+};
 module.exports = exports;

package/lib/nodejieba.cpp CHANGED Viewed

@@ -22,6 +22,7 @@ NodeJieba::NodeJieba(Napi::Env env, Napi::Object exports) {
     InstanceMethod("textRankExtract", &NodeJieba::textRankExtract),
     InstanceMethod("insertWord", &NodeJieba::insertWord),
     InstanceMethod("loadUserDict", &NodeJieba::loadUserDict),
+    InstanceMethod("loadIdfDict", &NodeJieba::loadIdfDict),
     InstanceMethod("setIdf", &NodeJieba::setIdf)
   });
 }
@@ -54,6 +55,178 @@ Napi::Value NodeJieba::load(const Napi::CallbackInfo& info) {
   return Napi::Boolean::New(info.Env(), true);
 }
+Napi::Value NodeJieba::loadIdfDict(const Napi::CallbackInfo& info) {
+  if (info.Length() < 1) {
+    return Napi::Boolean::New(info.Env(), false);
+  }
+  if( !_jieba_handle ){
+    Napi::Error::New(info.Env(), "Before calling any other function you have to call load() first").ThrowAsJavaScriptException();
+  }
+  auto isBlankString = [](const std::string& str) -> bool {
+    for (char c : str) {
+      if (!std::isspace(static_cast<unsigned char>(c))) {
+        return false;
+      }
+    }
+    return true;
+  };
+  auto trimString = [](std::string& str) -> void {
+    size_t start = 0;
+    size_t end = str.length();
+    while (start < end && std::isspace(static_cast<unsigned char>(str[start]))) {
+      start++;
+    }
+    while (end > start && std::isspace(static_cast<unsigned char>(str[end - 1]))) {
+      end--;
+    }
+    str = str.substr(start, end - start);
+  };
+  auto toUpper = [](const std::string& str) -> std::string {
+    std::string result = str;
+    for (size_t i = 0; i < result.length(); i++) {
+      result[i] = std::toupper(static_cast<unsigned char>(result[i]));
+    }
+    return result;
+  };
+  auto toLower = [](const std::string& str) -> std::string {
+    std::string result = str;
+    for (size_t i = 0; i < result.length(); i++) {
+      result[i] = std::tolower(static_cast<unsigned char>(result[i]));
+    }
+    return result;
+  };
+  auto hasEnglishChars = [](const std::string& str) -> bool {
+    for (char c : str) {
+      if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
+        return true;
+      }
+    }
+    return false;
+  };
+  auto extractWordAndIdf = [](const std::string& line) -> std::pair<std::string, double> {
+    size_t lastSpace = line.find_last_of(" \t");
+    if (lastSpace == std::string::npos || lastSpace == 0) {
+      return std::make_pair("", 0.0);
+    }
+    std::string word = line.substr(0, lastSpace);
+    std::string idfStr = line.substr(lastSpace + 1);
+    char* endptr;
+    double idf = strtod(idfStr.c_str(), &endptr);
+    if (endptr == idfStr.c_str()) {
+      return std::make_pair("", 0.0);
+    }
+    return std::make_pair(word, idf);
+  };
+  auto removeSpaces = [](const std::string& str) -> std::string {
+    std::string result;
+    for (char c : str) {
+      if (!std::isspace(static_cast<unsigned char>(c))) {
+        result += c;
+      }
+    }
+    return result;
+  };
+  auto processIdfLine = [&](const std::string& line) {
+    std::pair<std::string, double> parsed = extractWordAndIdf(line);
+    std::string word = parsed.first;
+    double idf = parsed.second;
+    if (word.empty() || idf == 0.0) {
+      return;
+    }
+    _jieba_handle->SetIdfForWord(word, idf);
+    _jieba_handle->InsertUserWord(word, "x");
+    std::string wordNoSpace = removeSpaces(word);
+    if (wordNoSpace != word) {
+      _jieba_handle->SetIdfForWord(wordNoSpace, idf);
+      _jieba_handle->InsertUserWord(wordNoSpace, "x");
+    }
+    if (hasEnglishChars(word)) {
+      std::string wordLower = toLower(word);
+      std::string wordUpper = toUpper(word);
+      if (wordLower != word) {
+        _jieba_handle->SetIdfForWord(wordLower, idf);
+        _jieba_handle->InsertUserWord(wordLower, "x");
+      }
+      if (wordUpper != word) {
+        _jieba_handle->SetIdfForWord(wordUpper, idf);
+        _jieba_handle->InsertUserWord(wordUpper, "x");
+      }
+      std::string wordLowerNoSpace = toLower(wordNoSpace);
+      std::string wordUpperNoSpace = toUpper(wordNoSpace);
+      if (wordLowerNoSpace != wordNoSpace && wordLowerNoSpace != wordLower) {
+        _jieba_handle->SetIdfForWord(wordLowerNoSpace, idf);
+        _jieba_handle->InsertUserWord(wordLowerNoSpace, "x");
+      }
+      if (wordUpperNoSpace != wordNoSpace && wordUpperNoSpace != wordUpper) {
+        _jieba_handle->SetIdfForWord(wordUpperNoSpace, idf);
+        _jieba_handle->InsertUserWord(wordUpperNoSpace, "x");
+      }
+    }
+  };
+  if (info[0].IsArray()) {
+    Napi::Array arr = info[0].As<Napi::Array>();
+    for (size_t i = 0; i < arr.Length(); i++) {
+      Napi::Value val = arr[i];
+      if (!val.IsString()) {
+        Napi::TypeError::New(info.Env(), "Array elements must be strings")
+          .ThrowAsJavaScriptException();
+        return Napi::Boolean::New(info.Env(), false);
+      }
+      std::string line = val.As<Napi::String>().Utf8Value();
+      trimString(line);
+      if (!line.empty() && !isBlankString(line)) {
+        processIdfLine(line);
+      }
+    }
+  } else if (info[0].IsString()) {
+    std::string line = info[0].As<Napi::String>().Utf8Value();
+    trimString(line);
+    if (!line.empty() && !isBlankString(line)) {
+      processIdfLine(line);
+    }
+  } else if (info[0].IsBuffer()) {
+    Napi::Buffer<char> buffer = info[0].As<Napi::Buffer<char>>();
+    std::string content(buffer.Data(), buffer.Length());
+    std::istringstream iss(content);
+    std::string line;
+    while (std::getline(iss, line)) {
+      trimString(line);
+      if (!line.empty() && !isBlankString(line)) {
+        processIdfLine(line);
+      }
+    }
+  } else {
+    return Napi::Boolean::New(info.Env(), false);
+  }
+  return Napi::Boolean::New(info.Env(), true);
+}
 Napi::Value NodeJieba::setIdf(const Napi::CallbackInfo& info) {
   if (info.Length() < 1) {
     return Napi::Boolean::New(info.Env(), false);
@@ -352,7 +525,7 @@ Napi::Value NodeJieba::loadUserDict(const Napi::CallbackInfo& info) {
     for (const auto& line : dictLines) {
       std::string keyword = extractKeyword(line);
       if (!keyword.empty()) {
-        _jieba_handle->SetIdfForWordWithMultiplier(keyword, 1.3);
+        _jieba_handle->SetIdfForWordWithMultiplier(keyword, 2.0);
       }
     }
   };

package/lib/nodejieba.h CHANGED Viewed

@@ -21,6 +21,7 @@ private:
   Napi::Value textRankExtract(const Napi::CallbackInfo& info);
   Napi::Value insertWord(const Napi::CallbackInfo& info);
   Napi::Value loadUserDict(const Napi::CallbackInfo& info);
+  Napi::Value loadIdfDict(const Napi::CallbackInfo& info);
   Napi::Value setIdf(const Napi::CallbackInfo& info);
   cppjieba::Jieba* _jieba_handle{nullptr};

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "nodejieba-plus",
   "description": "chinese word segmentation for node",
-  "version": "3.5.17",
+  "version": "3.5.18",
   "author": "Yanyi Wu <wuyanyi09@foxmail.com>",
   "maintainers": [
     "Yanyi Wu <wuyanyi09@foxmail.com>"

package/submodules/cppjieba/include/cppjieba/Jieba.hpp CHANGED Viewed

@@ -124,6 +124,10 @@ class Jieba {
     extractor.SetIdfWithMultiplier(word, multiplier);
   }
+  void LoadIdfDict(const vector<string>& buf) {
+    extractor.LoadIdfDict(buf);
+  }
  private:
   static string pathJoin(const string& dir, const string& filename) {
     if (dir.empty()) {

package/submodules/cppjieba/include/cppjieba/KeywordExtractor.hpp CHANGED Viewed

@@ -52,6 +52,41 @@ class KeywordExtractor {
     }
   }
+  void LoadIdfDict(const vector<string>& buf) {
+    double idf = 0.0;
+    double idfSum = 0.0;
+    size_t validCount = 0;
+    for (size_t i = 0; i < buf.size(); i++) {
+      const string& line = buf[i];
+      if (line.empty()) {
+        continue;
+      }
+      size_t lastSpace = line.find_last_of(" \t");
+      if (lastSpace == string::npos) {
+        continue;
+      }
+      string word = line.substr(0, lastSpace);
+      string idfStr = line.substr(lastSpace + 1);
+      char* endptr;
+      idf = strtod(idfStr.c_str(), &endptr);
+      if (endptr == idfStr.c_str()) {
+        continue;
+      }
+      idfMap_[word] = idf;
+      idfSum += idf;
+      validCount++;
+    }
+    if (validCount > 0) {
+      idfAverage_ = idfSum / validCount;
+    }
+  }
   void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
     vector<Word> topWords;
     Extract(sentence, topWords, topN);

package/test_2x_weight.js ADDED Viewed

@@ -0,0 +1,39 @@
+var nodejieba = require("./index.js");
+console.log("=== 测试 2.0 倍 IDF 权重 ===\n");
+const content = "疯狂动物城 疯狂动物城 疯狂动物城 这是一个二次开发的项目，整合了原版的动画及Open Claw打包制作了MAC安装包，它可以出Open Claw现在你的系统桌面的任何地方，也会随互动有特定的动作，还蛮有意思的项目地址：https://github.com/justaLoli/VPet-Mac云盘：https://pan.quark.cn/s/62596470429a功能：✅开始、关闭、正常效果的动画播放✅拖动效果✅「互动」菜单里的互动，即睡觉、学习、工作等（带计时器，但没有经验、金钱加成）✅自动事件（发呆、待机、睡觉等）✅桌宠自动移动✅摸头预览";
+console.log("【测试 1: 未加载用户词典】");
+nodejieba.load();
+var result1 = nodejieba.extract(content, 10);
+console.log("关键词及权重:");
+result1.forEach((item, i) => {
+  console.log(`  ${i + 1}. ${item.word}: ${item.weight.toFixed(2)}`);
+});
+console.log("\n【测试 2: 加载用户词典（自动 2.0 倍权重）】");
+nodejieba.loadUserDict("Open Claw 10 n");
+var result2 = nodejieba.extract(content, 10);
+console.log("关键词及权重:");
+result2.forEach((item, i) => {
+  console.log(`  ${i + 1}. ${item.word}: ${item.weight.toFixed(2)}`);
+});
+console.log("\n【验证 2.0 倍权重】");
+var openClawWeight1 = result1.find(r => r.word === "Open" || r.word === "Claw");
+var openClawWeight2 = result2.find(r => r.word === "Open Claw");
+if (openClawWeight1 && openClawWeight2) {
+  console.log(`未加载时权重: ${openClawWeight1.weight.toFixed(2)}`);
+  console.log(`加载后权重: ${openClawWeight2.weight.toFixed(2)}`);
+  console.log(`权重倍数: ${(openClawWeight2.weight / openClawWeight1.weight).toFixed(2)}`);
+  var ratio = openClawWeight2.weight / openClawWeight1.weight;
+  if (Math.abs(ratio - 2.0) < 0.01) {
+    console.log("✅ 2.0 倍权重验证成功！");
+  } else {
+    console.log(`❌ 权重倍数不符合预期（期望 2.0，实际 ${ratio.toFixed(2)}）`);
+  }
+}
+console.log("\n=== 测试完成 ===");

package/test_load_idf_dict.js ADDED Viewed

@@ -0,0 +1,98 @@
+var nodejieba = require('./index.js');
+console.log('=== 测试 loadIdfDict 功能 ===\n');
+// 初始化
+nodejieba.load();
+// 测试1: 使用数组加载IDF
+console.log('测试1: 使用数组加载IDF');
+var idfArray = [
+  '自定义词1 15.0',
+  '自定义词2 12.5',
+  '测试词汇 10.0'
+];
+var result1 = nodejieba.loadIdfDict(idfArray);
+console.log('数组加载结果:', result1);
+// 测试关键词提取
+var keywords1 = nodejieba.extract('这是一个自定义词1和自定义词2的测试词汇', 3);
+console.log('提取的关键词:', keywords1);
+console.log();
+// 测试2: 使用字符串加载单个IDF
+console.log('测试2: 使用字符串加载单个IDF');
+var result2 = nodejieba.loadIdfDict('单个词汇 20.0');
+console.log('字符串加载结果:', result2);
+var keywords2 = nodejieba.extract('这是一个单个词汇的测试', 3);
+console.log('提取的关键词:', keywords2);
+console.log();
+// 测试3: 使用Buffer加载IDF
+console.log('测试3: 使用Buffer加载IDF');
+var idfBuffer = Buffer.from('缓冲词汇1 18.0\n缓冲词汇2 16.5');
+var result3 = nodejieba.loadIdfDict(idfBuffer);
+console.log('Buffer加载结果:', result3);
+var keywords3 = nodejieba.extract('缓冲词汇1和缓冲词汇2的测试', 3);
+console.log('提取的关键词:', keywords3);
+console.log();
+// 测试4: 使用Set加载IDF
+console.log('测试4: 使用Set加载IDF');
+var idfSet = new Set(['集合词汇1 14.0', '集合词汇2 13.0']);
+var result4 = nodejieba.loadIdfDict(idfSet);
+console.log('Set加载结果:', result4);
+var keywords4 = nodejieba.extract('集合词汇1和集合词汇2的测试', 3);
+console.log('提取的关键词:', keywords4);
+console.log();
+// 测试5: 支持包含空格的词组
+console.log('测试5: 支持包含空格的词组');
+var idfWithSpace = ['人工智能 技术 25.0', '机器 学习 22.0'];
+var result5 = nodejieba.loadIdfDict(idfWithSpace);
+console.log('包含空格词组加载结果:', result5);
+// 测试提取包含空格的词组
+var keywords5a = nodejieba.extract('人工智能技术的发展和机器学习的应用', 5);
+console.log('提取的关键词(含空格):', keywords5a);
+// 测试不含空格的版本也能匹配
+var keywords5b = nodejieba.extract('人工智能技术发展和机器学习应用', 5);
+console.log('提取的关键词(不含空格):', keywords5b);
+console.log();
+// 测试6: 英文大小写不敏感
+console.log('测试6: 英文大小写不敏感');
+var idfEnglish = ['MachineLearning 30.0', 'Deep Learning 28.0', 'AI Technology 26.0'];
+var result6 = nodejieba.loadIdfDict(idfEnglish);
+console.log('英文IDF加载结果:', result6);
+// 测试大写匹配
+var keywords6a = nodejieba.extract('MachineLearning is important', 3);
+console.log('大写匹配:', keywords6a);
+// 测试小写匹配
+var keywords6b = nodejieba.extract('machinelearning is important', 3);
+console.log('小写匹配:', keywords6b);
+// 测试混合大小写匹配
+var keywords6c = nodejieba.extract('MACHINELEARNING is important', 3);
+console.log('全大写匹配:', keywords6c);
+// 测试包含空格的英文词组
+var keywords6d = nodejieba.extract('Deep Learning and AI Technology', 3);
+console.log('包含空格英文词组:', keywords6d);
+// 测试不含空格的英文词组
+var keywords6e = nodejieba.extract('DeepLearning and AITechnology', 3);
+console.log('不含空格英文词组:', keywords6e);
+// 测试小写版本的包含空格英文词组
+var keywords6f = nodejieba.extract('deep learning and ai technology', 3);
+console.log('小写包含空格英文词组:', keywords6f);
+console.log();
+console.log('=== 所有测试完成 ===');

package/types/index.d.ts CHANGED Viewed

@@ -28,5 +28,6 @@ declare module "nodejieba" {
   export function insertWord(word: string, tag?: string): boolean;
   export function cutSmall(sentence: string, small: number): string[];
   export function loadUserDict(dict: string | string[] | Set<string> | Buffer): boolean;
+  export function loadIdfDict(dict: string | string[] | Set<string> | Buffer): boolean;
   export function setIdf(word: string, idf?: number, multiplier?: number): boolean;
 }