RubyGems - pluskid-rmmseg-cpp - Versions diffs - 0.2.2 → 0.2.3 - Mend

pluskid-rmmseg-cpp 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/data/words.dic CHANGED Viewed

@@ -2602,7 +2602,6 @@
 4 犬牙相临
 2 变参
 2 大典
-3 #的话
 2 标准
 2 守护
 2 过失
@@ -15090,7 +15089,6 @@
 2 分开
 4 中外观众
 2 只只
-4 #的意思
 4 悔过自责
 2 为重
 4 面带笑容
@@ -17189,7 +17187,6 @@
 4 神怡心醉
 4 沉吟不语
 2 全岛
-3 #的卡
 3 统计员
 2 中师
 2 不尽
@@ -20816,7 +20813,6 @@
 2 极佳
 2 夜闭
 3 立体镜
-3 #的士
 2 冥冥
 4 拨号网络
 2 水师
@@ -20838,7 +20834,6 @@
 2 半透
 2 或许
 2 邻水
-3 #我刚
 2 短局
 3 近卫军
 2 调卷
@@ -26193,7 +26188,6 @@
 3 印刷体
 3 饰头巾
 2 参预
-4 #的确良
 2 预设
 2 一着
 4 反战运动
@@ -27457,7 +27451,6 @@
 2 家姓
 2 绝妙
 2 触网
-4 #的方向
 3 厘米波
 2 赌马
 4 涅而不缁
@@ -29576,7 +29569,6 @@
 2 阅读
 3 超短波
 2 论件
-3 #的当
 4 鼎足之势
 5 少壮不努力
 4 八方呼应
@@ -36413,7 +36405,6 @@
 2 午狮
 2 人学
 2 舒兰
-4 #的中部
 2 弄弄
 2 幼教
 2 强行
@@ -41703,7 +41694,6 @@
 3 受精卵
 4 强拉硬扯
 2 福特
-3 #的呢
 2 舒卷
 4 凌杂米盐
 2 跃至
@@ -43930,7 +43920,6 @@
 4 狂轰滥炸
 2 火神
 2 结节
-4 #的结果
 2 铆钉
 2 名目
 4 甘言厚币
@@ -57228,7 +57217,6 @@
 2 榆木
 4 首恶必办
 2 河池
-4 #的下方
 2 滑动
 4 克爱克威
 4 言出患入
@@ -61452,7 +61440,6 @@
 2 敲定
 2 改动
 2 复摆
-3 #而不
 2 治乱
 2 门外
 3 粗制品
@@ -67612,7 +67599,6 @@
 2 云块
 2 氮族
 2 基值
-4 #的样子
 2 举手
 5 从群众中来
 3 触发器
@@ -74495,7 +74481,6 @@
 3 八一队
 3 讽刺文
 2 桅帆
-3 #的心
 2 茶房
 2 要塌
 2 附识
@@ -78987,7 +78972,6 @@
 4 不复存在
 3 减速器
 3 触摸者
-4 #的年代
 2 唇形
 4 势难两全
 2 恕不
@@ -79343,7 +79327,6 @@
 4 晚生后学
 4 不经一事
 2 刺激
-3 #的人
 2 空检
 2 代职
 4 既往不咎
@@ -95875,7 +95858,6 @@
 2 好生
 4 抽样调查
 4 闻雷失箸
-4 #的时候
 4 抱蔓摘瓜
 2 变干
 2 监测
@@ -102186,7 +102168,6 @@
 2 英史
 3 适配器
 4 长驱直入
-3 #的死
 2 鸥类
 4 遁入空门
 4 言听计从
@@ -111611,7 +111592,6 @@
 4 摩天大厦
 2 代营
 2 相辅
-4 #的意义
 2 钢管
 2 行贿
 4 雕章镂句
@@ -114283,7 +114263,6 @@
 4 神经纤维
 2 暗补
 2 工矿
-3 #的确
 4 新硎初试
 2 猛醒
 3 指定者
@@ -115249,7 +115228,6 @@
 2 厚板
 4 拥兵玩寇
 2 互通
-4 #的一端
 2 词法
 4 互为表里
 3 该记住

data/ext/rmmseg/rmmseg.cpp CHANGED Viewed

@@ -68,6 +68,7 @@ extern "C" {
     static void tk_mark(Token *t)
     {
+        // start and end are Fixnums, no need to mark
         rb_gc_mark(t->text);
     }
     static void tk_free(Token *t)

data/lib/rmmseg/dictionary.rb CHANGED Viewed

@@ -31,7 +31,9 @@ module RMMSeg
       #
       # except the first number is not the frequency, but the number of
       # characters (not number of bytes) in the word.
-      #
+      #
+      # There's a script (convert.rb) in the tools directory that can be used
+      # to convert and normalize dictionaries.
       attr_accessor :dictionaries
       # Add a user defined dictionary, +type+ can be

data/lib/rmmseg/ferret.rb CHANGED Viewed

@@ -20,7 +20,7 @@ module RMMSeg
       end
       def token_stream(field, text)
-        t = PunctuationFilter.new(Tokenizer.new(text))
+        t = Tokenizer.new(text)
         if @brk
           @brk.call(t)
         else
@@ -61,56 +61,5 @@ module RMMSeg
         @algor = Algorithm.new(@text)
       end
     end
-    # PunctuationFilter filter out the stand alone Chinese
-    # punctuation tokens.
-    class PunctuationFilter < ::Ferret::Analysis::TokenStream
-      # The punctuation dictionary.
-      class Dictionary
-        include Singleton
-        DIC_FILE = File.join(File.dirname(__FILE__),
-                             "..",
-                             "..",
-                             "data",
-                             "punctuation.dic")
-        def initialize
-          @dic = Hash.new
-          File.open(DIC_FILE, "r") do |f|
-            f.each_line { |line|
-              @dic[line.chomp.freeze] = nil
-            }
-          end
-        end
-        def include?(str)
-          @dic.has_key?(str)
-        end
-      end
-      def initialize(stream)
-        @stream = stream
-      end
-      # Get next token, skip stand alone Chinese punctuations.
-      def next
-        token = @stream.next
-        dic = Dictionary.instance
-        until token.nil? || !(dic.include? token.text)
-          token = @stream.next
-        end
-        token
-      end
-      def text
-        @stream.text
-      end
-      def text=(str)
-        @stream.text = str
-      end
-    end
   end
 end

data/tools/convert.rb ADDED Viewed

@@ -0,0 +1,114 @@
+#!/usr/bin/ruby
+# A utility used to convert the old RMMSeg dictionary
+# to rmmseg-cpp format.
+# There are several constrains for the new rmmseg-cpp
+# dictionary format.
+#  - length of word should be specified in the dict
+#  - number and string should be separated by ONE space
+#  - there should be a newline at the end of file
+$KCODE='u'
+require 'jcode'
+def usage(msg=nil)
+  puts "***ERROR: #{msg}\n\n" if msg
+  puts <<EOT
+Usage:
+#{$0} action type input.dic output.dic
+  action: either 'convert' or 'normalize'
+           - 'convert' is used to convert the dict from
+             old RMMSeg format.
+           - 'normalize' is used to normalize an existing
+             rmmseg-cpp dict.
+  type:   either 'words' or 'chars'
+EOT
+  exit(0)
+end
+usage if ARGV.size != 4
+usage("unknown action #{ARGV[0]}") if ! ['convert', 'normalize'].include? ARGV[0]
+usage("unknown type #{ARGV[1]}") if ! ['words', 'chars'].include? ARGV[1]
+def output(data)
+  File.open(ARGV[3], "w") do |f|
+    data.each do |num, word|
+      f.puts "#{num} #{word}" if word
+    end
+  end
+end
+def read_RMMSeg_chars
+  max = 0
+  File.readlines(ARGV[2]).map do |line|
+    if line =~ /^(.)\s+(\d+)$/
+      n = $2.to_i
+      max = n if n > max
+      [n, $1]
+    else
+      [nil, nil]
+    end
+  end.map do |num, word|
+    if word
+      [num*65535/max, word]
+    else
+      [nil, nil]
+    end
+  end
+end
+def read_RMMSeg_words
+  File.readlines(ARGV[2]).map do |line|
+    line.chomp!
+    if !line.empty?
+      [line.jlength, line]
+    else
+      [nil, nil]
+    end
+  end
+end
+def read_rmmseg_cpp_chars
+  max = 0
+  File.readlines(ARGV[2]).map do |line|
+    if line =~ /^(\d+)\s+(.)$/
+      n = $1.to_i
+      max = n if n > max
+      [n, $2]
+    else
+      [nil, nil]
+    end
+  end.map do |num, word|
+    if word
+      [num*65535/max, word]
+    else
+      [nil, nil]
+    end
+  end
+end
+def read_rmmseg_cpp_words
+  File.readlines(ARGV[2]).map do |line|
+    if line =~ /^(\d+)\s+(\w+)$/
+      [$1, $2]
+    else
+      [nil, nil]
+    end
+  end
+end
+case ARGV[0,2]
+when ['convert', 'chars']
+  output(read_RMMSeg_chars)
+when ['convert', 'words']
+  output(read_RMMSeg_words)
+when ['normalize', 'chars']
+  output(read_rmmseg_cpp_chars)
+when ['normalize', 'words']
+  output(read_rmmseg_cpp_words)
+end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: pluskid-rmmseg-cpp
 version: !ruby/object:Gem::Version
-  version: 0.2.2
+  version: 0.2.3
 platform: ruby
 authors:
 - pluskid
@@ -36,9 +36,9 @@ files:
 - ext/rmmseg/word.h
 - ext/rmmseg/chunk.h
 - ext/rmmseg/memory.h
-- data/punctuation.dic
 - data/words.dic
 - data/chars.dic
+- tools/convert.rb
 - README
 - ext/rmmseg/extconf.rb
 has_rdoc: true

data/data/punctuation.dic DELETED Viewed

@@ -1,79 +0,0 @@
-｛
-×
-π
-）
-〖
-；
-〗
-＜
-°
-“
-＋
-◆
-♀
-＝
-±
-←
-｝
-，
-”
-㎡
-◇
-＞
-↑
-～
-△
-？
-♂
-‰
-——
-→
-■
-￥
-－
-＠
-≈
-↓
-□
-〈
-′
-〉
-／
-★
-《
-○
-″
-☆
-》
-·
-∶
-！
-『
-§
-●
-』
-…
-【
-℃
-＃
-÷
-】
-№
-＄
-※
-≤
-‖
-％
-≥
-＆
-、
-‘
-〔
-。
-’
-√
-（
-〕
-￡
-：