RubyGems - kokugo_tagger - Versions diffs - 0.0.6 → 1.0.3 - Mend

kokugo_tagger 0.0.6 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/README.md +36 -11
data/bin/kokugo_tagger +23 -2
data/lib/kokugo_tagger/cabocha.rb +75 -0
data/lib/kokugo_tagger/kokugo.model +0 -0
data/lib/kokugo_tagger/tagger.rb +222 -132
data/lib/kokugo_tagger/version.rb +1 -1
data/lib/kokugo_tagger.rb +2 -2
metadata +5 -5
data/lib/kokugo_tagger/parser.rb +0 -47

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 07d91224845f955eb5e2433d059361d99f2de2f4
-  data.tar.gz: fbe4d1d40cd9f533f745c28bcacc3954aee77f58
+  metadata.gz: 572d91b93afea39ab298c267f0068436e62670c1
+  data.tar.gz: a5ffe79141ca346aa6c346b7514dab584eb234eb
 SHA512:
-  metadata.gz: 700d2dcf6d0dab285c7036ea32ba3e88ae3f5f51edb5434dd0a32c61745a6455ef0aad9a67074452c82a6036485136c021c8ecd05a50c30393351430e9e9a157
-  data.tar.gz: 713b0e7ff1872541a8fc5fd6dcd681cb58fb8d2d988cef6694650ee18d53f6a0529946cac4f4fce117ae4630286ba86015a923f901966553e6dea0c2ad9cda95
+  metadata.gz: 57414c4a4ec004c4f8a1483118ef638d58d744e1b0b71a9e7cb43a1f89e660de3fd9fe4408dc350901f5afb7c3094a2ad83b8461c406766b8b554b9345c0f411
+  data.tar.gz: 459cc09789690d93f943d027af74481aecce5ebd3283181c77c809c1f83f0e6fef56db379c10da69c3bc7a313a9c0711a3ce35fe5a04957601bbc9a24ddcf861

data/README.md CHANGED Viewed

@@ -1,26 +1,51 @@
 # KokugoTagger
-TODO: Write a gem description
+cabocha形式のファイルに対して、学校文法に準拠した係り受けラベルを付与します。
 ## Installation
-Add this line to your application's Gemfile:
+事前に以下のツールをインストールし、パスを通しておく必要があります。
-```ruby
-gem 'kokugo_tagger'
-```
+- Ruby
+- YamCha
-And then execute:
-    $ bundle
-Or install it yourself as:
+コマンドラインから以下のように入力し、インストールしてください。
     $ gem install kokugo_tagger
 ## Usage
-TODO: Write usage instructions here
+UTF-8/UniDicのCaboCha形式データにのみ対応しています。CaboCha形式のファイルにラベルを付与する場合は、次のように実行してください。
+    $ cat neko.cabocha | kokugo_tagger > output.cabocha
+プレーンテキストの場合は、次のようにCaboChaと組み合わせて下さい。CaboChaは別途インストールしてください。
+    $ cat neko.txt | cabocha -f1 | kokugo_tagger > output.cabocha
+コマンドライン上で対話的に実行することもできます。
+    $ cabocha -f1 | kokugo_tagger
+    吾輩は猫である
+    * 0 1S 0/1 0.000000
+    吾輩	代名詞,*,*,*,*,*,ワガハイ,我が輩,吾輩,ワガハイ,吾輩,ワガハイ,混,*,*,*,*,ワガハイ,ワガハイ,ワガハイ,ワガハイ,*,*,0,*,*	O
+    は	助詞,係助詞,*,*,*,*,ハ,は,は,ワ,は,ワ,和,*,*,*,*,ハ,ハ,ハ,ハ,*,*,*,"動詞%F2@0,名詞%F1,形容詞%F2@-1",*	O
+    * 1 -1X 2/2 0.000000
+    猫	名詞,普通名詞,一般,*,*,*,ネコ,猫,猫,ネコ,猫,ネコ,和,*,*,*,*,ネコ,ネコ,ネコ,ネコ,*,*,1,C4,*	O
+    で	助動詞,*,*,*,助動詞-ダ,連用形-一般,ダ,だ,で,デ,だ,ダ,和,*,*,*,*,デ,ダ,デ,ダ,*,*,*,名詞%F1,*	O
+    ある	動詞,非自立可能,*,*,五段-ラ行,終止形-一般,アル,有る,ある,アル,ある,アル,和,*,*,*,*,アル,アル,アル,アル,*,*,1,C3,*	O
+    EOS
+係り受けラベルは以下の8種類です。
+- S: 主語
+- R: 連用修飾語
+- T: 連体修飾語
+- Z: 接続語
+- D: 独立語
+- H: 並立の関係
+- J: 補助の関係
+- X: その他(文末など)
 ## Contributing

data/bin/kokugo_tagger CHANGED Viewed

@@ -1,6 +1,27 @@
 #!/usr/bin/env ruby
+require 'optparse'
 require 'kokugo_tagger'
-Encoding.default_external = 'UTF-8'
-KokugoTagger.annotate ARGF
+$enc = "UTF-8"
+$model = nil
+$learn = nil
+$valid = nil
+OptionParser.new do |opt|
+	Version = "0.1.0"
+	opt.on('-e STR', '--encoding STR', String, 'encoding'){|str| $enc = str }
+	opt.on('-m FILE', '--model FILE', String, 'model file'){|file| $model = file }
+	opt.on('-l DIR', '--learn DIR', String, 'train corpus directory'){|dir| $learn = dir }
+	opt.on('-v DIR', '--valid DIR', String, 'K-fold cross-validation'){|dir| $learn = dir; $valid = 3 }
+	opt.parse!
+end
+# $stdin.set_encoding 'UTF-8'
+if $learn and $valid
+	KokugoTagger.validation $learn, $enc, $valid
+elsif $learn
+	KokugoTagger.learn $learn, $model, $enc
+else
+	KokugoTagger.annotate $stdin, $model, $enc
+end

data/lib/kokugo_tagger/cabocha.rb ADDED Viewed

@@ -0,0 +1,75 @@
+# -*- coding: utf-8 -*-
+module KokugoTagger::Cabocha
+	module_function
+	def parse(file = @source)
+		document = KokugoTagger::Cabocha::Document.new
+		sentence, chunk = nil, nil
+		file.each_line do |line|
+			sentence ||= KokugoTagger::Cabocha::Sentence.new
+			case line
+			when /^EOS/
+				sentence.each{|chunk| chunk.detect_structure} # sem_headとsyn_headの独自判定
+				document << sentence
+				sentence, chunk = nil, nil
+			when /^\*/
+				chunk = KokugoTagger::Cabocha::Chunk.new(line)
+				sentence << chunk
+				chunk.sentence = sentence
+			when /^#/
+				# nothing
+			else
+				token = KokugoTagger::Cabocha::Token.new(line)
+				chunk.tokens << token
+			end
+		end
+		return document
+	end
+end
+class KokugoTagger::Cabocha::Document < Array
+end
+class KokugoTagger::Cabocha::Sentence < Array
+end
+class KokugoTagger::Cabocha::Chunk
+	attr_accessor :info, :id, :link, :rel, :sem_head_id, :syn_head_id, :ext, :tokens, :sentence
+	def initialize(line)
+		@info = line.chomp.split(/\s/)[1..-1]
+		@id = @info[0].to_i
+		@link = @info[1].to_i
+		@rel = @info[1].delete("-0-9")
+		@sem_head_id = @info[2].split('/')[0].to_i
+		@syn_head_id = @info[2].split('/')[1].to_i
+		@ext = @info[4]
+		@tokens = []
+	end
+	def detect_structure
+		sem_head_id, syn_head_id = 0, 0
+		@tokens.each_with_index do |token, num|
+			if token.pos =~ /^(助詞|助動詞)/
+				syn_head_id = num
+			elsif token.pos !~ /^(補助記号|空白)/
+				sem_head_id = num if sem_head_id == syn_head_id
+				syn_head_id = num
+			end
+		end
+		@sem_head_id, @syn_head_id = sem_head_id, syn_head_id
+	end
+	def text
+		@tokens.map{|token| token.text}.join
+	end
+end
+class KokugoTagger::Cabocha::Token
+	attr_accessor :info, :text, :pos, :ctype, :cform
+	def initialize(line)
+		text, info = line.chomp.split("\t")
+		@info = info.split(",")
+		@text = text
+		@pos = @info[0, 4].delete_if{|s| s == '*'}.join('-')
+		@ctype = @info[4]
+		@cform = @info[5]
+	end
+end

data/lib/kokugo_tagger/kokugo.model ADDED Viewed

Binary file

data/lib/kokugo_tagger/tagger.rb CHANGED Viewed

@@ -1,151 +1,241 @@
 # -*- coding: utf-8 -*-
-require 'csv'
 module KokugoTagger
 	module_function
-	def annotate(file)
-		file.each_line do |line|
-			next unless data = CabochaParser.parse(line)
-			method_name = data[:type].downcase.to_sym
-			method(method_name).call(data) if methods.include?(method_name)
-			puts line
+	def annotate(source, model, enc)
+		enc ||= "UTF-8"
+		source.set_encoding enc
+		source_0, source_1 = KokugoTagger::Duplicator.duplicate(source, 2)
+		converter = KokugoTagger::Converter.connect(source_0)
+		yamcha = KokugoTagger::Yamcha.connect(converter, model)
+		merger = KokugoTagger::Merger.connect(yamcha, source_1)
+		merger.each_line{|line| puts line }
+	end
+	def convert(source)
+		converter = KokugoTagger::Converter.connect(source)
+		converter.each_line{|line| puts line }
+	end
+	def learn(source_dir, model, enc)
+		model ||= "kokugo"
+		enc ||= "UTF-8"
+		model = File.basename(model, ".model")
+		KokugoTagger::Learner.learn source_dir, model, enc
+	end
+	def validation(source_dir, enc, k)
+		enc ||= "UTF-8"
+		k_num ||= 3
+		KokugoTagger::Learner.validation source_dir, enc, k
+	end
+end
+module KokugoTagger::Duplicator
+	module_function
+	def duplicate(source, number = 2)
+		pipes = number.times.map{ IO.pipe("UTF-8") }
+		Thread.fork do
+			source.each_line { |line| pipes.each{|r, w| w.puts line } }
+			pipes.each{|r, w| w.close}
+		end.abort_on_exception = true
+		return pipes.map{|r, w| r }
+	end
+end
+module KokugoTagger::Converter
+	module_function
+	def connect(source)
+		read, write = IO.pipe("UTF-8")
+		Thread.fork do
+			self.process source, write
+			write.close
+		end.abort_on_exception = true
+		return read
+	end
+	def process(source, output)
+		buffer = ""
+		source.each_line do |line|
+			buffer << line
+			self.flush(buffer, output) if line.chomp == "EOS"
 		end
 	end
-	def chunk(data)
-		@chunks ||= []
-		@chunks << @chunk = data
-		@lpos ||= 0
-		@chunk.update start:@lpos, end:@lpos, text:'', pos:nil, pred:nil, conj:nil
-	end
-	def token(data)
-		@lpos += data[:text].size
-		@chunk[:end] = @lpos
-		@chunk[:text] += data[:text]
-		pos data
-		cform data
-	end
-	def segment_s(data)
-		@segments ||= []
-		@segments << data
-		@last_item = data
-	end
-	def group_s(data)
-		@groups ||= []
-		@groups << data
-		@last_item = data
-	end
-	def attr(data)
-		@last_item[:attributes] ||= []
-		@last_item[:attributes] << data
-	end
-	def eos(data)
-		return unless @chunks
-		before_eos
-		@chunks.each do |chunk|
-			puts '#! SEGMENT_S bccwj-kok:Bnst %d %d "%s"' % [chunk[:start], chunk[:end], chunk[:text]]
-			puts '#! ATTR bccwj-kok:pred "%s述語"' % chunk[:pos] if chunk[:pred]
-			puts '#! ATTR bccwj-kok:conj "%s"' % chunk[:conj] if chunk[:conj]
+	def flush(buffer, output)
+		document = KokugoTagger::Cabocha.parse(buffer)
+		document.each do |sentence|
+			sentence.each { |chunk| output.puts chunk_features(chunk).join(" ") }
+			output.puts
 		end
-		@chunks, @chunk, @lpos, @segments, @groups = nil
-	end
-	def pos(token)
-		case token[:pos]
-		when /^(名詞|代名詞|接尾辞-名詞的)/
-			@chunk.update pos:'名詞', pred:nil, conj:nil
-		when /^(形状詞|接尾辞-形状詞的)/
-			@chunk.update pos:'形状詞', pred:nil, conj:nil
-		when /^連体詞/
-			@chunk.update pos:'連体詞', pred:nil, conj:'修飾(連体)'
-		when /^副詞/
-			@chunk.update pos:'副詞', pred:nil, conj:'修飾(連用)'
-		when /^接続詞/
-			@chunk.update pos:'接続詞', pred:nil, conj:'接続'
-		when /^感動詞/
-			@chunk.update pos:'感動詞', pred:nil, conj:'独立'
-		when /^(動詞|接尾辞-動詞的)/
-			@chunk.update pos:'動詞', pred:true, conj:nil
-		when /^(形容詞|接尾辞-形容詞的)/
-			@chunk.update pos:'形容詞', pred:true, conj:nil
-		when /^助動詞/
-			@chunk.update pred:true, conj:nil
-		when /^助詞-格助詞/
-			case token[:text]
-			when 'が'
-				@chunk.update conj:'主語'
-			when 'を', 'に'
-				@chunk.update conj:'補語'
-			when 'の', 'との', 'という', 'といった'
-				@chunk.update conj:'修飾(連体)'
-			else
-				@chunk.update conj:'修飾(連用)'
+		buffer.clear
+	end
+	def token_features(token)
+		return %w(* * * *) unless token
+		text, pos, cform = token.text, token.pos.split('-'), token.cform.split('-')
+		features = [text, pos[0], pos[1], cform[0]].map{|f| f || '*'}
+		return features
+	end
+	def chunk_features(chunk)
+		tokens = [sem_head(chunk), case_marker(chunk), syn_head(chunk), punct(chunk), sem_head(link_to(chunk))]
+		features = tokens.map{|token| token_features(token)}
+		return [chunk.text, features, chunk.rel].flatten
+		# return [chunk.text, features, chunk.ext].flatten
+	end
+	def sem_head(chunk)
+		return nil unless chunk
+		chunk.tokens[chunk.sem_head_id]
+	end
+	def case_marker(chunk)
+		return nil unless chunk
+		chunk.tokens.find{|token| token.pos.split('-')[1] == '格助詞'}
+	end
+	def syn_head(chunk)
+		return nil unless chunk
+		chunk.tokens[chunk.syn_head_id]
+	end
+	def punct(chunk)
+		return nil unless chunk
+		chunk.tokens[chunk.syn_head_id + 1]
+	end
+	def link_to(chunk)
+		chunk.sentence[chunk.link] if chunk.link != -1
+	end
+end
+module KokugoTagger::Yamcha
+	module_function
+	def connect(source, model)
+		model ||= File.dirname(__FILE__) + "/kokugo.model"
+		io = IO.popen("yamcha -m \"#{model}\"", "r+", encoding: "UTF-8")
+		Thread.fork {
+			source.each_line{|line| io.puts line }
+			io.close_write
+		}.abort_on_exception = true
+		return io
+	end
+end
+module KokugoTagger::Merger
+	module_function
+	def connect(yamcha, cabocha)
+		read, write = IO.pipe("UTF-8")
+		Thread.fork do
+			self.process yamcha, cabocha, write
+			write.close
+		end.abort_on_exception = true
+		return read
+	end
+	def process(yamcha, cabocha, output)
+		cabocha.each_line do |line|
+			if line[0] == "*"
+				record = yamcha.gets
+				record = yamcha.gets until record.chomp != ""
+				letter = record.chomp.split("\t").last.upcase
+				line.sub! /[A-Z]+/, letter
 			end
-		when /^(助詞-副助詞|助詞-係助詞)/
-			@chunk.update conj:'修飾(連用)'
-		when /^助詞-接続詞/
-			@chunk.update pred:true, conj:'接続'
-		when /^助詞-終助詞/
-			@chunk.update pred:true, conj:nil
-		when /^助詞-準体助詞/
-			@chunk.update conj:nil
+			output.puts line
 		end
 	end
-	def cform(token)
-		case token[:cform]
-		when /^語幹/
-		when /^(未然形|連用形|仮定形|已然形)/
-			@chunk.update conj:'接続'
-		when /^(意志推量形|連体形)/
-			@chunk.update conj:'修飾(連体)'
-		when /^(終止形|命令形)/
-			@chunk.update conj:nil
-		end
-	end
-	def before_eos
-		# BCCWJ-DepPara
-		@chunks.each do |chunk|
-			chunk[:conj] = [chunk[:conj], '断片'].compact.join(':') if chunk[:rel] == 'F'
-			chunk[:conj] = [chunk[:conj], '文節内'].compact.join(':') if chunk[:rel] == 'B'
-			chunk[:conj] = '文末' if chunk[:rel] == 'Z'
+end
+module KokugoTagger::Learner
+	module_function
+	def learn(source_dir, model, enc)
+		model ||= "kokugo"
+		convert source_dir, "train.data", enc
+		yamcha_learn "train.data", model
+	end
+	def validation(source_dir, enc, k)
+		convert source_dir, "train.data", enc
+		filenames = split("train.data", enc, k)
+		filenames.each_with_index do |filename, n|
+			others = filenames - [filename]
+			concat others, "temp.data"
+			model = "test"
+			yamcha_learn "temp.data", model
+			result = "result.#{n}.data"
+			system "cat #{filename} | yamcha -m test.model > #{result}"
 		end
-		# 並列・同格関係
-		@groups ||= []
-		@segments ||= []
-		@groups.each do |group|
-			next unless group[:name] =~ /^(Parallel|Apposition)$/
-			members = group[:member].map{|n| n.to_i}
-			members = @segments.values_at(*members)
-			chunk_ids = members.map do |segment|
-				_end = segment[:end].to_i
-				chunk = @chunks.find{|c| c[:start] < _end and c[:end] >= _end}
-				chunk[:id].to_i if chunk
-			end
-			chunk_ids = chunk_ids.compact.uniq.sort
-			if chunk_ids.size > 1
-				conj = {'Parallel' => '並立', 'Apposition' => '同格'}[group[:name]]
-				chunk_ids[0..-2].each{|cid| @chunks[cid][:conj] = conj}
+		data_set = []
+		filenames = Array.new(k){|n| "result.#{n}.data"}
+		filenames.each do |filename|
+			sentence = 0
+			chunk = 0
+			accuracy = 0
+			label_data = Hash.new{|h, k| h[k] = Hash.new(0)}
+			File.foreach(filename, encoding: enc) do |line|
+				line.chomp!
+				if line.empty?
+					sentence += 1
+				else
+					t, p = *line.split(/\s/)[-2, 2]
+					acc = (t == p)
+					chunk += 1
+					accuracy += 1 if acc
+					label_data[t][[true, acc]] += 1
+					label_data[p][[acc, true]] += 1
+				end
 			end
+			data_set << [sentence, chunk, accuracy, label_data]
 		end
-		# 属性を付与できなかった文節に対して、係り受けを利用して属性を補完
-		# 連用成分を受ける文節を述語とみなす
-		@chunks.each do |chunk|
-			chunk[:pred] ||= @chunks.any?{|_chunk| _chunk[:link] == chunk[:id] && _chunk[:conj] =~ /^(主語|補語|修飾\(連用\)|接続)$/}
+		report = open("validation.txt", "w:UTF-8")
+		report.puts "# #{k}-fold cross-validation"
+		report.puts
+		report.puts "## test files"
+		k.times do |n|
+			sentence, chunk = *data_set[n]
+			report.puts "train.#{n}.data: #{sentence} sentences. #{chunk} chunks."
 		end
-		# 述語にかかる文節を修飾(連用)とみなす
-		@chunks.each do |chunk|
-			chunk[:conj] = '修飾(連用)' if chunk[:conj] == nil && @chunks.any?{|_chunk| _chunk[:id] == chunk[:link] && _chunk[:pred]}
+		report.puts
+		k.times do |n|
+			sentence, chunk, accuracy, label_data = *data_set[n]
+			report.puts "## train.#{n}.data"
+			report.puts "accuracy: #{accuracy.to_f / chunk}"
+			report.puts "labels:"
+			report.puts %w(label tp tn fp fn recall precision f-score accuracy).join("\t")
+			label_data.sort.each do |label, data|
+				tp = data[[true, true]]
+				tn = data[[true, false]]
+				fp = data[[false, true]]
+				fn = chunk - (tp + tn + fp)
+				recall = tp.to_f / (tp + tn)
+				precision = tp.to_f / (tp + fp)
+				f = 2 * precision * recall / (precision + recall)
+				acc = (tp + fn).to_f / chunk
+				report.puts %w(%s %d %d %d %d %.2f %.2f %.2f %.2f).join("\t") % [label, tp, tn, fp, fn, recall, precision, f, acc]
+			end
+			report.puts
 		end
-		# 述語項構造が付与されている文節を補語にする
-		@chunks.each do |chunk|
-			next if chunk[:link] == '-1' || chunk[:arg] == nil
-			next unless chunk[:conj] == nil || chunk[:conj] == '修飾(連用)'
-			pred = @chunks[chunk[:link].to_i]
-			if chunk[:arg] == 'Ga' and pred[:passive] == nil
-				chunk[:conj] = '主語'
-			elsif chunk[:arg] == 'O' and pred[:passive] == '直接'
-				chunk[:conj] = '主語'
-			else
-				chunk[:conj] = '補語'
+	end
+	def convert(source_dir, target_filename, enc)
+		target = open(target_filename, "w:#{enc}")
+		source_filenames = Dir.glob(source_dir + "/*.cabocha")
+		# source_filenames = source_filenames[0, 2] # debug
+		source_filenames.each do |filename|
+			source = open(filename, encoding: enc)
+			converter = KokugoTagger::Converter.connect(source)
+			converter.each_line{|line| target.puts line }
+			source.close
+		end
+		target.close
+	end
+	def split(source_filename, enc, k)
+		basename = File.basename(source_filename, ".data")
+		target_filenames = Array.new(k){|n| "#{basename}.#{n}.data"}
+		targets = target_filenames.map{|filename| open(filename, "w:#{enc}")}
+		index = 0
+		File.foreach(source_filename, encoding: enc) do |line|
+			targets[index].puts line
+			if line.chomp.empty?
+				index += 1
+				index = 0 if index == k
 			end
 		end
+		targets.each{|f| f.close}
+		return target_filenames
+	end
+	def concat(source_filenames, target_filename)
+		system "cat #{source_filenames.join(" ")} > #{target_filename}"
+	end
+	def yamcha_learn(train_data, model)
+		libexecdir = `yamcha-config --libexecdir`.chomp
+		system "cp #{libexecdir}/Makefile ."
+		system "make CORPUS=#{train_data} MODEL=#{model} FEATURE=\"F:0..0:1..\" SVM_PARAM=\"-t 1 -d 2 -c 1\" MULTI_CLASS=1 train"
 	end
 end

data/lib/kokugo_tagger/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module KokugoTagger
-  VERSION = "0.0.6"
+  VERSION = "1.0.3"
 end

data/lib/kokugo_tagger.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 require "kokugo_tagger/version"
-require "kokugo_tagger/parser"
+require "kokugo_tagger/cabocha"
 require "kokugo_tagger/tagger"
 module KokugoTagger
   # Your code goes here...
-end
+end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: kokugo_tagger
 version: !ruby/object:Gem::Version
-  version: 0.0.6
+  version: 1.0.3
 platform: ruby
 authors:
 - Mizuho IMADA
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-08-31 00:00:00.000000000 Z
+date: 2016-02-16 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -54,7 +54,8 @@ files:
 - bin/kokugo_tagger
 - kokugo_tagger.gemspec
 - lib/kokugo_tagger.rb
-- lib/kokugo_tagger/parser.rb
+- lib/kokugo_tagger/cabocha.rb
+- lib/kokugo_tagger/kokugo.model
 - lib/kokugo_tagger/tagger.rb
 - lib/kokugo_tagger/version.rb
 homepage: ''
@@ -77,9 +78,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.4.5
+rubygems_version: 2.5.2
 signing_key:
 specification_version: 4
 summary: Write a short summary. Required.
 test_files: []
-has_rdoc:

data/lib/kokugo_tagger/parser.rb DELETED Viewed

@@ -1,47 +0,0 @@
-# -*- coding: utf-8 -*-
-require 'csv'
-module CabochaParser
-	def parse(line)
-		case line.chomp
-		when /^#/
-			return parse_excab(line)
-		when /^\*/
-			return parse_chunk(line)
-		when 'EOS'
-			return {type: 'EOS'}
-		when ''
-			return nil
-		else
-			return parse_token(line)
-		end
-	end
-	def parse_excab(line)
-		line = line.gsub('\"', '""')
-		null, type, *data = CSV.parse_line(line.chomp, col_sep:' ')
-		case type
-		when 'SEGMENT', 'SEGMENT_S', 'LINK', 'LINK_S'
-			excab = {type: type, name: data[0], start: data[1].to_i, end: data[2].to_i, comment: data[3]}
-		when 'GROUP', 'GROUP_S'
-			excab = {type: type, name: data[0], member: data[1..-2], comment: data[-1]}
-		when 'ATTR'
-			excab = {type: type, name: data[0], value: data[1]}
-		end
-		return excab
-	end
-	def parse_chunk(line)
-		null, id, dep, part, score = line.chomp.split("\s")
-		link, rel = dep[0..-2], dep[-1]
-		head, func = part.split('/')
-		chunk = {type: 'CHUNK', id: id, link: link, rel: rel, head: head, func: func, score: score}
-		return chunk
-	end
-	def parse_token(line)
-		text, attrs, ne = line.chomp.split("\t")
-		attrs = CSV.parse_line(attrs, col_sep:',')
-		pos = attrs[0, 4].delete_if{|item| item.empty?}.join('-')
-		token = {type: 'TOKEN', text: text, ne: ne, pos: pos, ctype: attrs[4], cform: attrs[5]}
-		return token
-	end
-	module_function :parse, :parse_excab, :parse_chunk, :parse_token
-end