sosowa 0.1 → 0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.md +4 -0
- data/README.md +2 -1
- data/lib/sosowa/scheme.rb +7 -6
- data/lib/sosowa/version.rb +1 -1
- data/lib/sosowa.rb +4 -0
- data/samples/tf-idf.rb +83 -0
- metadata +5 -4
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
data/lib/sosowa/scheme.rb
CHANGED
@@ -2,6 +2,10 @@ module Sosowa
|
|
2
2
|
class Scheme
|
3
3
|
protected
|
4
4
|
|
5
|
+
def initialize(element)
|
6
|
+
@element = element
|
7
|
+
end
|
8
|
+
|
5
9
|
def method_missing(action, *args)
|
6
10
|
return @element[action.to_s.to_sym] rescue nil
|
7
11
|
end
|
@@ -24,6 +28,7 @@ module Sosowa
|
|
24
28
|
def fetch(log, key)
|
25
29
|
params = Sosowa.serialize_parameter({:mode => :read, :log => log, :key => key})
|
26
30
|
@page = @agent.get(URI.join(Sosowa::BASE_URL, params))
|
31
|
+
title = (@page/%{div[@class="header"] > h1})[0].inner_html.to_s.toutf8.strip
|
27
32
|
tags = (@page/%{dl[@class="info"][1] > dd > a}).map{|t| t.inner_html.to_s.toutf8 }
|
28
33
|
text = (@page/%{div[@class="contents ss"]})[0].inner_html.to_s.toutf8
|
29
34
|
ps = (@page/%{div[@class="aft"]})[0].inner_html.to_s.toutf8
|
@@ -48,6 +53,7 @@ module Sosowa
|
|
48
53
|
comments << comment
|
49
54
|
end
|
50
55
|
novel = {
|
56
|
+
:title => title,
|
51
57
|
:text => text,
|
52
58
|
:ps => ps,
|
53
59
|
:author => author,
|
@@ -93,12 +99,7 @@ module Sosowa
|
|
93
99
|
|
94
100
|
end
|
95
101
|
|
96
|
-
class Index < Scheme
|
97
|
-
def initialize(element)
|
98
|
-
super(@element)
|
99
|
-
@element = element
|
100
|
-
end
|
101
|
-
|
102
|
+
class Index < Scheme
|
102
103
|
def fetch
|
103
104
|
Novel.new(:log => @element[:log], :key => @element[:key])
|
104
105
|
end
|
data/lib/sosowa/version.rb
CHANGED
data/lib/sosowa.rb
CHANGED
@@ -10,6 +10,8 @@ require "sosowa/parser"
|
|
10
10
|
|
11
11
|
module Sosowa
|
12
12
|
BASE_URL = "http://coolier.sytes.net:8080/sosowa/ssw_l/"
|
13
|
+
|
14
|
+
protected
|
13
15
|
|
14
16
|
# @param [Hash] parameter
|
15
17
|
# @return [String] URL Serialized parameters
|
@@ -22,6 +24,8 @@ module Sosowa
|
|
22
24
|
param = ant.inject(""){|k,v|k+"&#{v[0]}=#{URI.escape(v[1])}"}.sub!(/^&/,"?")
|
23
25
|
return param ? param : ""
|
24
26
|
end
|
27
|
+
|
28
|
+
public
|
25
29
|
|
26
30
|
def self.get(args={})
|
27
31
|
args[:log] ||= 0
|
data/samples/tf-idf.rb
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: utf-8
|
3
|
+
# 創想話の最新版から適当なSSのテキストを取得してMeCab(+ 東方MeCab辞書)を用いて代表キーワード候補を名詞限定で選出し、TF-IDF法による特徴語抽出を行います。
|
4
|
+
# 注意: ugigi gemが必要です
|
5
|
+
|
6
|
+
require "MeCab"
|
7
|
+
require "kconv"
|
8
|
+
require "sosowa"
|
9
|
+
require "ugigi"
|
10
|
+
|
11
|
+
module MeCab
|
12
|
+
class Tagger
|
13
|
+
alias_method :parseToNode_org, :parseToNode
|
14
|
+
private :parseToNode_org
|
15
|
+
|
16
|
+
def parseToNode(*args)
|
17
|
+
node = parseToNode_org(*args)
|
18
|
+
nodes = []
|
19
|
+
while node
|
20
|
+
nodes.push(node)
|
21
|
+
node = node.next
|
22
|
+
end
|
23
|
+
return nodes[1, nodes.size - 2]
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
class Node
|
28
|
+
alias_method :feature_org, :feature
|
29
|
+
alias_method :surface_org, :surface
|
30
|
+
private :feature_org
|
31
|
+
private :surface_org
|
32
|
+
|
33
|
+
def feature ; feature_org.toutf8 end
|
34
|
+
def surface ; surface_org.toutf8 end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
puts "東方MeCab辞書をダウンロード中..."
|
39
|
+
system("curl -L https://github.com/oame/thdic-mecab/raw/master/pkg/thdic-mecab.dic > thdic-mecab.dic")
|
40
|
+
|
41
|
+
puts "完了. MeCab::Taggerを初期化します"
|
42
|
+
mecab = MeCab::Tagger.new("-u thdic-mecab.dic")
|
43
|
+
|
44
|
+
novel = Sosowa.get.sample.fetch
|
45
|
+
puts "-"*30
|
46
|
+
puts novel.title
|
47
|
+
puts "作者: #{novel.author.name}"
|
48
|
+
puts "-"*30
|
49
|
+
text = novel.plain
|
50
|
+
tf = {}
|
51
|
+
n = 15646.0
|
52
|
+
puts "代表キーワード候補を抽出中..."
|
53
|
+
tokens = mecab.parseToNode(text)
|
54
|
+
tokens.each do |token|
|
55
|
+
next unless token.feature =~ /名詞/
|
56
|
+
tf[token.surface] ||= 0
|
57
|
+
tf[token.surface] += 1
|
58
|
+
end
|
59
|
+
|
60
|
+
puts "代表キーワード候補数: #{tf.size}"
|
61
|
+
|
62
|
+
tfidf_list = []
|
63
|
+
tf.each do |e|
|
64
|
+
print "TF: #{e[0]} ... \t"
|
65
|
+
df = Ugigi.total_count(:free => e[0], :sswp => 0, :compe => 0)
|
66
|
+
if df == 0
|
67
|
+
print "N/A\n"
|
68
|
+
tfidf_list << [e[0], 0]
|
69
|
+
next
|
70
|
+
end
|
71
|
+
print "DF: #{df} \t"
|
72
|
+
tfidf = e[1] * Math.log(n/df)
|
73
|
+
print "TF-IDF: #{tfidf}\n"
|
74
|
+
tfidf_list << [e[0], tfidf]
|
75
|
+
end
|
76
|
+
tfidf_list = tfidf_list.sort{|a, b| b[1] <=> a[1]}
|
77
|
+
|
78
|
+
puts "集計終わり!"
|
79
|
+
|
80
|
+
10.times do |n|
|
81
|
+
l = tfidf_list[n]
|
82
|
+
puts "#{n+1}. #{l[0]} \tTF-IDF: #{l[1]}"
|
83
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sosowa
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.2'
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-07-
|
12
|
+
date: 2012-07-11 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
16
|
-
requirement: &
|
16
|
+
requirement: &70146581283880 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,7 +21,7 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70146581283880
|
25
25
|
description: Sosowa Parser for Ruby
|
26
26
|
email:
|
27
27
|
- oame@oameya.com
|
@@ -39,6 +39,7 @@ files:
|
|
39
39
|
- lib/sosowa/parser.rb
|
40
40
|
- lib/sosowa/scheme.rb
|
41
41
|
- lib/sosowa/version.rb
|
42
|
+
- samples/tf-idf.rb
|
42
43
|
- samples/token_segment.rb
|
43
44
|
- sosowa.gemspec
|
44
45
|
homepage: ''
|