sosowa 0.1 → 0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.md +4 -0
- data/README.md +2 -1
- data/lib/sosowa/scheme.rb +7 -6
- data/lib/sosowa/version.rb +1 -1
- data/lib/sosowa.rb +4 -0
- data/samples/tf-idf.rb +83 -0
- metadata +5 -4
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
data/lib/sosowa/scheme.rb
CHANGED
@@ -2,6 +2,10 @@ module Sosowa
|
|
2
2
|
class Scheme
|
3
3
|
protected
|
4
4
|
|
5
|
+
def initialize(element)
|
6
|
+
@element = element
|
7
|
+
end
|
8
|
+
|
5
9
|
def method_missing(action, *args)
|
6
10
|
return @element[action.to_s.to_sym] rescue nil
|
7
11
|
end
|
@@ -24,6 +28,7 @@ module Sosowa
|
|
24
28
|
def fetch(log, key)
|
25
29
|
params = Sosowa.serialize_parameter({:mode => :read, :log => log, :key => key})
|
26
30
|
@page = @agent.get(URI.join(Sosowa::BASE_URL, params))
|
31
|
+
title = (@page/%{div[@class="header"] > h1})[0].inner_html.to_s.toutf8.strip
|
27
32
|
tags = (@page/%{dl[@class="info"][1] > dd > a}).map{|t| t.inner_html.to_s.toutf8 }
|
28
33
|
text = (@page/%{div[@class="contents ss"]})[0].inner_html.to_s.toutf8
|
29
34
|
ps = (@page/%{div[@class="aft"]})[0].inner_html.to_s.toutf8
|
@@ -48,6 +53,7 @@ module Sosowa
|
|
48
53
|
comments << comment
|
49
54
|
end
|
50
55
|
novel = {
|
56
|
+
:title => title,
|
51
57
|
:text => text,
|
52
58
|
:ps => ps,
|
53
59
|
:author => author,
|
@@ -93,12 +99,7 @@ module Sosowa
|
|
93
99
|
|
94
100
|
end
|
95
101
|
|
96
|
-
class Index < Scheme
|
97
|
-
def initialize(element)
|
98
|
-
super(@element)
|
99
|
-
@element = element
|
100
|
-
end
|
101
|
-
|
102
|
+
class Index < Scheme
|
102
103
|
def fetch
|
103
104
|
Novel.new(:log => @element[:log], :key => @element[:key])
|
104
105
|
end
|
data/lib/sosowa/version.rb
CHANGED
data/lib/sosowa.rb
CHANGED
@@ -10,6 +10,8 @@ require "sosowa/parser"
|
|
10
10
|
|
11
11
|
module Sosowa
|
12
12
|
BASE_URL = "http://coolier.sytes.net:8080/sosowa/ssw_l/"
|
13
|
+
|
14
|
+
protected
|
13
15
|
|
14
16
|
# @param [Hash] parameter
|
15
17
|
# @return [String] URL Serialized parameters
|
@@ -22,6 +24,8 @@ module Sosowa
|
|
22
24
|
param = ant.inject(""){|k,v|k+"&#{v[0]}=#{URI.escape(v[1])}"}.sub!(/^&/,"?")
|
23
25
|
return param ? param : ""
|
24
26
|
end
|
27
|
+
|
28
|
+
public
|
25
29
|
|
26
30
|
def self.get(args={})
|
27
31
|
args[:log] ||= 0
|
data/samples/tf-idf.rb
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: utf-8
|
3
|
+
# 創想話の最新版から適当なSSのテキストを取得してMeCab(+ 東方MeCab辞書)を用いて代表キーワード候補を名詞限定で選出し、TF-IDF法による特徴語抽出を行います。
|
4
|
+
# 注意: ugigi gemが必要です
|
5
|
+
|
6
|
+
require "MeCab"
|
7
|
+
require "kconv"
|
8
|
+
require "sosowa"
|
9
|
+
require "ugigi"
|
10
|
+
|
11
|
+
module MeCab
|
12
|
+
class Tagger
|
13
|
+
alias_method :parseToNode_org, :parseToNode
|
14
|
+
private :parseToNode_org
|
15
|
+
|
16
|
+
def parseToNode(*args)
|
17
|
+
node = parseToNode_org(*args)
|
18
|
+
nodes = []
|
19
|
+
while node
|
20
|
+
nodes.push(node)
|
21
|
+
node = node.next
|
22
|
+
end
|
23
|
+
return nodes[1, nodes.size - 2]
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
class Node
|
28
|
+
alias_method :feature_org, :feature
|
29
|
+
alias_method :surface_org, :surface
|
30
|
+
private :feature_org
|
31
|
+
private :surface_org
|
32
|
+
|
33
|
+
def feature ; feature_org.toutf8 end
|
34
|
+
def surface ; surface_org.toutf8 end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
puts "東方MeCab辞書をダウンロード中..."
|
39
|
+
system("curl -L https://github.com/oame/thdic-mecab/raw/master/pkg/thdic-mecab.dic > thdic-mecab.dic")
|
40
|
+
|
41
|
+
puts "完了. MeCab::Taggerを初期化します"
|
42
|
+
mecab = MeCab::Tagger.new("-u thdic-mecab.dic")
|
43
|
+
|
44
|
+
novel = Sosowa.get.sample.fetch
|
45
|
+
puts "-"*30
|
46
|
+
puts novel.title
|
47
|
+
puts "作者: #{novel.author.name}"
|
48
|
+
puts "-"*30
|
49
|
+
text = novel.plain
|
50
|
+
tf = {}
|
51
|
+
n = 15646.0
|
52
|
+
puts "代表キーワード候補を抽出中..."
|
53
|
+
tokens = mecab.parseToNode(text)
|
54
|
+
tokens.each do |token|
|
55
|
+
next unless token.feature =~ /名詞/
|
56
|
+
tf[token.surface] ||= 0
|
57
|
+
tf[token.surface] += 1
|
58
|
+
end
|
59
|
+
|
60
|
+
puts "代表キーワード候補数: #{tf.size}"
|
61
|
+
|
62
|
+
tfidf_list = []
|
63
|
+
tf.each do |e|
|
64
|
+
print "TF: #{e[0]} ... \t"
|
65
|
+
df = Ugigi.total_count(:free => e[0], :sswp => 0, :compe => 0)
|
66
|
+
if df == 0
|
67
|
+
print "N/A\n"
|
68
|
+
tfidf_list << [e[0], 0]
|
69
|
+
next
|
70
|
+
end
|
71
|
+
print "DF: #{df} \t"
|
72
|
+
tfidf = e[1] * Math.log(n/df)
|
73
|
+
print "TF-IDF: #{tfidf}\n"
|
74
|
+
tfidf_list << [e[0], tfidf]
|
75
|
+
end
|
76
|
+
tfidf_list = tfidf_list.sort{|a, b| b[1] <=> a[1]}
|
77
|
+
|
78
|
+
puts "集計終わり!"
|
79
|
+
|
80
|
+
10.times do |n|
|
81
|
+
l = tfidf_list[n]
|
82
|
+
puts "#{n+1}. #{l[0]} \tTF-IDF: #{l[1]}"
|
83
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sosowa
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.2'
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-07-
|
12
|
+
date: 2012-07-11 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
16
|
-
requirement: &
|
16
|
+
requirement: &70146581283880 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,7 +21,7 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70146581283880
|
25
25
|
description: Sosowa Parser for Ruby
|
26
26
|
email:
|
27
27
|
- oame@oameya.com
|
@@ -39,6 +39,7 @@ files:
|
|
39
39
|
- lib/sosowa/parser.rb
|
40
40
|
- lib/sosowa/scheme.rb
|
41
41
|
- lib/sosowa/version.rb
|
42
|
+
- samples/tf-idf.rb
|
42
43
|
- samples/token_segment.rb
|
43
44
|
- sosowa.gemspec
|
44
45
|
homepage: ''
|