sosowa 0.2 → 0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.md +6 -0
- data/README.md +1 -1
- data/lib/sosowa/parser.rb +17 -4
- data/lib/sosowa/scheme.rb +38 -14
- data/lib/sosowa/version.rb +1 -1
- data/samples/chara_recognize.rb +37 -0
- data/samples/feature-0.3.rb +17 -0
- data/samples/tf-idf.rb +3 -30
- data/samples/token_segment.rb +4 -20
- metadata +6 -4
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
## 0.3
|
2
|
+
* Sosowa#Logを追加。作品集単位で抽象化出来るようになりました。殆どの場合、このクラスはArrayとして振る舞います。
|
3
|
+
* Sosowa#Log.logで絶対作品集番号を得ることが出来ます。このメソッドは最新作品集であっても0では無く実際の番号が割り振られます。
|
4
|
+
* Sosowa#Log.next_page, Sosowa#Log.prev_pageが追加されました。前後のページを取得してSosowa#Logを返します。
|
5
|
+
* 0.3のサンプルはtest/feature-0.3.rbで確認することが出来ます。
|
6
|
+
|
1
7
|
## 0.2
|
2
8
|
* Sosowa::Novel.titleを追加。むしろどうして今まで無かった
|
3
9
|
* Sosowa::Author, Sosowa::Commentが取得出来ないバグを修正
|
data/README.md
CHANGED
data/lib/sosowa/parser.rb
CHANGED
@@ -8,16 +8,29 @@ module Sosowa
|
|
8
8
|
|
9
9
|
def search(query, args={})
|
10
10
|
params = Sosowa.serialize_parameter({:mode => :search, :type => (args[:type] ? args[:type] : :insubject), :query => query.tosjis})
|
11
|
-
parse_index(URI.join(Sosowa::BASE_URL, params))
|
11
|
+
parse_index(@agent.get(URI.join(Sosowa::BASE_URL, params)))
|
12
12
|
end
|
13
13
|
|
14
14
|
def fetch_index(log)
|
15
15
|
params = Sosowa.serialize_parameter({:log => log})
|
16
|
-
|
16
|
+
page = @agent.get(URI.join(Sosowa::BASE_URL, params))
|
17
|
+
indexes = parse_index(page)
|
18
|
+
abs_log_num = parse_absolute_log_number(page)
|
19
|
+
Log.new(indexes, abs_log_num)
|
20
|
+
end
|
21
|
+
|
22
|
+
def parse_absolute_log_number(page)
|
23
|
+
li = page.search(%{ul[@id="pages"] li > *})
|
24
|
+
log = li.size
|
25
|
+
li.each do |l|
|
26
|
+
if l.attributes["id"] && l.attributes["id"].value == "selectedPage"
|
27
|
+
return log
|
28
|
+
end
|
29
|
+
log -= 1
|
30
|
+
end
|
17
31
|
end
|
18
32
|
|
19
|
-
def parse_index(
|
20
|
-
page = @agent.get(url)
|
33
|
+
def parse_index(page)
|
21
34
|
indexes = []
|
22
35
|
tr = page.search("tr")
|
23
36
|
tr = tr[1, tr.size-1]
|
data/lib/sosowa/scheme.rb
CHANGED
@@ -39,18 +39,20 @@ module Sosowa
|
|
39
39
|
review = header[3][1].split("/")
|
40
40
|
comments = []
|
41
41
|
comment_element = (@page/%{div[@class="comments"] > dl > *})
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
42
|
+
if comment_element.size > 0
|
43
|
+
comment_element[1, comment_element.size-1].each_slice(2) do |element|
|
44
|
+
bobj = element[0].search("b").map{|n| n.inner_html.to_s.toutf8.strip}
|
45
|
+
point = element[0].search("span").inner_html.to_s.toutf8.to_i
|
46
|
+
id = element[0].inner_html.to_s.toutf8.split(/\r?\n/).map{|n| n.strip}[1].to_i
|
47
|
+
comment = Comment.new(
|
48
|
+
:id => id,
|
49
|
+
:point => point,
|
50
|
+
:name => bobj[0],
|
51
|
+
:created_at => Time.parse(bobj[1].gsub(/[^\/\d\s:]/, "")),
|
52
|
+
:text => element[1].inner_html.to_s.toutf8.strip
|
53
|
+
)
|
54
|
+
comments << comment
|
55
|
+
end
|
54
56
|
end
|
55
57
|
novel = {
|
56
58
|
:title => title,
|
@@ -92,11 +94,11 @@ module Sosowa
|
|
92
94
|
end
|
93
95
|
|
94
96
|
class Comment < Scheme
|
95
|
-
|
97
|
+
|
96
98
|
end
|
97
99
|
|
98
100
|
class Author < Scheme
|
99
|
-
|
101
|
+
|
100
102
|
end
|
101
103
|
|
102
104
|
class Index < Scheme
|
@@ -105,4 +107,26 @@ module Sosowa
|
|
105
107
|
end
|
106
108
|
alias_method :get, :fetch
|
107
109
|
end
|
110
|
+
|
111
|
+
class Log < Array
|
112
|
+
attr_reader :log
|
113
|
+
|
114
|
+
def initialize(page, log=0)
|
115
|
+
@page = page
|
116
|
+
@log = log
|
117
|
+
super(page)
|
118
|
+
end
|
119
|
+
|
120
|
+
def next_page
|
121
|
+
parser = Parser.new
|
122
|
+
parser.fetch_index(@log-1)
|
123
|
+
end
|
124
|
+
alias_method :next, :next_page
|
125
|
+
|
126
|
+
def prev_page
|
127
|
+
parser = Parser.new
|
128
|
+
parser.fetch_index(@log+1)
|
129
|
+
end
|
130
|
+
alias_method :prev, :prev_page
|
131
|
+
end
|
108
132
|
end
|
data/lib/sosowa/version.rb
CHANGED
@@ -0,0 +1,37 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: utf-8
|
3
|
+
# 創想話の最新版から適当なSSのテキストを取得してMeCab(+ 東方MeCab辞書)を用いてテキスト中のセリフの発言者を予測します。
|
4
|
+
# 精度低いので誰かちゃんとしたの作ってください!
|
5
|
+
|
6
|
+
require "mecab-modern"
|
7
|
+
require "kconv"
|
8
|
+
require "sosowa"
|
9
|
+
require "pp"
|
10
|
+
|
11
|
+
puts "東方MeCab辞書をダウンロード中..."
|
12
|
+
system("curl -L https://github.com/oame/thdic-mecab/raw/master/pkg/thdic-mecab.dic > thdic-mecab.dic") unless FileTest.exists? "thdic-mecab.dic"
|
13
|
+
|
14
|
+
puts "完了. MeCab::Taggerを初期化します"
|
15
|
+
mecab = MeCab::Tagger.new#("-u thdic-mecab.dic")
|
16
|
+
|
17
|
+
#novel = Sosowa.get.sample.fetch
|
18
|
+
novel = Sosowa.get(:log => 170, :key => 1342037924)
|
19
|
+
puts "-"*30
|
20
|
+
puts novel.title
|
21
|
+
puts "作者: #{novel.author.name}"
|
22
|
+
puts "-"*30
|
23
|
+
lines = novel.text.gsub(/\r?\n/, "").split("<br>").reject{|t| t == ""}.map{|n| n.strip}
|
24
|
+
num = 0
|
25
|
+
lines.each do |line|
|
26
|
+
name_nodes = mecab.parseToNode(line).select{|n| n.feature =~ /名詞,固有名詞,人名/}
|
27
|
+
unless name_nodes[0]
|
28
|
+
num += 1
|
29
|
+
next
|
30
|
+
end
|
31
|
+
unless lines[num+1] =~ /(「|」)/
|
32
|
+
num += 1
|
33
|
+
next
|
34
|
+
end
|
35
|
+
puts "#{name_nodes[0].surface}: #{lines[num+1]}"
|
36
|
+
num += 1
|
37
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: utf-8
|
3
|
+
|
4
|
+
require "sosowa"
|
5
|
+
|
6
|
+
# 最新版の作品集を取得
|
7
|
+
latest = Sosowa.get
|
8
|
+
|
9
|
+
# 最新版よりひとつ古い作品集を取得
|
10
|
+
next_log = latest.next_page
|
11
|
+
|
12
|
+
# 最近版から直近3ページまで遡ってSSのタイトルを列挙する
|
13
|
+
3.times do |n|
|
14
|
+
Sosowa.get(:log => latest.log - n).each do |index|
|
15
|
+
puts index.title
|
16
|
+
end
|
17
|
+
end
|
data/samples/tf-idf.rb
CHANGED
@@ -1,42 +1,15 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
# coding: utf-8
|
3
3
|
# 創想話の最新版から適当なSSのテキストを取得してMeCab(+ 東方MeCab辞書)を用いて代表キーワード候補を名詞限定で選出し、TF-IDF法による特徴語抽出を行います。
|
4
|
-
# 注意: ugigi gemが必要です
|
4
|
+
# 注意: ugigi gemとmecab-modern gemが必要です
|
5
5
|
|
6
|
-
require "
|
6
|
+
require "mecab-modern"
|
7
7
|
require "kconv"
|
8
8
|
require "sosowa"
|
9
9
|
require "ugigi"
|
10
10
|
|
11
|
-
module MeCab
|
12
|
-
class Tagger
|
13
|
-
alias_method :parseToNode_org, :parseToNode
|
14
|
-
private :parseToNode_org
|
15
|
-
|
16
|
-
def parseToNode(*args)
|
17
|
-
node = parseToNode_org(*args)
|
18
|
-
nodes = []
|
19
|
-
while node
|
20
|
-
nodes.push(node)
|
21
|
-
node = node.next
|
22
|
-
end
|
23
|
-
return nodes[1, nodes.size - 2]
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
class Node
|
28
|
-
alias_method :feature_org, :feature
|
29
|
-
alias_method :surface_org, :surface
|
30
|
-
private :feature_org
|
31
|
-
private :surface_org
|
32
|
-
|
33
|
-
def feature ; feature_org.toutf8 end
|
34
|
-
def surface ; surface_org.toutf8 end
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
11
|
puts "東方MeCab辞書をダウンロード中..."
|
39
|
-
system("curl -L https://github.com/oame/thdic-mecab/raw/master/pkg/thdic-mecab.dic > thdic-mecab.dic")
|
12
|
+
system("curl -L https://github.com/oame/thdic-mecab/raw/master/pkg/thdic-mecab.dic > thdic-mecab.dic") unless FileTest.exists? "thdic-mecab.dic"
|
40
13
|
|
41
14
|
puts "完了. MeCab::Taggerを初期化します"
|
42
15
|
mecab = MeCab::Tagger.new("-u thdic-mecab.dic")
|
data/samples/token_segment.rb
CHANGED
@@ -1,34 +1,18 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
# coding: utf-8
|
3
3
|
# 創想話の最新版から適当なSSを取得してMeCab(+ 東方MeCab辞書)を用いてトークナイズします。
|
4
|
+
# mecab-modern gemが必要です
|
4
5
|
|
5
|
-
require "
|
6
|
+
require "mecab-modern"
|
6
7
|
require "sosowa"
|
7
8
|
|
8
|
-
module MeCab
|
9
|
-
class Tagger
|
10
|
-
alias_method :parseToNode_org, :parseToNode
|
11
|
-
private :parseToNode_org
|
12
|
-
|
13
|
-
def parseToNode(*args)
|
14
|
-
node = parseToNode_org(*args)
|
15
|
-
nodes = []
|
16
|
-
while node
|
17
|
-
nodes.push(node)
|
18
|
-
node = node.next
|
19
|
-
end
|
20
|
-
return nodes[1, nodes.size - 2]
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
9
|
puts "Fetching thdic-mecab..."
|
26
|
-
system("curl -L https://github.com/oame/thdic-mecab/raw/master/pkg/thdic-mecab.dic > thdic-mecab.dic")
|
10
|
+
system("curl -L https://github.com/oame/thdic-mecab/raw/master/pkg/thdic-mecab.dic > thdic-mecab.dic") unless FileTest.exists? "thdic-mecab.dic"
|
27
11
|
|
28
12
|
puts "Done. Initialize MeCab::Tagger"
|
29
13
|
mecab = MeCab::Tagger.new("-u thdic-mecab.dic")
|
30
14
|
|
31
|
-
text = Sosowa.get.sample.fetch.text.
|
15
|
+
text = Sosowa.get.sample.fetch.text.plain
|
32
16
|
tokens = mecab.parseToNode(text)
|
33
17
|
tokens.each do |token|
|
34
18
|
puts token.feature
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sosowa
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.3'
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-07-
|
12
|
+
date: 2012-07-21 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
16
|
-
requirement: &
|
16
|
+
requirement: &70129121669320 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,7 +21,7 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70129121669320
|
25
25
|
description: Sosowa Parser for Ruby
|
26
26
|
email:
|
27
27
|
- oame@oameya.com
|
@@ -39,6 +39,8 @@ files:
|
|
39
39
|
- lib/sosowa/parser.rb
|
40
40
|
- lib/sosowa/scheme.rb
|
41
41
|
- lib/sosowa/version.rb
|
42
|
+
- samples/chara_recognize.rb
|
43
|
+
- samples/feature-0.3.rb
|
42
44
|
- samples/tf-idf.rb
|
43
45
|
- samples/token_segment.rb
|
44
46
|
- sosowa.gemspec
|