sosowa 0.2 → 0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.md +6 -0
- data/README.md +1 -1
- data/lib/sosowa/parser.rb +17 -4
- data/lib/sosowa/scheme.rb +38 -14
- data/lib/sosowa/version.rb +1 -1
- data/samples/chara_recognize.rb +37 -0
- data/samples/feature-0.3.rb +17 -0
- data/samples/tf-idf.rb +3 -30
- data/samples/token_segment.rb +4 -20
- metadata +6 -4
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
## 0.3
|
2
|
+
* Sosowa#Logを追加。作品集単位で抽象化出来るようになりました。殆どの場合、このクラスはArrayとして振る舞います。
|
3
|
+
* Sosowa#Log.logで絶対作品集番号を得ることが出来ます。このメソッドは最新作品集であっても0では無く実際の番号が割り振られます。
|
4
|
+
* Sosowa#Log.next_page, Sosowa#Log.prev_pageが追加されました。前後のページを取得してSosowa#Logを返します。
|
5
|
+
* 0.3のサンプルはtest/feature-0.3.rbで確認することが出来ます。
|
6
|
+
|
1
7
|
## 0.2
|
2
8
|
* Sosowa::Novel.titleを追加。むしろどうして今まで無かった
|
3
9
|
* Sosowa::Author, Sosowa::Commentが取得出来ないバグを修正
|
data/README.md
CHANGED
data/lib/sosowa/parser.rb
CHANGED
@@ -8,16 +8,29 @@ module Sosowa
|
|
8
8
|
|
9
9
|
def search(query, args={})
|
10
10
|
params = Sosowa.serialize_parameter({:mode => :search, :type => (args[:type] ? args[:type] : :insubject), :query => query.tosjis})
|
11
|
-
parse_index(URI.join(Sosowa::BASE_URL, params))
|
11
|
+
parse_index(@agent.get(URI.join(Sosowa::BASE_URL, params)))
|
12
12
|
end
|
13
13
|
|
14
14
|
def fetch_index(log)
|
15
15
|
params = Sosowa.serialize_parameter({:log => log})
|
16
|
-
|
16
|
+
page = @agent.get(URI.join(Sosowa::BASE_URL, params))
|
17
|
+
indexes = parse_index(page)
|
18
|
+
abs_log_num = parse_absolute_log_number(page)
|
19
|
+
Log.new(indexes, abs_log_num)
|
20
|
+
end
|
21
|
+
|
22
|
+
def parse_absolute_log_number(page)
|
23
|
+
li = page.search(%{ul[@id="pages"] li > *})
|
24
|
+
log = li.size
|
25
|
+
li.each do |l|
|
26
|
+
if l.attributes["id"] && l.attributes["id"].value == "selectedPage"
|
27
|
+
return log
|
28
|
+
end
|
29
|
+
log -= 1
|
30
|
+
end
|
17
31
|
end
|
18
32
|
|
19
|
-
def parse_index(
|
20
|
-
page = @agent.get(url)
|
33
|
+
def parse_index(page)
|
21
34
|
indexes = []
|
22
35
|
tr = page.search("tr")
|
23
36
|
tr = tr[1, tr.size-1]
|
data/lib/sosowa/scheme.rb
CHANGED
@@ -39,18 +39,20 @@ module Sosowa
|
|
39
39
|
review = header[3][1].split("/")
|
40
40
|
comments = []
|
41
41
|
comment_element = (@page/%{div[@class="comments"] > dl > *})
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
42
|
+
if comment_element.size > 0
|
43
|
+
comment_element[1, comment_element.size-1].each_slice(2) do |element|
|
44
|
+
bobj = element[0].search("b").map{|n| n.inner_html.to_s.toutf8.strip}
|
45
|
+
point = element[0].search("span").inner_html.to_s.toutf8.to_i
|
46
|
+
id = element[0].inner_html.to_s.toutf8.split(/\r?\n/).map{|n| n.strip}[1].to_i
|
47
|
+
comment = Comment.new(
|
48
|
+
:id => id,
|
49
|
+
:point => point,
|
50
|
+
:name => bobj[0],
|
51
|
+
:created_at => Time.parse(bobj[1].gsub(/[^\/\d\s:]/, "")),
|
52
|
+
:text => element[1].inner_html.to_s.toutf8.strip
|
53
|
+
)
|
54
|
+
comments << comment
|
55
|
+
end
|
54
56
|
end
|
55
57
|
novel = {
|
56
58
|
:title => title,
|
@@ -92,11 +94,11 @@ module Sosowa
|
|
92
94
|
end
|
93
95
|
|
94
96
|
class Comment < Scheme
|
95
|
-
|
97
|
+
|
96
98
|
end
|
97
99
|
|
98
100
|
class Author < Scheme
|
99
|
-
|
101
|
+
|
100
102
|
end
|
101
103
|
|
102
104
|
class Index < Scheme
|
@@ -105,4 +107,26 @@ module Sosowa
|
|
105
107
|
end
|
106
108
|
alias_method :get, :fetch
|
107
109
|
end
|
110
|
+
|
111
|
+
class Log < Array
|
112
|
+
attr_reader :log
|
113
|
+
|
114
|
+
def initialize(page, log=0)
|
115
|
+
@page = page
|
116
|
+
@log = log
|
117
|
+
super(page)
|
118
|
+
end
|
119
|
+
|
120
|
+
def next_page
|
121
|
+
parser = Parser.new
|
122
|
+
parser.fetch_index(@log-1)
|
123
|
+
end
|
124
|
+
alias_method :next, :next_page
|
125
|
+
|
126
|
+
def prev_page
|
127
|
+
parser = Parser.new
|
128
|
+
parser.fetch_index(@log+1)
|
129
|
+
end
|
130
|
+
alias_method :prev, :prev_page
|
131
|
+
end
|
108
132
|
end
|
data/lib/sosowa/version.rb
CHANGED
@@ -0,0 +1,37 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: utf-8
|
3
|
+
# 創想話の最新版から適当なSSのテキストを取得してMeCab(+ 東方MeCab辞書)を用いてテキスト中のセリフの発言者を予測します。
|
4
|
+
# 精度低いので誰かちゃんとしたの作ってください!
|
5
|
+
|
6
|
+
require "mecab-modern"
|
7
|
+
require "kconv"
|
8
|
+
require "sosowa"
|
9
|
+
require "pp"
|
10
|
+
|
11
|
+
puts "東方MeCab辞書をダウンロード中..."
|
12
|
+
system("curl -L https://github.com/oame/thdic-mecab/raw/master/pkg/thdic-mecab.dic > thdic-mecab.dic") unless FileTest.exists? "thdic-mecab.dic"
|
13
|
+
|
14
|
+
puts "完了. MeCab::Taggerを初期化します"
|
15
|
+
mecab = MeCab::Tagger.new#("-u thdic-mecab.dic")
|
16
|
+
|
17
|
+
#novel = Sosowa.get.sample.fetch
|
18
|
+
novel = Sosowa.get(:log => 170, :key => 1342037924)
|
19
|
+
puts "-"*30
|
20
|
+
puts novel.title
|
21
|
+
puts "作者: #{novel.author.name}"
|
22
|
+
puts "-"*30
|
23
|
+
lines = novel.text.gsub(/\r?\n/, "").split("<br>").reject{|t| t == ""}.map{|n| n.strip}
|
24
|
+
num = 0
|
25
|
+
lines.each do |line|
|
26
|
+
name_nodes = mecab.parseToNode(line).select{|n| n.feature =~ /名詞,固有名詞,人名/}
|
27
|
+
unless name_nodes[0]
|
28
|
+
num += 1
|
29
|
+
next
|
30
|
+
end
|
31
|
+
unless lines[num+1] =~ /(「|」)/
|
32
|
+
num += 1
|
33
|
+
next
|
34
|
+
end
|
35
|
+
puts "#{name_nodes[0].surface}: #{lines[num+1]}"
|
36
|
+
num += 1
|
37
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: utf-8
|
3
|
+
|
4
|
+
require "sosowa"
|
5
|
+
|
6
|
+
# 最新版の作品集を取得
|
7
|
+
latest = Sosowa.get
|
8
|
+
|
9
|
+
# 最新版よりひとつ古い作品集を取得
|
10
|
+
next_log = latest.next_page
|
11
|
+
|
12
|
+
# 最近版から直近3ページまで遡ってSSのタイトルを列挙する
|
13
|
+
3.times do |n|
|
14
|
+
Sosowa.get(:log => latest.log - n).each do |index|
|
15
|
+
puts index.title
|
16
|
+
end
|
17
|
+
end
|
data/samples/tf-idf.rb
CHANGED
@@ -1,42 +1,15 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
# coding: utf-8
|
3
3
|
# 創想話の最新版から適当なSSのテキストを取得してMeCab(+ 東方MeCab辞書)を用いて代表キーワード候補を名詞限定で選出し、TF-IDF法による特徴語抽出を行います。
|
4
|
-
# 注意: ugigi gemが必要です
|
4
|
+
# 注意: ugigi gemとmecab-modern gemが必要です
|
5
5
|
|
6
|
-
require "
|
6
|
+
require "mecab-modern"
|
7
7
|
require "kconv"
|
8
8
|
require "sosowa"
|
9
9
|
require "ugigi"
|
10
10
|
|
11
|
-
module MeCab
|
12
|
-
class Tagger
|
13
|
-
alias_method :parseToNode_org, :parseToNode
|
14
|
-
private :parseToNode_org
|
15
|
-
|
16
|
-
def parseToNode(*args)
|
17
|
-
node = parseToNode_org(*args)
|
18
|
-
nodes = []
|
19
|
-
while node
|
20
|
-
nodes.push(node)
|
21
|
-
node = node.next
|
22
|
-
end
|
23
|
-
return nodes[1, nodes.size - 2]
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
class Node
|
28
|
-
alias_method :feature_org, :feature
|
29
|
-
alias_method :surface_org, :surface
|
30
|
-
private :feature_org
|
31
|
-
private :surface_org
|
32
|
-
|
33
|
-
def feature ; feature_org.toutf8 end
|
34
|
-
def surface ; surface_org.toutf8 end
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
11
|
puts "東方MeCab辞書をダウンロード中..."
|
39
|
-
system("curl -L https://github.com/oame/thdic-mecab/raw/master/pkg/thdic-mecab.dic > thdic-mecab.dic")
|
12
|
+
system("curl -L https://github.com/oame/thdic-mecab/raw/master/pkg/thdic-mecab.dic > thdic-mecab.dic") unless FileTest.exists? "thdic-mecab.dic"
|
40
13
|
|
41
14
|
puts "完了. MeCab::Taggerを初期化します"
|
42
15
|
mecab = MeCab::Tagger.new("-u thdic-mecab.dic")
|
data/samples/token_segment.rb
CHANGED
@@ -1,34 +1,18 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
# coding: utf-8
|
3
3
|
# 創想話の最新版から適当なSSを取得してMeCab(+ 東方MeCab辞書)を用いてトークナイズします。
|
4
|
+
# mecab-modern gemが必要です
|
4
5
|
|
5
|
-
require "
|
6
|
+
require "mecab-modern"
|
6
7
|
require "sosowa"
|
7
8
|
|
8
|
-
module MeCab
|
9
|
-
class Tagger
|
10
|
-
alias_method :parseToNode_org, :parseToNode
|
11
|
-
private :parseToNode_org
|
12
|
-
|
13
|
-
def parseToNode(*args)
|
14
|
-
node = parseToNode_org(*args)
|
15
|
-
nodes = []
|
16
|
-
while node
|
17
|
-
nodes.push(node)
|
18
|
-
node = node.next
|
19
|
-
end
|
20
|
-
return nodes[1, nodes.size - 2]
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
9
|
puts "Fetching thdic-mecab..."
|
26
|
-
system("curl -L https://github.com/oame/thdic-mecab/raw/master/pkg/thdic-mecab.dic > thdic-mecab.dic")
|
10
|
+
system("curl -L https://github.com/oame/thdic-mecab/raw/master/pkg/thdic-mecab.dic > thdic-mecab.dic") unless FileTest.exists? "thdic-mecab.dic"
|
27
11
|
|
28
12
|
puts "Done. Initialize MeCab::Tagger"
|
29
13
|
mecab = MeCab::Tagger.new("-u thdic-mecab.dic")
|
30
14
|
|
31
|
-
text = Sosowa.get.sample.fetch.text.
|
15
|
+
text = Sosowa.get.sample.fetch.text.plain
|
32
16
|
tokens = mecab.parseToNode(text)
|
33
17
|
tokens.each do |token|
|
34
18
|
puts token.feature
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sosowa
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.3'
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-07-
|
12
|
+
date: 2012-07-21 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
16
|
-
requirement: &
|
16
|
+
requirement: &70129121669320 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,7 +21,7 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70129121669320
|
25
25
|
description: Sosowa Parser for Ruby
|
26
26
|
email:
|
27
27
|
- oame@oameya.com
|
@@ -39,6 +39,8 @@ files:
|
|
39
39
|
- lib/sosowa/parser.rb
|
40
40
|
- lib/sosowa/scheme.rb
|
41
41
|
- lib/sosowa/version.rb
|
42
|
+
- samples/chara_recognize.rb
|
43
|
+
- samples/feature-0.3.rb
|
42
44
|
- samples/tf-idf.rb
|
43
45
|
- samples/token_segment.rb
|
44
46
|
- sosowa.gemspec
|