sosowa 0.2 → 0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## 0.3
2
+ * Sosowa#Logを追加。作品集単位で抽象化出来るようになりました。殆どの場合、このクラスはArrayとして振る舞います。
3
+ * Sosowa#Log.logで絶対作品集番号を得ることが出来ます。このメソッドは最新作品集であっても0では無く実際の番号が割り振られます。
4
+ * Sosowa#Log.next_page, Sosowa#Log.prev_pageが追加されました。前後のページを取得してSosowa#Logを返します。
5
+ * 0.3のサンプルはtest/feature-0.3.rbで確認することが出来ます。
6
+
1
7
  ## 0.2
2
8
  * Sosowa::Novel.titleを追加。むしろどうして今まで無かった
3
9
  * Sosowa::Author, Sosowa::Commentが取得出来ないバグを修正
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Sosowa
2
2
 
3
- 創想話パーサー for Ruby 1.9.x
3
+ 創想話パーサー for Ruby 1.9.x<br>
4
4
  samples/に各種サンプルが入っています。
5
5
 
6
6
  ## Requirements
data/lib/sosowa/parser.rb CHANGED
@@ -8,16 +8,29 @@ module Sosowa
8
8
 
9
9
  def search(query, args={})
10
10
  params = Sosowa.serialize_parameter({:mode => :search, :type => (args[:type] ? args[:type] : :insubject), :query => query.tosjis})
11
- parse_index(URI.join(Sosowa::BASE_URL, params))
11
+ parse_index(@agent.get(URI.join(Sosowa::BASE_URL, params)))
12
12
  end
13
13
 
14
14
  def fetch_index(log)
15
15
  params = Sosowa.serialize_parameter({:log => log})
16
- parse_index(URI.join(Sosowa::BASE_URL, params))
16
+ page = @agent.get(URI.join(Sosowa::BASE_URL, params))
17
+ indexes = parse_index(page)
18
+ abs_log_num = parse_absolute_log_number(page)
19
+ Log.new(indexes, abs_log_num)
20
+ end
21
+
22
+ def parse_absolute_log_number(page)
23
+ li = page.search(%{ul[@id="pages"] li > *})
24
+ log = li.size
25
+ li.each do |l|
26
+ if l.attributes["id"] && l.attributes["id"].value == "selectedPage"
27
+ return log
28
+ end
29
+ log -= 1
30
+ end
17
31
  end
18
32
 
19
- def parse_index(url)
20
- page = @agent.get(url)
33
+ def parse_index(page)
21
34
  indexes = []
22
35
  tr = page.search("tr")
23
36
  tr = tr[1, tr.size-1]
data/lib/sosowa/scheme.rb CHANGED
@@ -39,18 +39,20 @@ module Sosowa
39
39
  review = header[3][1].split("/")
40
40
  comments = []
41
41
  comment_element = (@page/%{div[@class="comments"] > dl > *})
42
- comment_element[1, comment_element.size-1].each_slice(2) do |element|
43
- bobj = element[0].search("b").map{|n| n.inner_html.to_s.toutf8.strip}
44
- point = element[0].search("span").inner_html.to_s.toutf8.to_i
45
- id = element[0].inner_html.to_s.toutf8.split(/\r?\n/).map{|n| n.strip}[1].to_i
46
- comment = Comment.new(
47
- :id => id,
48
- :point => point,
49
- :name => bobj[0],
50
- :created_at => Time.parse(bobj[1].gsub(/[^\/\d\s:]/, "")),
51
- :text => element[1].inner_html.to_s.toutf8.strip
52
- )
53
- comments << comment
42
+ if comment_element.size > 0
43
+ comment_element[1, comment_element.size-1].each_slice(2) do |element|
44
+ bobj = element[0].search("b").map{|n| n.inner_html.to_s.toutf8.strip}
45
+ point = element[0].search("span").inner_html.to_s.toutf8.to_i
46
+ id = element[0].inner_html.to_s.toutf8.split(/\r?\n/).map{|n| n.strip}[1].to_i
47
+ comment = Comment.new(
48
+ :id => id,
49
+ :point => point,
50
+ :name => bobj[0],
51
+ :created_at => Time.parse(bobj[1].gsub(/[^\/\d\s:]/, "")),
52
+ :text => element[1].inner_html.to_s.toutf8.strip
53
+ )
54
+ comments << comment
55
+ end
54
56
  end
55
57
  novel = {
56
58
  :title => title,
@@ -92,11 +94,11 @@ module Sosowa
92
94
  end
93
95
 
94
96
  class Comment < Scheme
95
-
97
+
96
98
  end
97
99
 
98
100
  class Author < Scheme
99
-
101
+
100
102
  end
101
103
 
102
104
  class Index < Scheme
@@ -105,4 +107,26 @@ module Sosowa
105
107
  end
106
108
  alias_method :get, :fetch
107
109
  end
110
+
111
+ class Log < Array
112
+ attr_reader :log
113
+
114
+ def initialize(page, log=0)
115
+ @page = page
116
+ @log = log
117
+ super(page)
118
+ end
119
+
120
+ def next_page
121
+ parser = Parser.new
122
+ parser.fetch_index(@log-1)
123
+ end
124
+ alias_method :next, :next_page
125
+
126
+ def prev_page
127
+ parser = Parser.new
128
+ parser.fetch_index(@log+1)
129
+ end
130
+ alias_method :prev, :prev_page
131
+ end
108
132
  end
@@ -1,3 +1,3 @@
1
1
  module Sosowa
2
- VERSION = "0.2"
2
+ VERSION = "0.3"
3
3
  end
@@ -0,0 +1,37 @@
1
+ #!/usr/bin/env ruby
2
+ # coding: utf-8
3
+ # 創想話の最新版から適当なSSのテキストを取得してMeCab(+ 東方MeCab辞書)を用いてテキスト中のセリフの発言者を予測します。
4
+ # 精度低いので誰かちゃんとしたの作ってください!
5
+
6
+ require "mecab-modern"
7
+ require "kconv"
8
+ require "sosowa"
9
+ require "pp"
10
+
11
+ puts "東方MeCab辞書をダウンロード中..."
12
+ system("curl -L https://github.com/oame/thdic-mecab/raw/master/pkg/thdic-mecab.dic > thdic-mecab.dic") unless FileTest.exists? "thdic-mecab.dic"
13
+
14
+ puts "完了. MeCab::Taggerを初期化します"
15
+ mecab = MeCab::Tagger.new#("-u thdic-mecab.dic")
16
+
17
+ #novel = Sosowa.get.sample.fetch
18
+ novel = Sosowa.get(:log => 170, :key => 1342037924)
19
+ puts "-"*30
20
+ puts novel.title
21
+ puts "作者: #{novel.author.name}"
22
+ puts "-"*30
23
+ lines = novel.text.gsub(/\r?\n/, "").split("<br>").reject{|t| t == ""}.map{|n| n.strip}
24
+ num = 0
25
+ lines.each do |line|
26
+ name_nodes = mecab.parseToNode(line).select{|n| n.feature =~ /名詞,固有名詞,人名/}
27
+ unless name_nodes[0]
28
+ num += 1
29
+ next
30
+ end
31
+ unless lines[num+1] =~ /(「|」)/
32
+ num += 1
33
+ next
34
+ end
35
+ puts "#{name_nodes[0].surface}: #{lines[num+1]}"
36
+ num += 1
37
+ end
@@ -0,0 +1,17 @@
1
+ #!/usr/bin/env ruby
2
+ # coding: utf-8
3
+
4
+ require "sosowa"
5
+
6
+ # 最新版の作品集を取得
7
+ latest = Sosowa.get
8
+
9
+ # 最新版よりひとつ古い作品集を取得
10
+ next_log = latest.next_page
11
+
12
+ # 最近版から直近3ページまで遡ってSSのタイトルを列挙する
13
+ 3.times do |n|
14
+ Sosowa.get(:log => latest.log - n).each do |index|
15
+ puts index.title
16
+ end
17
+ end
data/samples/tf-idf.rb CHANGED
@@ -1,42 +1,15 @@
1
1
  #!/usr/bin/env ruby
2
2
  # coding: utf-8
3
3
  # 創想話の最新版から適当なSSのテキストを取得してMeCab(+ 東方MeCab辞書)を用いて代表キーワード候補を名詞限定で選出し、TF-IDF法による特徴語抽出を行います。
4
- # 注意: ugigi gemが必要です
4
+ # 注意: ugigi gemとmecab-modern gemが必要です
5
5
 
6
- require "MeCab"
6
+ require "mecab-modern"
7
7
  require "kconv"
8
8
  require "sosowa"
9
9
  require "ugigi"
10
10
 
11
- module MeCab
12
- class Tagger
13
- alias_method :parseToNode_org, :parseToNode
14
- private :parseToNode_org
15
-
16
- def parseToNode(*args)
17
- node = parseToNode_org(*args)
18
- nodes = []
19
- while node
20
- nodes.push(node)
21
- node = node.next
22
- end
23
- return nodes[1, nodes.size - 2]
24
- end
25
- end
26
-
27
- class Node
28
- alias_method :feature_org, :feature
29
- alias_method :surface_org, :surface
30
- private :feature_org
31
- private :surface_org
32
-
33
- def feature ; feature_org.toutf8 end
34
- def surface ; surface_org.toutf8 end
35
- end
36
- end
37
-
38
11
  puts "東方MeCab辞書をダウンロード中..."
39
- system("curl -L https://github.com/oame/thdic-mecab/raw/master/pkg/thdic-mecab.dic > thdic-mecab.dic")
12
+ system("curl -L https://github.com/oame/thdic-mecab/raw/master/pkg/thdic-mecab.dic > thdic-mecab.dic") unless FileTest.exists? "thdic-mecab.dic"
40
13
 
41
14
  puts "完了. MeCab::Taggerを初期化します"
42
15
  mecab = MeCab::Tagger.new("-u thdic-mecab.dic")
@@ -1,34 +1,18 @@
1
1
  #!/usr/bin/env ruby
2
2
  # coding: utf-8
3
3
  # 創想話の最新版から適当なSSを取得してMeCab(+ 東方MeCab辞書)を用いてトークナイズします。
4
+ # mecab-modern gemが必要です
4
5
 
5
- require "MeCab"
6
+ require "mecab-modern"
6
7
  require "sosowa"
7
8
 
8
- module MeCab
9
- class Tagger
10
- alias_method :parseToNode_org, :parseToNode
11
- private :parseToNode_org
12
-
13
- def parseToNode(*args)
14
- node = parseToNode_org(*args)
15
- nodes = []
16
- while node
17
- nodes.push(node)
18
- node = node.next
19
- end
20
- return nodes[1, nodes.size - 2]
21
- end
22
- end
23
- end
24
-
25
9
  puts "Fetching thdic-mecab..."
26
- system("curl -L https://github.com/oame/thdic-mecab/raw/master/pkg/thdic-mecab.dic > thdic-mecab.dic")
10
+ system("curl -L https://github.com/oame/thdic-mecab/raw/master/pkg/thdic-mecab.dic > thdic-mecab.dic") unless FileTest.exists? "thdic-mecab.dic"
27
11
 
28
12
  puts "Done. Initialize MeCab::Tagger"
29
13
  mecab = MeCab::Tagger.new("-u thdic-mecab.dic")
30
14
 
31
- text = Sosowa.get.sample.fetch.text.gsub(/(<br>|\r?\n)/, "")
15
+ text = Sosowa.get.sample.fetch.text.plain
32
16
  tokens = mecab.parseToNode(text)
33
17
  tokens.each do |token|
34
18
  puts token.feature
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sosowa
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.2'
4
+ version: '0.3'
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-11 00:00:00.000000000Z
12
+ date: 2012-07-21 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
16
- requirement: &70146581283880 !ruby/object:Gem::Requirement
16
+ requirement: &70129121669320 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,7 +21,7 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70146581283880
24
+ version_requirements: *70129121669320
25
25
  description: Sosowa Parser for Ruby
26
26
  email:
27
27
  - oame@oameya.com
@@ -39,6 +39,8 @@ files:
39
39
  - lib/sosowa/parser.rb
40
40
  - lib/sosowa/scheme.rb
41
41
  - lib/sosowa/version.rb
42
+ - samples/chara_recognize.rb
43
+ - samples/feature-0.3.rb
42
44
  - samples/tf-idf.rb
43
45
  - samples/token_segment.rb
44
46
  - sosowa.gemspec