sosowa 0.1 → 0.2

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ ## 0.2
2
+ * Sosowa::Novel.titleを追加。むしろどうして今まで無かった
3
+ * Sosowa::Author, Sosowa::Commentが取得出来ないバグを修正
4
+
1
5
  ## 0.1
2
6
  * Sosowa#searchを追加
3
7
  * Sosowa::Novel#plainを使って<br>タグや改行コードが取り除かれたテキストを得ることが出来ます。
data/README.md CHANGED
@@ -1,6 +1,7 @@
1
1
  # Sosowa
2
2
 
3
- Sosowa Parser for Ruby 1.9.x
3
+ 創想話パーサー for Ruby 1.9.x
4
+ samples/に各種サンプルが入っています。
4
5
 
5
6
  ## Requirements
6
7
 
data/lib/sosowa/scheme.rb CHANGED
@@ -2,6 +2,10 @@ module Sosowa
2
2
  class Scheme
3
3
  protected
4
4
 
5
+ def initialize(element)
6
+ @element = element
7
+ end
8
+
5
9
  def method_missing(action, *args)
6
10
  return @element[action.to_s.to_sym] rescue nil
7
11
  end
@@ -24,6 +28,7 @@ module Sosowa
24
28
  def fetch(log, key)
25
29
  params = Sosowa.serialize_parameter({:mode => :read, :log => log, :key => key})
26
30
  @page = @agent.get(URI.join(Sosowa::BASE_URL, params))
31
+ title = (@page/%{div[@class="header"] > h1})[0].inner_html.to_s.toutf8.strip
27
32
  tags = (@page/%{dl[@class="info"][1] > dd > a}).map{|t| t.inner_html.to_s.toutf8 }
28
33
  text = (@page/%{div[@class="contents ss"]})[0].inner_html.to_s.toutf8
29
34
  ps = (@page/%{div[@class="aft"]})[0].inner_html.to_s.toutf8
@@ -48,6 +53,7 @@ module Sosowa
48
53
  comments << comment
49
54
  end
50
55
  novel = {
56
+ :title => title,
51
57
  :text => text,
52
58
  :ps => ps,
53
59
  :author => author,
@@ -93,12 +99,7 @@ module Sosowa
93
99
 
94
100
  end
95
101
 
96
- class Index < Scheme
97
- def initialize(element)
98
- super(@element)
99
- @element = element
100
- end
101
-
102
+ class Index < Scheme
102
103
  def fetch
103
104
  Novel.new(:log => @element[:log], :key => @element[:key])
104
105
  end
@@ -1,3 +1,3 @@
1
1
  module Sosowa
2
- VERSION = "0.1"
2
+ VERSION = "0.2"
3
3
  end
data/lib/sosowa.rb CHANGED
@@ -10,6 +10,8 @@ require "sosowa/parser"
10
10
 
11
11
  module Sosowa
12
12
  BASE_URL = "http://coolier.sytes.net:8080/sosowa/ssw_l/"
13
+
14
+ protected
13
15
 
14
16
  # @param [Hash] parameter
15
17
  # @return [String] URL Serialized parameters
@@ -22,6 +24,8 @@ module Sosowa
22
24
  param = ant.inject(""){|k,v|k+"&#{v[0]}=#{URI.escape(v[1])}"}.sub!(/^&/,"?")
23
25
  return param ? param : ""
24
26
  end
27
+
28
+ public
25
29
 
26
30
  def self.get(args={})
27
31
  args[:log] ||= 0
data/samples/tf-idf.rb ADDED
@@ -0,0 +1,83 @@
1
+ #!/usr/bin/env ruby
2
+ # coding: utf-8
3
+ # 創想話の最新版から適当なSSのテキストを取得してMeCab(+ 東方MeCab辞書)を用いて代表キーワード候補を名詞限定で選出し、TF-IDF法による特徴語抽出を行います。
4
+ # 注意: ugigi gemが必要です
5
+
6
+ require "MeCab"
7
+ require "kconv"
8
+ require "sosowa"
9
+ require "ugigi"
10
+
11
+ module MeCab
12
+ class Tagger
13
+ alias_method :parseToNode_org, :parseToNode
14
+ private :parseToNode_org
15
+
16
+ def parseToNode(*args)
17
+ node = parseToNode_org(*args)
18
+ nodes = []
19
+ while node
20
+ nodes.push(node)
21
+ node = node.next
22
+ end
23
+ return nodes[1, nodes.size - 2]
24
+ end
25
+ end
26
+
27
+ class Node
28
+ alias_method :feature_org, :feature
29
+ alias_method :surface_org, :surface
30
+ private :feature_org
31
+ private :surface_org
32
+
33
+ def feature ; feature_org.toutf8 end
34
+ def surface ; surface_org.toutf8 end
35
+ end
36
+ end
37
+
38
+ puts "東方MeCab辞書をダウンロード中..."
39
+ system("curl -L https://github.com/oame/thdic-mecab/raw/master/pkg/thdic-mecab.dic > thdic-mecab.dic")
40
+
41
+ puts "完了. MeCab::Taggerを初期化します"
42
+ mecab = MeCab::Tagger.new("-u thdic-mecab.dic")
43
+
44
+ novel = Sosowa.get.sample.fetch
45
+ puts "-"*30
46
+ puts novel.title
47
+ puts "作者: #{novel.author.name}"
48
+ puts "-"*30
49
+ text = novel.plain
50
+ tf = {}
51
+ n = 15646.0
52
+ puts "代表キーワード候補を抽出中..."
53
+ tokens = mecab.parseToNode(text)
54
+ tokens.each do |token|
55
+ next unless token.feature =~ /名詞/
56
+ tf[token.surface] ||= 0
57
+ tf[token.surface] += 1
58
+ end
59
+
60
+ puts "代表キーワード候補数: #{tf.size}"
61
+
62
+ tfidf_list = []
63
+ tf.each do |e|
64
+ print "TF: #{e[0]} ... \t"
65
+ df = Ugigi.total_count(:free => e[0], :sswp => 0, :compe => 0)
66
+ if df == 0
67
+ print "N/A\n"
68
+ tfidf_list << [e[0], 0]
69
+ next
70
+ end
71
+ print "DF: #{df} \t"
72
+ tfidf = e[1] * Math.log(n/df)
73
+ print "TF-IDF: #{tfidf}\n"
74
+ tfidf_list << [e[0], tfidf]
75
+ end
76
+ tfidf_list = tfidf_list.sort{|a, b| b[1] <=> a[1]}
77
+
78
+ puts "集計終わり!"
79
+
80
+ 10.times do |n|
81
+ l = tfidf_list[n]
82
+ puts "#{n+1}. #{l[0]} \tTF-IDF: #{l[1]}"
83
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sosowa
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.1'
4
+ version: '0.2'
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-10 00:00:00.000000000Z
12
+ date: 2012-07-11 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
16
- requirement: &70307433904340 !ruby/object:Gem::Requirement
16
+ requirement: &70146581283880 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,7 +21,7 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70307433904340
24
+ version_requirements: *70146581283880
25
25
  description: Sosowa Parser for Ruby
26
26
  email:
27
27
  - oame@oameya.com
@@ -39,6 +39,7 @@ files:
39
39
  - lib/sosowa/parser.rb
40
40
  - lib/sosowa/scheme.rb
41
41
  - lib/sosowa/version.rb
42
+ - samples/tf-idf.rb
42
43
  - samples/token_segment.rb
43
44
  - sosowa.gemspec
44
45
  homepage: ''