sosowa 0.1 → 0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ ## 0.2
2
+ * Sosowa::Novel.titleを追加。むしろどうして今まで無かった
3
+ * Sosowa::Author, Sosowa::Commentが取得出来ないバグを修正
4
+
1
5
  ## 0.1
2
6
  * Sosowa#searchを追加
3
7
  * Sosowa::Novel#plainを使って<br>タグや改行コードが取り除かれたテキストを得ることが出来ます。
data/README.md CHANGED
@@ -1,6 +1,7 @@
1
1
  # Sosowa
2
2
 
3
- Sosowa Parser for Ruby 1.9.x
3
+ 創想話パーサー for Ruby 1.9.x
4
+ samples/に各種サンプルが入っています。
4
5
 
5
6
  ## Requirements
6
7
 
data/lib/sosowa/scheme.rb CHANGED
@@ -2,6 +2,10 @@ module Sosowa
2
2
  class Scheme
3
3
  protected
4
4
 
5
+ def initialize(element)
6
+ @element = element
7
+ end
8
+
5
9
  def method_missing(action, *args)
6
10
  return @element[action.to_s.to_sym] rescue nil
7
11
  end
@@ -24,6 +28,7 @@ module Sosowa
24
28
  def fetch(log, key)
25
29
  params = Sosowa.serialize_parameter({:mode => :read, :log => log, :key => key})
26
30
  @page = @agent.get(URI.join(Sosowa::BASE_URL, params))
31
+ title = (@page/%{div[@class="header"] > h1})[0].inner_html.to_s.toutf8.strip
27
32
  tags = (@page/%{dl[@class="info"][1] > dd > a}).map{|t| t.inner_html.to_s.toutf8 }
28
33
  text = (@page/%{div[@class="contents ss"]})[0].inner_html.to_s.toutf8
29
34
  ps = (@page/%{div[@class="aft"]})[0].inner_html.to_s.toutf8
@@ -48,6 +53,7 @@ module Sosowa
48
53
  comments << comment
49
54
  end
50
55
  novel = {
56
+ :title => title,
51
57
  :text => text,
52
58
  :ps => ps,
53
59
  :author => author,
@@ -93,12 +99,7 @@ module Sosowa
93
99
 
94
100
  end
95
101
 
96
- class Index < Scheme
97
- def initialize(element)
98
- super(@element)
99
- @element = element
100
- end
101
-
102
+ class Index < Scheme
102
103
  def fetch
103
104
  Novel.new(:log => @element[:log], :key => @element[:key])
104
105
  end
@@ -1,3 +1,3 @@
1
1
  module Sosowa
2
- VERSION = "0.1"
2
+ VERSION = "0.2"
3
3
  end
data/lib/sosowa.rb CHANGED
@@ -10,6 +10,8 @@ require "sosowa/parser"
10
10
 
11
11
  module Sosowa
12
12
  BASE_URL = "http://coolier.sytes.net:8080/sosowa/ssw_l/"
13
+
14
+ protected
13
15
 
14
16
  # @param [Hash] parameter
15
17
  # @return [String] URL Serialized parameters
@@ -22,6 +24,8 @@ module Sosowa
22
24
  param = ant.inject(""){|k,v|k+"&#{v[0]}=#{URI.escape(v[1])}"}.sub!(/^&/,"?")
23
25
  return param ? param : ""
24
26
  end
27
+
28
+ public
25
29
 
26
30
  def self.get(args={})
27
31
  args[:log] ||= 0
data/samples/tf-idf.rb ADDED
@@ -0,0 +1,83 @@
1
+ #!/usr/bin/env ruby
2
+ # coding: utf-8
3
+ # 創想話の最新版から適当なSSのテキストを取得してMeCab(+ 東方MeCab辞書)を用いて代表キーワード候補を名詞限定で選出し、TF-IDF法による特徴語抽出を行います。
4
+ # 注意: ugigi gemが必要です
5
+
6
+ require "MeCab"
7
+ require "kconv"
8
+ require "sosowa"
9
+ require "ugigi"
10
+
11
+ module MeCab
12
+ class Tagger
13
+ alias_method :parseToNode_org, :parseToNode
14
+ private :parseToNode_org
15
+
16
+ def parseToNode(*args)
17
+ node = parseToNode_org(*args)
18
+ nodes = []
19
+ while node
20
+ nodes.push(node)
21
+ node = node.next
22
+ end
23
+ return nodes[1, nodes.size - 2]
24
+ end
25
+ end
26
+
27
+ class Node
28
+ alias_method :feature_org, :feature
29
+ alias_method :surface_org, :surface
30
+ private :feature_org
31
+ private :surface_org
32
+
33
+ def feature ; feature_org.toutf8 end
34
+ def surface ; surface_org.toutf8 end
35
+ end
36
+ end
37
+
38
+ puts "東方MeCab辞書をダウンロード中..."
39
+ system("curl -L https://github.com/oame/thdic-mecab/raw/master/pkg/thdic-mecab.dic > thdic-mecab.dic")
40
+
41
+ puts "完了. MeCab::Taggerを初期化します"
42
+ mecab = MeCab::Tagger.new("-u thdic-mecab.dic")
43
+
44
+ novel = Sosowa.get.sample.fetch
45
+ puts "-"*30
46
+ puts novel.title
47
+ puts "作者: #{novel.author.name}"
48
+ puts "-"*30
49
+ text = novel.plain
50
+ tf = {}
51
+ n = 15646.0
52
+ puts "代表キーワード候補を抽出中..."
53
+ tokens = mecab.parseToNode(text)
54
+ tokens.each do |token|
55
+ next unless token.feature =~ /名詞/
56
+ tf[token.surface] ||= 0
57
+ tf[token.surface] += 1
58
+ end
59
+
60
+ puts "代表キーワード候補数: #{tf.size}"
61
+
62
+ tfidf_list = []
63
+ tf.each do |e|
64
+ print "TF: #{e[0]} ... \t"
65
+ df = Ugigi.total_count(:free => e[0], :sswp => 0, :compe => 0)
66
+ if df == 0
67
+ print "N/A\n"
68
+ tfidf_list << [e[0], 0]
69
+ next
70
+ end
71
+ print "DF: #{df} \t"
72
+ tfidf = e[1] * Math.log(n/df)
73
+ print "TF-IDF: #{tfidf}\n"
74
+ tfidf_list << [e[0], tfidf]
75
+ end
76
+ tfidf_list = tfidf_list.sort{|a, b| b[1] <=> a[1]}
77
+
78
+ puts "集計終わり!"
79
+
80
+ 10.times do |n|
81
+ l = tfidf_list[n]
82
+ puts "#{n+1}. #{l[0]} \tTF-IDF: #{l[1]}"
83
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sosowa
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.1'
4
+ version: '0.2'
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-10 00:00:00.000000000Z
12
+ date: 2012-07-11 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
16
- requirement: &70307433904340 !ruby/object:Gem::Requirement
16
+ requirement: &70146581283880 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,7 +21,7 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70307433904340
24
+ version_requirements: *70146581283880
25
25
  description: Sosowa Parser for Ruby
26
26
  email:
27
27
  - oame@oameya.com
@@ -39,6 +39,7 @@ files:
39
39
  - lib/sosowa/parser.rb
40
40
  - lib/sosowa/scheme.rb
41
41
  - lib/sosowa/version.rb
42
+ - samples/tf-idf.rb
42
43
  - samples/token_segment.rb
43
44
  - sosowa.gemspec
44
45
  homepage: ''