analy_z 0.1.5 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: dab7044b4666701f270c5441f871c3567cac1123
4
- data.tar.gz: 00444c7e412b8931688633b0eb93485d66d8341f
3
+ metadata.gz: df13178fbd1dcad7f742f0276f16afaa8fd1097a
4
+ data.tar.gz: 18409354de2247a9f98721b7c5ab64384419f6c2
5
5
  SHA512:
6
- metadata.gz: c823771d1c96c3e7b890256c5946b34a1f91376b66f8b381c35d47cdb5c048a775d064b491c946d1307a4ffac5f7031df6fe7470d7bc9a5d6a05d06ebd7a2a17
7
- data.tar.gz: 25b1dadad75ec7382756047cf14f3ce2988a7b5f14a0f2d8af75c5ebb0b294a893753b4afd9c1f15dfae6a0c9c38901f8aaa73dd72ded105a806d019e5ee8f54
6
+ metadata.gz: ed74cb9fe090d407a18ece87f5e84dfea9178c5825cbbb34f2340f3b6b16de2dec23dad566760c14b446126294e8bf3816ad80e4ebac2fe6607001d195925c10
7
+ data.tar.gz: b875b6ca63249366070cfbf74b60b24c8ba42c868493ba26a516d70f737e272fa1b0637f6a0d77d85b1ea9a6104c603f591c9ac45bc28f7732575a84c55d2aca
data/README.md CHANGED
@@ -1,8 +1,34 @@
1
1
  # AnalyZ
2
2
 
3
- Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/analy_z`. To experiment with that code, run `bin/console` for an interactive prompt.
3
+ This is gem for text analyze.
4
+ Now you can analyze hse-tf-idf value about each words.
4
5
 
5
- TODO: Delete this and the text above, and describe your gem
6
+ ## What is hse-tf-idf
7
+
8
+ hse-tf-idf = hse * tf-idf
9
+
10
+ ### What is hse
11
+
12
+ Hse is HTML Semantic Element (valuation).
13
+ Evaluate HTML tag and express it's value in number.
14
+
15
+ for example
16
+
17
+ | tag name | font-size | font-weight | valuation |
18
+ |:----------:|:-----------:|:------------:|:----------:|
19
+ | h1 | 2 | 1.75 | 3.5 |
20
+ | h2 | 1.5 | 1.75 | 2.625 |
21
+ | h3 | 1.17 | 1.75 | 2.0475 |
22
+ | h4 | 1 | 1.75 | 1.75 |
23
+
24
+ `valuation = font-size * font-weight`
25
+
26
+ And I'm looking for another valuation.
27
+ Please tell me if you find out more good tag or style.
28
+
29
+ I want to add hse valuation logic below,
30
+ - font size by css
31
+ - font color
6
32
 
7
33
  ## Installation
8
34
 
@@ -14,15 +40,36 @@ gem 'analy_z'
14
40
 
15
41
  And then execute:
16
42
 
17
- $ bundle
43
+ $ bundle install
18
44
 
19
45
  Or install it yourself as:
20
46
 
21
47
  $ gem install analy_z
22
48
 
49
+
23
50
  ## Usage
24
51
 
25
- TODO: Write usage instructions here
52
+ ```ruby
53
+ require 'analy_z'
54
+
55
+ # file_path : file path for files you want to analyze
56
+ # for example 'html/*.html'
57
+ # NOTE please add more than 2 files
58
+ # because only 1 file, analy_z can't calucurate idf
59
+ # selector : selector for place you want to analyze
60
+ # for example '#main .content'
61
+
62
+ a = AnalyZ::HTML.word_val(file_path, selector)
63
+
64
+ a.tf # tf
65
+ a.idf # idf
66
+ a.tf_idf # tf-idf
67
+ a.hse_tf_idf # hse-tf-idf
68
+ a.words # words analy_z analyzed
69
+ a.texts # texts analy_z analyzed
70
+ a.sentences # sentences analy_z analyzed
71
+
72
+ ```
26
73
 
27
74
  ## Development
28
75
 
@@ -32,7 +79,7 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
32
79
 
33
80
  ## Contributing
34
81
 
35
- 1. Fork it ( https://github.com/[my-github-username]/analy_z/fork )
82
+ 1. Fork it ( https://github.com/nao215/analy_z/fork )
36
83
  2. Create your feature branch (`git checkout -b my-new-feature`)
37
84
  3. Commit your changes (`git commit -am 'Add some feature'`)
38
85
  4. Push to the branch (`git push origin my-new-feature`)
@@ -0,0 +1,179 @@
1
+ module AnalyZ
2
+
3
+ module HTML
4
+
5
+ class WordVal
6
+
7
+ attr_accessor :tf
8
+ attr_accessor :idf
9
+ attr_accessor :tf_idf
10
+ attr_accessor :hse_tf_idf
11
+ attr_accessor :words
12
+ attr_accessor :texts
13
+ attr_accessor :sentences
14
+
15
+ def initialize html_path, selector = 'body', type_ary = ['名詞']
16
+ @sentences = {}
17
+ Dir.glob(html_path).each do |f|
18
+ print '.'
19
+ @sentences[f] = parse_html(Nokogiri::HTML.parse(File.read(f), nil, nil).css(selector).to_html)
20
+ end
21
+
22
+ puts "\n=== creating sentences file ==="
23
+ txt = ""
24
+ @sentences.each do |k, sentences|
25
+ print '.'
26
+ txt << sentences.map{|s| s[0] }.join + '/=== EOS ===/'
27
+ end
28
+
29
+ FileUtils.mkdir_p('tmp') unless FileTest.exist?('tmp')
30
+ text_file_path = "tmp/#{DateTime.now}.txt"
31
+ File.write(text_file_path, txt)
32
+
33
+ puts "\n=== analyzing... ==="
34
+ analyze_words(@sentences, text_file_path)
35
+ end
36
+
37
+ def analyze_words sentences, text_file_path, type_ary = ['名詞']
38
+
39
+ @words, @tf, @idf, @hse = {}, {}, {}, {}
40
+
41
+ puts "=== calculating tf and idf and hse ==="
42
+ sentences.each do |key, sentence_ary|
43
+ print '.'
44
+ text = sentence_ary.map {|s| s[0] }.join
45
+ @words[key] = parse_by_natto(text, type_ary)
46
+ @tf[key] = calc_tf(@words[key])
47
+ @idf[key] = calc_idf(@words[key], text_file_path)
48
+ @hse[key] = calc_hse(@words[key], sentence_ary)
49
+ end
50
+
51
+ puts "\n=== calculating tf idf ==="
52
+ @tf_idf = calc_tf_idf(@tf, @idf)
53
+
54
+ puts "=== calculating hse tf idf ==="
55
+ @hse_tf_idf = calc_hse_tf_idf(@tf_idf, @hse)
56
+
57
+ end
58
+
59
+ def parse_html html
60
+ sentences, important_tags = [], []
61
+ tag_rep = /<(".*?"|'.*?'|[^'"])*?>/
62
+ h_tag_reg = /<[hH][1-4].*?>.*?<\/[hH][1-4]>/
63
+
64
+ important_tags = html.scan(h_tag_reg)
65
+ .map{|m| [m.gsub(/<(".*?"|'.*?'|[^'"])*?>/, ''), m.match(/[hH][1-4]/)[0] ]}
66
+
67
+ sentences = html.gsub(/\"/, '')
68
+ .split(tag_rep)
69
+ .delete_if{|el| el != ['h1', 'h2', 'h3', 'h4'] && el =~ /\s/ || el.length <= 1}
70
+ .map{|m| [m, 1]}
71
+
72
+ sentences.each_with_index do |sentence, i|
73
+ important_tags.each do |tag_data|
74
+ rate = 2 * 1.75 if tag_data[1] == 'h1'
75
+ rate = 1.5 * 1.75 if tag_data[1] == 'h2'
76
+ rate = 1.17 * 1.75 if tag_data[1] == 'h3'
77
+ rate = 1.17 * 1.75 if tag_data[1] == 'h4'
78
+ sentences[i][1] = rate if sentence[0].include?(tag_data[0]) || tag_data[0].include?(sentence[0])
79
+ end
80
+ end
81
+
82
+ sentences
83
+
84
+ end
85
+
86
+ def parse_by_natto text, type_ary
87
+ words = []
88
+
89
+ Natto::MeCab.new.parse(text).split(/\n/).map do |row|
90
+ row = row.split(/\t|,/)
91
+ words << row[0] if type_ary.include?(row[1]) # row[0] is word, row[1] is a part of speech
92
+ end
93
+
94
+ words
95
+ end
96
+
97
+ def calc_tf words
98
+ freq_hash = {}
99
+
100
+ words.each_with_index do |word, i|
101
+ freq_hash[word] = freq_hash.has_key?(word) ? freq_hash[word] + 1 : 1
102
+ end
103
+
104
+ tf_list = freq_hash.sort_by {|k, v| v }.reverse.map do |k, v|
105
+ [k, v / words.length.to_f]
106
+ end
107
+
108
+ tf_list
109
+ end
110
+
111
+ def standardization_tf tf_ary_list, ave_word_num
112
+ return tf_ary_list.map do |tf_ary|
113
+ tf_ary.map do |tf|
114
+ [tf[0], tf[1] * (tf_ary.length / ave_word_num.to_f), tf_ary.length / ave_word_num.to_f]
115
+ end
116
+ end
117
+ end
118
+
119
+ def calc_idf words, text_file_path
120
+ texts = File.read(text_file_path).split('/=== EOS ===/')
121
+ words.map do |word|
122
+ cnt = 0
123
+ texts.each do |text|
124
+ cnt += 1 if text.include?(word)
125
+ end
126
+ [word, Math.log(sentences.length / cnt.to_f)]
127
+ end
128
+ end
129
+
130
+ def calc_hse words, sentence_ary
131
+ sentence_ary = sentence_ary.select{|sentence| sentence[1] != 1}
132
+ words.map do |word|
133
+ rate = 1
134
+ sentence_ary.each do |sentence|
135
+ rate = sentence[1] if sentence[0].include?(word[0])
136
+ end
137
+ [word, rate]
138
+ end.uniq
139
+ end
140
+
141
+ def calc_tf_idf tf_list_hash, idf_list_hash
142
+
143
+ tf_idfs = {}
144
+
145
+ tf_list_hash.each do |k, tf|
146
+ tf_idf = []
147
+ idf_list_hash[k].each do |idf|
148
+ tf_idf << [idf[0], idf[1] * tf.assoc(idf[0])[1]]
149
+ end
150
+ tf_idfs[k] = tf_idf.sort{ |a, b| b[1] <=> a[1] }.uniq
151
+ end
152
+
153
+ tf_idfs
154
+
155
+ end
156
+
157
+ def calc_hse_tf_idf tf_idf_list_hash, hse
158
+
159
+ hse_tf_idf = {}
160
+
161
+ hse.each do |k, h|
162
+ hse[k] = hse[k].select {|h| h[1] != 1 }
163
+ end
164
+
165
+ tf_idf_list_hash.each do |k, tf_idf_list|
166
+ hse_tf_idf[k] = tf_idf_list.map do |tf_idf|
167
+ rate = hse[k].assoc(tf_idf[0]) ? hse[k].assoc(tf_idf[0])[1] : 1
168
+ [tf_idf[0], tf_idf[1] * rate]
169
+ end
170
+ end
171
+
172
+ hse_tf_idf
173
+ end
174
+
175
+ end
176
+
177
+ end
178
+
179
+ end
@@ -0,0 +1,19 @@
1
+ module AnalyZ
2
+
3
+ class << self
4
+ def HTML html_path, selector = 'body', type_ary = ['名詞']
5
+ AnalyZ::HTML
6
+ end
7
+
8
+ end
9
+
10
+ module HTML
11
+
12
+ def self.word_val html_path, selector = 'body', type_ary = ['名詞']
13
+ WordVal.new(html_path, selector, type_ary)
14
+ end
15
+
16
+ end
17
+
18
+ end
19
+
@@ -1,3 +1,3 @@
1
1
  module AnalyZ
2
- VERSION = "0.1.5"
2
+ VERSION = "0.1.6"
3
3
  end
data/lib/analy_z.rb CHANGED
@@ -1,183 +1,13 @@
1
1
  require "analy_z/version"
2
+ require 'pp'
3
+ require 'date'
4
+ require 'natto'
5
+ require 'nokogiri'
6
+ require 'fileutils'
2
7
 
3
- module AnalyZ
4
-
5
- class Analyzer
6
-
7
- require 'pp'
8
- require 'date'
9
- require 'natto'
10
- require 'nokogiri'
11
- require 'fileutils'
12
-
13
- attr_accessor :tf
14
- attr_accessor :idf
15
- attr_accessor :tf_idf
16
- attr_accessor :hse_tf_idf
17
- attr_accessor :words
18
- attr_accessor :texts
19
- attr_accessor :sentences
20
-
21
- def initialize html_path, selector = 'body', type_ary = ['名詞']
22
- @sentences = {}
23
- Dir.glob(html_path).each do |f|
24
- print '.'
25
- @sentences[f] = parse_html(Nokogiri::HTML.parse(File.read(f), nil, nil).css(selector).to_html)
26
- end
27
-
28
- puts "\n=== creating sentences file ==="
29
- txt = ""
30
- @sentences.each do |k, sentences|
31
- print '.'
32
- txt << sentences.map{|s| s[0] }.join + '/=== EOS ===/'
33
- end
34
-
35
- FileUtils.mkdir_p('tmp') unless FileTest.exist?('tmp')
36
- text_file_path = "tmp/#{DateTime.now}.txt"
37
- File.write(text_file_path, txt)
38
-
39
- puts "\n=== analyzing... ==="
40
- analyze_words(@sentences, text_file_path)
41
- end
42
-
43
- def analyze_words sentences, text_file_path, type_ary = ['名詞']
44
-
45
- @words, @tf, @idf, @hse = {}, {}, {}, {}
46
-
47
- puts "=== calculating tf and idf and hse ==="
48
- sentences.each do |key, sentence_ary|
49
- print '.'
50
- text = sentence_ary.map {|s| s[0] }.join
51
- @words[key] = parse_by_natto(text, type_ary)
52
- @tf[key] = calc_tf(@words[key])
53
- @idf[key] = calc_idf(@words[key], text_file_path)
54
- @hse[key] = calc_hse(@words[key], sentence_ary)
55
- end
56
-
57
- puts "\n=== calculating tf idf ==="
58
- @tf_idf = calc_tf_idf(@tf, @idf)
59
-
60
- puts "=== calculating hse tf idf ==="
61
- @hse_tf_idf = calc_hse_tf_idf(@tf_idf, @hse)
62
-
63
- end
64
-
65
- def parse_html html
66
- sentences, important_tags = [], []
67
- tag_rep = /<(".*?"|'.*?'|[^'"])*?>/
68
- h_tag_reg = /<[hH][1-4].*?>.*?<\/[hH][1-4]>/
69
-
70
- important_tags = html.scan(h_tag_reg)
71
- .map{|m| [m.gsub(/<(".*?"|'.*?'|[^'"])*?>/, ''), m.match(/[hH][1-4]/)[0] ]}
72
-
73
- sentences = html.gsub(/\"/, '')
74
- .split(tag_rep)
75
- .delete_if{|el| el != ['h1', 'h2', 'h3', 'h4'] && el =~ /\s/ || el.length <= 1}
76
- .map{|m| [m, 1]}
77
-
78
- sentences.each_with_index do |sentence, i|
79
- important_tags.each do |tag_data|
80
- rate = 2 * 1.75 if tag_data[1] == 'h1'
81
- rate = 1.5 * 1.75 if tag_data[1] == 'h2'
82
- rate = 1.17 * 1.75 if tag_data[1] == 'h3'
83
- rate = 1.17 * 1.75 if tag_data[1] == 'h4'
84
- sentences[i][1] = rate if sentence[0].include?(tag_data[0]) || tag_data[0].include?(sentence[0])
85
- end
86
- end
87
-
88
- sentences
89
-
90
- end
91
-
92
- def parse_by_natto text, type_ary
93
- words = []
94
-
95
- Natto::MeCab.new.parse(text).split(/\n/).map do |row|
96
- row = row.split(/\t|,/)
97
- words << row[0] if type_ary.include?(row[1]) # row[0] is word, row[1] is a part of speech
98
- end
99
-
100
- words
101
- end
102
-
103
- def calc_tf words
104
- freq_hash = {}
105
-
106
- words.each_with_index do |word, i|
107
- freq_hash[word] = freq_hash.has_key?(word) ? freq_hash[word] + 1 : 1
108
- end
109
-
110
- tf_list = freq_hash.sort_by {|k, v| v }.reverse.map do |k, v|
111
- [k, v / words.length.to_f]
112
- end
113
-
114
- tf_list
115
- end
116
-
117
- def standardization_tf tf_ary_list, ave_word_num
118
- return tf_ary_list.map do |tf_ary|
119
- tf_ary.map do |tf|
120
- [tf[0], tf[1] * (tf_ary.length / ave_word_num.to_f), tf_ary.length / ave_word_num.to_f]
121
- end
122
- end
123
- end
124
-
125
- def calc_idf words, text_file_path
126
- texts = File.read(text_file_path).split('/=== EOS ===/')
127
- words.map do |word|
128
- cnt = 0
129
- texts.each do |text|
130
- cnt += 1 if text.include?(word)
131
- end
132
- [word, Math.log(sentences.length / cnt.to_f)]
133
- end
134
- end
135
-
136
- def calc_hse words, sentence_ary
137
- sentence_ary = sentence_ary.select{|sentence| sentence[1] != 1}
138
- words.map do |word|
139
- rate = 1
140
- sentence_ary.each do |sentence|
141
- rate = sentence[1] if sentence[0].include?(word[0])
142
- end
143
- [word, rate]
144
- end.uniq
145
- end
146
-
147
- def calc_tf_idf tf_list_hash, idf_list_hash
148
-
149
- tf_idfs = {}
150
-
151
- tf_list_hash.each do |k, tf|
152
- tf_idf = []
153
- idf_list_hash[k].each do |idf|
154
- tf_idf << [idf[0], idf[1] * tf.assoc(idf[0])[1]]
155
- end
156
- tf_idfs[k] = tf_idf.sort{ |a, b| b[1] <=> a[1] }.uniq
157
- end
158
-
159
- tf_idfs
160
-
161
- end
162
-
163
- def calc_hse_tf_idf tf_idf_list_hash, hse
164
-
165
- hse_tf_idf = {}
166
-
167
- hse.each do |k, h|
168
- hse[k] = hse[k].select {|h| h[1] != 1 }
169
- end
170
-
171
- tf_idf_list_hash.each do |k, tf_idf_list|
172
- hse_tf_idf[k] = tf_idf_list.map do |tf_idf|
173
- rate = hse[k].assoc(tf_idf[0]) ? hse[k].assoc(tf_idf[0])[1] : 1
174
- [tf_idf[0], tf_idf[1] * rate]
175
- end
176
- end
177
-
178
- hse_tf_idf
179
- end
180
-
181
- end
8
+ require 'analy_z/html'
9
+ require 'analy_z/html/word_val'
10
+ require 'analy_z/html/similarity'
182
11
 
12
+ module AnalyZ
183
13
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: analy_z
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - nao215
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-04-30 00:00:00.000000000 Z
11
+ date: 2016-05-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: natto
@@ -69,6 +69,8 @@ files:
69
69
  - bin/console
70
70
  - bin/setup
71
71
  - lib/analy_z.rb
72
+ - lib/analy_z/html.rb
73
+ - lib/analy_z/html/word_val.rb
72
74
  - lib/analy_z/version.rb
73
75
  homepage: ''
74
76
  licenses: []