analy_z 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: dab7044b4666701f270c5441f871c3567cac1123
4
- data.tar.gz: 00444c7e412b8931688633b0eb93485d66d8341f
3
+ metadata.gz: df13178fbd1dcad7f742f0276f16afaa8fd1097a
4
+ data.tar.gz: 18409354de2247a9f98721b7c5ab64384419f6c2
5
5
  SHA512:
6
- metadata.gz: c823771d1c96c3e7b890256c5946b34a1f91376b66f8b381c35d47cdb5c048a775d064b491c946d1307a4ffac5f7031df6fe7470d7bc9a5d6a05d06ebd7a2a17
7
- data.tar.gz: 25b1dadad75ec7382756047cf14f3ce2988a7b5f14a0f2d8af75c5ebb0b294a893753b4afd9c1f15dfae6a0c9c38901f8aaa73dd72ded105a806d019e5ee8f54
6
+ metadata.gz: ed74cb9fe090d407a18ece87f5e84dfea9178c5825cbbb34f2340f3b6b16de2dec23dad566760c14b446126294e8bf3816ad80e4ebac2fe6607001d195925c10
7
+ data.tar.gz: b875b6ca63249366070cfbf74b60b24c8ba42c868493ba26a516d70f737e272fa1b0637f6a0d77d85b1ea9a6104c603f591c9ac45bc28f7732575a84c55d2aca
data/README.md CHANGED
@@ -1,8 +1,34 @@
1
1
  # AnalyZ
2
2
 
3
- Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/analy_z`. To experiment with that code, run `bin/console` for an interactive prompt.
3
+ This is gem for text analyze.
4
+ Now you can analyze hse-tf-idf value about each words.
4
5
 
5
- TODO: Delete this and the text above, and describe your gem
6
+ ## What is hse-tf-idf
7
+
8
+ hse-tf-idf = hse * tf-idf
9
+
10
+ ### What is hse
11
+
12
+ Hse is HTML Semantic Element (valuation).
13
+ Evaluate HTML tag and express it's value in number.
14
+
15
+ for example
16
+
17
+ | tag name | font-size | font-weight | valuation |
18
+ |:----------:|:-----------:|:------------:|:----------:|
19
+ | h1 | 2 | 1.75 | 3.5 |
20
+ | h2 | 1.5 | 1.75 | 2.625 |
21
+ | h3 | 1.17 | 1.75 | 2.0475 |
22
+ | h4 | 1 | 1.75 | 1.75 |
23
+
24
+ `valuation = font-size * font-weight`
25
+
26
+ And I'm looking for another valuation.
27
+ Please tell me if you find out more good tag or style.
28
+
29
+ I want to add hse valuation logic below,
30
+ - font size by css
31
+ - font color
6
32
 
7
33
  ## Installation
8
34
 
@@ -14,15 +40,36 @@ gem 'analy_z'
14
40
 
15
41
  And then execute:
16
42
 
17
- $ bundle
43
+ $ bundle install
18
44
 
19
45
  Or install it yourself as:
20
46
 
21
47
  $ gem install analy_z
22
48
 
49
+
23
50
  ## Usage
24
51
 
25
- TODO: Write usage instructions here
52
+ ```ruby
53
+ require 'analy_z'
54
+
55
+ # file_path : file path for files you want to analyze
56
+ # for example 'html/*.html'
57
+ # NOTE please add more than 2 files
58
+ # because only 1 file, analy_z can't calucurate idf
59
+ # selector : selector for place you want to analyze
60
+ # for example '#main .content'
61
+
62
+ a = AnalyZ::HTML.word_val(file_path, selector)
63
+
64
+ a.tf # tf
65
+ a.idf # idf
66
+ a.tf_idf # tf-idf
67
+ a.hse_tf_idf # hse-tf-idf
68
+ a.words # words analy_z analyzed
69
+ a.texts # texts analy_z analyzed
70
+ a.sentences # sentences analy_z analyzed
71
+
72
+ ```
26
73
 
27
74
  ## Development
28
75
 
@@ -32,7 +79,7 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
32
79
 
33
80
  ## Contributing
34
81
 
35
- 1. Fork it ( https://github.com/[my-github-username]/analy_z/fork )
82
+ 1. Fork it ( https://github.com/nao215/analy_z/fork )
36
83
  2. Create your feature branch (`git checkout -b my-new-feature`)
37
84
  3. Commit your changes (`git commit -am 'Add some feature'`)
38
85
  4. Push to the branch (`git push origin my-new-feature`)
@@ -0,0 +1,179 @@
1
+ module AnalyZ
2
+
3
+ module HTML
4
+
5
+ class WordVal
6
+
7
+ attr_accessor :tf
8
+ attr_accessor :idf
9
+ attr_accessor :tf_idf
10
+ attr_accessor :hse_tf_idf
11
+ attr_accessor :words
12
+ attr_accessor :texts
13
+ attr_accessor :sentences
14
+
15
+ def initialize html_path, selector = 'body', type_ary = ['名詞']
16
+ @sentences = {}
17
+ Dir.glob(html_path).each do |f|
18
+ print '.'
19
+ @sentences[f] = parse_html(Nokogiri::HTML.parse(File.read(f), nil, nil).css(selector).to_html)
20
+ end
21
+
22
+ puts "\n=== creating sentences file ==="
23
+ txt = ""
24
+ @sentences.each do |k, sentences|
25
+ print '.'
26
+ txt << sentences.map{|s| s[0] }.join + '/=== EOS ===/'
27
+ end
28
+
29
+ FileUtils.mkdir_p('tmp') unless FileTest.exist?('tmp')
30
+ text_file_path = "tmp/#{DateTime.now}.txt"
31
+ File.write(text_file_path, txt)
32
+
33
+ puts "\n=== analyzing... ==="
34
+ analyze_words(@sentences, text_file_path)
35
+ end
36
+
37
+ def analyze_words sentences, text_file_path, type_ary = ['名詞']
38
+
39
+ @words, @tf, @idf, @hse = {}, {}, {}, {}
40
+
41
+ puts "=== calculating tf and idf and hse ==="
42
+ sentences.each do |key, sentence_ary|
43
+ print '.'
44
+ text = sentence_ary.map {|s| s[0] }.join
45
+ @words[key] = parse_by_natto(text, type_ary)
46
+ @tf[key] = calc_tf(@words[key])
47
+ @idf[key] = calc_idf(@words[key], text_file_path)
48
+ @hse[key] = calc_hse(@words[key], sentence_ary)
49
+ end
50
+
51
+ puts "\n=== calculating tf idf ==="
52
+ @tf_idf = calc_tf_idf(@tf, @idf)
53
+
54
+ puts "=== calculating hse tf idf ==="
55
+ @hse_tf_idf = calc_hse_tf_idf(@tf_idf, @hse)
56
+
57
+ end
58
+
59
+ def parse_html html
60
+ sentences, important_tags = [], []
61
+ tag_rep = /<(".*?"|'.*?'|[^'"])*?>/
62
+ h_tag_reg = /<[hH][1-4].*?>.*?<\/[hH][1-4]>/
63
+
64
+ important_tags = html.scan(h_tag_reg)
65
+ .map{|m| [m.gsub(/<(".*?"|'.*?'|[^'"])*?>/, ''), m.match(/[hH][1-4]/)[0] ]}
66
+
67
+ sentences = html.gsub(/\"/, '')
68
+ .split(tag_rep)
69
+ .delete_if{|el| el != ['h1', 'h2', 'h3', 'h4'] && el =~ /\s/ || el.length <= 1}
70
+ .map{|m| [m, 1]}
71
+
72
+ sentences.each_with_index do |sentence, i|
73
+ important_tags.each do |tag_data|
74
+ rate = 2 * 1.75 if tag_data[1] == 'h1'
75
+ rate = 1.5 * 1.75 if tag_data[1] == 'h2'
76
+ rate = 1.17 * 1.75 if tag_data[1] == 'h3'
77
+ rate = 1.17 * 1.75 if tag_data[1] == 'h4'
78
+ sentences[i][1] = rate if sentence[0].include?(tag_data[0]) || tag_data[0].include?(sentence[0])
79
+ end
80
+ end
81
+
82
+ sentences
83
+
84
+ end
85
+
86
+ def parse_by_natto text, type_ary
87
+ words = []
88
+
89
+ Natto::MeCab.new.parse(text).split(/\n/).map do |row|
90
+ row = row.split(/\t|,/)
91
+ words << row[0] if type_ary.include?(row[1]) # row[0] is word, row[1] is a part of speech
92
+ end
93
+
94
+ words
95
+ end
96
+
97
+ def calc_tf words
98
+ freq_hash = {}
99
+
100
+ words.each_with_index do |word, i|
101
+ freq_hash[word] = freq_hash.has_key?(word) ? freq_hash[word] + 1 : 1
102
+ end
103
+
104
+ tf_list = freq_hash.sort_by {|k, v| v }.reverse.map do |k, v|
105
+ [k, v / words.length.to_f]
106
+ end
107
+
108
+ tf_list
109
+ end
110
+
111
+ def standardization_tf tf_ary_list, ave_word_num
112
+ return tf_ary_list.map do |tf_ary|
113
+ tf_ary.map do |tf|
114
+ [tf[0], tf[1] * (tf_ary.length / ave_word_num.to_f), tf_ary.length / ave_word_num.to_f]
115
+ end
116
+ end
117
+ end
118
+
119
+ def calc_idf words, text_file_path
120
+ texts = File.read(text_file_path).split('/=== EOS ===/')
121
+ words.map do |word|
122
+ cnt = 0
123
+ texts.each do |text|
124
+ cnt += 1 if text.include?(word)
125
+ end
126
+ [word, Math.log(sentences.length / cnt.to_f)]
127
+ end
128
+ end
129
+
130
+ def calc_hse words, sentence_ary
131
+ sentence_ary = sentence_ary.select{|sentence| sentence[1] != 1}
132
+ words.map do |word|
133
+ rate = 1
134
+ sentence_ary.each do |sentence|
135
+ rate = sentence[1] if sentence[0].include?(word[0])
136
+ end
137
+ [word, rate]
138
+ end.uniq
139
+ end
140
+
141
+ def calc_tf_idf tf_list_hash, idf_list_hash
142
+
143
+ tf_idfs = {}
144
+
145
+ tf_list_hash.each do |k, tf|
146
+ tf_idf = []
147
+ idf_list_hash[k].each do |idf|
148
+ tf_idf << [idf[0], idf[1] * tf.assoc(idf[0])[1]]
149
+ end
150
+ tf_idfs[k] = tf_idf.sort{ |a, b| b[1] <=> a[1] }.uniq
151
+ end
152
+
153
+ tf_idfs
154
+
155
+ end
156
+
157
+ def calc_hse_tf_idf tf_idf_list_hash, hse
158
+
159
+ hse_tf_idf = {}
160
+
161
+ hse.each do |k, h|
162
+ hse[k] = hse[k].select {|h| h[1] != 1 }
163
+ end
164
+
165
+ tf_idf_list_hash.each do |k, tf_idf_list|
166
+ hse_tf_idf[k] = tf_idf_list.map do |tf_idf|
167
+ rate = hse[k].assoc(tf_idf[0]) ? hse[k].assoc(tf_idf[0])[1] : 1
168
+ [tf_idf[0], tf_idf[1] * rate]
169
+ end
170
+ end
171
+
172
+ hse_tf_idf
173
+ end
174
+
175
+ end
176
+
177
+ end
178
+
179
+ end
@@ -0,0 +1,19 @@
1
+ module AnalyZ
2
+
3
+ class << self
4
+ def HTML html_path, selector = 'body', type_ary = ['名詞']
5
+ AnalyZ::HTML
6
+ end
7
+
8
+ end
9
+
10
+ module HTML
11
+
12
+ def self.word_val html_path, selector = 'body', type_ary = ['名詞']
13
+ WordVal.new(html_path, selector, type_ary)
14
+ end
15
+
16
+ end
17
+
18
+ end
19
+
@@ -1,3 +1,3 @@
1
1
  module AnalyZ
2
- VERSION = "0.1.5"
2
+ VERSION = "0.1.6"
3
3
  end
data/lib/analy_z.rb CHANGED
@@ -1,183 +1,13 @@
1
1
  require "analy_z/version"
2
+ require 'pp'
3
+ require 'date'
4
+ require 'natto'
5
+ require 'nokogiri'
6
+ require 'fileutils'
2
7
 
3
- module AnalyZ
4
-
5
- class Analyzer
6
-
7
- require 'pp'
8
- require 'date'
9
- require 'natto'
10
- require 'nokogiri'
11
- require 'fileutils'
12
-
13
- attr_accessor :tf
14
- attr_accessor :idf
15
- attr_accessor :tf_idf
16
- attr_accessor :hse_tf_idf
17
- attr_accessor :words
18
- attr_accessor :texts
19
- attr_accessor :sentences
20
-
21
- def initialize html_path, selector = 'body', type_ary = ['名詞']
22
- @sentences = {}
23
- Dir.glob(html_path).each do |f|
24
- print '.'
25
- @sentences[f] = parse_html(Nokogiri::HTML.parse(File.read(f), nil, nil).css(selector).to_html)
26
- end
27
-
28
- puts "\n=== creating sentences file ==="
29
- txt = ""
30
- @sentences.each do |k, sentences|
31
- print '.'
32
- txt << sentences.map{|s| s[0] }.join + '/=== EOS ===/'
33
- end
34
-
35
- FileUtils.mkdir_p('tmp') unless FileTest.exist?('tmp')
36
- text_file_path = "tmp/#{DateTime.now}.txt"
37
- File.write(text_file_path, txt)
38
-
39
- puts "\n=== analyzing... ==="
40
- analyze_words(@sentences, text_file_path)
41
- end
42
-
43
- def analyze_words sentences, text_file_path, type_ary = ['名詞']
44
-
45
- @words, @tf, @idf, @hse = {}, {}, {}, {}
46
-
47
- puts "=== calculating tf and idf and hse ==="
48
- sentences.each do |key, sentence_ary|
49
- print '.'
50
- text = sentence_ary.map {|s| s[0] }.join
51
- @words[key] = parse_by_natto(text, type_ary)
52
- @tf[key] = calc_tf(@words[key])
53
- @idf[key] = calc_idf(@words[key], text_file_path)
54
- @hse[key] = calc_hse(@words[key], sentence_ary)
55
- end
56
-
57
- puts "\n=== calculating tf idf ==="
58
- @tf_idf = calc_tf_idf(@tf, @idf)
59
-
60
- puts "=== calculating hse tf idf ==="
61
- @hse_tf_idf = calc_hse_tf_idf(@tf_idf, @hse)
62
-
63
- end
64
-
65
- def parse_html html
66
- sentences, important_tags = [], []
67
- tag_rep = /<(".*?"|'.*?'|[^'"])*?>/
68
- h_tag_reg = /<[hH][1-4].*?>.*?<\/[hH][1-4]>/
69
-
70
- important_tags = html.scan(h_tag_reg)
71
- .map{|m| [m.gsub(/<(".*?"|'.*?'|[^'"])*?>/, ''), m.match(/[hH][1-4]/)[0] ]}
72
-
73
- sentences = html.gsub(/\"/, '')
74
- .split(tag_rep)
75
- .delete_if{|el| el != ['h1', 'h2', 'h3', 'h4'] && el =~ /\s/ || el.length <= 1}
76
- .map{|m| [m, 1]}
77
-
78
- sentences.each_with_index do |sentence, i|
79
- important_tags.each do |tag_data|
80
- rate = 2 * 1.75 if tag_data[1] == 'h1'
81
- rate = 1.5 * 1.75 if tag_data[1] == 'h2'
82
- rate = 1.17 * 1.75 if tag_data[1] == 'h3'
83
- rate = 1.17 * 1.75 if tag_data[1] == 'h4'
84
- sentences[i][1] = rate if sentence[0].include?(tag_data[0]) || tag_data[0].include?(sentence[0])
85
- end
86
- end
87
-
88
- sentences
89
-
90
- end
91
-
92
- def parse_by_natto text, type_ary
93
- words = []
94
-
95
- Natto::MeCab.new.parse(text).split(/\n/).map do |row|
96
- row = row.split(/\t|,/)
97
- words << row[0] if type_ary.include?(row[1]) # row[0] is word, row[1] is a part of speech
98
- end
99
-
100
- words
101
- end
102
-
103
- def calc_tf words
104
- freq_hash = {}
105
-
106
- words.each_with_index do |word, i|
107
- freq_hash[word] = freq_hash.has_key?(word) ? freq_hash[word] + 1 : 1
108
- end
109
-
110
- tf_list = freq_hash.sort_by {|k, v| v }.reverse.map do |k, v|
111
- [k, v / words.length.to_f]
112
- end
113
-
114
- tf_list
115
- end
116
-
117
- def standardization_tf tf_ary_list, ave_word_num
118
- return tf_ary_list.map do |tf_ary|
119
- tf_ary.map do |tf|
120
- [tf[0], tf[1] * (tf_ary.length / ave_word_num.to_f), tf_ary.length / ave_word_num.to_f]
121
- end
122
- end
123
- end
124
-
125
- def calc_idf words, text_file_path
126
- texts = File.read(text_file_path).split('/=== EOS ===/')
127
- words.map do |word|
128
- cnt = 0
129
- texts.each do |text|
130
- cnt += 1 if text.include?(word)
131
- end
132
- [word, Math.log(sentences.length / cnt.to_f)]
133
- end
134
- end
135
-
136
- def calc_hse words, sentence_ary
137
- sentence_ary = sentence_ary.select{|sentence| sentence[1] != 1}
138
- words.map do |word|
139
- rate = 1
140
- sentence_ary.each do |sentence|
141
- rate = sentence[1] if sentence[0].include?(word[0])
142
- end
143
- [word, rate]
144
- end.uniq
145
- end
146
-
147
- def calc_tf_idf tf_list_hash, idf_list_hash
148
-
149
- tf_idfs = {}
150
-
151
- tf_list_hash.each do |k, tf|
152
- tf_idf = []
153
- idf_list_hash[k].each do |idf|
154
- tf_idf << [idf[0], idf[1] * tf.assoc(idf[0])[1]]
155
- end
156
- tf_idfs[k] = tf_idf.sort{ |a, b| b[1] <=> a[1] }.uniq
157
- end
158
-
159
- tf_idfs
160
-
161
- end
162
-
163
- def calc_hse_tf_idf tf_idf_list_hash, hse
164
-
165
- hse_tf_idf = {}
166
-
167
- hse.each do |k, h|
168
- hse[k] = hse[k].select {|h| h[1] != 1 }
169
- end
170
-
171
- tf_idf_list_hash.each do |k, tf_idf_list|
172
- hse_tf_idf[k] = tf_idf_list.map do |tf_idf|
173
- rate = hse[k].assoc(tf_idf[0]) ? hse[k].assoc(tf_idf[0])[1] : 1
174
- [tf_idf[0], tf_idf[1] * rate]
175
- end
176
- end
177
-
178
- hse_tf_idf
179
- end
180
-
181
- end
8
+ require 'analy_z/html'
9
+ require 'analy_z/html/word_val'
10
+ require 'analy_z/html/similarity'
182
11
 
12
+ module AnalyZ
183
13
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: analy_z
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - nao215
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-04-30 00:00:00.000000000 Z
11
+ date: 2016-05-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: natto
@@ -69,6 +69,8 @@ files:
69
69
  - bin/console
70
70
  - bin/setup
71
71
  - lib/analy_z.rb
72
+ - lib/analy_z/html.rb
73
+ - lib/analy_z/html/word_val.rb
72
74
  - lib/analy_z/version.rb
73
75
  homepage: ''
74
76
  licenses: []