analy_z 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/analy_z.rb +28 -8
- data/lib/analy_z/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 257d11a71d341c246d265954ebdd1f557332867d
|
4
|
+
data.tar.gz: 0ad929efdd028daa2509005473322951f9d37ba1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6da8e91ad8f952e7f2f0c9ee02c79557fda628f5b2995c60a119d0de91d71fd7937965c77d9ee2e7277fbaa24b1b83c82c20d95e4c1bb336a9b745da1d91e891
|
7
|
+
data.tar.gz: 448bbe2ae21c0bde8d4964e966cd6909de326cc63696b74a631d8a41b5923d292af8d0e6e6706fce47301cf9f73f3d1ae2f8f3435e601e9fad596e7d49a10a55
|
data/lib/analy_z.rb
CHANGED
@@ -5,6 +5,7 @@ module AnalyZ
|
|
5
5
|
class Analyzer
|
6
6
|
|
7
7
|
require 'pp'
|
8
|
+
require 'date'
|
8
9
|
require 'natto'
|
9
10
|
require 'nokogiri'
|
10
11
|
|
@@ -19,26 +20,42 @@ module AnalyZ
|
|
19
20
|
def initialize html_path, selector = 'body', type_ary = ['名詞']
|
20
21
|
@sentences = {}
|
21
22
|
Dir.glob(html_path).each do |f|
|
23
|
+
print '.'
|
22
24
|
@sentences[f] = parse_html(Nokogiri::HTML.parse(File.read(f), nil, nil).css(selector).to_html)
|
23
25
|
end
|
24
|
-
analyze_words(@sentences)
|
25
|
-
end
|
26
26
|
|
27
|
-
|
27
|
+
puts "\n=== creating sentences file ==="
|
28
|
+
txt = ""
|
29
|
+
@sentences.each do |k, sentences|
|
30
|
+
print '.'
|
31
|
+
txt << sentences.map{|s| s[0] }.join + '/=== EOS ===/'
|
32
|
+
end
|
33
|
+
|
34
|
+
text_file_path = "tmp/#{DateTime.now}.txt"
|
35
|
+
File.write(text_file_path, txt)
|
28
36
|
|
29
|
-
|
37
|
+
puts "\n=== analyzing... ==="
|
38
|
+
analyze_words(@sentences, text_file_path)
|
39
|
+
end
|
30
40
|
|
31
|
-
|
41
|
+
def analyze_words sentences, text_file_path, type_ary = ['名詞']
|
32
42
|
|
43
|
+
@words, @tf, @idf, @hse = {}, {}, {}, {}
|
44
|
+
|
45
|
+
puts "=== calculating tf and idf and hse ==="
|
33
46
|
sentences.each do |key, sentence_ary|
|
47
|
+
print '.'
|
34
48
|
text = sentence_ary.map {|s| s[0] }.join
|
35
49
|
@words[key] = parse_by_natto(text, type_ary)
|
36
50
|
@tf[key] = calc_tf(@words[key])
|
37
|
-
@idf[key] = calc_idf(@
|
51
|
+
@idf[key] = calc_idf(@words[key], text_file_path)
|
38
52
|
@hse[key] = calc_hse(@words[key], sentence_ary)
|
39
53
|
end
|
40
54
|
|
55
|
+
puts "\n=== calculating tf idf ==="
|
41
56
|
@tf_idf = calc_tf_idf(@tf, @idf)
|
57
|
+
|
58
|
+
puts "=== calculating hse tf idf ==="
|
42
59
|
@hse_tf_idf = calc_hse_tf_idf(@tf_idf, @hse)
|
43
60
|
|
44
61
|
end
|
@@ -103,10 +120,13 @@ module AnalyZ
|
|
103
120
|
end
|
104
121
|
end
|
105
122
|
|
106
|
-
def calc_idf
|
123
|
+
def calc_idf words, text_file_path
|
124
|
+
texts = File.read(text_file_path).split('/=== EOS ===/')
|
107
125
|
words.map do |word|
|
108
126
|
cnt = 0
|
109
|
-
|
127
|
+
texts.each do |text|
|
128
|
+
cnt += 1 if text.include?(word)
|
129
|
+
end
|
110
130
|
[word, Math.log(sentences.length / cnt.to_f)]
|
111
131
|
end
|
112
132
|
end
|
data/lib/analy_z/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: analy_z
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- nao215
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-04-
|
11
|
+
date: 2016-04-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: natto
|