analy_z 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 758343df058bfed451689c1c991e1354fb92d289
4
- data.tar.gz: e150373446af0e1430e78a8db54f8abbd5f0db93
3
+ metadata.gz: 257d11a71d341c246d265954ebdd1f557332867d
4
+ data.tar.gz: 0ad929efdd028daa2509005473322951f9d37ba1
5
5
  SHA512:
6
- metadata.gz: 8a7858dab8bf27d15663fbba157b11de25fbc6b955b675f812ecc422d0c90ab5f885be5cd6e0956dc4c4310a8465e9e425c5e1eaf674857aeb01b655ca9882c2
7
- data.tar.gz: 106f76e8109d8762fdcf49d3fdfe8853d1a6aff6028bc10bf1613d1f0bfc616354bbe4da0987b4bc3190122582f579b807d93b2545acf75335b24b7f33981ec0
6
+ metadata.gz: 6da8e91ad8f952e7f2f0c9ee02c79557fda628f5b2995c60a119d0de91d71fd7937965c77d9ee2e7277fbaa24b1b83c82c20d95e4c1bb336a9b745da1d91e891
7
+ data.tar.gz: 448bbe2ae21c0bde8d4964e966cd6909de326cc63696b74a631d8a41b5923d292af8d0e6e6706fce47301cf9f73f3d1ae2f8f3435e601e9fad596e7d49a10a55
@@ -5,6 +5,7 @@ module AnalyZ
5
5
  class Analyzer
6
6
 
7
7
  require 'pp'
8
+ require 'date'
8
9
  require 'natto'
9
10
  require 'nokogiri'
10
11
 
@@ -19,26 +20,42 @@ module AnalyZ
19
20
  def initialize html_path, selector = 'body', type_ary = ['名詞']
20
21
  @sentences = {}
21
22
  Dir.glob(html_path).each do |f|
23
+ print '.'
22
24
  @sentences[f] = parse_html(Nokogiri::HTML.parse(File.read(f), nil, nil).css(selector).to_html)
23
25
  end
24
- analyze_words(@sentences)
25
- end
26
26
 
27
- def analyze_words sentences, type_ary = ['名詞']
27
+ puts "\n=== creating sentences file ==="
28
+ txt = ""
29
+ @sentences.each do |k, sentences|
30
+ print '.'
31
+ txt << sentences.map{|s| s[0] }.join + '/=== EOS ===/'
32
+ end
33
+
34
+ text_file_path = "tmp/#{DateTime.now}.txt"
35
+ File.write(text_file_path, txt)
28
36
 
29
- @texts, @words, @tf, @idf, @hse = {}, {}, {}, {}, {}
37
+ puts "\n=== analyzing... ==="
38
+ analyze_words(@sentences, text_file_path)
39
+ end
30
40
 
31
- sentences.each{|k, sentence| @texts[k] = sentence.map {|s| s[0]}.join }
41
+ def analyze_words sentences, text_file_path, type_ary = ['名詞']
32
42
 
43
+ @words, @tf, @idf, @hse = {}, {}, {}, {}
44
+
45
+ puts "=== calculating tf and idf and hse ==="
33
46
  sentences.each do |key, sentence_ary|
47
+ print '.'
34
48
  text = sentence_ary.map {|s| s[0] }.join
35
49
  @words[key] = parse_by_natto(text, type_ary)
36
50
  @tf[key] = calc_tf(@words[key])
37
- @idf[key] = calc_idf(@texts, @words[key])
51
+ @idf[key] = calc_idf(@words[key], text_file_path)
38
52
  @hse[key] = calc_hse(@words[key], sentence_ary)
39
53
  end
40
54
 
55
+ puts "\n=== calculating tf idf ==="
41
56
  @tf_idf = calc_tf_idf(@tf, @idf)
57
+
58
+ puts "=== calculating hse tf idf ==="
42
59
  @hse_tf_idf = calc_hse_tf_idf(@tf_idf, @hse)
43
60
 
44
61
  end
@@ -103,10 +120,13 @@ module AnalyZ
103
120
  end
104
121
  end
105
122
 
106
- def calc_idf sentences, words
123
+ def calc_idf words, text_file_path
124
+ texts = File.read(text_file_path).split('/=== EOS ===/')
107
125
  words.map do |word|
108
126
  cnt = 0
109
- sentences.each {|k, v| cnt += 1 if v.include?(word) }
127
+ texts.each do |text|
128
+ cnt += 1 if text.include?(word)
129
+ end
110
130
  [word, Math.log(sentences.length / cnt.to_f)]
111
131
  end
112
132
  end
@@ -1,3 +1,3 @@
1
1
  module AnalyZ
2
- VERSION = "0.1.2"
2
+ VERSION = "0.1.3"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: analy_z
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - nao215
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-04-29 00:00:00.000000000 Z
11
+ date: 2016-04-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: natto