analy_z 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 758343df058bfed451689c1c991e1354fb92d289
4
- data.tar.gz: e150373446af0e1430e78a8db54f8abbd5f0db93
3
+ metadata.gz: 257d11a71d341c246d265954ebdd1f557332867d
4
+ data.tar.gz: 0ad929efdd028daa2509005473322951f9d37ba1
5
5
  SHA512:
6
- metadata.gz: 8a7858dab8bf27d15663fbba157b11de25fbc6b955b675f812ecc422d0c90ab5f885be5cd6e0956dc4c4310a8465e9e425c5e1eaf674857aeb01b655ca9882c2
7
- data.tar.gz: 106f76e8109d8762fdcf49d3fdfe8853d1a6aff6028bc10bf1613d1f0bfc616354bbe4da0987b4bc3190122582f579b807d93b2545acf75335b24b7f33981ec0
6
+ metadata.gz: 6da8e91ad8f952e7f2f0c9ee02c79557fda628f5b2995c60a119d0de91d71fd7937965c77d9ee2e7277fbaa24b1b83c82c20d95e4c1bb336a9b745da1d91e891
7
+ data.tar.gz: 448bbe2ae21c0bde8d4964e966cd6909de326cc63696b74a631d8a41b5923d292af8d0e6e6706fce47301cf9f73f3d1ae2f8f3435e601e9fad596e7d49a10a55
@@ -5,6 +5,7 @@ module AnalyZ
5
5
  class Analyzer
6
6
 
7
7
  require 'pp'
8
+ require 'date'
8
9
  require 'natto'
9
10
  require 'nokogiri'
10
11
 
@@ -19,26 +20,42 @@ module AnalyZ
19
20
  def initialize html_path, selector = 'body', type_ary = ['名詞']
20
21
  @sentences = {}
21
22
  Dir.glob(html_path).each do |f|
23
+ print '.'
22
24
  @sentences[f] = parse_html(Nokogiri::HTML.parse(File.read(f), nil, nil).css(selector).to_html)
23
25
  end
24
- analyze_words(@sentences)
25
- end
26
26
 
27
- def analyze_words sentences, type_ary = ['名詞']
27
+ puts "\n=== creating sentences file ==="
28
+ txt = ""
29
+ @sentences.each do |k, sentences|
30
+ print '.'
31
+ txt << sentences.map{|s| s[0] }.join + '/=== EOS ===/'
32
+ end
33
+
34
+ text_file_path = "tmp/#{DateTime.now}.txt"
35
+ File.write(text_file_path, txt)
28
36
 
29
- @texts, @words, @tf, @idf, @hse = {}, {}, {}, {}, {}
37
+ puts "\n=== analyzing... ==="
38
+ analyze_words(@sentences, text_file_path)
39
+ end
30
40
 
31
- sentences.each{|k, sentence| @texts[k] = sentence.map {|s| s[0]}.join }
41
+ def analyze_words sentences, text_file_path, type_ary = ['名詞']
32
42
 
43
+ @words, @tf, @idf, @hse = {}, {}, {}, {}
44
+
45
+ puts "=== calculating tf and idf and hse ==="
33
46
  sentences.each do |key, sentence_ary|
47
+ print '.'
34
48
  text = sentence_ary.map {|s| s[0] }.join
35
49
  @words[key] = parse_by_natto(text, type_ary)
36
50
  @tf[key] = calc_tf(@words[key])
37
- @idf[key] = calc_idf(@texts, @words[key])
51
+ @idf[key] = calc_idf(@words[key], text_file_path)
38
52
  @hse[key] = calc_hse(@words[key], sentence_ary)
39
53
  end
40
54
 
55
+ puts "\n=== calculating tf idf ==="
41
56
  @tf_idf = calc_tf_idf(@tf, @idf)
57
+
58
+ puts "=== calculating hse tf idf ==="
42
59
  @hse_tf_idf = calc_hse_tf_idf(@tf_idf, @hse)
43
60
 
44
61
  end
@@ -103,10 +120,13 @@ module AnalyZ
103
120
  end
104
121
  end
105
122
 
106
- def calc_idf sentences, words
123
+ def calc_idf words, text_file_path
124
+ texts = File.read(text_file_path).split('/=== EOS ===/')
107
125
  words.map do |word|
108
126
  cnt = 0
109
- sentences.each {|k, v| cnt += 1 if v.include?(word) }
127
+ texts.each do |text|
128
+ cnt += 1 if text.include?(word)
129
+ end
110
130
  [word, Math.log(sentences.length / cnt.to_f)]
111
131
  end
112
132
  end
@@ -1,3 +1,3 @@
1
1
  module AnalyZ
2
- VERSION = "0.1.2"
2
+ VERSION = "0.1.3"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: analy_z
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - nao215
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-04-29 00:00:00.000000000 Z
11
+ date: 2016-04-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: natto