RubyGems - analy_z - Versions diffs - 0.1.5 → 0.1.6 - Mend

analy_z 0.1.5 → 0.1.6

Files changed (7) hide show

checksums.yaml +4 -4
data/README.md +52 -5
data/lib/analy_z/html/word_val.rb +179 -0
data/lib/analy_z/html.rb +19 -0
data/lib/analy_z/version.rb +1 -1
data/lib/analy_z.rb +9 -179
metadata +4 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: dab7044b4666701f270c5441f871c3567cac1123
-  data.tar.gz: 00444c7e412b8931688633b0eb93485d66d8341f
+  metadata.gz: df13178fbd1dcad7f742f0276f16afaa8fd1097a
+  data.tar.gz: 18409354de2247a9f98721b7c5ab64384419f6c2
 SHA512:
-  metadata.gz: c823771d1c96c3e7b890256c5946b34a1f91376b66f8b381c35d47cdb5c048a775d064b491c946d1307a4ffac5f7031df6fe7470d7bc9a5d6a05d06ebd7a2a17
-  data.tar.gz: 25b1dadad75ec7382756047cf14f3ce2988a7b5f14a0f2d8af75c5ebb0b294a893753b4afd9c1f15dfae6a0c9c38901f8aaa73dd72ded105a806d019e5ee8f54
+  metadata.gz: ed74cb9fe090d407a18ece87f5e84dfea9178c5825cbbb34f2340f3b6b16de2dec23dad566760c14b446126294e8bf3816ad80e4ebac2fe6607001d195925c10
+  data.tar.gz: b875b6ca63249366070cfbf74b60b24c8ba42c868493ba26a516d70f737e272fa1b0637f6a0d77d85b1ea9a6104c603f591c9ac45bc28f7732575a84c55d2aca

data/README.md CHANGED Viewed

@@ -1,8 +1,34 @@
 # AnalyZ
-Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/analy_z`. To experiment with that code, run `bin/console` for an interactive prompt.
+This is gem for text analyze.
+Now you can analyze hse-tf-idf value about each words.
-TODO: Delete this and the text above, and describe your gem
+## What is hse-tf-idf
+hse-tf-idf = hse * tf-idf
+### What is hse
+Hse is HTML Semantic Element (valuation).
+Evaluate HTML tag and express it's value in number.
+for example
+| tag name   | font-size   | font-weight  | valuation  |
+|:----------:|:-----------:|:------------:|:----------:|
+| h1         | 2           | 1.75         | 3.5        |
+| h2         | 1.5         | 1.75         | 2.625      |
+| h3         | 1.17        | 1.75         | 2.0475     |
+| h4         | 1           | 1.75         | 1.75       |
+`valuation = font-size * font-weight`
+And I'm looking for another valuation.
+Please tell me if you find out more good tag or style.
+I want to add hse valuation logic below,
+- font size by css
+- font color
 ## Installation
@@ -14,15 +40,36 @@ gem 'analy_z'
 And then execute:
-    $ bundle
+    $ bundle install
 Or install it yourself as:
     $ gem install analy_z
 ## Usage
-TODO: Write usage instructions here
+```ruby
+require 'analy_z'
+# file_path : file path for files you want to analyze
+#             for example 'html/*.html'
+#             NOTE please add more than 2 files
+#             because only 1 file, analy_z can't calucurate idf
+# selector  : selector for place you want to analyze
+#             for example '#main .content'
+a = AnalyZ::HTML.word_val(file_path, selector)
+a.tf          # tf
+a.idf         # idf
+a.tf_idf      # tf-idf
+a.hse_tf_idf  # hse-tf-idf
+a.words       # words analy_z analyzed
+a.texts       # texts analy_z analyzed
+a.sentences   # sentences analy_z analyzed
+```
 ## Development
@@ -32,7 +79,7 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
 ## Contributing
-1. Fork it ( https://github.com/[my-github-username]/analy_z/fork )
+1. Fork it ( https://github.com/nao215/analy_z/fork )
 2. Create your feature branch (`git checkout -b my-new-feature`)
 3. Commit your changes (`git commit -am 'Add some feature'`)
 4. Push to the branch (`git push origin my-new-feature`)

data/lib/analy_z/html/word_val.rb ADDED Viewed

@@ -0,0 +1,179 @@
+module AnalyZ
+  module HTML
+    class WordVal
+      attr_accessor :tf
+      attr_accessor :idf
+      attr_accessor :tf_idf
+      attr_accessor :hse_tf_idf
+      attr_accessor :words
+      attr_accessor :texts
+      attr_accessor :sentences
+      def initialize html_path, selector = 'body', type_ary = ['名詞']
+        @sentences = {}
+        Dir.glob(html_path).each do |f|
+          print '.'
+          @sentences[f] = parse_html(Nokogiri::HTML.parse(File.read(f), nil, nil).css(selector).to_html)
+        end
+        puts "\n=== creating sentences file ==="
+        txt = ""
+        @sentences.each do |k, sentences|
+          print '.'
+          txt << sentences.map{|s| s[0] }.join + '/=== EOS ===/'
+        end
+        FileUtils.mkdir_p('tmp') unless FileTest.exist?('tmp')
+        text_file_path = "tmp/#{DateTime.now}.txt"
+        File.write(text_file_path, txt)
+        puts "\n=== analyzing... ==="
+        analyze_words(@sentences, text_file_path)
+      end
+      def analyze_words sentences, text_file_path, type_ary = ['名詞']
+        @words, @tf, @idf, @hse = {}, {}, {}, {}
+        puts "=== calculating tf and idf and hse ==="
+        sentences.each do |key, sentence_ary|
+          print '.'
+          text = sentence_ary.map {|s| s[0] }.join
+          @words[key] = parse_by_natto(text, type_ary)
+          @tf[key] = calc_tf(@words[key])
+          @idf[key] = calc_idf(@words[key], text_file_path)
+          @hse[key] = calc_hse(@words[key], sentence_ary)
+        end
+        puts "\n=== calculating tf idf ==="
+        @tf_idf = calc_tf_idf(@tf, @idf)
+        puts "=== calculating hse tf idf ==="
+        @hse_tf_idf = calc_hse_tf_idf(@tf_idf, @hse)
+      end
+      def parse_html html
+        sentences, important_tags = [], []
+        tag_rep = /<(".*?"|'.*?'|[^'"])*?>/
+        h_tag_reg = /<[hH][1-4].*?>.*?<\/[hH][1-4]>/
+        important_tags = html.scan(h_tag_reg)
+          .map{|m| [m.gsub(/<(".*?"|'.*?'|[^'"])*?>/, ''), m.match(/[hH][1-4]/)[0] ]}
+        sentences = html.gsub(/\"/, '')
+            .split(tag_rep)
+            .delete_if{|el| el != ['h1', 'h2', 'h3', 'h4'] && el =~ /\s/ || el.length <= 1}
+            .map{|m| [m, 1]}
+        sentences.each_with_index do |sentence, i|
+          important_tags.each do |tag_data|
+            rate = 2    * 1.75  if tag_data[1] == 'h1'
+            rate = 1.5  * 1.75  if tag_data[1] == 'h2'
+            rate = 1.17 * 1.75  if tag_data[1] == 'h3'
+            rate = 1.17 * 1.75  if tag_data[1] == 'h4'
+            sentences[i][1] = rate if sentence[0].include?(tag_data[0]) || tag_data[0].include?(sentence[0])
+          end
+        end
+        sentences
+      end
+      def parse_by_natto text, type_ary
+        words = []
+        Natto::MeCab.new.parse(text).split(/\n/).map do |row|
+          row = row.split(/\t|,/)
+          words << row[0] if type_ary.include?(row[1]) # row[0] is word, row[1] is a part of speech
+        end
+        words
+      end
+      def calc_tf words
+        freq_hash = {}
+        words.each_with_index do |word, i|
+          freq_hash[word] = freq_hash.has_key?(word) ? freq_hash[word] + 1 : 1
+        end
+        tf_list = freq_hash.sort_by {|k, v| v }.reverse.map do |k, v|
+          [k, v / words.length.to_f]
+        end
+        tf_list
+      end
+      def standardization_tf tf_ary_list, ave_word_num
+        return tf_ary_list.map do |tf_ary|
+          tf_ary.map do |tf|
+            [tf[0], tf[1] * (tf_ary.length / ave_word_num.to_f), tf_ary.length / ave_word_num.to_f]
+          end
+        end
+      end
+      def calc_idf words, text_file_path
+        texts = File.read(text_file_path).split('/=== EOS ===/')
+        words.map do |word|
+          cnt = 0
+          texts.each do |text|
+            cnt += 1 if text.include?(word)
+          end
+          [word, Math.log(sentences.length / cnt.to_f)]
+        end
+      end
+      def calc_hse words, sentence_ary
+        sentence_ary = sentence_ary.select{|sentence| sentence[1] != 1}
+        words.map do |word|
+          rate = 1
+          sentence_ary.each do |sentence|
+            rate = sentence[1] if sentence[0].include?(word[0])
+          end
+          [word, rate]
+        end.uniq
+      end
+      def calc_tf_idf tf_list_hash, idf_list_hash
+        tf_idfs = {}
+        tf_list_hash.each do |k, tf|
+          tf_idf = []
+          idf_list_hash[k].each do |idf|
+            tf_idf << [idf[0], idf[1] * tf.assoc(idf[0])[1]]
+          end
+          tf_idfs[k] = tf_idf.sort{ |a, b| b[1] <=> a[1] }.uniq
+        end
+        tf_idfs
+      end
+      def calc_hse_tf_idf tf_idf_list_hash, hse
+        hse_tf_idf = {}
+        hse.each do |k, h|
+          hse[k] = hse[k].select {|h| h[1] != 1 }
+        end
+        tf_idf_list_hash.each do |k, tf_idf_list|
+          hse_tf_idf[k] = tf_idf_list.map do |tf_idf|
+            rate = hse[k].assoc(tf_idf[0]) ? hse[k].assoc(tf_idf[0])[1] : 1
+            [tf_idf[0], tf_idf[1] * rate]
+          end
+        end
+        hse_tf_idf
+      end
+    end
+  end
+end

data/lib/analy_z/html.rb ADDED Viewed

@@ -0,0 +1,19 @@
+module AnalyZ
+  class << self
+    def HTML html_path, selector = 'body', type_ary = ['名詞']
+      AnalyZ::HTML
+    end
+  end
+  module HTML
+    def self.word_val html_path, selector = 'body', type_ary = ['名詞']
+      WordVal.new(html_path, selector, type_ary)
+    end
+  end
+end

data/lib/analy_z/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module AnalyZ
-  VERSION = "0.1.5"
+  VERSION = "0.1.6"
 end

data/lib/analy_z.rb CHANGED Viewed

@@ -1,183 +1,13 @@
 require "analy_z/version"
+require 'pp'
+require 'date'
+require 'natto'
+require 'nokogiri'
+require 'fileutils'
-module AnalyZ
-  class Analyzer
-    require 'pp'
-    require 'date'
-    require 'natto'
-    require 'nokogiri'
-    require  'fileutils'
-    attr_accessor :tf
-    attr_accessor :idf
-    attr_accessor :tf_idf
-    attr_accessor :hse_tf_idf
-    attr_accessor :words
-    attr_accessor :texts
-    attr_accessor :sentences
-    def initialize html_path, selector = 'body', type_ary = ['名詞']
-      @sentences = {}
-      Dir.glob(html_path).each do |f|
-        print '.'
-        @sentences[f] = parse_html(Nokogiri::HTML.parse(File.read(f), nil, nil).css(selector).to_html)
-      end
-      puts "\n=== creating sentences file ==="
-      txt = ""
-      @sentences.each do |k, sentences|
-        print '.'
-        txt << sentences.map{|s| s[0] }.join + '/=== EOS ===/'
-      end
-      FileUtils.mkdir_p('tmp') unless FileTest.exist?('tmp')
-      text_file_path = "tmp/#{DateTime.now}.txt"
-      File.write(text_file_path, txt)
-      puts "\n=== analyzing... ==="
-      analyze_words(@sentences, text_file_path)
-    end
-    def analyze_words sentences, text_file_path, type_ary = ['名詞']
-      @words, @tf, @idf, @hse = {}, {}, {}, {}
-      puts "=== calculating tf and idf and hse ==="
-      sentences.each do |key, sentence_ary|
-        print '.'
-        text = sentence_ary.map {|s| s[0] }.join
-        @words[key] = parse_by_natto(text, type_ary)
-        @tf[key] = calc_tf(@words[key])
-        @idf[key] = calc_idf(@words[key], text_file_path)
-        @hse[key] = calc_hse(@words[key], sentence_ary)
-      end
-      puts "\n=== calculating tf idf ==="
-      @tf_idf = calc_tf_idf(@tf, @idf)
-      puts "=== calculating hse tf idf ==="
-      @hse_tf_idf = calc_hse_tf_idf(@tf_idf, @hse)
-    end
-    def parse_html html
-      sentences, important_tags = [], []
-      tag_rep = /<(".*?"|'.*?'|[^'"])*?>/
-      h_tag_reg = /<[hH][1-4].*?>.*?<\/[hH][1-4]>/
-      important_tags = html.scan(h_tag_reg)
-        .map{|m| [m.gsub(/<(".*?"|'.*?'|[^'"])*?>/, ''), m.match(/[hH][1-4]/)[0] ]}
-      sentences = html.gsub(/\"/, '')
-          .split(tag_rep)
-          .delete_if{|el| el != ['h1', 'h2', 'h3', 'h4'] && el =~ /\s/ || el.length <= 1}
-          .map{|m| [m, 1]}
-      sentences.each_with_index do |sentence, i|
-        important_tags.each do |tag_data|
-          rate = 2    * 1.75  if tag_data[1] == 'h1'
-          rate = 1.5  * 1.75  if tag_data[1] == 'h2'
-          rate = 1.17 * 1.75  if tag_data[1] == 'h3'
-          rate = 1.17 * 1.75  if tag_data[1] == 'h4'
-          sentences[i][1] = rate if sentence[0].include?(tag_data[0]) || tag_data[0].include?(sentence[0])
-        end
-      end
-      sentences
-    end
-    def parse_by_natto text, type_ary
-      words = []
-      Natto::MeCab.new.parse(text).split(/\n/).map do |row|
-        row = row.split(/\t|,/)
-        words << row[0] if type_ary.include?(row[1]) # row[0] is word, row[1] is a part of speech
-      end
-      words
-    end
-    def calc_tf words
-      freq_hash = {}
-      words.each_with_index do |word, i|
-        freq_hash[word] = freq_hash.has_key?(word) ? freq_hash[word] + 1 : 1
-      end
-      tf_list = freq_hash.sort_by {|k, v| v }.reverse.map do |k, v|
-        [k, v / words.length.to_f]
-      end
-      tf_list
-    end
-    def standardization_tf tf_ary_list, ave_word_num
-      return tf_ary_list.map do |tf_ary|
-        tf_ary.map do |tf|
-          [tf[0], tf[1] * (tf_ary.length / ave_word_num.to_f), tf_ary.length / ave_word_num.to_f]
-        end
-      end
-    end
-    def calc_idf words, text_file_path
-      texts = File.read(text_file_path).split('/=== EOS ===/')
-      words.map do |word|
-        cnt = 0
-        texts.each do |text|
-          cnt += 1 if text.include?(word)
-        end
-        [word, Math.log(sentences.length / cnt.to_f)]
-      end
-    end
-    def calc_hse words, sentence_ary
-      sentence_ary = sentence_ary.select{|sentence| sentence[1] != 1}
-      words.map do |word|
-        rate = 1
-        sentence_ary.each do |sentence|
-          rate = sentence[1] if sentence[0].include?(word[0])
-        end
-        [word, rate]
-      end.uniq
-    end
-    def calc_tf_idf tf_list_hash, idf_list_hash
-      tf_idfs = {}
-      tf_list_hash.each do |k, tf|
-        tf_idf = []
-        idf_list_hash[k].each do |idf|
-          tf_idf << [idf[0], idf[1] * tf.assoc(idf[0])[1]]
-        end
-        tf_idfs[k] = tf_idf.sort{ |a, b| b[1] <=> a[1] }.uniq
-      end
-      tf_idfs
-    end
-    def calc_hse_tf_idf tf_idf_list_hash, hse
-      hse_tf_idf = {}
-      hse.each do |k, h|
-        hse[k] = hse[k].select {|h| h[1] != 1 }
-      end
-      tf_idf_list_hash.each do |k, tf_idf_list|
-        hse_tf_idf[k] = tf_idf_list.map do |tf_idf|
-          rate = hse[k].assoc(tf_idf[0]) ? hse[k].assoc(tf_idf[0])[1] : 1
-          [tf_idf[0], tf_idf[1] * rate]
-        end
-      end
-      hse_tf_idf
-    end
-  end
+require 'analy_z/html'
+require 'analy_z/html/word_val'
+require 'analy_z/html/similarity'
+module AnalyZ
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: analy_z
 version: !ruby/object:Gem::Version
-  version: 0.1.5
+  version: 0.1.6
 platform: ruby
 authors:
 - nao215
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2016-04-30 00:00:00.000000000 Z
+date: 2016-05-04 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: natto
@@ -69,6 +69,8 @@ files:
 - bin/console
 - bin/setup
 - lib/analy_z.rb
+- lib/analy_z/html.rb
+- lib/analy_z/html/word_val.rb
 - lib/analy_z/version.rb
 homepage: ''
 licenses: []