RubyGems - analy_z - Versions diffs - 0.1.5 → 0.1.6 - Mend

analy_z 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml +4 -4
data/README.md +52 -5
data/lib/analy_z/html/word_val.rb +179 -0
data/lib/analy_z/html.rb +19 -0
data/lib/analy_z/version.rb +1 -1
data/lib/analy_z.rb +9 -179
metadata +4 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: dab7044b4666701f270c5441f871c3567cac1123
-  data.tar.gz: 00444c7e412b8931688633b0eb93485d66d8341f
+  metadata.gz: df13178fbd1dcad7f742f0276f16afaa8fd1097a
+  data.tar.gz: 18409354de2247a9f98721b7c5ab64384419f6c2
 SHA512:
-  metadata.gz: c823771d1c96c3e7b890256c5946b34a1f91376b66f8b381c35d47cdb5c048a775d064b491c946d1307a4ffac5f7031df6fe7470d7bc9a5d6a05d06ebd7a2a17
-  data.tar.gz: 25b1dadad75ec7382756047cf14f3ce2988a7b5f14a0f2d8af75c5ebb0b294a893753b4afd9c1f15dfae6a0c9c38901f8aaa73dd72ded105a806d019e5ee8f54
+  metadata.gz: ed74cb9fe090d407a18ece87f5e84dfea9178c5825cbbb34f2340f3b6b16de2dec23dad566760c14b446126294e8bf3816ad80e4ebac2fe6607001d195925c10
+  data.tar.gz: b875b6ca63249366070cfbf74b60b24c8ba42c868493ba26a516d70f737e272fa1b0637f6a0d77d85b1ea9a6104c603f591c9ac45bc28f7732575a84c55d2aca

data/README.md CHANGED Viewed

@@ -1,8 +1,34 @@
 # AnalyZ
-Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/analy_z`. To experiment with that code, run `bin/console` for an interactive prompt.
+This is gem for text analyze.
+Now you can analyze hse-tf-idf value about each words.
-TODO: Delete this and the text above, and describe your gem
+## What is hse-tf-idf
+hse-tf-idf = hse * tf-idf
+### What is hse
+Hse is HTML Semantic Element (valuation).
+Evaluate HTML tag and express it's value in number.
+for example
+| tag name   | font-size   | font-weight  | valuation  |
+|:----------:|:-----------:|:------------:|:----------:|
+| h1         | 2           | 1.75         | 3.5        |
+| h2         | 1.5         | 1.75         | 2.625      |
+| h3         | 1.17        | 1.75         | 2.0475     |
+| h4         | 1           | 1.75         | 1.75       |
+`valuation = font-size * font-weight`
+And I'm looking for another valuation.
+Please tell me if you find out more good tag or style.
+I want to add hse valuation logic below,
+- font size by css
+- font color
 ## Installation
@@ -14,15 +40,36 @@ gem 'analy_z'
 And then execute:
-    $ bundle
+    $ bundle install
 Or install it yourself as:
     $ gem install analy_z
 ## Usage
-TODO: Write usage instructions here
+```ruby
+require 'analy_z'
+# file_path : file path for files you want to analyze
+#             for example 'html/*.html'
+#             NOTE please add more than 2 files
+#             because only 1 file, analy_z can't calucurate idf
+# selector  : selector for place you want to analyze
+#             for example '#main .content'
+a = AnalyZ::HTML.word_val(file_path, selector)
+a.tf          # tf
+a.idf         # idf
+a.tf_idf      # tf-idf
+a.hse_tf_idf  # hse-tf-idf
+a.words       # words analy_z analyzed
+a.texts       # texts analy_z analyzed
+a.sentences   # sentences analy_z analyzed
+```
 ## Development
@@ -32,7 +79,7 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
 ## Contributing
-1. Fork it ( https://github.com/[my-github-username]/analy_z/fork )
+1. Fork it ( https://github.com/nao215/analy_z/fork )
 2. Create your feature branch (`git checkout -b my-new-feature`)
 3. Commit your changes (`git commit -am 'Add some feature'`)
 4. Push to the branch (`git push origin my-new-feature`)

data/lib/analy_z/html/word_val.rb ADDED Viewed

@@ -0,0 +1,179 @@
+module AnalyZ
+  module HTML
+    class WordVal
+      attr_accessor :tf
+      attr_accessor :idf
+      attr_accessor :tf_idf
+      attr_accessor :hse_tf_idf
+      attr_accessor :words
+      attr_accessor :texts
+      attr_accessor :sentences
+      def initialize html_path, selector = 'body', type_ary = ['名詞']
+        @sentences = {}
+        Dir.glob(html_path).each do |f|
+          print '.'
+          @sentences[f] = parse_html(Nokogiri::HTML.parse(File.read(f), nil, nil).css(selector).to_html)
+        end
+        puts "\n=== creating sentences file ==="
+        txt = ""
+        @sentences.each do |k, sentences|
+          print '.'
+          txt << sentences.map{|s| s[0] }.join + '/=== EOS ===/'
+        end
+        FileUtils.mkdir_p('tmp') unless FileTest.exist?('tmp')
+        text_file_path = "tmp/#{DateTime.now}.txt"
+        File.write(text_file_path, txt)
+        puts "\n=== analyzing... ==="
+        analyze_words(@sentences, text_file_path)
+      end
+      def analyze_words sentences, text_file_path, type_ary = ['名詞']
+        @words, @tf, @idf, @hse = {}, {}, {}, {}
+        puts "=== calculating tf and idf and hse ==="
+        sentences.each do |key, sentence_ary|
+          print '.'
+          text = sentence_ary.map {|s| s[0] }.join
+          @words[key] = parse_by_natto(text, type_ary)
+          @tf[key] = calc_tf(@words[key])
+          @idf[key] = calc_idf(@words[key], text_file_path)
+          @hse[key] = calc_hse(@words[key], sentence_ary)
+        end
+        puts "\n=== calculating tf idf ==="
+        @tf_idf = calc_tf_idf(@tf, @idf)
+        puts "=== calculating hse tf idf ==="
+        @hse_tf_idf = calc_hse_tf_idf(@tf_idf, @hse)
+      end
+      def parse_html html
+        sentences, important_tags = [], []
+        tag_rep = /<(".*?"|'.*?'|[^'"])*?>/
+        h_tag_reg = /<[hH][1-4].*?>.*?<\/[hH][1-4]>/
+        important_tags = html.scan(h_tag_reg)
+          .map{|m| [m.gsub(/<(".*?"|'.*?'|[^'"])*?>/, ''), m.match(/[hH][1-4]/)[0] ]}
+        sentences = html.gsub(/\"/, '')
+            .split(tag_rep)
+            .delete_if{|el| el != ['h1', 'h2', 'h3', 'h4'] && el =~ /\s/ || el.length <= 1}
+            .map{|m| [m, 1]}
+        sentences.each_with_index do |sentence, i|
+          important_tags.each do |tag_data|
+            rate = 2    * 1.75  if tag_data[1] == 'h1'
+            rate = 1.5  * 1.75  if tag_data[1] == 'h2'
+            rate = 1.17 * 1.75  if tag_data[1] == 'h3'
+            rate = 1.17 * 1.75  if tag_data[1] == 'h4'
+            sentences[i][1] = rate if sentence[0].include?(tag_data[0]) || tag_data[0].include?(sentence[0])
+          end
+        end
+        sentences
+      end
+      def parse_by_natto text, type_ary
+        words = []
+        Natto::MeCab.new.parse(text).split(/\n/).map do |row|
+          row = row.split(/\t|,/)
+          words << row[0] if type_ary.include?(row[1]) # row[0] is word, row[1] is a part of speech
+        end
+        words
+      end
+      def calc_tf words
+        freq_hash = {}
+        words.each_with_index do |word, i|
+          freq_hash[word] = freq_hash.has_key?(word) ? freq_hash[word] + 1 : 1
+        end
+        tf_list = freq_hash.sort_by {|k, v| v }.reverse.map do |k, v|
+          [k, v / words.length.to_f]
+        end
+        tf_list
+      end
+      def standardization_tf tf_ary_list, ave_word_num
+        return tf_ary_list.map do |tf_ary|
+          tf_ary.map do |tf|
+            [tf[0], tf[1] * (tf_ary.length / ave_word_num.to_f), tf_ary.length / ave_word_num.to_f]
+          end
+        end
+      end
+      def calc_idf words, text_file_path
+        texts = File.read(text_file_path).split('/=== EOS ===/')
+        words.map do |word|
+          cnt = 0
+          texts.each do |text|
+            cnt += 1 if text.include?(word)
+          end
+          [word, Math.log(sentences.length / cnt.to_f)]
+        end
+      end
+      def calc_hse words, sentence_ary
+        sentence_ary = sentence_ary.select{|sentence| sentence[1] != 1}
+        words.map do |word|
+          rate = 1
+          sentence_ary.each do |sentence|
+            rate = sentence[1] if sentence[0].include?(word[0])
+          end
+          [word, rate]
+        end.uniq
+      end
+      def calc_tf_idf tf_list_hash, idf_list_hash
+        tf_idfs = {}
+        tf_list_hash.each do |k, tf|
+          tf_idf = []
+          idf_list_hash[k].each do |idf|
+            tf_idf << [idf[0], idf[1] * tf.assoc(idf[0])[1]]
+          end
+          tf_idfs[k] = tf_idf.sort{ |a, b| b[1] <=> a[1] }.uniq
+        end
+        tf_idfs
+      end
+      def calc_hse_tf_idf tf_idf_list_hash, hse
+        hse_tf_idf = {}
+        hse.each do |k, h|
+          hse[k] = hse[k].select {|h| h[1] != 1 }
+        end
+        tf_idf_list_hash.each do |k, tf_idf_list|
+          hse_tf_idf[k] = tf_idf_list.map do |tf_idf|
+            rate = hse[k].assoc(tf_idf[0]) ? hse[k].assoc(tf_idf[0])[1] : 1
+            [tf_idf[0], tf_idf[1] * rate]
+          end
+        end
+        hse_tf_idf
+      end
+    end
+  end
+end

data/lib/analy_z/html.rb ADDED Viewed

@@ -0,0 +1,19 @@
+module AnalyZ
+  class << self
+    def HTML html_path, selector = 'body', type_ary = ['名詞']
+      AnalyZ::HTML
+    end
+  end
+  module HTML
+    def self.word_val html_path, selector = 'body', type_ary = ['名詞']
+      WordVal.new(html_path, selector, type_ary)
+    end
+  end
+end

data/lib/analy_z/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module AnalyZ
-  VERSION = "0.1.5"
+  VERSION = "0.1.6"
 end

data/lib/analy_z.rb CHANGED Viewed

@@ -1,183 +1,13 @@
 require "analy_z/version"
+require 'pp'
+require 'date'
+require 'natto'
+require 'nokogiri'
+require 'fileutils'
-module AnalyZ
-  class Analyzer
-    require 'pp'
-    require 'date'
-    require 'natto'
-    require 'nokogiri'
-    require  'fileutils'
-    attr_accessor :tf
-    attr_accessor :idf
-    attr_accessor :tf_idf
-    attr_accessor :hse_tf_idf
-    attr_accessor :words
-    attr_accessor :texts
-    attr_accessor :sentences
-    def initialize html_path, selector = 'body', type_ary = ['名詞']
-      @sentences = {}
-      Dir.glob(html_path).each do |f|
-        print '.'
-        @sentences[f] = parse_html(Nokogiri::HTML.parse(File.read(f), nil, nil).css(selector).to_html)
-      end
-      puts "\n=== creating sentences file ==="
-      txt = ""
-      @sentences.each do |k, sentences|
-        print '.'
-        txt << sentences.map{|s| s[0] }.join + '/=== EOS ===/'
-      end
-      FileUtils.mkdir_p('tmp') unless FileTest.exist?('tmp')
-      text_file_path = "tmp/#{DateTime.now}.txt"
-      File.write(text_file_path, txt)
-      puts "\n=== analyzing... ==="
-      analyze_words(@sentences, text_file_path)
-    end
-    def analyze_words sentences, text_file_path, type_ary = ['名詞']
-      @words, @tf, @idf, @hse = {}, {}, {}, {}
-      puts "=== calculating tf and idf and hse ==="
-      sentences.each do |key, sentence_ary|
-        print '.'
-        text = sentence_ary.map {|s| s[0] }.join
-        @words[key] = parse_by_natto(text, type_ary)
-        @tf[key] = calc_tf(@words[key])
-        @idf[key] = calc_idf(@words[key], text_file_path)
-        @hse[key] = calc_hse(@words[key], sentence_ary)
-      end
-      puts "\n=== calculating tf idf ==="
-      @tf_idf = calc_tf_idf(@tf, @idf)
-      puts "=== calculating hse tf idf ==="
-      @hse_tf_idf = calc_hse_tf_idf(@tf_idf, @hse)
-    end
-    def parse_html html
-      sentences, important_tags = [], []
-      tag_rep = /<(".*?"|'.*?'|[^'"])*?>/
-      h_tag_reg = /<[hH][1-4].*?>.*?<\/[hH][1-4]>/
-      important_tags = html.scan(h_tag_reg)
-        .map{|m| [m.gsub(/<(".*?"|'.*?'|[^'"])*?>/, ''), m.match(/[hH][1-4]/)[0] ]}
-      sentences = html.gsub(/\"/, '')
-          .split(tag_rep)
-          .delete_if{|el| el != ['h1', 'h2', 'h3', 'h4'] && el =~ /\s/ || el.length <= 1}
-          .map{|m| [m, 1]}
-      sentences.each_with_index do |sentence, i|
-        important_tags.each do |tag_data|
-          rate = 2    * 1.75  if tag_data[1] == 'h1'
-          rate = 1.5  * 1.75  if tag_data[1] == 'h2'
-          rate = 1.17 * 1.75  if tag_data[1] == 'h3'
-          rate = 1.17 * 1.75  if tag_data[1] == 'h4'
-          sentences[i][1] = rate if sentence[0].include?(tag_data[0]) || tag_data[0].include?(sentence[0])
-        end
-      end
-      sentences
-    end
-    def parse_by_natto text, type_ary
-      words = []
-      Natto::MeCab.new.parse(text).split(/\n/).map do |row|
-        row = row.split(/\t|,/)
-        words << row[0] if type_ary.include?(row[1]) # row[0] is word, row[1] is a part of speech
-      end
-      words
-    end
-    def calc_tf words
-      freq_hash = {}
-      words.each_with_index do |word, i|
-        freq_hash[word] = freq_hash.has_key?(word) ? freq_hash[word] + 1 : 1
-      end
-      tf_list = freq_hash.sort_by {|k, v| v }.reverse.map do |k, v|
-        [k, v / words.length.to_f]
-      end
-      tf_list
-    end
-    def standardization_tf tf_ary_list, ave_word_num
-      return tf_ary_list.map do |tf_ary|
-        tf_ary.map do |tf|
-          [tf[0], tf[1] * (tf_ary.length / ave_word_num.to_f), tf_ary.length / ave_word_num.to_f]
-        end
-      end
-    end
-    def calc_idf words, text_file_path
-      texts = File.read(text_file_path).split('/=== EOS ===/')
-      words.map do |word|
-        cnt = 0
-        texts.each do |text|
-          cnt += 1 if text.include?(word)
-        end
-        [word, Math.log(sentences.length / cnt.to_f)]
-      end
-    end
-    def calc_hse words, sentence_ary
-      sentence_ary = sentence_ary.select{|sentence| sentence[1] != 1}
-      words.map do |word|
-        rate = 1
-        sentence_ary.each do |sentence|
-          rate = sentence[1] if sentence[0].include?(word[0])
-        end
-        [word, rate]
-      end.uniq
-    end
-    def calc_tf_idf tf_list_hash, idf_list_hash
-      tf_idfs = {}
-      tf_list_hash.each do |k, tf|
-        tf_idf = []
-        idf_list_hash[k].each do |idf|
-          tf_idf << [idf[0], idf[1] * tf.assoc(idf[0])[1]]
-        end
-        tf_idfs[k] = tf_idf.sort{ |a, b| b[1] <=> a[1] }.uniq
-      end
-      tf_idfs
-    end
-    def calc_hse_tf_idf tf_idf_list_hash, hse
-      hse_tf_idf = {}
-      hse.each do |k, h|
-        hse[k] = hse[k].select {|h| h[1] != 1 }
-      end
-      tf_idf_list_hash.each do |k, tf_idf_list|
-        hse_tf_idf[k] = tf_idf_list.map do |tf_idf|
-          rate = hse[k].assoc(tf_idf[0]) ? hse[k].assoc(tf_idf[0])[1] : 1
-          [tf_idf[0], tf_idf[1] * rate]
-        end
-      end
-      hse_tf_idf
-    end
-  end
+require 'analy_z/html'
+require 'analy_z/html/word_val'
+require 'analy_z/html/similarity'
+module AnalyZ
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: analy_z
 version: !ruby/object:Gem::Version
-  version: 0.1.5
+  version: 0.1.6
 platform: ruby
 authors:
 - nao215
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2016-04-30 00:00:00.000000000 Z
+date: 2016-05-04 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: natto
@@ -69,6 +69,8 @@ files:
 - bin/console
 - bin/setup
 - lib/analy_z.rb
+- lib/analy_z/html.rb
+- lib/analy_z/html/word_val.rb
 - lib/analy_z/version.rb
 homepage: ''
 licenses: []