RubyGems - analy_z - Versions diffs - 0.1.0 - Mend

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: cafbd605a88e44b0b2e80aa2d919acaa099a0bc8
+  data.tar.gz: 71e59a86e64bf5f99ed72e520edbe444e1d86189
+SHA512:
+  metadata.gz: 2a97c6c54a3311af0e19517f0d5e09fbc461569d9c12d93be8e780a5efc22fd3766f7eafb0d2981993e4235abdd7d983b94b6d48148ccc0cbf780e586e716849
+  data.tar.gz: 10a3efaf2f55fa5b1146f070cf5ff983146f62197afeaffbe0407e68bf9376d0da7f453f82fe5665fe5d3cc9885a17d1660d6fd01cd123452fbeba3172d65010

data/.gitignore ADDED Viewed

@@ -0,0 +1,9 @@
+/.bundle/
+/.yardoc
+/Gemfile.lock
+/_yardoc/
+/coverage/
+/doc/
+/pkg/
+/spec/reports/
+/tmp/

data/.rspec ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ --format documentation
2	+ --color

data/.travis.yml ADDED Viewed

@@ -0,0 +1,3 @@
+language: ruby
+rvm:
+  - 2.1.1

data/Gemfile ADDED Viewed

@@ -0,0 +1,7 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in analy_z.gemspec
+gemspec
+gem 'nokogiri'
+gem 'natto'

data/README.md ADDED Viewed

@@ -0,0 +1,39 @@
+# AnalyZ
+Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/analy_z`. To experiment with that code, run `bin/console` for an interactive prompt.
+TODO: Delete this and the text above, and describe your gem
+## Installation
+Add this line to your application's Gemfile:
+```ruby
+gem 'analy_z'
+```
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install analy_z
+## Usage
+TODO: Write usage instructions here
+## Development
+After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
+To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release` to create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
+## Contributing
+1. Fork it ( https://github.com/[my-github-username]/analy_z/fork )
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create a new Pull Request

data/Rakefile ADDED Viewed

	@@ -0,0 +1 @@
1	+ require "bundler/gem_tasks"

data/analy_z.gemspec ADDED Viewed

@@ -0,0 +1,23 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'analy_z/version'
+Gem::Specification.new do |spec|
+  spec.name          = "analy_z"
+  spec.version       = AnalyZ::VERSION
+  spec.authors       = ["nao215"]
+  spec.email         = ["xxxxxy.naoxxxxx@gmail.com"]
+  spec.summary       = %q{Text Analyzer}
+  spec.description   = %q{calcurate tf idf and hse(html semantic element) }
+  spec.homepage      = ""
+  spec.files         = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
+  spec.bindir        = "exe"
+  spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
+  spec.require_paths = ["lib"]
+  spec.add_development_dependency "bundler", "~> 1.9"
+  spec.add_development_dependency "rake", "~> 10.0"
+end

data/bin/console ADDED Viewed

@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+require "bundler/setup"
+require "analy_z"
+# You can add fixtures and/or initialization code here to make experimenting
+# with your gem easier. You can also use a different console, if you like.
+# (If you use this, don't forget to add pry to your Gemfile!)
+# require "pry"
+# Pry.start
+require "irb"
+IRB.start

data/bin/setup ADDED Viewed

@@ -0,0 +1,7 @@
+#!/bin/bash
+set -euo pipefail
+IFS=$'\n\t'
+bundle install
+# Do any other automated setup that you need to do here

data/lib/analy_z.rb ADDED Viewed

@@ -0,0 +1,161 @@
+require "analy_z/version"
+module AnalyZ
+  class Analyzer
+    require 'pp'
+    require 'natto'
+    require 'nokogiri'
+    attr_accessor :tf
+    attr_accessor :idf
+    attr_accessor :tf_idf
+    attr_accessor :hse_tf_idf
+    attr_accessor :words
+    attr_accessor :texts
+    attr_accessor :sentences
+    def initialize html, selector = 'body', type_ary = ['名詞']
+      @sentences = {}
+      Dir.glob("htmls/*.html").each do |f|
+        @sentences[f] = parse_html(Nokogiri::HTML.parse(File.read(f), nil, nil).css(selector).to_html)
+      end
+      analyze_words(@sentences)
+    end
+    def analyze_words sentences, type_ary = ['名詞']
+      @texts, @words, @tf, @idf, @hse = {}, {}, {}, {}, {}
+      sentences.each{|k, sentence| @texts[k] = sentence.map {|s| s[0]}.join }
+      sentences.each do |key, sentence_ary|
+        text = sentence_ary.map {|s| s[0] }.join
+        @words[key] = parse_by_natto(text, type_ary)
+        @tf[key] = calc_tf(@words[key])
+        @idf[key] = calc_idf(@texts, @words[key])
+        @hse[key] = calc_hse(@words[key], sentence_ary)
+      end
+      @tf_idf = calc_tf_idf(@tf, @idf)
+      @hse_tf_idf = calc_hse_tf_idf(@tf_idf, @hse)
+    end
+    def parse_html html
+      sentences, important_tags = [], []
+      tag_rep = /<(".*?"|'.*?'|[^'"])*?>/
+      h_tag_reg = /<[hH][1-4].*?>.*?<\/[hH][1-4]>/
+      important_tags = html.scan(h_tag_reg)
+        .map{|m| [m.gsub(/<(".*?"|'.*?'|[^'"])*?>/, ''), m.match(/[hH][1-4]/)[0] ]}
+      sentences = html.gsub(/\"/, '')
+          .split(tag_rep)
+          .delete_if{|el| el != ['h1', 'h2', 'h3', 'h4'] && el =~ /\s/ || el.length <= 1}
+          .map{|m| [m, 1]}
+      sentences.each_with_index do |sentence, i|
+        important_tags.each do |tag_data|
+          rate = 2    * 1.75  if tag_data[1] == 'h1'
+          rate = 1.5  * 1.75  if tag_data[1] == 'h2'
+          rate = 1.17 * 1.75  if tag_data[1] == 'h3'
+          rate = 1.17 * 1.75  if tag_data[1] == 'h4'
+          sentences[i][1] = rate if sentence[0].include?(tag_data[0]) || tag_data[0].include?(sentence[0])
+        end
+      end
+      sentences
+    end
+    def parse_by_natto text, type_ary
+      words = []
+      Natto::MeCab.new.parse(text).split(/\n/).map do |row|
+        row = row.split(/\t|,/)
+        words << row[0] if type_ary.include?(row[1]) # row[0] is word, row[1] is a part of speech
+      end
+      words
+    end
+    def calc_tf words
+      freq_hash = {}
+      words.each_with_index do |word, i|
+        freq_hash[word] = freq_hash.has_key?(word) ? freq_hash[word] + 1 : 1
+      end
+      tf_list = freq_hash.sort_by {|k, v| v }.reverse.map do |k, v|
+        [k, v / words.length.to_f]
+      end
+      tf_list
+    end
+    def standardization_tf tf_ary_list, ave_word_num
+      return tf_ary_list.map do |tf_ary|
+        tf_ary.map do |tf|
+          [tf[0], tf[1] * (tf_ary.length / ave_word_num.to_f), tf_ary.length / ave_word_num.to_f]
+        end
+      end
+    end
+    def calc_idf sentences, words
+      words.map do |word|
+        cnt = 0
+        sentences.each {|k, v| cnt += 1 if v.include?(word) }
+        [word, Math.log(sentences.length / cnt.to_f)]
+      end
+    end
+    def calc_hse words, sentence_ary
+      sentence_ary = sentence_ary.select{|sentence| sentence[1] != 1}
+      words.map do |word|
+        rate = 1
+        sentence_ary.each do |sentence|
+          rate = sentence[1] if sentence[0].include?(word[0])
+        end
+        [word, rate]
+      end.uniq
+    end
+    def calc_tf_idf tf_list_hash, idf_list_hash
+      tf_idfs = {}
+      tf_list_hash.each do |k, tf|
+        tf_idf = []
+        idf_list_hash[k].each do |idf|
+          tf_idf << [idf[0], idf[1] * tf.assoc(idf[0])[1]]
+        end
+        tf_idfs[k] = tf_idf.sort{ |a, b| b[1] <=> a[1] }.uniq
+      end
+      tf_idfs
+    end
+    def calc_hse_tf_idf tf_idf_list_hash, hse
+      hse_tf_idf = {}
+      hse.each do |k, h|
+        hse[k] = hse[k].select {|h| h[1] != 1 }
+      end
+      tf_idf_list_hash.each do |k, tf_idf_list|
+        hse_tf_idf[k] = tf_idf_list.map do |tf_idf|
+          rate = hse[k].assoc(tf_idf[0]) ? hse[k].assoc(tf_idf[0])[1] : 1
+          [tf_idf[0], tf_idf[1] * rate]
+        end
+      end
+      hse_tf_idf
+    end
+  end
+end

data/lib/analy_z/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module AnalyZ
+  VERSION = "0.1.0"
+end

metadata ADDED Viewed

@@ -0,0 +1,82 @@
+--- !ruby/object:Gem::Specification
+name: analy_z
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- nao215
+autorequire:
+bindir: exe
+cert_chain: []
+date: 2016-04-29 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.9'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.9'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.0'
+description: 'calcurate tf idf and hse(html semantic element) '
+email:
+- xxxxxy.naoxxxxx@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- ".gitignore"
+- ".rspec"
+- ".travis.yml"
+- Gemfile
+- README.md
+- Rakefile
+- analy_z.gemspec
+- bin/console
+- bin/setup
+- lib/analy_z.rb
+- lib/analy_z/version.rb
+homepage: ''
+licenses: []
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.4.7
+signing_key:
+specification_version: 4
+summary: Text Analyzer
+test_files: []

analy_z 0.1.0