analy_z 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: cafbd605a88e44b0b2e80aa2d919acaa099a0bc8
4
+ data.tar.gz: 71e59a86e64bf5f99ed72e520edbe444e1d86189
5
+ SHA512:
6
+ metadata.gz: 2a97c6c54a3311af0e19517f0d5e09fbc461569d9c12d93be8e780a5efc22fd3766f7eafb0d2981993e4235abdd7d983b94b6d48148ccc0cbf780e586e716849
7
+ data.tar.gz: 10a3efaf2f55fa5b1146f070cf5ff983146f62197afeaffbe0407e68bf9376d0da7f453f82fe5665fe5d3cc9885a17d1660d6fd01cd123452fbeba3172d65010
data/.gitignore ADDED
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,3 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.1.1
data/Gemfile ADDED
@@ -0,0 +1,7 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in analy_z.gemspec
4
+ gemspec
5
+
6
+ gem 'nokogiri'
7
+ gem 'natto'
data/README.md ADDED
@@ -0,0 +1,39 @@
1
+ # AnalyZ
2
+
3
+ Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/analy_z`. To experiment with that code, run `bin/console` for an interactive prompt.
4
+
5
+ TODO: Delete this and the text above, and describe your gem
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'analy_z'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install analy_z
22
+
23
+ ## Usage
24
+
25
+ TODO: Write usage instructions here
26
+
27
+ ## Development
28
+
29
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
30
+
31
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release` to create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
32
+
33
+ ## Contributing
34
+
35
+ 1. Fork it ( https://github.com/[my-github-username]/analy_z/fork )
36
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
37
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
38
+ 4. Push to the branch (`git push origin my-new-feature`)
39
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
data/analy_z.gemspec ADDED
@@ -0,0 +1,23 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'analy_z/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "analy_z"
8
+ spec.version = AnalyZ::VERSION
9
+ spec.authors = ["nao215"]
10
+ spec.email = ["xxxxxy.naoxxxxx@gmail.com"]
11
+
12
+ spec.summary = %q{Text Analyzer}
13
+ spec.description = %q{calcurate tf idf and hse(html semantic element) }
14
+ spec.homepage = ""
15
+
16
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
17
+ spec.bindir = "exe"
18
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.9"
22
+ spec.add_development_dependency "rake", "~> 10.0"
23
+ end
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "analy_z"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+
5
+ bundle install
6
+
7
+ # Do any other automated setup that you need to do here
data/lib/analy_z.rb ADDED
@@ -0,0 +1,161 @@
1
+ require "analy_z/version"
2
+
3
+ module AnalyZ
4
+
5
+ class Analyzer
6
+
7
+ require 'pp'
8
+ require 'natto'
9
+ require 'nokogiri'
10
+
11
+ attr_accessor :tf
12
+ attr_accessor :idf
13
+ attr_accessor :tf_idf
14
+ attr_accessor :hse_tf_idf
15
+ attr_accessor :words
16
+ attr_accessor :texts
17
+ attr_accessor :sentences
18
+
19
+ def initialize html, selector = 'body', type_ary = ['名詞']
20
+ @sentences = {}
21
+ Dir.glob("htmls/*.html").each do |f|
22
+ @sentences[f] = parse_html(Nokogiri::HTML.parse(File.read(f), nil, nil).css(selector).to_html)
23
+ end
24
+ analyze_words(@sentences)
25
+ end
26
+
27
+ def analyze_words sentences, type_ary = ['名詞']
28
+
29
+ @texts, @words, @tf, @idf, @hse = {}, {}, {}, {}, {}
30
+
31
+ sentences.each{|k, sentence| @texts[k] = sentence.map {|s| s[0]}.join }
32
+
33
+ sentences.each do |key, sentence_ary|
34
+ text = sentence_ary.map {|s| s[0] }.join
35
+ @words[key] = parse_by_natto(text, type_ary)
36
+ @tf[key] = calc_tf(@words[key])
37
+ @idf[key] = calc_idf(@texts, @words[key])
38
+ @hse[key] = calc_hse(@words[key], sentence_ary)
39
+ end
40
+
41
+ @tf_idf = calc_tf_idf(@tf, @idf)
42
+ @hse_tf_idf = calc_hse_tf_idf(@tf_idf, @hse)
43
+
44
+ end
45
+
46
+ def parse_html html
47
+ sentences, important_tags = [], []
48
+ tag_rep = /<(".*?"|'.*?'|[^'"])*?>/
49
+ h_tag_reg = /<[hH][1-4].*?>.*?<\/[hH][1-4]>/
50
+
51
+ important_tags = html.scan(h_tag_reg)
52
+ .map{|m| [m.gsub(/<(".*?"|'.*?'|[^'"])*?>/, ''), m.match(/[hH][1-4]/)[0] ]}
53
+
54
+ sentences = html.gsub(/\"/, '')
55
+ .split(tag_rep)
56
+ .delete_if{|el| el != ['h1', 'h2', 'h3', 'h4'] && el =~ /\s/ || el.length <= 1}
57
+ .map{|m| [m, 1]}
58
+
59
+ sentences.each_with_index do |sentence, i|
60
+ important_tags.each do |tag_data|
61
+ rate = 2 * 1.75 if tag_data[1] == 'h1'
62
+ rate = 1.5 * 1.75 if tag_data[1] == 'h2'
63
+ rate = 1.17 * 1.75 if tag_data[1] == 'h3'
64
+ rate = 1.17 * 1.75 if tag_data[1] == 'h4'
65
+ sentences[i][1] = rate if sentence[0].include?(tag_data[0]) || tag_data[0].include?(sentence[0])
66
+ end
67
+ end
68
+
69
+ sentences
70
+
71
+ end
72
+
73
+ def parse_by_natto text, type_ary
74
+ words = []
75
+
76
+ Natto::MeCab.new.parse(text).split(/\n/).map do |row|
77
+ row = row.split(/\t|,/)
78
+ words << row[0] if type_ary.include?(row[1]) # row[0] is word, row[1] is a part of speech
79
+ end
80
+
81
+ words
82
+ end
83
+
84
+ def calc_tf words
85
+ freq_hash = {}
86
+
87
+ words.each_with_index do |word, i|
88
+ freq_hash[word] = freq_hash.has_key?(word) ? freq_hash[word] + 1 : 1
89
+ end
90
+
91
+ tf_list = freq_hash.sort_by {|k, v| v }.reverse.map do |k, v|
92
+ [k, v / words.length.to_f]
93
+ end
94
+
95
+ tf_list
96
+ end
97
+
98
+ def standardization_tf tf_ary_list, ave_word_num
99
+ return tf_ary_list.map do |tf_ary|
100
+ tf_ary.map do |tf|
101
+ [tf[0], tf[1] * (tf_ary.length / ave_word_num.to_f), tf_ary.length / ave_word_num.to_f]
102
+ end
103
+ end
104
+ end
105
+
106
+ def calc_idf sentences, words
107
+ words.map do |word|
108
+ cnt = 0
109
+ sentences.each {|k, v| cnt += 1 if v.include?(word) }
110
+ [word, Math.log(sentences.length / cnt.to_f)]
111
+ end
112
+ end
113
+
114
+ def calc_hse words, sentence_ary
115
+ sentence_ary = sentence_ary.select{|sentence| sentence[1] != 1}
116
+ words.map do |word|
117
+ rate = 1
118
+ sentence_ary.each do |sentence|
119
+ rate = sentence[1] if sentence[0].include?(word[0])
120
+ end
121
+ [word, rate]
122
+ end.uniq
123
+ end
124
+
125
+ def calc_tf_idf tf_list_hash, idf_list_hash
126
+
127
+ tf_idfs = {}
128
+
129
+ tf_list_hash.each do |k, tf|
130
+ tf_idf = []
131
+ idf_list_hash[k].each do |idf|
132
+ tf_idf << [idf[0], idf[1] * tf.assoc(idf[0])[1]]
133
+ end
134
+ tf_idfs[k] = tf_idf.sort{ |a, b| b[1] <=> a[1] }.uniq
135
+ end
136
+
137
+ tf_idfs
138
+
139
+ end
140
+
141
+ def calc_hse_tf_idf tf_idf_list_hash, hse
142
+
143
+ hse_tf_idf = {}
144
+
145
+ hse.each do |k, h|
146
+ hse[k] = hse[k].select {|h| h[1] != 1 }
147
+ end
148
+
149
+ tf_idf_list_hash.each do |k, tf_idf_list|
150
+ hse_tf_idf[k] = tf_idf_list.map do |tf_idf|
151
+ rate = hse[k].assoc(tf_idf[0]) ? hse[k].assoc(tf_idf[0])[1] : 1
152
+ [tf_idf[0], tf_idf[1] * rate]
153
+ end
154
+ end
155
+
156
+ hse_tf_idf
157
+ end
158
+
159
+ end
160
+
161
+ end
@@ -0,0 +1,3 @@
1
+ module AnalyZ
2
+ VERSION = "0.1.0"
3
+ end
metadata ADDED
@@ -0,0 +1,82 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: analy_z
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - nao215
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2016-04-29 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.9'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.9'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ description: 'calcurate tf idf and hse(html semantic element) '
42
+ email:
43
+ - xxxxxy.naoxxxxx@gmail.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - ".gitignore"
49
+ - ".rspec"
50
+ - ".travis.yml"
51
+ - Gemfile
52
+ - README.md
53
+ - Rakefile
54
+ - analy_z.gemspec
55
+ - bin/console
56
+ - bin/setup
57
+ - lib/analy_z.rb
58
+ - lib/analy_z/version.rb
59
+ homepage: ''
60
+ licenses: []
61
+ metadata: {}
62
+ post_install_message:
63
+ rdoc_options: []
64
+ require_paths:
65
+ - lib
66
+ required_ruby_version: !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ version: '0'
71
+ required_rubygems_version: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ requirements: []
77
+ rubyforge_project:
78
+ rubygems_version: 2.4.7
79
+ signing_key:
80
+ specification_version: 4
81
+ summary: Text Analyzer
82
+ test_files: []