analy_z 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/.rspec +2 -0
- data/.travis.yml +3 -0
- data/Gemfile +7 -0
- data/README.md +39 -0
- data/Rakefile +1 -0
- data/analy_z.gemspec +23 -0
- data/bin/console +14 -0
- data/bin/setup +7 -0
- data/lib/analy_z.rb +161 -0
- data/lib/analy_z/version.rb +3 -0
- metadata +82 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: cafbd605a88e44b0b2e80aa2d919acaa099a0bc8
|
4
|
+
data.tar.gz: 71e59a86e64bf5f99ed72e520edbe444e1d86189
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 2a97c6c54a3311af0e19517f0d5e09fbc461569d9c12d93be8e780a5efc22fd3766f7eafb0d2981993e4235abdd7d983b94b6d48148ccc0cbf780e586e716849
|
7
|
+
data.tar.gz: 10a3efaf2f55fa5b1146f070cf5ff983146f62197afeaffbe0407e68bf9376d0da7f453f82fe5665fe5d3cc9885a17d1660d6fd01cd123452fbeba3172d65010
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
# AnalyZ
|
2
|
+
|
3
|
+
Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/analy_z`. To experiment with that code, run `bin/console` for an interactive prompt.
|
4
|
+
|
5
|
+
TODO: Delete this and the text above, and describe your gem
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application's Gemfile:
|
10
|
+
|
11
|
+
```ruby
|
12
|
+
gem 'analy_z'
|
13
|
+
```
|
14
|
+
|
15
|
+
And then execute:
|
16
|
+
|
17
|
+
$ bundle
|
18
|
+
|
19
|
+
Or install it yourself as:
|
20
|
+
|
21
|
+
$ gem install analy_z
|
22
|
+
|
23
|
+
## Usage
|
24
|
+
|
25
|
+
TODO: Write usage instructions here
|
26
|
+
|
27
|
+
## Development
|
28
|
+
|
29
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
|
30
|
+
|
31
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release` to create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
32
|
+
|
33
|
+
## Contributing
|
34
|
+
|
35
|
+
1. Fork it ( https://github.com/[my-github-username]/analy_z/fork )
|
36
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
37
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
38
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
39
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/analy_z.gemspec
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'analy_z/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "analy_z"
|
8
|
+
spec.version = AnalyZ::VERSION
|
9
|
+
spec.authors = ["nao215"]
|
10
|
+
spec.email = ["xxxxxy.naoxxxxx@gmail.com"]
|
11
|
+
|
12
|
+
spec.summary = %q{Text Analyzer}
|
13
|
+
spec.description = %q{calcurate tf idf and hse(html semantic element) }
|
14
|
+
spec.homepage = ""
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
17
|
+
spec.bindir = "exe"
|
18
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.9"
|
22
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
23
|
+
end
|
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "analy_z"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start
|
data/bin/setup
ADDED
data/lib/analy_z.rb
ADDED
@@ -0,0 +1,161 @@
|
|
1
|
+
require "analy_z/version"
|
2
|
+
|
3
|
+
module AnalyZ
|
4
|
+
|
5
|
+
class Analyzer
|
6
|
+
|
7
|
+
require 'pp'
|
8
|
+
require 'natto'
|
9
|
+
require 'nokogiri'
|
10
|
+
|
11
|
+
attr_accessor :tf
|
12
|
+
attr_accessor :idf
|
13
|
+
attr_accessor :tf_idf
|
14
|
+
attr_accessor :hse_tf_idf
|
15
|
+
attr_accessor :words
|
16
|
+
attr_accessor :texts
|
17
|
+
attr_accessor :sentences
|
18
|
+
|
19
|
+
def initialize html, selector = 'body', type_ary = ['名詞']
|
20
|
+
@sentences = {}
|
21
|
+
Dir.glob("htmls/*.html").each do |f|
|
22
|
+
@sentences[f] = parse_html(Nokogiri::HTML.parse(File.read(f), nil, nil).css(selector).to_html)
|
23
|
+
end
|
24
|
+
analyze_words(@sentences)
|
25
|
+
end
|
26
|
+
|
27
|
+
def analyze_words sentences, type_ary = ['名詞']
|
28
|
+
|
29
|
+
@texts, @words, @tf, @idf, @hse = {}, {}, {}, {}, {}
|
30
|
+
|
31
|
+
sentences.each{|k, sentence| @texts[k] = sentence.map {|s| s[0]}.join }
|
32
|
+
|
33
|
+
sentences.each do |key, sentence_ary|
|
34
|
+
text = sentence_ary.map {|s| s[0] }.join
|
35
|
+
@words[key] = parse_by_natto(text, type_ary)
|
36
|
+
@tf[key] = calc_tf(@words[key])
|
37
|
+
@idf[key] = calc_idf(@texts, @words[key])
|
38
|
+
@hse[key] = calc_hse(@words[key], sentence_ary)
|
39
|
+
end
|
40
|
+
|
41
|
+
@tf_idf = calc_tf_idf(@tf, @idf)
|
42
|
+
@hse_tf_idf = calc_hse_tf_idf(@tf_idf, @hse)
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
def parse_html html
|
47
|
+
sentences, important_tags = [], []
|
48
|
+
tag_rep = /<(".*?"|'.*?'|[^'"])*?>/
|
49
|
+
h_tag_reg = /<[hH][1-4].*?>.*?<\/[hH][1-4]>/
|
50
|
+
|
51
|
+
important_tags = html.scan(h_tag_reg)
|
52
|
+
.map{|m| [m.gsub(/<(".*?"|'.*?'|[^'"])*?>/, ''), m.match(/[hH][1-4]/)[0] ]}
|
53
|
+
|
54
|
+
sentences = html.gsub(/\"/, '')
|
55
|
+
.split(tag_rep)
|
56
|
+
.delete_if{|el| el != ['h1', 'h2', 'h3', 'h4'] && el =~ /\s/ || el.length <= 1}
|
57
|
+
.map{|m| [m, 1]}
|
58
|
+
|
59
|
+
sentences.each_with_index do |sentence, i|
|
60
|
+
important_tags.each do |tag_data|
|
61
|
+
rate = 2 * 1.75 if tag_data[1] == 'h1'
|
62
|
+
rate = 1.5 * 1.75 if tag_data[1] == 'h2'
|
63
|
+
rate = 1.17 * 1.75 if tag_data[1] == 'h3'
|
64
|
+
rate = 1.17 * 1.75 if tag_data[1] == 'h4'
|
65
|
+
sentences[i][1] = rate if sentence[0].include?(tag_data[0]) || tag_data[0].include?(sentence[0])
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
sentences
|
70
|
+
|
71
|
+
end
|
72
|
+
|
73
|
+
def parse_by_natto text, type_ary
|
74
|
+
words = []
|
75
|
+
|
76
|
+
Natto::MeCab.new.parse(text).split(/\n/).map do |row|
|
77
|
+
row = row.split(/\t|,/)
|
78
|
+
words << row[0] if type_ary.include?(row[1]) # row[0] is word, row[1] is a part of speech
|
79
|
+
end
|
80
|
+
|
81
|
+
words
|
82
|
+
end
|
83
|
+
|
84
|
+
def calc_tf words
|
85
|
+
freq_hash = {}
|
86
|
+
|
87
|
+
words.each_with_index do |word, i|
|
88
|
+
freq_hash[word] = freq_hash.has_key?(word) ? freq_hash[word] + 1 : 1
|
89
|
+
end
|
90
|
+
|
91
|
+
tf_list = freq_hash.sort_by {|k, v| v }.reverse.map do |k, v|
|
92
|
+
[k, v / words.length.to_f]
|
93
|
+
end
|
94
|
+
|
95
|
+
tf_list
|
96
|
+
end
|
97
|
+
|
98
|
+
def standardization_tf tf_ary_list, ave_word_num
|
99
|
+
return tf_ary_list.map do |tf_ary|
|
100
|
+
tf_ary.map do |tf|
|
101
|
+
[tf[0], tf[1] * (tf_ary.length / ave_word_num.to_f), tf_ary.length / ave_word_num.to_f]
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
def calc_idf sentences, words
|
107
|
+
words.map do |word|
|
108
|
+
cnt = 0
|
109
|
+
sentences.each {|k, v| cnt += 1 if v.include?(word) }
|
110
|
+
[word, Math.log(sentences.length / cnt.to_f)]
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
def calc_hse words, sentence_ary
|
115
|
+
sentence_ary = sentence_ary.select{|sentence| sentence[1] != 1}
|
116
|
+
words.map do |word|
|
117
|
+
rate = 1
|
118
|
+
sentence_ary.each do |sentence|
|
119
|
+
rate = sentence[1] if sentence[0].include?(word[0])
|
120
|
+
end
|
121
|
+
[word, rate]
|
122
|
+
end.uniq
|
123
|
+
end
|
124
|
+
|
125
|
+
def calc_tf_idf tf_list_hash, idf_list_hash
|
126
|
+
|
127
|
+
tf_idfs = {}
|
128
|
+
|
129
|
+
tf_list_hash.each do |k, tf|
|
130
|
+
tf_idf = []
|
131
|
+
idf_list_hash[k].each do |idf|
|
132
|
+
tf_idf << [idf[0], idf[1] * tf.assoc(idf[0])[1]]
|
133
|
+
end
|
134
|
+
tf_idfs[k] = tf_idf.sort{ |a, b| b[1] <=> a[1] }.uniq
|
135
|
+
end
|
136
|
+
|
137
|
+
tf_idfs
|
138
|
+
|
139
|
+
end
|
140
|
+
|
141
|
+
def calc_hse_tf_idf tf_idf_list_hash, hse
|
142
|
+
|
143
|
+
hse_tf_idf = {}
|
144
|
+
|
145
|
+
hse.each do |k, h|
|
146
|
+
hse[k] = hse[k].select {|h| h[1] != 1 }
|
147
|
+
end
|
148
|
+
|
149
|
+
tf_idf_list_hash.each do |k, tf_idf_list|
|
150
|
+
hse_tf_idf[k] = tf_idf_list.map do |tf_idf|
|
151
|
+
rate = hse[k].assoc(tf_idf[0]) ? hse[k].assoc(tf_idf[0])[1] : 1
|
152
|
+
[tf_idf[0], tf_idf[1] * rate]
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
hse_tf_idf
|
157
|
+
end
|
158
|
+
|
159
|
+
end
|
160
|
+
|
161
|
+
end
|
metadata
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: analy_z
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- nao215
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-04-29 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.9'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.9'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '10.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.0'
|
41
|
+
description: 'calcurate tf idf and hse(html semantic element) '
|
42
|
+
email:
|
43
|
+
- xxxxxy.naoxxxxx@gmail.com
|
44
|
+
executables: []
|
45
|
+
extensions: []
|
46
|
+
extra_rdoc_files: []
|
47
|
+
files:
|
48
|
+
- ".gitignore"
|
49
|
+
- ".rspec"
|
50
|
+
- ".travis.yml"
|
51
|
+
- Gemfile
|
52
|
+
- README.md
|
53
|
+
- Rakefile
|
54
|
+
- analy_z.gemspec
|
55
|
+
- bin/console
|
56
|
+
- bin/setup
|
57
|
+
- lib/analy_z.rb
|
58
|
+
- lib/analy_z/version.rb
|
59
|
+
homepage: ''
|
60
|
+
licenses: []
|
61
|
+
metadata: {}
|
62
|
+
post_install_message:
|
63
|
+
rdoc_options: []
|
64
|
+
require_paths:
|
65
|
+
- lib
|
66
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
67
|
+
requirements:
|
68
|
+
- - ">="
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
version: '0'
|
71
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
requirements: []
|
77
|
+
rubyforge_project:
|
78
|
+
rubygems_version: 2.4.7
|
79
|
+
signing_key:
|
80
|
+
specification_version: 4
|
81
|
+
summary: Text Analyzer
|
82
|
+
test_files: []
|