lang_libs 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: c4e9e14ff662cf98d5553724efbf699ac82186b8
4
+ data.tar.gz: c1efa4106f25da2fc98f8fad1dbab5bade51e8a0
5
+ SHA512:
6
+ metadata.gz: c9fc4359beba39069aa8982b10c044bc5931dbe688ad5a1928472254be5049d54e68c8c9c669c24466552eba2ae5d2716b7c83e34d2d072fbc6de8856e4069ff
7
+ data.tar.gz: 67d944df3f7239107095b3705fcbb7d5fc7860b0fd8adc9a5cf3d837a79bf9c6b46de457841c0939113396f26baa2ce4ca3739e719be007d1aebcd6468e42804
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --require spec_helper
@@ -0,0 +1,3 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.1.5
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in lang_libs.gemspec
4
+ gemspec
@@ -0,0 +1,39 @@
1
+ # LangLibs
2
+
3
+ Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/lang_libs`. To experiment with that code, run `bin/console` for an interactive prompt.
4
+
5
+ TODO: Delete this and the text above, and describe your gem
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'lang_libs'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install lang_libs
22
+
23
+ ## Usage
24
+
25
+ TODO: Write usage instructions here
26
+
27
+ ## Development
28
+
29
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
30
+
31
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release` to create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
32
+
33
+ ## Contributing
34
+
35
+ 1. Fork it ( https://github.com/[my-github-username]/lang_libs/fork )
36
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
37
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
38
+ 4. Push to the branch (`git push origin my-new-feature`)
39
+ 5. Create a new Pull Request
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "lang_libs"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+
5
+ bundle install
6
+
7
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,31 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'lang_libs/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "lang_libs"
8
+ spec.version = LangLibs::VERSION
9
+ spec.authors = ["Ryu_Mac"]
10
+ spec.email = ["ts@mail.com"]
11
+
12
+ spec.summary = %q{This is gem for language processing.}
13
+ spec.description = %q{This gem can calculate tf-idf etc.}
14
+ #spec.homepage = "TODO: Put your gem's website or public repo URL here."
15
+
16
+ # Prevent pushing this gem to RubyGems.org by setting 'allowed_push_host', or
17
+ # delete this section to allow pushing this gem to any host.
18
+ if spec.respond_to?(:metadata)
19
+ #spec.metadata['allowed_push_host'] = "TODO: Set to 'http://mygemserver.com'"
20
+ else
21
+ raise "RubyGems 2.0 or newer is required to protect against public gem pushes."
22
+ end
23
+
24
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
25
+ spec.bindir = "exe"
26
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
27
+ spec.require_paths = ["lib"]
28
+
29
+ spec.add_development_dependency "bundler", "~> 1.9"
30
+ spec.add_development_dependency "rake", "~> 10.0"
31
+ end
@@ -0,0 +1,9 @@
1
+ require "lang_libs/version"
2
+ require "lang_libs/model"
3
+ require "lang_libs/n_gram"
4
+ require "lang_libs/tf_idf"
5
+ require "lang_libs/file_access"
6
+ require "lang_libs/accessor"
7
+ require "lang_libs/index"
8
+ require "lang_libs/getter"
9
+ require "lang_libs/setter"
@@ -0,0 +1,61 @@
1
+ module LangLibs
2
+ class Corpus
3
+
4
+ def dump_tf(index)
5
+ File.open("#{@dump_path}/tf_#{index}","wb") do |f|
6
+ Marshal.dump(@tf[index],f)
7
+ end
8
+ end
9
+ def load_tf(index)
10
+ result = nil
11
+ File.open("#{@dump_path}/tf_#{index}","rb") do |f|
12
+ result = Marshal.load(f)
13
+ end
14
+ set_tf(index,result)
15
+ result
16
+ end
17
+
18
+ def dump_idf
19
+ File.open("#{@dump_path}/idf","wb") do |f|
20
+ Marshal.dump(@idf,f)
21
+ end
22
+ end
23
+ def load_idf
24
+ result = nil
25
+ File.open("#{@dump_path}/idf","rb") do |f|
26
+ result = Marshal.load(f)
27
+ end
28
+ set_idf(result)
29
+ result
30
+ end
31
+
32
+ def dump_tfidf(index)
33
+ File.open("#{@dump_path}/tfidf_#{index}","wb") do |f|
34
+ Marshal.dump(@tfidf[index],f)
35
+ end
36
+ end
37
+ def load_tfidf(index)
38
+ result = nil
39
+ File.open("#{@dump_path}/tfidf_#{index}","rb") do |f|
40
+ result = Marshal.load(f)
41
+ end
42
+ set_tfidf(index,result)
43
+ result
44
+ end
45
+
46
+ def dump_count_terms
47
+ File.open("#{@dump_path}/count_terms","wb") do |f|
48
+ Marshal.dump(@count_terms,f)
49
+ end
50
+ end
51
+ def load_count_terms
52
+ result = nil
53
+ File.open("#{@dump_path}/count_terms","rb") do |f|
54
+ result = Marshal.load(f)
55
+ end
56
+ set_count_terms(result)
57
+ result
58
+ end
59
+
60
+ end
61
+ end
@@ -0,0 +1,22 @@
1
+ require 'yaml'
2
+
3
+ module LangLibs
4
+ class Corpus
5
+
6
+ def save(file_path, *instance_names)
7
+ saving_hash = Hash.new
8
+ yield(saving_hash) if block_given?
9
+ instance_names.each do |name|
10
+ if instance_variable_defined?("@#{name}")
11
+ saving_hash["#{name}"] = eval("@#{name}").map{|key,val| [ key.to_s, val ] }.to_h
12
+ else
13
+ raise "instance variable defined: #{name}"
14
+ end
15
+ end
16
+ open(file_path,"w") do |f|
17
+ YAML.dump(saving_hash,f)
18
+ end
19
+ end
20
+
21
+ end
22
+ end
@@ -0,0 +1,33 @@
1
+ module LangLibs
2
+ class Corpus
3
+ define_method(:get_tf) {|index| @tf[index] || load_tf(index) }
4
+ define_method(:get_idf) { @idf.empty? ? load_idf : @idf }
5
+ define_method(:get_tfidf) {|index| @tfidf[index] || load_tfidf(index) }
6
+ define_method(:get_count_terms) { @count_terms.empty? ? load_count_terms : @count_terms }
7
+
8
+ def each_tf(&block)
9
+ @index.each do |i|
10
+ block.call(get_tf(i))
11
+ end
12
+ end
13
+
14
+ def each_tf_with_index(&block)
15
+ @index.each do |i|
16
+ block.call(i, get_tf(i))
17
+ end
18
+ end
19
+
20
+ def each_tfidf(&block)
21
+ @index.each do |i|
22
+ block.call(get_tfidf(i))
23
+ end
24
+ end
25
+
26
+ def each_tfidf_with_index(&block)
27
+ @index.each do |i|
28
+ block.call(i, get_tfidf(i))
29
+ end
30
+ end
31
+
32
+ end
33
+ end
@@ -0,0 +1,23 @@
1
+ module LangLibs
2
+ class Index
3
+ def initialize
4
+ @index = Array.new
5
+ end
6
+
7
+ def regist(name)
8
+ @index.push(name) if !@index.include?(name)
9
+ end
10
+
11
+ def each(&block)
12
+ @index.each{|i| block.call(i)}
13
+ end
14
+
15
+ def size
16
+ @index.size
17
+ end
18
+
19
+ def first
20
+ @index.first
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,77 @@
1
+ module LangLibs
2
+ class Corpus
3
+ # @param [Array] document(String) Array
4
+ def initialize(documents: nil,dump_path: "./dump",dump: false)
5
+ @dump_path = File.expand_path(dump_path)
6
+ @corpus = Hash.new
7
+ @tf, @idf, @tfidf, @count_terms = {},{},{},Hash.new(0)
8
+ @dump = dump
9
+ configure(dump_path)
10
+ documents_set(documents)
11
+ end
12
+
13
+ def documents_set(documents)
14
+ if documents.is_a?(Array)
15
+ self.add_documents(documents)
16
+ elsif documents.is_a?(Hash)
17
+ self.add_hashed_documents(documents)
18
+ elsif documents.nil?
19
+ else
20
+ raise "LangLibs::Corpus#documents_set : augument is not Array or Hash"
21
+ end
22
+ end
23
+
24
+ def configure(dump_path)
25
+ config = nil
26
+ config_file_path = File.join(dump_path,"configure")
27
+ if File.exist?(config_file_path)
28
+ File.open(config_file_path, "rb") do |f|
29
+ config = Marshal.load(f)
30
+ end
31
+ @index = config["index"]
32
+ else
33
+ @index = Index.new
34
+ end
35
+ end
36
+
37
+ def store_configure
38
+ config = Hash.new
39
+ config["index"] = @index
40
+ File.open(File.join(@dump_path,"configure"), "wb") do |f|
41
+ Marshal.dump(config,f)
42
+ end
43
+ end
44
+
45
+ def all_document
46
+ @corpus
47
+ end
48
+
49
+ def [](index)
50
+ @corpus[index]
51
+ end
52
+
53
+ def add_document(document)
54
+ @corpus[@index.size] = document
55
+ @index.regist(@index.size)
56
+ end
57
+
58
+ def add_documents(documents)
59
+ documents.each{|doc| add_document(doc) }
60
+ end
61
+
62
+ def add_hashed_documents(hashed_documents)
63
+ hashed_documents.each do |label,doc|
64
+ @corpus[label] = doc
65
+ @index.regist(label)
66
+ end
67
+ end
68
+
69
+ def each(&block)
70
+ @corpus.each{|_, val| block.call(val)}
71
+ end
72
+
73
+ def each_with_key(&block)
74
+ @corpus.each{|key, val| block.call(key,val)}
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,18 @@
1
+ module NGram
2
+ def ngram(doc,n,sep: "\s")
3
+ ngram_list = Hash.new(0)
4
+ grams = split_for_ngram(doc,n,sep: sep)
5
+ grams.each do |gram|
6
+ ngram_list[gram] += 1
7
+ end
8
+ ngram_list
9
+ end
10
+
11
+ private
12
+
13
+ def split_for_ngram(doc,n,sep: "\s")
14
+ characters = doc.split(sep)
15
+ return [ doc ] if characters.size <= n
16
+ return characters.each_cons(n).map{|chars| chars.join(sep)}
17
+ end
18
+ end
@@ -0,0 +1,8 @@
1
+ module LangLibs
2
+ class Corpus
3
+ define_method(:set_tf) {|index,value| @tf[index] = value}
4
+ define_method(:set_idf) {|value| @idf = value }
5
+ define_method(:set_tfidf) {|index,value| @tfidf[index] = value}
6
+ define_method(:set_count_terms) {|value| @count_terms = value}
7
+ end
8
+ end
@@ -0,0 +1,71 @@
1
+ require "lang_libs/n_gram"
2
+
3
+ module LangLibs
4
+ class Corpus
5
+ include NGram
6
+ def calc_tf(n, sep: "\s")
7
+ term_list = Hash.new
8
+ ngrams = Hash.new
9
+ count_terms = Hash.new(0)
10
+ each_with_key do |i,doc|
11
+ ngrams[i] = ngram(doc, n, sep: sep)
12
+ ngrams[i].each do |term,num|
13
+ term_list[term] = 0
14
+ count_terms[term] += num
15
+ end
16
+ end
17
+ set_count_terms(count_terms)
18
+ ngrams.each do |i,terms|
19
+ temp_term_list = term_list.dup
20
+ ngrams_sum = 0
21
+ terms.each do |term,num|
22
+ temp_term_list[term] = num
23
+ ngrams_sum += num
24
+ end
25
+ set_tf(i, temp_term_list.map{|term,freq| [term, freq.to_f/ngrams_sum]}.to_h )
26
+ end
27
+ if @dump
28
+ @index.each{|i| dump_tf(i) }
29
+ dump_count_terms
30
+ end
31
+ end
32
+
33
+ def calc_idf
34
+ term_freq = Hash.new(0)
35
+ get_tf(@index.first).each do |term,_|
36
+ each_tf do |tfs|
37
+ term_freq[term] += 1 if tfs[term] > 0
38
+ end
39
+ end
40
+ set_idf( term_freq.map{|term,freq| [ term, Math.log(@index.size.to_f / freq) ] }.to_h )
41
+ dump_idf if @dump
42
+ end
43
+
44
+ def calc_tfidf
45
+ each_tf_with_index do |i,tfs|
46
+ temp_tfidf = get_idf.dup
47
+ tfs.each do |term, tf|
48
+ temp_tfidf[term] = tf * get_idf[term]
49
+ end
50
+ set_tfidf(i, temp_tfidf)
51
+ end
52
+ @index.each{|i| dump_tfidf(i)} if @dump
53
+ end
54
+
55
+ def top_term_tfidf(n)
56
+ result = Hash.new
57
+ set_count_terms(get_count_terms.sort_by{|term,num| -num}.to_h)
58
+ each_tfidf_with_index do |i,tfidfs|
59
+ result[i] = Hash.new
60
+ count = 0
61
+ get_count_terms.each do |term,_|
62
+ result[i][term] = tfidfs[term]
63
+ count += 1
64
+ break if count >= n
65
+ end
66
+ end
67
+ result
68
+ end
69
+
70
+ end
71
+ end
@@ -0,0 +1,3 @@
1
+ module LangLibs
2
+ VERSION = "0.1.0"
3
+ end
metadata ADDED
@@ -0,0 +1,91 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: lang_libs
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Ryu_Mac
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2016-07-18 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.9'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.9'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ description: This gem can calculate tf-idf etc.
42
+ email:
43
+ - ts@mail.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - ".gitignore"
49
+ - ".rspec"
50
+ - ".travis.yml"
51
+ - Gemfile
52
+ - README.md
53
+ - Rakefile
54
+ - bin/console
55
+ - bin/setup
56
+ - lang_libs.gemspec
57
+ - lib/lang_libs.rb
58
+ - lib/lang_libs/accessor.rb
59
+ - lib/lang_libs/file_access.rb
60
+ - lib/lang_libs/getter.rb
61
+ - lib/lang_libs/index.rb
62
+ - lib/lang_libs/model.rb
63
+ - lib/lang_libs/n_gram.rb
64
+ - lib/lang_libs/setter.rb
65
+ - lib/lang_libs/tf_idf.rb
66
+ - lib/lang_libs/version.rb
67
+ homepage:
68
+ licenses: []
69
+ metadata: {}
70
+ post_install_message:
71
+ rdoc_options: []
72
+ require_paths:
73
+ - lib
74
+ required_ruby_version: !ruby/object:Gem::Requirement
75
+ requirements:
76
+ - - ">="
77
+ - !ruby/object:Gem::Version
78
+ version: '0'
79
+ required_rubygems_version: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ version: '0'
84
+ requirements: []
85
+ rubyforge_project:
86
+ rubygems_version: 2.2.2
87
+ signing_key:
88
+ specification_version: 4
89
+ summary: This is gem for language processing.
90
+ test_files: []
91
+ has_rdoc: