ruby-tf-idf 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in ruby-tf-idf.gemspec
4
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 mathieuripert
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,29 @@
1
+ # Ruby::Tf::Idf
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'ruby-tf-idf'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install ruby-tf-idf
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,144 @@
1
+ require "ruby-tf-idf/version"
2
+
3
+ module RubyTfIdf
4
+
5
+ class TfIdf
6
+
7
+ STOP_WORDS_EN = [
8
+ 'a','cannot','into','our','thus','about','co','is','ours','to','above',
9
+ 'could','it','ourselves','together','across','down','its','out','too',
10
+ 'after','during','itself','over','toward','afterwards','each','last','own',
11
+ 'towards','again','eg','latter','per','under','against','either','latterly',
12
+ 'perhaps','until','all','else','least','rather','up','almost','elsewhere',
13
+ 'less','same','upon','alone','enough','ltd','seem','us','along','etc',
14
+ 'many','seemed','very','already','even','may','seeming','via','also','ever',
15
+ 'me','seems','was','although','every','meanwhile','several','we','always',
16
+ 'everyone','might','she','well','among','everything','more','should','were',
17
+ 'amongst','everywhere','moreover','since','what','an','except','most','so',
18
+ 'whatever','and','few','mostly','some','when','another','first','much',
19
+ 'somehow','whence','any','for','must','someone','whenever','anyhow',
20
+ 'former','my','something','where','anyone','formerly','myself','sometime',
21
+ 'whereafter','anything','from','namely','sometimes','whereas','anywhere',
22
+ 'further','neither','somewhere','whereby','are','had','never','still',
23
+ 'wherein','around','has','nevertheless','such','whereupon','as','have',
24
+ 'next','than','wherever','at','he','no','that','whether','be','hence',
25
+ 'nobody','the','whither','became','her','none','their','which','because',
26
+ 'here','noone','them','while','become','hereafter','nor','themselves','who',
27
+ 'becomes','hereby','not','then','whoever','becoming','herein','nothing',
28
+ 'thence','whole','been','hereupon','now','there','whom','before','hers',
29
+ 'nowhere','thereafter','whose','beforehand','herself','of','thereby','why',
30
+ 'behind','him','off','therefore','will','being','himself','often','therein',
31
+ 'with','below','his','on','thereupon','within','beside','how','once',
32
+ 'these','without','besides','however','one','they','would','between','i',
33
+ 'only','this','yet','beyond','ie','onto','those','you','both','if','or',
34
+ 'though','your','but','in','other','through','yours','by','inc','others',
35
+ 'throughout','yourself','can','indeed','otherwise','thru','yourselves'
36
+ ]
37
+
38
+ STOP_WORDS_FR = [
39
+
40
+ '-elle','-il','10ème','1er','1ère','2ème','3ème','4ème','5ème','6ème','7ème','8ème',
41
+ '9ème','à','a','afin','ai','ainsi','ais','ait','alors','après','as','assez','au','aucun',
42
+ 'aucune','auprès','auquel','auquelles','auquels','auraient','aurais','aurait','aurez',
43
+ 'auriez','aurions','aurons','auront','aussi','aussitôt','autre','autres','aux',
44
+ 'avaient','avais','avait','avant','avec','avez','aviez','avoir','avons','ayant',
45
+ 'beaucoup','c','car','ce','ceci','cela','celle','celles','celui','cependant',
46
+ 'certes','ces','cet','cette','ceux','chacun','chacune','chaque','chez','cinq',
47
+ 'comme','d','abord','dans','de','dehors','delà','depuis','des','dessous',
48
+ 'dessus','deux','deça','dix','doit','donc','dont','du','durant','dès','déjà',
49
+ 'elle','elles','en','encore','enfin','entre','er','est','est-ce','et','etc',
50
+ 'eu','eurent','eut','faut','fur','hormis','hors','huit','il','ils','j','je',
51
+ 'jusqu','l','la','laquelle','le','lequel','les','lesquels','leur','leurs',
52
+ 'lors','lorsque','lui','là','m','mais','malgré','me','melle','mes','mm','mme',
53
+ 'moi','moins','mon','mr','même','mêmes','n','neuf','ni','non-','nos','notamment',
54
+ 'notre','nous','néanmoins','nôtres','on','ont','ou','où','par','parce','parfois',
55
+ 'parmi','partout','pas','pendant','peu','peut','peut-être','plus','plutôt','pour',
56
+ 'pourquoi','près','puisqu','puisque','qu','quand','quant','quatre','que','quel',
57
+ 'quelle','quelles','quelqu','quelque','quelquefois','quelques','quels','qui',
58
+ 'quoi','quot','s','sa','sans','se','sept','sera','serai','seraient','serais',
59
+ 'serait','seras','serez','seriez','serions','serons','seront','ses','si','sien',
60
+ 'siennes','siens','sitôt','six','soi','sommes','son','sont','sous','souvent',
61
+ 'suis','sur','t','toi','ton','toujours','tous','tout','toutefois','toutes',
62
+ 'troiw','tu','un','une','unes','uns','voici','voilà','vos','votre','vous','vôtres',
63
+ 'y','à','ème','étaient','étais','était','étant','étiez','étions','êtes','être',
64
+ 'afin','ainsi','alors','après','aucun','aucune','auprès','auquel','aussi','autant',
65
+ 'aux','avec','car','ceci','cela','celle','celles','celui','cependant','ces',
66
+ 'cet','cette','ceux','chacun','chacune','chaque','chez','comme','comment','dans',
67
+ 'des','donc','donné','dont','duquel','dès','déjà','elle','elles','encore','entre',
68
+ 'étant','etc','été','eux','furent','grâce','hors','ici','ils','jusqu','les','leur',
69
+ 'leurs','lors','lui','mais','malgré','mes','mien','mienne','miennes','miens',
70
+ 'moins','moment','mon','même','mêmes','non','nos','notre','notres','nous','notre',
71
+ 'oui','par','parce','parmi','plus','pour','près','puis','puisque','quand','quant',
72
+ 'que','quel','quelle','quelque','quelquun','quelques','quels','qui','quoi','sans',
73
+ 'sauf','selon','ses','sien','sienne','siennes','siens','soi','soit','sont','sous',
74
+ 'suis','sur','tandis','tant','tes','tienne','tiennes','tiens','toi','ton','tous',
75
+ 'tout','toute','toutes','trop','très','une','vos','votre','vous','étaient','était',
76
+ 'étant','être'
77
+ ]
78
+
79
+ attr_accessor :tf, :idf, :tf_idf
80
+
81
+ def initialize(docs, limit, exclude_stop_words)
82
+
83
+ @docs = split_docs(docs)
84
+ @tf = []
85
+ @idf = {}
86
+ @tf_idf = []
87
+ @docs_size = @docs.size
88
+ compute_tf_and_idf
89
+ compute_tf_idf(limit,exclude_stop_words)
90
+
91
+ end
92
+
93
+ def split_docs(docs)
94
+
95
+ splitted_docs = []
96
+ docs.each do |d|
97
+ begin
98
+ splitted_docs << d.downcase!.gsub(/,|\.|\'/,'').split(/\s+/)
99
+ rescue
100
+ end
101
+ end
102
+ splitted_docs
103
+ end
104
+
105
+
106
+ def compute_tf_and_idf
107
+
108
+ @docs.each do |words|
109
+
110
+ terms_freq_in_words = words.inject(Hash.new(0)) { |h, e| h[e] += 1 ; h }
111
+ @tf.push(terms_freq_in_words)
112
+ distinct_words = words.uniq
113
+ distinct_words.each do |w|
114
+ if ( @idf.has_key?(w) )
115
+ y = @docs_size / ( 10**(@idf[w]) )
116
+ y += 1
117
+ @idf[w] = Math.log10(@docs_size / y)
118
+ else
119
+ @idf[w] = Math.log10(@docs_size)
120
+ end
121
+ end
122
+ end
123
+
124
+
125
+ def compute_tf_idf(limit,exlude_stop_words)
126
+
127
+ @tf.each do |tf_freq|
128
+ tfidf = Hash.new(0)
129
+ tf_freq.each do |key,value|
130
+ tfidf[key] = @idf[key] * value
131
+ end
132
+ if (exlude_stop_words == true)
133
+ tfidf.reject!{ |k| STOP_WORDS_FR.include?(k) == true }
134
+ tfidf.reject!{ |k| STOP_WORDS_EN.include?(k) == true }
135
+ end
136
+ tfidf = Hash[tfidf.sort_by { |k,v| -v }[0..limit-1]]
137
+ @tf_idf.push(tfidf)
138
+ end
139
+ end
140
+
141
+ end
142
+ end
143
+
144
+ end
@@ -0,0 +1,3 @@
1
+ module RubyTfIdf
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,19 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'ruby-tf-idf/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "ruby-tf-idf"
8
+ gem.version = RubyTfIdf::VERSION
9
+ gem.authors = ["mathieuripert"]
10
+ gem.email = ["mathieu.ripert@gmail.com"]
11
+ gem.description = %q{Term Frequency - Inverse Document Frequency }
12
+ gem.summary = %q{Gem that calculates TF-IDF out of a text to find most relevant words in each document of the corpus}
13
+ gem.homepage = ""
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ["lib"]
19
+ end
metadata ADDED
@@ -0,0 +1,54 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ruby-tf-idf
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - mathieuripert
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-01-03 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: ! 'Term Frequency - Inverse Document Frequency '
15
+ email:
16
+ - mathieu.ripert@gmail.com
17
+ executables: []
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - .gitignore
22
+ - Gemfile
23
+ - LICENSE.txt
24
+ - README.md
25
+ - Rakefile
26
+ - lib/ruby-tf-idf.rb
27
+ - lib/ruby-tf-idf/version.rb
28
+ - ruby-tf-idf.gemspec
29
+ homepage: ''
30
+ licenses: []
31
+ post_install_message:
32
+ rdoc_options: []
33
+ require_paths:
34
+ - lib
35
+ required_ruby_version: !ruby/object:Gem::Requirement
36
+ none: false
37
+ requirements:
38
+ - - ! '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ required_rubygems_version: !ruby/object:Gem::Requirement
42
+ none: false
43
+ requirements:
44
+ - - ! '>='
45
+ - !ruby/object:Gem::Version
46
+ version: '0'
47
+ requirements: []
48
+ rubyforge_project:
49
+ rubygems_version: 1.8.10
50
+ signing_key:
51
+ specification_version: 3
52
+ summary: Gem that calculates TF-IDF out of a text to find most relevant words in each
53
+ document of the corpus
54
+ test_files: []