text_classifier 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +15 -0
  2. data/lib/text_classifier.rb +75 -0
  3. metadata +44 -0
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ MzZkN2ZkMTg3N2U3NTk0ODk5YTg1Nzc3ZmNjMTZhNjQyMDJiYWE4Zg==
5
+ data.tar.gz: !binary |-
6
+ OGI2N2NmNWYwM2I4ODBhNWQ2Mzg1NGMxMzU2OWNhNGI0MGE3YTU3YQ==
7
+ SHA512:
8
+ metadata.gz: !binary |-
9
+ YWQ1OTQwNTYwYjllY2M2OTMzZjI1YmRmN2NmZDllZGZiYzJjZjc2NzgwMjA0
10
+ M2MwYmViMGNkMTA0MDFmZWY2YjdhNTkxNjZiMTgzYmYwYTFjNTNjYzgwNDEx
11
+ NzQ3NjYzNzUzMTk1ZGYwMzdmZGMyODE4ZGI0ZjEyMGI2MTkwODU=
12
+ data.tar.gz: !binary |-
13
+ YmVhMGJjZjVkMWQzMzZjMTZmMWNlMTdjOThiNmI0YTA2NjI0Y2QxZmQyOTUz
14
+ YjU1YTRkNzVhMjVmYTMwNmY3OTA0ZDkyYjExOTY0NzkyZWFmMmFjOGMwZWMz
15
+ MmIyNzA2ZDhhMTI0YjM1NjYzNmY2NTlhMzU4NmIxMWQ5MjE4MzM=
@@ -0,0 +1,75 @@
1
+ require 'set'
2
+
3
+ class TextClassifier
4
+ def self.classify(documents_by_category, test_doc)
5
+ stop_words = Set.new ['a','about','above','after','again','against','all','am','an','and','any','are','aren\'t','as','at','be','because','been','before','being','below','between','both','but','by','can\'t','cannot','could','couldn\'t','did','didn\'t','do','does','doesn\'t','doing','don\'t','down','during','each','few','for','from','further','had','hadn\'t','has','hasn\'t','have','haven\'t','having','he','he\'d','he\'ll','he\'s','her','here','here\'s','hers','herself','him','himself','his','how','how\'s','i','i\'d','i\'ll','i\'m','i\'ve','if','in','into','is','isn\'t','it','it\'s','its','itself','let\'s','me','more','most','mustn\'t','my','myself','no','nor','not','of','off','on','once','only','or','other','ought','our','ours','ourselves','out','over','own','same','shan\'t','she','she\'d','she\'ll','she\'s','should','shouldn\'t','so','some','such','than','that','that\'s','the','their','theirs','them','themselves','then','there','there\'s','these','they','they\'d','they\'ll','they\'re','they\'ve','this','those','through','to','too','under','until','up','very','was','wasn\'t','we','we\'d','we\'ll','we\'re','we\'ve','were','weren\'t','what','what\'s','when','when\'s','where','where\'s','which','while','who','who\'s','whom','why','why\'s','with','won\'t','would','wouldn\'t','you','you\'d','you\'ll','you\'re','you\'ve','your','yours','yourself','yourselves']
6
+ num_categories = documents_by_category.size
7
+ probability_of_category = Array.new(num_categories)
8
+ num_words_in_category = Array.new(num_categories)
9
+ count_words_by_category = Array.new(num_categories)
10
+ entire_vocabulary = Set.new
11
+
12
+ # count the total number of documents across all categories
13
+ num_docs = 0
14
+ for i in 0..num_categories-1 do
15
+ documents_this_cat = documents_by_category[i]
16
+ num_docs += documents_this_cat.size
17
+ documents_this_cat.each do |doc|
18
+ doc = doc.downcase.gsub(/[^a-z']/, ' ').squeeze(' ')
19
+ end
20
+ end
21
+ test_doc = test_doc.downcase.gsub(/[^a-z']/, ' ').squeeze(' ')
22
+
23
+ # count how many of each word are in each category and build the entire vocabulary
24
+ for i in 0..num_categories-1 do
25
+ category = documents_by_category[i]
26
+ probability_of_category[i] = category.size.to_f / num_docs
27
+
28
+ num_words_this_cat = 0
29
+ count_words_this_cat = Hash.new(0)
30
+ category.each do |document|
31
+ document.split.each do |word|
32
+ entire_vocabulary.add(word)
33
+ num_words_this_cat += 1
34
+ count_words_this_cat[word] += 1
35
+ end
36
+ end
37
+ num_words_in_category[i] = num_words_this_cat
38
+ count_words_by_category[i] = count_words_this_cat
39
+ end
40
+
41
+ # find the conditional probability of a word, given that we are in a category
42
+ cond_probs = Array.new(num_categories)
43
+ size = entire_vocabulary.size
44
+ for i in 0..num_categories-1 do
45
+ prob = Hash.new(0)
46
+ denom = num_words_in_category[i] + size
47
+ entire_vocabulary.each do |word|
48
+ numer = 1.0 + count_words_by_category[i][word]
49
+ prob[word] = numer / denom
50
+ end
51
+
52
+ cond_probs[i] = prob
53
+ end
54
+
55
+ # calculate the probability of each category on the new test document
56
+ test_doc_probs = Array.new(num_categories)
57
+ for i in 0..num_categories-1 do
58
+ prob = cond_probs[i]
59
+ total_prob = probability_of_category[i]
60
+ test_doc.split.each do |word|
61
+ total_prob *= prob[word]
62
+ end
63
+
64
+ test_doc_probs[i] = total_prob
65
+ end
66
+
67
+ # test_doc_probs are proportional to each other so scale to make them sum to 1
68
+ sum_test_doc_probs = test_doc_probs.inject(:+)
69
+ for i in 0..num_categories-1 do
70
+ test_doc_probs[i] /= sum_test_doc_probs
71
+ end
72
+
73
+ return test_doc_probs
74
+ end
75
+ end
metadata ADDED
@@ -0,0 +1,44 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: text_classifier
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Brendan Lundy
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-07-12 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Text classification of a new document using multinomial naive bayes
14
+ email: absolartstudio@gmail.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/text_classifier.rb
20
+ homepage: http://rubygems.org/gems/text_classifier
21
+ licenses:
22
+ - MIT
23
+ metadata: {}
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ required_rubygems_version: !ruby/object:Gem::Requirement
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ requirements: []
39
+ rubyforge_project:
40
+ rubygems_version: 2.4.3
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: Text classifier
44
+ test_files: []