text_classifier 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/lib/text_classifier.rb +75 -0
- metadata +44 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
MzZkN2ZkMTg3N2U3NTk0ODk5YTg1Nzc3ZmNjMTZhNjQyMDJiYWE4Zg==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
OGI2N2NmNWYwM2I4ODBhNWQ2Mzg1NGMxMzU2OWNhNGI0MGE3YTU3YQ==
|
7
|
+
SHA512:
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
YWQ1OTQwNTYwYjllY2M2OTMzZjI1YmRmN2NmZDllZGZiYzJjZjc2NzgwMjA0
|
10
|
+
M2MwYmViMGNkMTA0MDFmZWY2YjdhNTkxNjZiMTgzYmYwYTFjNTNjYzgwNDEx
|
11
|
+
NzQ3NjYzNzUzMTk1ZGYwMzdmZGMyODE4ZGI0ZjEyMGI2MTkwODU=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
YmVhMGJjZjVkMWQzMzZjMTZmMWNlMTdjOThiNmI0YTA2NjI0Y2QxZmQyOTUz
|
14
|
+
YjU1YTRkNzVhMjVmYTMwNmY3OTA0ZDkyYjExOTY0NzkyZWFmMmFjOGMwZWMz
|
15
|
+
MmIyNzA2ZDhhMTI0YjM1NjYzNmY2NTlhMzU4NmIxMWQ5MjE4MzM=
|
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
class TextClassifier
|
4
|
+
def self.classify(documents_by_category, test_doc)
|
5
|
+
stop_words = Set.new ['a','about','above','after','again','against','all','am','an','and','any','are','aren\'t','as','at','be','because','been','before','being','below','between','both','but','by','can\'t','cannot','could','couldn\'t','did','didn\'t','do','does','doesn\'t','doing','don\'t','down','during','each','few','for','from','further','had','hadn\'t','has','hasn\'t','have','haven\'t','having','he','he\'d','he\'ll','he\'s','her','here','here\'s','hers','herself','him','himself','his','how','how\'s','i','i\'d','i\'ll','i\'m','i\'ve','if','in','into','is','isn\'t','it','it\'s','its','itself','let\'s','me','more','most','mustn\'t','my','myself','no','nor','not','of','off','on','once','only','or','other','ought','our','ours','ourselves','out','over','own','same','shan\'t','she','she\'d','she\'ll','she\'s','should','shouldn\'t','so','some','such','than','that','that\'s','the','their','theirs','them','themselves','then','there','there\'s','these','they','they\'d','they\'ll','they\'re','they\'ve','this','those','through','to','too','under','until','up','very','was','wasn\'t','we','we\'d','we\'ll','we\'re','we\'ve','were','weren\'t','what','what\'s','when','when\'s','where','where\'s','which','while','who','who\'s','whom','why','why\'s','with','won\'t','would','wouldn\'t','you','you\'d','you\'ll','you\'re','you\'ve','your','yours','yourself','yourselves']
|
6
|
+
num_categories = documents_by_category.size
|
7
|
+
probability_of_category = Array.new(num_categories)
|
8
|
+
num_words_in_category = Array.new(num_categories)
|
9
|
+
count_words_by_category = Array.new(num_categories)
|
10
|
+
entire_vocabulary = Set.new
|
11
|
+
|
12
|
+
# count the total number of documents across all categories
|
13
|
+
num_docs = 0
|
14
|
+
for i in 0..num_categories-1 do
|
15
|
+
documents_this_cat = documents_by_category[i]
|
16
|
+
num_docs += documents_this_cat.size
|
17
|
+
documents_this_cat.each do |doc|
|
18
|
+
doc = doc.downcase.gsub(/[^a-z']/, ' ').squeeze(' ')
|
19
|
+
end
|
20
|
+
end
|
21
|
+
test_doc = test_doc.downcase.gsub(/[^a-z']/, ' ').squeeze(' ')
|
22
|
+
|
23
|
+
# count how many of each word are in each category and build the entire vocabulary
|
24
|
+
for i in 0..num_categories-1 do
|
25
|
+
category = documents_by_category[i]
|
26
|
+
probability_of_category[i] = category.size.to_f / num_docs
|
27
|
+
|
28
|
+
num_words_this_cat = 0
|
29
|
+
count_words_this_cat = Hash.new(0)
|
30
|
+
category.each do |document|
|
31
|
+
document.split.each do |word|
|
32
|
+
entire_vocabulary.add(word)
|
33
|
+
num_words_this_cat += 1
|
34
|
+
count_words_this_cat[word] += 1
|
35
|
+
end
|
36
|
+
end
|
37
|
+
num_words_in_category[i] = num_words_this_cat
|
38
|
+
count_words_by_category[i] = count_words_this_cat
|
39
|
+
end
|
40
|
+
|
41
|
+
# find the conditional probability of a word, given that we are in a category
|
42
|
+
cond_probs = Array.new(num_categories)
|
43
|
+
size = entire_vocabulary.size
|
44
|
+
for i in 0..num_categories-1 do
|
45
|
+
prob = Hash.new(0)
|
46
|
+
denom = num_words_in_category[i] + size
|
47
|
+
entire_vocabulary.each do |word|
|
48
|
+
numer = 1.0 + count_words_by_category[i][word]
|
49
|
+
prob[word] = numer / denom
|
50
|
+
end
|
51
|
+
|
52
|
+
cond_probs[i] = prob
|
53
|
+
end
|
54
|
+
|
55
|
+
# calculate the probability of each category on the new test document
|
56
|
+
test_doc_probs = Array.new(num_categories)
|
57
|
+
for i in 0..num_categories-1 do
|
58
|
+
prob = cond_probs[i]
|
59
|
+
total_prob = probability_of_category[i]
|
60
|
+
test_doc.split.each do |word|
|
61
|
+
total_prob *= prob[word]
|
62
|
+
end
|
63
|
+
|
64
|
+
test_doc_probs[i] = total_prob
|
65
|
+
end
|
66
|
+
|
67
|
+
# test_doc_probs are proportional to each other so scale to make them sum to 1
|
68
|
+
sum_test_doc_probs = test_doc_probs.inject(:+)
|
69
|
+
for i in 0..num_categories-1 do
|
70
|
+
test_doc_probs[i] /= sum_test_doc_probs
|
71
|
+
end
|
72
|
+
|
73
|
+
return test_doc_probs
|
74
|
+
end
|
75
|
+
end
|
metadata
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: text_classifier
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Brendan Lundy
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-07-12 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Text classification of a new document using multinomial naive bayes
|
14
|
+
email: absolartstudio@gmail.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/text_classifier.rb
|
20
|
+
homepage: http://rubygems.org/gems/text_classifier
|
21
|
+
licenses:
|
22
|
+
- MIT
|
23
|
+
metadata: {}
|
24
|
+
post_install_message:
|
25
|
+
rdoc_options: []
|
26
|
+
require_paths:
|
27
|
+
- lib
|
28
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
requirements: []
|
39
|
+
rubyforge_project:
|
40
|
+
rubygems_version: 2.4.3
|
41
|
+
signing_key:
|
42
|
+
specification_version: 4
|
43
|
+
summary: Text classifier
|
44
|
+
test_files: []
|