unsupervised-language-detection 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
@@ -4,11 +4,13 @@ require File.expand_path('../naive-bayes-classifier', __FILE__)
|
|
4
4
|
class String
|
5
5
|
# Returns a set of `n`-grams computed from this string.
|
6
6
|
def to_ngrams(n)
|
7
|
-
self.
|
7
|
+
self.normalize_tweet.scan(/.{#{n}}/)
|
8
8
|
end
|
9
9
|
|
10
|
+
private
|
11
|
+
|
10
12
|
# TODO: Try not normalizing out all non-ASCII characters! Should significantly reduce false positive rate.
|
11
|
-
def
|
13
|
+
def normalize_tweet
|
12
14
|
self.remove_tweeters.remove_links.remove_hashtags.downcase.gsub(/\s/, " ").gsub(/[^a-z0-9\s]/, "")
|
13
15
|
end
|
14
16
|
|
@@ -1,13 +1,3 @@
|
|
1
|
-
class Array
|
2
|
-
def sum
|
3
|
-
self.reduce(0) { |total, element| total + element }
|
4
|
-
end
|
5
|
-
|
6
|
-
def product
|
7
|
-
self.reduce(1) { |total, element| total * element }
|
8
|
-
end
|
9
|
-
end
|
10
|
-
|
11
1
|
class NaiveBayesClassifier
|
12
2
|
attr_reader :num_categories, :prior_token_count, :prior_category_counts
|
13
3
|
attr_accessor :category_names
|
@@ -72,10 +62,10 @@ class NaiveBayesClassifier
|
|
72
62
|
# Returns p(category | token), for each category, in an array.
|
73
63
|
def get_posterior_category_probabilities(tokens)
|
74
64
|
unnormalized_posterior_probs = (0..@num_categories-1).map do |category|
|
75
|
-
p = tokens.map { |token| get_token_probability(token, category) }.
|
65
|
+
p = tokens.map { |token| get_token_probability(token, category) }.reduce(:*) # p(tokens | category)
|
76
66
|
p * get_prior_category_probability(category) # p(tokens | category) * p(category)
|
77
67
|
end
|
78
|
-
normalization = unnormalized_posterior_probs.
|
68
|
+
normalization = unnormalized_posterior_probs.reduce(:+)
|
79
69
|
normalization = 1 if normalization == 0
|
80
70
|
return unnormalized_posterior_probs.map{ |p| p / normalization }
|
81
71
|
end
|
@@ -92,7 +82,7 @@ class NaiveBayesClassifier
|
|
92
82
|
|
93
83
|
# p(category)
|
94
84
|
def get_prior_category_probability(category_index)
|
95
|
-
denom = @category_counts.
|
85
|
+
denom = @category_counts.reduce(:+) + @prior_category_counts.reduce(:+)
|
96
86
|
if denom == 0
|
97
87
|
return 0
|
98
88
|
else
|
data/website/views/layout.haml
CHANGED
@@ -11,4 +11,9 @@
|
|
11
11
|
%body
|
12
12
|
#container
|
13
13
|
%h1 Unsupervised Language Detection on Twitter
|
14
|
-
= yield
|
14
|
+
= yield
|
15
|
+
|
16
|
+
%footer
|
17
|
+
%p
|
18
|
+
%strong How does this work?
|
19
|
+
Learn more at <a href = "http://blog.echen.me/2011/05/05/twss-building-a-thats-what-she-said-classifier/">here</a>. By <a href="http://echen.me">Edwin Chen</a>.
|
metadata
CHANGED
@@ -1,13 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: unsupervised-language-detection
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
prerelease:
|
4
|
+
prerelease: false
|
6
5
|
segments:
|
7
6
|
- 0
|
8
7
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
8
|
+
- 3
|
9
|
+
version: 0.0.3
|
11
10
|
platform: ruby
|
12
11
|
authors:
|
13
12
|
- Edwin Chen
|
@@ -15,7 +14,7 @@ autorequire:
|
|
15
14
|
bindir: bin
|
16
15
|
cert_chain: []
|
17
16
|
|
18
|
-
date: 2011-
|
17
|
+
date: 2011-07-22 00:00:00 -07:00
|
19
18
|
default_executable:
|
20
19
|
dependencies: []
|
21
20
|
|
@@ -79,7 +78,6 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
79
78
|
requirements:
|
80
79
|
- - ">="
|
81
80
|
- !ruby/object:Gem::Version
|
82
|
-
hash: 3
|
83
81
|
segments:
|
84
82
|
- 0
|
85
83
|
version: "0"
|
@@ -88,14 +86,13 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
88
86
|
requirements:
|
89
87
|
- - ">="
|
90
88
|
- !ruby/object:Gem::Version
|
91
|
-
hash: 3
|
92
89
|
segments:
|
93
90
|
- 0
|
94
91
|
version: "0"
|
95
92
|
requirements: []
|
96
93
|
|
97
94
|
rubyforge_project: unsupervised-language-detection
|
98
|
-
rubygems_version: 1.
|
95
|
+
rubygems_version: 1.3.7
|
99
96
|
signing_key:
|
100
97
|
specification_version: 3
|
101
98
|
summary: Perform unsupervised language detection.
|