unsupervised-language-detection 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,11 +4,13 @@ require File.expand_path('../naive-bayes-classifier', __FILE__)
|
|
4
4
|
class String
|
5
5
|
# Returns a set of `n`-grams computed from this string.
|
6
6
|
def to_ngrams(n)
|
7
|
-
self.
|
7
|
+
self.normalize_tweet.scan(/.{#{n}}/)
|
8
8
|
end
|
9
9
|
|
10
|
+
private
|
11
|
+
|
10
12
|
# TODO: Try not normalizing out all non-ASCII characters! Should significantly reduce false positive rate.
|
11
|
-
def
|
13
|
+
def normalize_tweet
|
12
14
|
self.remove_tweeters.remove_links.remove_hashtags.downcase.gsub(/\s/, " ").gsub(/[^a-z0-9\s]/, "")
|
13
15
|
end
|
14
16
|
|
@@ -1,13 +1,3 @@
|
|
1
|
-
class Array
|
2
|
-
def sum
|
3
|
-
self.reduce(0) { |total, element| total + element }
|
4
|
-
end
|
5
|
-
|
6
|
-
def product
|
7
|
-
self.reduce(1) { |total, element| total * element }
|
8
|
-
end
|
9
|
-
end
|
10
|
-
|
11
1
|
class NaiveBayesClassifier
|
12
2
|
attr_reader :num_categories, :prior_token_count, :prior_category_counts
|
13
3
|
attr_accessor :category_names
|
@@ -72,10 +62,10 @@ class NaiveBayesClassifier
|
|
72
62
|
# Returns p(category | token), for each category, in an array.
|
73
63
|
def get_posterior_category_probabilities(tokens)
|
74
64
|
unnormalized_posterior_probs = (0..@num_categories-1).map do |category|
|
75
|
-
p = tokens.map { |token| get_token_probability(token, category) }.
|
65
|
+
p = tokens.map { |token| get_token_probability(token, category) }.reduce(:*) # p(tokens | category)
|
76
66
|
p * get_prior_category_probability(category) # p(tokens | category) * p(category)
|
77
67
|
end
|
78
|
-
normalization = unnormalized_posterior_probs.
|
68
|
+
normalization = unnormalized_posterior_probs.reduce(:+)
|
79
69
|
normalization = 1 if normalization == 0
|
80
70
|
return unnormalized_posterior_probs.map{ |p| p / normalization }
|
81
71
|
end
|
@@ -92,7 +82,7 @@ class NaiveBayesClassifier
|
|
92
82
|
|
93
83
|
# p(category)
|
94
84
|
def get_prior_category_probability(category_index)
|
95
|
-
denom = @category_counts.
|
85
|
+
denom = @category_counts.reduce(:+) + @prior_category_counts.reduce(:+)
|
96
86
|
if denom == 0
|
97
87
|
return 0
|
98
88
|
else
|
data/website/views/layout.haml
CHANGED
@@ -11,4 +11,9 @@
|
|
11
11
|
%body
|
12
12
|
#container
|
13
13
|
%h1 Unsupervised Language Detection on Twitter
|
14
|
-
= yield
|
14
|
+
= yield
|
15
|
+
|
16
|
+
%footer
|
17
|
+
%p
|
18
|
+
%strong How does this work?
|
19
|
+
Learn more at <a href = "http://blog.echen.me/2011/05/05/twss-building-a-thats-what-she-said-classifier/">here</a>. By <a href="http://echen.me">Edwin Chen</a>.
|
metadata
CHANGED
@@ -1,13 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: unsupervised-language-detection
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
prerelease:
|
4
|
+
prerelease: false
|
6
5
|
segments:
|
7
6
|
- 0
|
8
7
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
8
|
+
- 3
|
9
|
+
version: 0.0.3
|
11
10
|
platform: ruby
|
12
11
|
authors:
|
13
12
|
- Edwin Chen
|
@@ -15,7 +14,7 @@ autorequire:
|
|
15
14
|
bindir: bin
|
16
15
|
cert_chain: []
|
17
16
|
|
18
|
-
date: 2011-
|
17
|
+
date: 2011-07-22 00:00:00 -07:00
|
19
18
|
default_executable:
|
20
19
|
dependencies: []
|
21
20
|
|
@@ -79,7 +78,6 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
79
78
|
requirements:
|
80
79
|
- - ">="
|
81
80
|
- !ruby/object:Gem::Version
|
82
|
-
hash: 3
|
83
81
|
segments:
|
84
82
|
- 0
|
85
83
|
version: "0"
|
@@ -88,14 +86,13 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
88
86
|
requirements:
|
89
87
|
- - ">="
|
90
88
|
- !ruby/object:Gem::Version
|
91
|
-
hash: 3
|
92
89
|
segments:
|
93
90
|
- 0
|
94
91
|
version: "0"
|
95
92
|
requirements: []
|
96
93
|
|
97
94
|
rubyforge_project: unsupervised-language-detection
|
98
|
-
rubygems_version: 1.
|
95
|
+
rubygems_version: 1.3.7
|
99
96
|
signing_key:
|
100
97
|
specification_version: 3
|
101
98
|
summary: Perform unsupervised language detection.
|