unsupervised-language-detection 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -4,11 +4,13 @@ require File.expand_path('../naive-bayes-classifier', __FILE__)
4
4
  class String
5
5
  # Returns a set of `n`-grams computed from this string.
6
6
  def to_ngrams(n)
7
- self.normalize.scan(/.{#{n}}/)
7
+ self.normalize_tweet.scan(/.{#{n}}/)
8
8
  end
9
9
 
10
+ private
11
+
10
12
  # TODO: Try not normalizing out all non-ASCII characters! Should significantly reduce false positive rate.
11
- def normalize
13
+ def normalize_tweet
12
14
  self.remove_tweeters.remove_links.remove_hashtags.downcase.gsub(/\s/, " ").gsub(/[^a-z0-9\s]/, "")
13
15
  end
14
16
 
@@ -1,13 +1,3 @@
1
- class Array
2
- def sum
3
- self.reduce(0) { |total, element| total + element }
4
- end
5
-
6
- def product
7
- self.reduce(1) { |total, element| total * element }
8
- end
9
- end
10
-
11
1
  class NaiveBayesClassifier
12
2
  attr_reader :num_categories, :prior_token_count, :prior_category_counts
13
3
  attr_accessor :category_names
@@ -72,10 +62,10 @@ class NaiveBayesClassifier
72
62
  # Returns p(category | token), for each category, in an array.
73
63
  def get_posterior_category_probabilities(tokens)
74
64
  unnormalized_posterior_probs = (0..@num_categories-1).map do |category|
75
- p = tokens.map { |token| get_token_probability(token, category) }.product # p(tokens | category)
65
+ p = tokens.map { |token| get_token_probability(token, category) }.reduce(:*) # p(tokens | category)
76
66
  p * get_prior_category_probability(category) # p(tokens | category) * p(category)
77
67
  end
78
- normalization = unnormalized_posterior_probs.sum
68
+ normalization = unnormalized_posterior_probs.reduce(:+)
79
69
  normalization = 1 if normalization == 0
80
70
  return unnormalized_posterior_probs.map{ |p| p / normalization }
81
71
  end
@@ -92,7 +82,7 @@ class NaiveBayesClassifier
92
82
 
93
83
  # p(category)
94
84
  def get_prior_category_probability(category_index)
95
- denom = @category_counts.sum + @prior_category_counts.sum
85
+ denom = @category_counts.reduce(:+) + @prior_category_counts.reduce(:+)
96
86
  if denom == 0
97
87
  return 0
98
88
  else
@@ -1,3 +1,3 @@
1
1
  module UnsupervisedLanguageDetection
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.3"
3
3
  end
@@ -11,4 +11,9 @@
11
11
  %body
12
12
  #container
13
13
  %h1 Unsupervised Language Detection on Twitter
14
- = yield
14
+ = yield
15
+
16
+ %footer
17
+ %p
18
+ %strong How does this work?
19
+ Learn more at <a href = "http://blog.echen.me/2011/05/05/twss-building-a-thats-what-she-said-classifier/">here</a>. By <a href="http://echen.me">Edwin Chen</a>.
metadata CHANGED
@@ -1,13 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unsupervised-language-detection
3
3
  version: !ruby/object:Gem::Version
4
- hash: 27
5
- prerelease:
4
+ prerelease: false
6
5
  segments:
7
6
  - 0
8
7
  - 0
9
- - 2
10
- version: 0.0.2
8
+ - 3
9
+ version: 0.0.3
11
10
  platform: ruby
12
11
  authors:
13
12
  - Edwin Chen
@@ -15,7 +14,7 @@ autorequire:
15
14
  bindir: bin
16
15
  cert_chain: []
17
16
 
18
- date: 2011-05-14 00:00:00 -07:00
17
+ date: 2011-07-22 00:00:00 -07:00
19
18
  default_executable:
20
19
  dependencies: []
21
20
 
@@ -79,7 +78,6 @@ required_ruby_version: !ruby/object:Gem::Requirement
79
78
  requirements:
80
79
  - - ">="
81
80
  - !ruby/object:Gem::Version
82
- hash: 3
83
81
  segments:
84
82
  - 0
85
83
  version: "0"
@@ -88,14 +86,13 @@ required_rubygems_version: !ruby/object:Gem::Requirement
88
86
  requirements:
89
87
  - - ">="
90
88
  - !ruby/object:Gem::Version
91
- hash: 3
92
89
  segments:
93
90
  - 0
94
91
  version: "0"
95
92
  requirements: []
96
93
 
97
94
  rubyforge_project: unsupervised-language-detection
98
- rubygems_version: 1.4.1
95
+ rubygems_version: 1.3.7
99
96
  signing_key:
100
97
  specification_version: 3
101
98
  summary: Perform unsupervised language detection.