unsupervised-language-detection 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,11 +4,13 @@ require File.expand_path('../naive-bayes-classifier', __FILE__)
4
4
  class String
5
5
  # Returns a set of `n`-grams computed from this string.
6
6
  def to_ngrams(n)
7
- self.normalize.scan(/.{#{n}}/)
7
+ self.normalize_tweet.scan(/.{#{n}}/)
8
8
  end
9
9
 
10
+ private
11
+
10
12
  # TODO: Try not normalizing out all non-ASCII characters! Should significantly reduce false positive rate.
11
- def normalize
13
+ def normalize_tweet
12
14
  self.remove_tweeters.remove_links.remove_hashtags.downcase.gsub(/\s/, " ").gsub(/[^a-z0-9\s]/, "")
13
15
  end
14
16
 
@@ -1,13 +1,3 @@
1
- class Array
2
- def sum
3
- self.reduce(0) { |total, element| total + element }
4
- end
5
-
6
- def product
7
- self.reduce(1) { |total, element| total * element }
8
- end
9
- end
10
-
11
1
  class NaiveBayesClassifier
12
2
  attr_reader :num_categories, :prior_token_count, :prior_category_counts
13
3
  attr_accessor :category_names
@@ -72,10 +62,10 @@ class NaiveBayesClassifier
72
62
  # Returns p(category | token), for each category, in an array.
73
63
  def get_posterior_category_probabilities(tokens)
74
64
  unnormalized_posterior_probs = (0..@num_categories-1).map do |category|
75
- p = tokens.map { |token| get_token_probability(token, category) }.product # p(tokens | category)
65
+ p = tokens.map { |token| get_token_probability(token, category) }.reduce(:*) # p(tokens | category)
76
66
  p * get_prior_category_probability(category) # p(tokens | category) * p(category)
77
67
  end
78
- normalization = unnormalized_posterior_probs.sum
68
+ normalization = unnormalized_posterior_probs.reduce(:+)
79
69
  normalization = 1 if normalization == 0
80
70
  return unnormalized_posterior_probs.map{ |p| p / normalization }
81
71
  end
@@ -92,7 +82,7 @@ class NaiveBayesClassifier
92
82
 
93
83
  # p(category)
94
84
  def get_prior_category_probability(category_index)
95
- denom = @category_counts.sum + @prior_category_counts.sum
85
+ denom = @category_counts.reduce(:+) + @prior_category_counts.reduce(:+)
96
86
  if denom == 0
97
87
  return 0
98
88
  else
@@ -1,3 +1,3 @@
1
1
  module UnsupervisedLanguageDetection
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.3"
3
3
  end
@@ -11,4 +11,9 @@
11
11
  %body
12
12
  #container
13
13
  %h1 Unsupervised Language Detection on Twitter
14
- = yield
14
+ = yield
15
+
16
+ %footer
17
+ %p
18
+ %strong How does this work?
19
+ Learn more at <a href = "http://blog.echen.me/2011/05/05/twss-building-a-thats-what-she-said-classifier/">here</a>. By <a href="http://echen.me">Edwin Chen</a>.
metadata CHANGED
@@ -1,13 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unsupervised-language-detection
3
3
  version: !ruby/object:Gem::Version
4
- hash: 27
5
- prerelease:
4
+ prerelease: false
6
5
  segments:
7
6
  - 0
8
7
  - 0
9
- - 2
10
- version: 0.0.2
8
+ - 3
9
+ version: 0.0.3
11
10
  platform: ruby
12
11
  authors:
13
12
  - Edwin Chen
@@ -15,7 +14,7 @@ autorequire:
15
14
  bindir: bin
16
15
  cert_chain: []
17
16
 
18
- date: 2011-05-14 00:00:00 -07:00
17
+ date: 2011-07-22 00:00:00 -07:00
19
18
  default_executable:
20
19
  dependencies: []
21
20
 
@@ -79,7 +78,6 @@ required_ruby_version: !ruby/object:Gem::Requirement
79
78
  requirements:
80
79
  - - ">="
81
80
  - !ruby/object:Gem::Version
82
- hash: 3
83
81
  segments:
84
82
  - 0
85
83
  version: "0"
@@ -88,14 +86,13 @@ required_rubygems_version: !ruby/object:Gem::Requirement
88
86
  requirements:
89
87
  - - ">="
90
88
  - !ruby/object:Gem::Version
91
- hash: 3
92
89
  segments:
93
90
  - 0
94
91
  version: "0"
95
92
  requirements: []
96
93
 
97
94
  rubyforge_project: unsupervised-language-detection
98
- rubygems_version: 1.4.1
95
+ rubygems_version: 1.3.7
99
96
  signing_key:
100
97
  specification_version: 3
101
98
  summary: Perform unsupervised language detection.