unsupervised-language-detection 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. data/Gemfile +4 -0
  2. data/README.md +28 -0
  3. data/Rakefile +2 -0
  4. data/datasets/gutenberg-test-du.txt +1224 -0
  5. data/datasets/gutenberg-test-en.txt +1130 -0
  6. data/datasets/gutenberg-test-sp.txt +1031 -0
  7. data/datasets/gutenberg-training-du.txt +1140 -0
  8. data/datasets/gutenberg-training-en.txt +2823 -0
  9. data/datasets/gutenberg-training-sp.txt +971 -0
  10. data/datasets/gutenberg-training.txt +3237 -0
  11. data/datasets/gutenberg-training_en_du.txt +3301 -0
  12. data/datasets/smiley_tweets_tiny.txt +1000 -0
  13. data/datasets/tweets_5000.txt +5000 -0
  14. data/language-detector-demo.rb +39 -0
  15. data/lib/unsupervised-language-detection.rb +8 -0
  16. data/lib/unsupervised-language-detection/english-tweet-detector.yaml +1658 -0
  17. data/lib/unsupervised-language-detection/language-detector.rb +68 -0
  18. data/lib/unsupervised-language-detection/naive-bayes-classifier.rb +102 -0
  19. data/lib/unsupervised-language-detection/train-english-tweet-detector.rb +11 -0
  20. data/lib/unsupervised-language-detection/version.rb +3 -0
  21. data/test/test_language_detector.rb +19 -0
  22. data/test/test_naive_bayes_classifier.rb +60 -0
  23. data/test/test_naive_bayes_em.rb +23 -0
  24. data/test/test_suite.rb +4 -0
  25. data/unsupervised-language-detection.gemspec +21 -0
  26. data/website/Gemfile +12 -0
  27. data/website/README.md +1 -0
  28. data/website/config.ru +2 -0
  29. data/website/detector.yaml +1658 -0
  30. data/website/detector2.yaml +1658 -0
  31. data/website/main.rb +46 -0
  32. data/website/public/jquery.inlineformlabels.js +53 -0
  33. data/website/public/main.css +23 -0
  34. data/website/views/index.haml +36 -0
  35. data/website/views/layout.haml +14 -0
  36. data/website/views/tweet.haml +3 -0
  37. metadata +106 -0
data/website/main.rb ADDED
@@ -0,0 +1,46 @@
1
+ require 'sinatra'
2
+ require 'uri'
3
+ require 'mongo'
4
+ require 'date'
5
+ require 'time'
6
+ require File.expand_path('../../src/language-detector', __FILE__)
7
+
8
+ use Rack::MethodOverride
9
+
10
+ # Tweets come from a MongoDB collection.
11
+ uri = URI.parse(ENV['MONGOHQ_URL'])
12
+ conn = Mongo::Connection.from_uri(ENV['MONGOHQ_URL'])
13
+ db = conn.db(uri.path.gsub(/^\//, ''))
14
+ coll = db["tweets"]
15
+
16
+ DETECTOR = LanguageDetector.load_yaml("detector2.yaml")
17
+
18
+ helpers do
19
+ def partial(page, locals = {})
20
+ haml page, :layout => false, :locals => locals
21
+ end
22
+ end
23
+
24
+ layout 'layout'
25
+
26
+ get '/' do
27
+ haml :index
28
+ end
29
+
30
+ post '/' do
31
+ @sentence = nil
32
+ if params[:sentence]
33
+ @sentence = params[:sentence]
34
+ @language = DETECTOR.classify(@sentence) == "majority" ? "English" : "Not English"
35
+ end
36
+
37
+ haml :index
38
+ end
39
+
40
+ get '/tweet' do
41
+ @tweet = coll.find().limit(-1).skip(rand(coll.count())).first()['text']
42
+ @language = DETECTOR.classify(@tweet) == "majority" ? "English" : "Not English"
43
+ @language = "Not English" if @tweet.split.select{ |c| c =~ /[^\x00-\x80]/ }.size > 1 # Use this if you want to check for non-Roman characters. Not necessary, but sometimes there are tweets consisting solely of non-Roman characters, in which case the classifier fails (since it currently removes all non-ASCII characters).
44
+
45
+ haml :tweet, :layout => false
46
+ end
@@ -0,0 +1,53 @@
1
+ (function($) {
2
+
3
+ $.fn.inlineFormLabels = function() {
4
+
5
+ var self = this;
6
+
7
+ // Hide all the labels, because we're going to put them in the input field instead.
8
+ $("label", self).hide();
9
+
10
+ // Grab all input fields (inputs and textareas) preceded by a label sibling...
11
+ $("label + input, label + textarea", self).each(function(type) {
12
+
13
+ // If the field is empty, display the label and add a class that indicates its holding the label.
14
+ if ($(this).val() == "") {
15
+ var labelText = $(this).prev("label, textarea").text().trim();
16
+ $(this).val(labelText).addClass("has-inline-label");
17
+ }
18
+
19
+ // If we click in the field, remove the label.
20
+ $(this).focus(function() {
21
+ if ($(this).hasClass("has-inline-label")) {
22
+ $(this).removeClass("has-inline-label");
23
+ $(this).val("");
24
+ }
25
+ });
26
+
27
+ // Not doing anything here yet...
28
+ $(this).keypress(function() {
29
+ });
30
+
31
+ // If we click out of the field and we haven't entered anything, redisplay the label and add back the label-indicator class.
32
+ $(this).blur(function() {
33
+ if ($(this).val() == "") {
34
+ var labelText = $(this).prev("label").text().trim();
35
+ $(this).val(labelText).addClass("has-inline-label");
36
+ }
37
+ });
38
+
39
+ });
40
+
41
+ // When submitting, remove the values from fields holding a label, so that we don't mistakenly think those are real inputs.
42
+ $(self).submit(function() {
43
+ $("input, textarea", self).each(function() {
44
+ if ($(this).hasClass("has-inline-label")) {
45
+ $(this).val("");
46
+ }
47
+ });
48
+ });
49
+
50
+ return self;
51
+ };
52
+
53
+ })(jQuery);
@@ -0,0 +1,23 @@
1
+ #container {
2
+ margin: 10px 20px;
3
+ }
4
+
5
+ table {
6
+ text-align: left;
7
+ }
8
+
9
+ table th.language, table td.language {
10
+ width: 150px;
11
+ }
12
+
13
+ .english {
14
+ background-color: #99FF99;
15
+ }
16
+
17
+ .other {
18
+ background-color: #FF9999;
19
+ }
20
+
21
+ form {
22
+ margin-bottom: 10px;
23
+ }
@@ -0,0 +1,36 @@
1
+ %p An unsupervised language identification algorithm. Trained on tweets with lang = "en" according to the Twitter API (which, in practice, returns tweets in Spanish, Portuguese, Dutch, Russian, and a couple other languages as well). More information <a href="http://blog.echen.me/2011/05/01/unsupervised-language-detection-algorithms/">here</a>.
2
+
3
+ %form{ :action => "/", :method => "post" }
4
+ %label{ :for => "sentence" } Sentence
5
+ %input{ :id => "sentence", :name => "sentence" }
6
+ %button{ :type => "submit" } Detect Language
7
+
8
+ - if @sentence and !@sentence.empty?
9
+ %p
10
+ %strong= @sentence
11
+ is
12
+ %span{ :class => "#{@language == "English" ? "english" : "other"}"}= @language
13
+
14
+ %table
15
+ %tr#header
16
+ %th.language Language
17
+ %th.tweet Tweet
18
+
19
+ :javascript
20
+ function addTweet() {
21
+ $.ajax({
22
+ method: 'GET',
23
+ url: '/tweet',
24
+ cache: false,
25
+ success: function(data) {
26
+ $("table tr#header:first").after(data).slideDown('slow');
27
+ setTimeout(addTweet, 500);
28
+ }
29
+ });
30
+ $('table tr.tweet:gt(20)').remove();
31
+ }
32
+
33
+ $(function() {
34
+ $("form").inlineFormLabels();
35
+ setTimeout(addTweet, 500);
36
+ });
@@ -0,0 +1,14 @@
1
+ !!! 5
2
+ %html
3
+ %head
4
+ %meta{ "http-equiv" => "Content-Type", :content => "text/html", :charset => "UTF-8" }
5
+
6
+ %title Babel Fett
7
+ %script{ :type => "text/javascript", :src => "http://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js" }
8
+ %script{ :type => "text/javascript", :src => "/jquery.inlineformlabels.js" }
9
+ %link{ :href => "/main.css", :rel => "stylesheet", :type => "text/css" }
10
+
11
+ %body
12
+ #container
13
+ %h1 Unsupervised Language Detection on Twitter
14
+ = yield
@@ -0,0 +1,3 @@
1
+ %tr{:class => "tweet #{@language == "English" ? "english" : "other"}"}
2
+ %td.language= @language
3
+ %td.text= @tweet
metadata ADDED
@@ -0,0 +1,106 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: unsupervised-language-detection
3
+ version: !ruby/object:Gem::Version
4
+ hash: 29
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 1
10
+ version: 0.0.1
11
+ platform: ruby
12
+ authors:
13
+ - Edwin Chen
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2011-05-14 00:00:00 -07:00
19
+ default_executable:
20
+ dependencies: []
21
+
22
+ description: Perform unsupervised language detection, specifically for the purpose of finding English-language tweets.
23
+ email:
24
+ - hello@echen.me
25
+ executables: []
26
+
27
+ extensions: []
28
+
29
+ extra_rdoc_files: []
30
+
31
+ files:
32
+ - Gemfile
33
+ - README.md
34
+ - Rakefile
35
+ - datasets/gutenberg-test-du.txt
36
+ - datasets/gutenberg-test-en.txt
37
+ - datasets/gutenberg-test-sp.txt
38
+ - datasets/gutenberg-training-du.txt
39
+ - datasets/gutenberg-training-en.txt
40
+ - datasets/gutenberg-training-sp.txt
41
+ - datasets/gutenberg-training.txt
42
+ - datasets/gutenberg-training_en_du.txt
43
+ - datasets/smiley_tweets_tiny.txt
44
+ - datasets/tweets_5000.txt
45
+ - language-detector-demo.rb
46
+ - lib/unsupervised-language-detection.rb
47
+ - lib/unsupervised-language-detection/english-tweet-detector.yaml
48
+ - lib/unsupervised-language-detection/language-detector.rb
49
+ - lib/unsupervised-language-detection/naive-bayes-classifier.rb
50
+ - lib/unsupervised-language-detection/train-english-tweet-detector.rb
51
+ - lib/unsupervised-language-detection/version.rb
52
+ - test/test_language_detector.rb
53
+ - test/test_naive_bayes_classifier.rb
54
+ - test/test_naive_bayes_em.rb
55
+ - test/test_suite.rb
56
+ - unsupervised-language-detection.gemspec
57
+ - website/Gemfile
58
+ - website/README.md
59
+ - website/config.ru
60
+ - website/detector.yaml
61
+ - website/detector2.yaml
62
+ - website/main.rb
63
+ - website/public/jquery.inlineformlabels.js
64
+ - website/public/main.css
65
+ - website/views/index.haml
66
+ - website/views/layout.haml
67
+ - website/views/tweet.haml
68
+ has_rdoc: true
69
+ homepage: http://blog.echen.me/2011/05/01/unsupervised-language-detection-algorithms/
70
+ licenses: []
71
+
72
+ post_install_message:
73
+ rdoc_options: []
74
+
75
+ require_paths:
76
+ - lib
77
+ required_ruby_version: !ruby/object:Gem::Requirement
78
+ none: false
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ hash: 3
83
+ segments:
84
+ - 0
85
+ version: "0"
86
+ required_rubygems_version: !ruby/object:Gem::Requirement
87
+ none: false
88
+ requirements:
89
+ - - ">="
90
+ - !ruby/object:Gem::Version
91
+ hash: 3
92
+ segments:
93
+ - 0
94
+ version: "0"
95
+ requirements: []
96
+
97
+ rubyforge_project: unsupervised-language-detection
98
+ rubygems_version: 1.4.1
99
+ signing_key:
100
+ specification_version: 3
101
+ summary: Perform unsupervised language detection.
102
+ test_files:
103
+ - test/test_language_detector.rb
104
+ - test/test_naive_bayes_classifier.rb
105
+ - test/test_naive_bayes_em.rb
106
+ - test/test_suite.rb