unsupervised-language-detection 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/unsupervised-language-detection.rb +4 -2
- data/lib/unsupervised-language-detection/language-detector.rb +15 -7
- data/lib/unsupervised-language-detection/naive-bayes-classifier.rb +40 -14
- data/lib/unsupervised-language-detection/train-english-tweet-detector.rb +5 -5
- data/lib/unsupervised-language-detection/version.rb +1 -1
- data/test/test_language_detector.rb +1 -1
- data/test/test_naive_bayes_classifier.rb +1 -1
- data/test/test_naive_bayes_em.rb +1 -1
- data/test/test_suite.rb +2 -1
- data/test/test_tweet_language_detection.rb +49 -0
- metadata +5 -14
- data/website/Gemfile +0 -12
- data/website/README.md +0 -1
- data/website/config.ru +0 -2
- data/website/detector.yaml +0 -1658
- data/website/detector2.yaml +0 -1658
- data/website/main.rb +0 -46
- data/website/public/jquery.inlineformlabels.js +0 -53
- data/website/public/main.css +0 -23
- data/website/views/index.haml +0 -36
- data/website/views/layout.haml +0 -19
- data/website/views/tweet.haml +0 -3
data/website/main.rb
DELETED
@@ -1,46 +0,0 @@
|
|
1
|
-
require 'sinatra'
|
2
|
-
require 'uri'
|
3
|
-
require 'mongo'
|
4
|
-
require 'date'
|
5
|
-
require 'time'
|
6
|
-
require File.expand_path('../../src/language-detector', __FILE__)
|
7
|
-
|
8
|
-
use Rack::MethodOverride
|
9
|
-
|
10
|
-
# Tweets come from a MongoDB collection.
|
11
|
-
uri = URI.parse(ENV['MONGOHQ_URL'])
|
12
|
-
conn = Mongo::Connection.from_uri(ENV['MONGOHQ_URL'])
|
13
|
-
db = conn.db(uri.path.gsub(/^\//, ''))
|
14
|
-
coll = db["tweets"]
|
15
|
-
|
16
|
-
DETECTOR = LanguageDetector.load_yaml("detector2.yaml")
|
17
|
-
|
18
|
-
helpers do
|
19
|
-
def partial(page, locals = {})
|
20
|
-
haml page, :layout => false, :locals => locals
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
|
-
layout 'layout'
|
25
|
-
|
26
|
-
get '/' do
|
27
|
-
haml :index
|
28
|
-
end
|
29
|
-
|
30
|
-
post '/' do
|
31
|
-
@sentence = nil
|
32
|
-
if params[:sentence]
|
33
|
-
@sentence = params[:sentence]
|
34
|
-
@language = DETECTOR.classify(@sentence) == "majority" ? "English" : "Not English"
|
35
|
-
end
|
36
|
-
|
37
|
-
haml :index
|
38
|
-
end
|
39
|
-
|
40
|
-
get '/tweet' do
|
41
|
-
@tweet = coll.find().limit(-1).skip(rand(coll.count())).first()['text']
|
42
|
-
@language = DETECTOR.classify(@tweet) == "majority" ? "English" : "Not English"
|
43
|
-
@language = "Not English" if @tweet.split.select{ |c| c =~ /[^\x00-\x80]/ }.size > 1 # Use this if you want to check for non-Roman characters. Not necessary, but sometimes there are tweets consisting solely of non-Roman characters, in which case the classifier fails (since it currently removes all non-ASCII characters).
|
44
|
-
|
45
|
-
haml :tweet, :layout => false
|
46
|
-
end
|
@@ -1,53 +0,0 @@
|
|
1
|
-
(function($) {
|
2
|
-
|
3
|
-
$.fn.inlineFormLabels = function() {
|
4
|
-
|
5
|
-
var self = this;
|
6
|
-
|
7
|
-
// Hide all the labels, because we're going to put them in the input field instead.
|
8
|
-
$("label", self).hide();
|
9
|
-
|
10
|
-
// Grab all input fields (inputs and textareas) preceded by a label sibling...
|
11
|
-
$("label + input, label + textarea", self).each(function(type) {
|
12
|
-
|
13
|
-
// If the field is empty, display the label and add a class that indicates its holding the label.
|
14
|
-
if ($(this).val() == "") {
|
15
|
-
var labelText = $(this).prev("label, textarea").text().trim();
|
16
|
-
$(this).val(labelText).addClass("has-inline-label");
|
17
|
-
}
|
18
|
-
|
19
|
-
// If we click in the field, remove the label.
|
20
|
-
$(this).focus(function() {
|
21
|
-
if ($(this).hasClass("has-inline-label")) {
|
22
|
-
$(this).removeClass("has-inline-label");
|
23
|
-
$(this).val("");
|
24
|
-
}
|
25
|
-
});
|
26
|
-
|
27
|
-
// Not doing anything here yet...
|
28
|
-
$(this).keypress(function() {
|
29
|
-
});
|
30
|
-
|
31
|
-
// If we click out of the field and we haven't entered anything, redisplay the label and add back the label-indicator class.
|
32
|
-
$(this).blur(function() {
|
33
|
-
if ($(this).val() == "") {
|
34
|
-
var labelText = $(this).prev("label").text().trim();
|
35
|
-
$(this).val(labelText).addClass("has-inline-label");
|
36
|
-
}
|
37
|
-
});
|
38
|
-
|
39
|
-
});
|
40
|
-
|
41
|
-
// When submitting, remove the values from fields holding a label, so that we don't mistakenly think those are real inputs.
|
42
|
-
$(self).submit(function() {
|
43
|
-
$("input, textarea", self).each(function() {
|
44
|
-
if ($(this).hasClass("has-inline-label")) {
|
45
|
-
$(this).val("");
|
46
|
-
}
|
47
|
-
});
|
48
|
-
});
|
49
|
-
|
50
|
-
return self;
|
51
|
-
};
|
52
|
-
|
53
|
-
})(jQuery);
|
data/website/public/main.css
DELETED
@@ -1,23 +0,0 @@
|
|
1
|
-
#container {
|
2
|
-
margin: 10px 20px;
|
3
|
-
}
|
4
|
-
|
5
|
-
table {
|
6
|
-
text-align: left;
|
7
|
-
}
|
8
|
-
|
9
|
-
table th.language, table td.language {
|
10
|
-
width: 150px;
|
11
|
-
}
|
12
|
-
|
13
|
-
.english {
|
14
|
-
background-color: #99FF99;
|
15
|
-
}
|
16
|
-
|
17
|
-
.other {
|
18
|
-
background-color: #FF9999;
|
19
|
-
}
|
20
|
-
|
21
|
-
form {
|
22
|
-
margin-bottom: 10px;
|
23
|
-
}
|
data/website/views/index.haml
DELETED
@@ -1,36 +0,0 @@
|
|
1
|
-
%p An unsupervised language identification algorithm. Trained on tweets with lang = "en" according to the Twitter API (which, in practice, returns tweets in Spanish, Portuguese, Dutch, Russian, and a couple other languages as well). More information <a href="http://blog.echen.me/2011/05/01/unsupervised-language-detection-algorithms/">here</a>.
|
2
|
-
|
3
|
-
%form{ :action => "/", :method => "post" }
|
4
|
-
%label{ :for => "sentence" } Sentence
|
5
|
-
%input{ :id => "sentence", :name => "sentence" }
|
6
|
-
%button{ :type => "submit" } Detect Language
|
7
|
-
|
8
|
-
- if @sentence and !@sentence.empty?
|
9
|
-
%p
|
10
|
-
%strong= @sentence
|
11
|
-
is
|
12
|
-
%span{ :class => "#{@language == "English" ? "english" : "other"}"}= @language
|
13
|
-
|
14
|
-
%table
|
15
|
-
%tr#header
|
16
|
-
%th.language Language
|
17
|
-
%th.tweet Tweet
|
18
|
-
|
19
|
-
:javascript
|
20
|
-
function addTweet() {
|
21
|
-
$.ajax({
|
22
|
-
method: 'GET',
|
23
|
-
url: '/tweet',
|
24
|
-
cache: false,
|
25
|
-
success: function(data) {
|
26
|
-
$("table tr#header:first").after(data).slideDown('slow');
|
27
|
-
setTimeout(addTweet, 500);
|
28
|
-
}
|
29
|
-
});
|
30
|
-
$('table tr.tweet:gt(20)').remove();
|
31
|
-
}
|
32
|
-
|
33
|
-
$(function() {
|
34
|
-
$("form").inlineFormLabels();
|
35
|
-
setTimeout(addTweet, 500);
|
36
|
-
});
|
data/website/views/layout.haml
DELETED
@@ -1,19 +0,0 @@
|
|
1
|
-
!!! 5
|
2
|
-
%html
|
3
|
-
%head
|
4
|
-
%meta{ "http-equiv" => "Content-Type", :content => "text/html", :charset => "UTF-8" }
|
5
|
-
|
6
|
-
%title Babel Fett
|
7
|
-
%script{ :type => "text/javascript", :src => "http://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js" }
|
8
|
-
%script{ :type => "text/javascript", :src => "/jquery.inlineformlabels.js" }
|
9
|
-
%link{ :href => "/main.css", :rel => "stylesheet", :type => "text/css" }
|
10
|
-
|
11
|
-
%body
|
12
|
-
#container
|
13
|
-
%h1 Unsupervised Language Detection on Twitter
|
14
|
-
= yield
|
15
|
-
|
16
|
-
%footer
|
17
|
-
%p
|
18
|
-
%strong How does this work?
|
19
|
-
Learn more at <a href = "http://blog.echen.me/2011/05/05/twss-building-a-thats-what-she-said-classifier/">here</a>. By <a href="http://echen.me">Edwin Chen</a>.
|
data/website/views/tweet.haml
DELETED