unsupervised-language-detection 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/unsupervised-language-detection.rb +4 -2
- data/lib/unsupervised-language-detection/language-detector.rb +15 -7
- data/lib/unsupervised-language-detection/naive-bayes-classifier.rb +40 -14
- data/lib/unsupervised-language-detection/train-english-tweet-detector.rb +5 -5
- data/lib/unsupervised-language-detection/version.rb +1 -1
- data/test/test_language_detector.rb +1 -1
- data/test/test_naive_bayes_classifier.rb +1 -1
- data/test/test_naive_bayes_em.rb +1 -1
- data/test/test_suite.rb +2 -1
- data/test/test_tweet_language_detection.rb +49 -0
- metadata +5 -14
- data/website/Gemfile +0 -12
- data/website/README.md +0 -1
- data/website/config.ru +0 -2
- data/website/detector.yaml +0 -1658
- data/website/detector2.yaml +0 -1658
- data/website/main.rb +0 -46
- data/website/public/jquery.inlineformlabels.js +0 -53
- data/website/public/main.css +0 -23
- data/website/views/index.haml +0 -36
- data/website/views/layout.haml +0 -19
- data/website/views/tweet.haml +0 -3
data/website/main.rb
DELETED
@@ -1,46 +0,0 @@
|
|
1
|
-
require 'sinatra'
|
2
|
-
require 'uri'
|
3
|
-
require 'mongo'
|
4
|
-
require 'date'
|
5
|
-
require 'time'
|
6
|
-
require File.expand_path('../../src/language-detector', __FILE__)
|
7
|
-
|
8
|
-
use Rack::MethodOverride
|
9
|
-
|
10
|
-
# Tweets come from a MongoDB collection.
|
11
|
-
uri = URI.parse(ENV['MONGOHQ_URL'])
|
12
|
-
conn = Mongo::Connection.from_uri(ENV['MONGOHQ_URL'])
|
13
|
-
db = conn.db(uri.path.gsub(/^\//, ''))
|
14
|
-
coll = db["tweets"]
|
15
|
-
|
16
|
-
DETECTOR = LanguageDetector.load_yaml("detector2.yaml")
|
17
|
-
|
18
|
-
helpers do
|
19
|
-
def partial(page, locals = {})
|
20
|
-
haml page, :layout => false, :locals => locals
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
|
-
layout 'layout'
|
25
|
-
|
26
|
-
get '/' do
|
27
|
-
haml :index
|
28
|
-
end
|
29
|
-
|
30
|
-
post '/' do
|
31
|
-
@sentence = nil
|
32
|
-
if params[:sentence]
|
33
|
-
@sentence = params[:sentence]
|
34
|
-
@language = DETECTOR.classify(@sentence) == "majority" ? "English" : "Not English"
|
35
|
-
end
|
36
|
-
|
37
|
-
haml :index
|
38
|
-
end
|
39
|
-
|
40
|
-
get '/tweet' do
|
41
|
-
@tweet = coll.find().limit(-1).skip(rand(coll.count())).first()['text']
|
42
|
-
@language = DETECTOR.classify(@tweet) == "majority" ? "English" : "Not English"
|
43
|
-
@language = "Not English" if @tweet.split.select{ |c| c =~ /[^\x00-\x80]/ }.size > 1 # Use this if you want to check for non-Roman characters. Not necessary, but sometimes there are tweets consisting solely of non-Roman characters, in which case the classifier fails (since it currently removes all non-ASCII characters).
|
44
|
-
|
45
|
-
haml :tweet, :layout => false
|
46
|
-
end
|
@@ -1,53 +0,0 @@
|
|
1
|
-
(function($) {
|
2
|
-
|
3
|
-
$.fn.inlineFormLabels = function() {
|
4
|
-
|
5
|
-
var self = this;
|
6
|
-
|
7
|
-
// Hide all the labels, because we're going to put them in the input field instead.
|
8
|
-
$("label", self).hide();
|
9
|
-
|
10
|
-
// Grab all input fields (inputs and textareas) preceded by a label sibling...
|
11
|
-
$("label + input, label + textarea", self).each(function(type) {
|
12
|
-
|
13
|
-
// If the field is empty, display the label and add a class that indicates its holding the label.
|
14
|
-
if ($(this).val() == "") {
|
15
|
-
var labelText = $(this).prev("label, textarea").text().trim();
|
16
|
-
$(this).val(labelText).addClass("has-inline-label");
|
17
|
-
}
|
18
|
-
|
19
|
-
// If we click in the field, remove the label.
|
20
|
-
$(this).focus(function() {
|
21
|
-
if ($(this).hasClass("has-inline-label")) {
|
22
|
-
$(this).removeClass("has-inline-label");
|
23
|
-
$(this).val("");
|
24
|
-
}
|
25
|
-
});
|
26
|
-
|
27
|
-
// Not doing anything here yet...
|
28
|
-
$(this).keypress(function() {
|
29
|
-
});
|
30
|
-
|
31
|
-
// If we click out of the field and we haven't entered anything, redisplay the label and add back the label-indicator class.
|
32
|
-
$(this).blur(function() {
|
33
|
-
if ($(this).val() == "") {
|
34
|
-
var labelText = $(this).prev("label").text().trim();
|
35
|
-
$(this).val(labelText).addClass("has-inline-label");
|
36
|
-
}
|
37
|
-
});
|
38
|
-
|
39
|
-
});
|
40
|
-
|
41
|
-
// When submitting, remove the values from fields holding a label, so that we don't mistakenly think those are real inputs.
|
42
|
-
$(self).submit(function() {
|
43
|
-
$("input, textarea", self).each(function() {
|
44
|
-
if ($(this).hasClass("has-inline-label")) {
|
45
|
-
$(this).val("");
|
46
|
-
}
|
47
|
-
});
|
48
|
-
});
|
49
|
-
|
50
|
-
return self;
|
51
|
-
};
|
52
|
-
|
53
|
-
})(jQuery);
|
data/website/public/main.css
DELETED
@@ -1,23 +0,0 @@
|
|
1
|
-
#container {
|
2
|
-
margin: 10px 20px;
|
3
|
-
}
|
4
|
-
|
5
|
-
table {
|
6
|
-
text-align: left;
|
7
|
-
}
|
8
|
-
|
9
|
-
table th.language, table td.language {
|
10
|
-
width: 150px;
|
11
|
-
}
|
12
|
-
|
13
|
-
.english {
|
14
|
-
background-color: #99FF99;
|
15
|
-
}
|
16
|
-
|
17
|
-
.other {
|
18
|
-
background-color: #FF9999;
|
19
|
-
}
|
20
|
-
|
21
|
-
form {
|
22
|
-
margin-bottom: 10px;
|
23
|
-
}
|
data/website/views/index.haml
DELETED
@@ -1,36 +0,0 @@
|
|
1
|
-
%p An unsupervised language identification algorithm. Trained on tweets with lang = "en" according to the Twitter API (which, in practice, returns tweets in Spanish, Portuguese, Dutch, Russian, and a couple other languages as well). More information <a href="http://blog.echen.me/2011/05/01/unsupervised-language-detection-algorithms/">here</a>.
|
2
|
-
|
3
|
-
%form{ :action => "/", :method => "post" }
|
4
|
-
%label{ :for => "sentence" } Sentence
|
5
|
-
%input{ :id => "sentence", :name => "sentence" }
|
6
|
-
%button{ :type => "submit" } Detect Language
|
7
|
-
|
8
|
-
- if @sentence and !@sentence.empty?
|
9
|
-
%p
|
10
|
-
%strong= @sentence
|
11
|
-
is
|
12
|
-
%span{ :class => "#{@language == "English" ? "english" : "other"}"}= @language
|
13
|
-
|
14
|
-
%table
|
15
|
-
%tr#header
|
16
|
-
%th.language Language
|
17
|
-
%th.tweet Tweet
|
18
|
-
|
19
|
-
:javascript
|
20
|
-
function addTweet() {
|
21
|
-
$.ajax({
|
22
|
-
method: 'GET',
|
23
|
-
url: '/tweet',
|
24
|
-
cache: false,
|
25
|
-
success: function(data) {
|
26
|
-
$("table tr#header:first").after(data).slideDown('slow');
|
27
|
-
setTimeout(addTweet, 500);
|
28
|
-
}
|
29
|
-
});
|
30
|
-
$('table tr.tweet:gt(20)').remove();
|
31
|
-
}
|
32
|
-
|
33
|
-
$(function() {
|
34
|
-
$("form").inlineFormLabels();
|
35
|
-
setTimeout(addTweet, 500);
|
36
|
-
});
|
data/website/views/layout.haml
DELETED
@@ -1,19 +0,0 @@
|
|
1
|
-
!!! 5
|
2
|
-
%html
|
3
|
-
%head
|
4
|
-
%meta{ "http-equiv" => "Content-Type", :content => "text/html", :charset => "UTF-8" }
|
5
|
-
|
6
|
-
%title Babel Fett
|
7
|
-
%script{ :type => "text/javascript", :src => "http://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js" }
|
8
|
-
%script{ :type => "text/javascript", :src => "/jquery.inlineformlabels.js" }
|
9
|
-
%link{ :href => "/main.css", :rel => "stylesheet", :type => "text/css" }
|
10
|
-
|
11
|
-
%body
|
12
|
-
#container
|
13
|
-
%h1 Unsupervised Language Detection on Twitter
|
14
|
-
= yield
|
15
|
-
|
16
|
-
%footer
|
17
|
-
%p
|
18
|
-
%strong How does this work?
|
19
|
-
Learn more at <a href = "http://blog.echen.me/2011/05/05/twss-building-a-thats-what-she-said-classifier/">here</a>. By <a href="http://echen.me">Edwin Chen</a>.
|
data/website/views/tweet.haml
DELETED