unsupervised-language-detection 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +4 -0
- data/README.md +28 -0
- data/Rakefile +2 -0
- data/datasets/gutenberg-test-du.txt +1224 -0
- data/datasets/gutenberg-test-en.txt +1130 -0
- data/datasets/gutenberg-test-sp.txt +1031 -0
- data/datasets/gutenberg-training-du.txt +1140 -0
- data/datasets/gutenberg-training-en.txt +2823 -0
- data/datasets/gutenberg-training-sp.txt +971 -0
- data/datasets/gutenberg-training.txt +3237 -0
- data/datasets/gutenberg-training_en_du.txt +3301 -0
- data/datasets/smiley_tweets_tiny.txt +1000 -0
- data/datasets/tweets_5000.txt +5000 -0
- data/language-detector-demo.rb +39 -0
- data/lib/unsupervised-language-detection.rb +8 -0
- data/lib/unsupervised-language-detection/english-tweet-detector.yaml +1658 -0
- data/lib/unsupervised-language-detection/language-detector.rb +68 -0
- data/lib/unsupervised-language-detection/naive-bayes-classifier.rb +102 -0
- data/lib/unsupervised-language-detection/train-english-tweet-detector.rb +11 -0
- data/lib/unsupervised-language-detection/version.rb +3 -0
- data/test/test_language_detector.rb +19 -0
- data/test/test_naive_bayes_classifier.rb +60 -0
- data/test/test_naive_bayes_em.rb +23 -0
- data/test/test_suite.rb +4 -0
- data/unsupervised-language-detection.gemspec +21 -0
- data/website/Gemfile +12 -0
- data/website/README.md +1 -0
- data/website/config.ru +2 -0
- data/website/detector.yaml +1658 -0
- data/website/detector2.yaml +1658 -0
- data/website/main.rb +46 -0
- data/website/public/jquery.inlineformlabels.js +53 -0
- data/website/public/main.css +23 -0
- data/website/views/index.haml +36 -0
- data/website/views/layout.haml +14 -0
- data/website/views/tweet.haml +3 -0
- metadata +106 -0
data/website/main.rb
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'sinatra'
|
2
|
+
require 'uri'
|
3
|
+
require 'mongo'
|
4
|
+
require 'date'
|
5
|
+
require 'time'
|
6
|
+
require File.expand_path('../../src/language-detector', __FILE__)
|
7
|
+
|
8
|
+
use Rack::MethodOverride
|
9
|
+
|
10
|
+
# Tweets come from a MongoDB collection.
|
11
|
+
uri = URI.parse(ENV['MONGOHQ_URL'])
|
12
|
+
conn = Mongo::Connection.from_uri(ENV['MONGOHQ_URL'])
|
13
|
+
db = conn.db(uri.path.gsub(/^\//, ''))
|
14
|
+
coll = db["tweets"]
|
15
|
+
|
16
|
+
DETECTOR = LanguageDetector.load_yaml("detector2.yaml")
|
17
|
+
|
18
|
+
helpers do
|
19
|
+
def partial(page, locals = {})
|
20
|
+
haml page, :layout => false, :locals => locals
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
layout 'layout'
|
25
|
+
|
26
|
+
get '/' do
|
27
|
+
haml :index
|
28
|
+
end
|
29
|
+
|
30
|
+
post '/' do
|
31
|
+
@sentence = nil
|
32
|
+
if params[:sentence]
|
33
|
+
@sentence = params[:sentence]
|
34
|
+
@language = DETECTOR.classify(@sentence) == "majority" ? "English" : "Not English"
|
35
|
+
end
|
36
|
+
|
37
|
+
haml :index
|
38
|
+
end
|
39
|
+
|
40
|
+
get '/tweet' do
|
41
|
+
@tweet = coll.find().limit(-1).skip(rand(coll.count())).first()['text']
|
42
|
+
@language = DETECTOR.classify(@tweet) == "majority" ? "English" : "Not English"
|
43
|
+
@language = "Not English" if @tweet.split.select{ |c| c =~ /[^\x00-\x80]/ }.size > 1 # Use this if you want to check for non-Roman characters. Not necessary, but sometimes there are tweets consisting solely of non-Roman characters, in which case the classifier fails (since it currently removes all non-ASCII characters).
|
44
|
+
|
45
|
+
haml :tweet, :layout => false
|
46
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
(function($) {
|
2
|
+
|
3
|
+
$.fn.inlineFormLabels = function() {
|
4
|
+
|
5
|
+
var self = this;
|
6
|
+
|
7
|
+
// Hide all the labels, because we're going to put them in the input field instead.
|
8
|
+
$("label", self).hide();
|
9
|
+
|
10
|
+
// Grab all input fields (inputs and textareas) preceded by a label sibling...
|
11
|
+
$("label + input, label + textarea", self).each(function(type) {
|
12
|
+
|
13
|
+
// If the field is empty, display the label and add a class that indicates its holding the label.
|
14
|
+
if ($(this).val() == "") {
|
15
|
+
var labelText = $(this).prev("label, textarea").text().trim();
|
16
|
+
$(this).val(labelText).addClass("has-inline-label");
|
17
|
+
}
|
18
|
+
|
19
|
+
// If we click in the field, remove the label.
|
20
|
+
$(this).focus(function() {
|
21
|
+
if ($(this).hasClass("has-inline-label")) {
|
22
|
+
$(this).removeClass("has-inline-label");
|
23
|
+
$(this).val("");
|
24
|
+
}
|
25
|
+
});
|
26
|
+
|
27
|
+
// Not doing anything here yet...
|
28
|
+
$(this).keypress(function() {
|
29
|
+
});
|
30
|
+
|
31
|
+
// If we click out of the field and we haven't entered anything, redisplay the label and add back the label-indicator class.
|
32
|
+
$(this).blur(function() {
|
33
|
+
if ($(this).val() == "") {
|
34
|
+
var labelText = $(this).prev("label").text().trim();
|
35
|
+
$(this).val(labelText).addClass("has-inline-label");
|
36
|
+
}
|
37
|
+
});
|
38
|
+
|
39
|
+
});
|
40
|
+
|
41
|
+
// When submitting, remove the values from fields holding a label, so that we don't mistakenly think those are real inputs.
|
42
|
+
$(self).submit(function() {
|
43
|
+
$("input, textarea", self).each(function() {
|
44
|
+
if ($(this).hasClass("has-inline-label")) {
|
45
|
+
$(this).val("");
|
46
|
+
}
|
47
|
+
});
|
48
|
+
});
|
49
|
+
|
50
|
+
return self;
|
51
|
+
};
|
52
|
+
|
53
|
+
})(jQuery);
|
@@ -0,0 +1,23 @@
|
|
1
|
+
#container {
|
2
|
+
margin: 10px 20px;
|
3
|
+
}
|
4
|
+
|
5
|
+
table {
|
6
|
+
text-align: left;
|
7
|
+
}
|
8
|
+
|
9
|
+
table th.language, table td.language {
|
10
|
+
width: 150px;
|
11
|
+
}
|
12
|
+
|
13
|
+
.english {
|
14
|
+
background-color: #99FF99;
|
15
|
+
}
|
16
|
+
|
17
|
+
.other {
|
18
|
+
background-color: #FF9999;
|
19
|
+
}
|
20
|
+
|
21
|
+
form {
|
22
|
+
margin-bottom: 10px;
|
23
|
+
}
|
@@ -0,0 +1,36 @@
|
|
1
|
+
%p An unsupervised language identification algorithm. Trained on tweets with lang = "en" according to the Twitter API (which, in practice, returns tweets in Spanish, Portuguese, Dutch, Russian, and a couple other languages as well). More information <a href="http://blog.echen.me/2011/05/01/unsupervised-language-detection-algorithms/">here</a>.
|
2
|
+
|
3
|
+
%form{ :action => "/", :method => "post" }
|
4
|
+
%label{ :for => "sentence" } Sentence
|
5
|
+
%input{ :id => "sentence", :name => "sentence" }
|
6
|
+
%button{ :type => "submit" } Detect Language
|
7
|
+
|
8
|
+
- if @sentence and !@sentence.empty?
|
9
|
+
%p
|
10
|
+
%strong= @sentence
|
11
|
+
is
|
12
|
+
%span{ :class => "#{@language == "English" ? "english" : "other"}"}= @language
|
13
|
+
|
14
|
+
%table
|
15
|
+
%tr#header
|
16
|
+
%th.language Language
|
17
|
+
%th.tweet Tweet
|
18
|
+
|
19
|
+
:javascript
|
20
|
+
function addTweet() {
|
21
|
+
$.ajax({
|
22
|
+
method: 'GET',
|
23
|
+
url: '/tweet',
|
24
|
+
cache: false,
|
25
|
+
success: function(data) {
|
26
|
+
$("table tr#header:first").after(data).slideDown('slow');
|
27
|
+
setTimeout(addTweet, 500);
|
28
|
+
}
|
29
|
+
});
|
30
|
+
$('table tr.tweet:gt(20)').remove();
|
31
|
+
}
|
32
|
+
|
33
|
+
$(function() {
|
34
|
+
$("form").inlineFormLabels();
|
35
|
+
setTimeout(addTweet, 500);
|
36
|
+
});
|
@@ -0,0 +1,14 @@
|
|
1
|
+
!!! 5
|
2
|
+
%html
|
3
|
+
%head
|
4
|
+
%meta{ "http-equiv" => "Content-Type", :content => "text/html", :charset => "UTF-8" }
|
5
|
+
|
6
|
+
%title Babel Fett
|
7
|
+
%script{ :type => "text/javascript", :src => "http://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js" }
|
8
|
+
%script{ :type => "text/javascript", :src => "/jquery.inlineformlabels.js" }
|
9
|
+
%link{ :href => "/main.css", :rel => "stylesheet", :type => "text/css" }
|
10
|
+
|
11
|
+
%body
|
12
|
+
#container
|
13
|
+
%h1 Unsupervised Language Detection on Twitter
|
14
|
+
= yield
|
metadata
ADDED
@@ -0,0 +1,106 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: unsupervised-language-detection
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 29
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
- 1
|
10
|
+
version: 0.0.1
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Edwin Chen
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2011-05-14 00:00:00 -07:00
|
19
|
+
default_executable:
|
20
|
+
dependencies: []
|
21
|
+
|
22
|
+
description: Perform unsupervised language detection, specifically for the purpose of finding English-language tweets.
|
23
|
+
email:
|
24
|
+
- hello@echen.me
|
25
|
+
executables: []
|
26
|
+
|
27
|
+
extensions: []
|
28
|
+
|
29
|
+
extra_rdoc_files: []
|
30
|
+
|
31
|
+
files:
|
32
|
+
- Gemfile
|
33
|
+
- README.md
|
34
|
+
- Rakefile
|
35
|
+
- datasets/gutenberg-test-du.txt
|
36
|
+
- datasets/gutenberg-test-en.txt
|
37
|
+
- datasets/gutenberg-test-sp.txt
|
38
|
+
- datasets/gutenberg-training-du.txt
|
39
|
+
- datasets/gutenberg-training-en.txt
|
40
|
+
- datasets/gutenberg-training-sp.txt
|
41
|
+
- datasets/gutenberg-training.txt
|
42
|
+
- datasets/gutenberg-training_en_du.txt
|
43
|
+
- datasets/smiley_tweets_tiny.txt
|
44
|
+
- datasets/tweets_5000.txt
|
45
|
+
- language-detector-demo.rb
|
46
|
+
- lib/unsupervised-language-detection.rb
|
47
|
+
- lib/unsupervised-language-detection/english-tweet-detector.yaml
|
48
|
+
- lib/unsupervised-language-detection/language-detector.rb
|
49
|
+
- lib/unsupervised-language-detection/naive-bayes-classifier.rb
|
50
|
+
- lib/unsupervised-language-detection/train-english-tweet-detector.rb
|
51
|
+
- lib/unsupervised-language-detection/version.rb
|
52
|
+
- test/test_language_detector.rb
|
53
|
+
- test/test_naive_bayes_classifier.rb
|
54
|
+
- test/test_naive_bayes_em.rb
|
55
|
+
- test/test_suite.rb
|
56
|
+
- unsupervised-language-detection.gemspec
|
57
|
+
- website/Gemfile
|
58
|
+
- website/README.md
|
59
|
+
- website/config.ru
|
60
|
+
- website/detector.yaml
|
61
|
+
- website/detector2.yaml
|
62
|
+
- website/main.rb
|
63
|
+
- website/public/jquery.inlineformlabels.js
|
64
|
+
- website/public/main.css
|
65
|
+
- website/views/index.haml
|
66
|
+
- website/views/layout.haml
|
67
|
+
- website/views/tweet.haml
|
68
|
+
has_rdoc: true
|
69
|
+
homepage: http://blog.echen.me/2011/05/01/unsupervised-language-detection-algorithms/
|
70
|
+
licenses: []
|
71
|
+
|
72
|
+
post_install_message:
|
73
|
+
rdoc_options: []
|
74
|
+
|
75
|
+
require_paths:
|
76
|
+
- lib
|
77
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
78
|
+
none: false
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
hash: 3
|
83
|
+
segments:
|
84
|
+
- 0
|
85
|
+
version: "0"
|
86
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
87
|
+
none: false
|
88
|
+
requirements:
|
89
|
+
- - ">="
|
90
|
+
- !ruby/object:Gem::Version
|
91
|
+
hash: 3
|
92
|
+
segments:
|
93
|
+
- 0
|
94
|
+
version: "0"
|
95
|
+
requirements: []
|
96
|
+
|
97
|
+
rubyforge_project: unsupervised-language-detection
|
98
|
+
rubygems_version: 1.4.1
|
99
|
+
signing_key:
|
100
|
+
specification_version: 3
|
101
|
+
summary: Perform unsupervised language detection.
|
102
|
+
test_files:
|
103
|
+
- test/test_language_detector.rb
|
104
|
+
- test/test_naive_bayes_classifier.rb
|
105
|
+
- test/test_naive_bayes_em.rb
|
106
|
+
- test/test_suite.rb
|