unsupervised-language-detection 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
data/website/main.rb DELETED
@@ -1,46 +0,0 @@
1
- require 'sinatra'
2
- require 'uri'
3
- require 'mongo'
4
- require 'date'
5
- require 'time'
6
- require File.expand_path('../../src/language-detector', __FILE__)
7
-
8
- use Rack::MethodOverride
9
-
10
- # Tweets come from a MongoDB collection.
11
- uri = URI.parse(ENV['MONGOHQ_URL'])
12
- conn = Mongo::Connection.from_uri(ENV['MONGOHQ_URL'])
13
- db = conn.db(uri.path.gsub(/^\//, ''))
14
- coll = db["tweets"]
15
-
16
- DETECTOR = LanguageDetector.load_yaml("detector2.yaml")
17
-
18
- helpers do
19
- def partial(page, locals = {})
20
- haml page, :layout => false, :locals => locals
21
- end
22
- end
23
-
24
- layout 'layout'
25
-
26
- get '/' do
27
- haml :index
28
- end
29
-
30
- post '/' do
31
- @sentence = nil
32
- if params[:sentence]
33
- @sentence = params[:sentence]
34
- @language = DETECTOR.classify(@sentence) == "majority" ? "English" : "Not English"
35
- end
36
-
37
- haml :index
38
- end
39
-
40
- get '/tweet' do
41
- @tweet = coll.find().limit(-1).skip(rand(coll.count())).first()['text']
42
- @language = DETECTOR.classify(@tweet) == "majority" ? "English" : "Not English"
43
- @language = "Not English" if @tweet.split.select{ |c| c =~ /[^\x00-\x80]/ }.size > 1 # Use this if you want to check for non-Roman characters. Not necessary, but sometimes there are tweets consisting solely of non-Roman characters, in which case the classifier fails (since it currently removes all non-ASCII characters).
44
-
45
- haml :tweet, :layout => false
46
- end
@@ -1,53 +0,0 @@
1
- (function($) {
2
-
3
- $.fn.inlineFormLabels = function() {
4
-
5
- var self = this;
6
-
7
- // Hide all the labels, because we're going to put them in the input field instead.
8
- $("label", self).hide();
9
-
10
- // Grab all input fields (inputs and textareas) preceded by a label sibling...
11
- $("label + input, label + textarea", self).each(function(type) {
12
-
13
- // If the field is empty, display the label and add a class that indicates its holding the label.
14
- if ($(this).val() == "") {
15
- var labelText = $(this).prev("label, textarea").text().trim();
16
- $(this).val(labelText).addClass("has-inline-label");
17
- }
18
-
19
- // If we click in the field, remove the label.
20
- $(this).focus(function() {
21
- if ($(this).hasClass("has-inline-label")) {
22
- $(this).removeClass("has-inline-label");
23
- $(this).val("");
24
- }
25
- });
26
-
27
- // Not doing anything here yet...
28
- $(this).keypress(function() {
29
- });
30
-
31
- // If we click out of the field and we haven't entered anything, redisplay the label and add back the label-indicator class.
32
- $(this).blur(function() {
33
- if ($(this).val() == "") {
34
- var labelText = $(this).prev("label").text().trim();
35
- $(this).val(labelText).addClass("has-inline-label");
36
- }
37
- });
38
-
39
- });
40
-
41
- // When submitting, remove the values from fields holding a label, so that we don't mistakenly think those are real inputs.
42
- $(self).submit(function() {
43
- $("input, textarea", self).each(function() {
44
- if ($(this).hasClass("has-inline-label")) {
45
- $(this).val("");
46
- }
47
- });
48
- });
49
-
50
- return self;
51
- };
52
-
53
- })(jQuery);
@@ -1,23 +0,0 @@
1
- #container {
2
- margin: 10px 20px;
3
- }
4
-
5
- table {
6
- text-align: left;
7
- }
8
-
9
- table th.language, table td.language {
10
- width: 150px;
11
- }
12
-
13
- .english {
14
- background-color: #99FF99;
15
- }
16
-
17
- .other {
18
- background-color: #FF9999;
19
- }
20
-
21
- form {
22
- margin-bottom: 10px;
23
- }
@@ -1,36 +0,0 @@
1
- %p An unsupervised language identification algorithm. Trained on tweets with lang = "en" according to the Twitter API (which, in practice, returns tweets in Spanish, Portuguese, Dutch, Russian, and a couple other languages as well). More information <a href="http://blog.echen.me/2011/05/01/unsupervised-language-detection-algorithms/">here</a>.
2
-
3
- %form{ :action => "/", :method => "post" }
4
- %label{ :for => "sentence" } Sentence
5
- %input{ :id => "sentence", :name => "sentence" }
6
- %button{ :type => "submit" } Detect Language
7
-
8
- - if @sentence and !@sentence.empty?
9
- %p
10
- %strong= @sentence
11
- is
12
- %span{ :class => "#{@language == "English" ? "english" : "other"}"}= @language
13
-
14
- %table
15
- %tr#header
16
- %th.language Language
17
- %th.tweet Tweet
18
-
19
- :javascript
20
- function addTweet() {
21
- $.ajax({
22
- method: 'GET',
23
- url: '/tweet',
24
- cache: false,
25
- success: function(data) {
26
- $("table tr#header:first").after(data).slideDown('slow');
27
- setTimeout(addTweet, 500);
28
- }
29
- });
30
- $('table tr.tweet:gt(20)').remove();
31
- }
32
-
33
- $(function() {
34
- $("form").inlineFormLabels();
35
- setTimeout(addTweet, 500);
36
- });
@@ -1,19 +0,0 @@
1
- !!! 5
2
- %html
3
- %head
4
- %meta{ "http-equiv" => "Content-Type", :content => "text/html", :charset => "UTF-8" }
5
-
6
- %title Babel Fett
7
- %script{ :type => "text/javascript", :src => "http://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js" }
8
- %script{ :type => "text/javascript", :src => "/jquery.inlineformlabels.js" }
9
- %link{ :href => "/main.css", :rel => "stylesheet", :type => "text/css" }
10
-
11
- %body
12
- #container
13
- %h1 Unsupervised Language Detection on Twitter
14
- = yield
15
-
16
- %footer
17
- %p
18
- %strong How does this work?
19
- Learn more at <a href = "http://blog.echen.me/2011/05/05/twss-building-a-thats-what-she-said-classifier/">here</a>. By <a href="http://echen.me">Edwin Chen</a>.
@@ -1,3 +0,0 @@
1
- %tr{:class => "tweet #{@language == "English" ? "english" : "other"}"}
2
- %td.language= @language
3
- %td.text= @tweet