igrigorik-language_detector 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,77 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'test/unit'
3
+ require File.dirname(__FILE__) + '/../lib/language_detector'
4
+
5
+ class ProfileTest < Test::Unit::TestCase
6
+ def test_is_punctuation
7
+ p = Profile.new(:name => "test")
8
+ assert p.is_punctuation?(',')
9
+ assert p.is_punctuation?('.')
10
+ assert !p.is_punctuation?('A')
11
+ assert !p.is_punctuation?('a')
12
+ end
13
+
14
+ def test_tokenize
15
+ p = Profile.new(:name => "test")
16
+ assert_equal ["this", "is", "A", "test"], p.tokenize("this is ,+_ A \t 123 test")
17
+ end
18
+
19
+ def test_count_ngram
20
+ p = Profile.new(:name => "test")
21
+ assert_equal({"w"=>1, "o"=>1, "r"=>1, "d"=>1, "s"=>1}, p.count_ngram('words', 1, {}))
22
+ assert_equal({"wo"=>1, "or"=>1, "rd"=>1, "ds"=>1, "_w" => 1, "s_" => 1}, p.count_ngram('words', 2, {}))
23
+ assert_equal({"wor"=>1, "ord"=>1, "rds"=>1, "_wo" => 1, "ds_" => 1, "s__" => 1}, p.count_ngram('words', 3, {}))
24
+ assert_equal({"word"=>1, "ords"=>1, "_wor" => 1, "rds_" => 1, "ds__" => 1, "s___" => 1}, p.count_ngram('words', 4, {}))
25
+ assert_equal({"words"=>1, "_word" => 1, "ords_" => 1, "rds__" => 1, "ds___" => 1, "s____" => 1}, p.count_ngram('words', 5, {}))
26
+ assert_equal({}, p.count_ngram('words', 6, {}))
27
+ end
28
+
29
+ def test_init_with_string
30
+ # ruby 1.8 / 1.9 sort has slightly different semantics, hence test the presence of each ngram instead
31
+ p = Profile.new(:text => "this is ,+_ A \t 123 test")
32
+ [
33
+ ["t_", 30], ["st__", 29], ["st", 16], ["hi", 8], ["_tes", 7], ["is__", 6], ["s___", 5], ["s_", 3], ["his_", 11],
34
+ ["tes", 10], ["t___", 9], ["es", 12], ["_te", 14], ["est_", 13], ["est", 15], ["te", 4], ["his", 17], ["_th", 20],
35
+ ["s__", 19], ["st_", 18], ["th", 24], ["_thi", 23], ["t__", 22], ["test", 21], ["thi", 28], ["is_", 27], ["this", 26],
36
+ ["_i", 25], ["is", 2], ["_t", 1]
37
+ ].each do |ngram|
38
+ assert p.ngrams.has_key?(ngram.first)
39
+ end
40
+ end
41
+
42
+ def test_init_with_file
43
+ p = Profile.new(:file => "bg-utf8.txt")
44
+ assert !p.ngrams.empty?
45
+ end
46
+
47
+ def test_compute_distance
48
+ p1 = Profile.new(:name => "test", :text => "this is ,+_ A \t 123 test")
49
+ p2 = Profile.new(:name => "test", :text => "this is ,+_ A \t 123 test")
50
+ assert_equal 0, p1.compute_distance(p2)
51
+
52
+ p3 = Profile.new(:name => "test", :text => "xxxx")
53
+ assert_equal 18000, p1.compute_distance(p3)
54
+ end
55
+ end
56
+
57
+ class LanguageDetectorTest < Test::Unit::TestCase
58
+ def test_detect
59
+ d = LanguageDetector.new
60
+
61
+ assert_equal "spanish", d.detect("para poner este importante proyecto en práctica")
62
+ assert_equal "english", d.detect("this is a test of the Emergency text categorizing system.")
63
+ assert_equal "french", d.detect("serait désigné peu après PDG d'Antenne 2 et de FR 3. Pas même lui ! Le")
64
+ assert_equal "italian", d.detect("studio dell'uomo interiore? La scienza del cuore umano, che")
65
+ assert_equal "romanian", d.detect("taiate pe din doua, in care vezi stralucind brun sau violet cristalele interioare")
66
+ assert_equal "polish", d.detect("na porozumieniu, na ³±czeniu si³ i ¶rodków. Dlatego szukam ludzi, którzy")
67
+ assert_equal "german", d.detect("sagt Hühsam das war bei Über eine Annonce in einem Frankfurter der Töpfer ein. Anhand von gefundenen gut kennt, hatte ihm die wahren Tatsachen Sechzehn Adorno-Schüler erinnern und daß ein Weiterdenken der Theorie für ihre Festlegung sind drei Jahre Erschütterung Einblick in die Abhängigkeit der Bauarbeiten sei")
68
+ assert_equal "hungarian", d.detect("esôzéseket egy kissé túlméretezte, ebbôl kifolyólag a Földet egy hatalmas árvíz mosta el")
69
+ assert_equal "finnish", d.detect("koulun arkistoihin pölyttymään, vaan nuoret saavat itse vaikuttaa ajatustensa eteenpäinviemiseen esimerkiksi")
70
+ assert_equal "dutch", d.detect("tegen de kabinetsplannen. Een speciaal in het leven geroepen Landelijk")
71
+ assert_equal "danish", d.detect("viksomhed, 58 pct. har et arbejde eller er under uddannelse, 76 pct. forsørges ikke længere af Kolding")
72
+ assert_equal "czech", d.detect("datují rokem 1862. Naprosto zakázán byl v pocitech smutku, beznadìje èi jiné")
73
+ # assert_equal "norwegian", d.detect("hånd på den enda hvitere restaurant-duken med en bevegelse så forfinet bevegelse")
74
+ assert_equal "portuguese", d.detect("popular. Segundo o seu biógrafo, a Maria Adelaide auxiliava muita gente")
75
+ assert_equal "english", d.detect("TaffyDB finders looking nice so far! Testing this long sentence.")
76
+ end
77
+ end
metadata ADDED
@@ -0,0 +1,58 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: igrigorik-language_detector
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.2
5
+ platform: ruby
6
+ authors:
7
+ - feedbackmine
8
+ - igrigorik
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2009-05-16 00:00:00 -07:00
14
+ default_executable:
15
+ dependencies: []
16
+
17
+ description: n-gram based language detector, written in ruby
18
+ email: feedbackmine@feedbackmine.com
19
+ executables: []
20
+
21
+ extensions: []
22
+
23
+ extra_rdoc_files: []
24
+
25
+ files:
26
+ - README.rdoc
27
+ - lib/language_detector.rb
28
+ - lib/model-fm.yml
29
+ - lib/model-tc.yml
30
+ - test/language_detector_test.rb
31
+ has_rdoc: false
32
+ homepage: http://www.tweetjobsearch.com
33
+ post_install_message:
34
+ rdoc_options: []
35
+
36
+ require_paths:
37
+ - lib
38
+ required_ruby_version: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: "0"
43
+ version:
44
+ required_rubygems_version: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - ">="
47
+ - !ruby/object:Gem::Version
48
+ version: "0"
49
+ version:
50
+ requirements: []
51
+
52
+ rubyforge_project:
53
+ rubygems_version: 1.2.0
54
+ signing_key:
55
+ specification_version: 2
56
+ summary: n-gram based language detector, written in ruby
57
+ test_files: []
58
+