feedbackmine-language_detector 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,85 @@
1
+ require 'test/unit'
2
+ require File.dirname(__FILE__) + '/../lib/language_detector'
3
+
4
+ class ProfileTest < Test::Unit::TestCase
5
+ def test_is_puctuation
6
+ p = Profile.new("test")
7
+ assert p.is_puctuation?(?,)
8
+ assert p.is_puctuation?(?.)
9
+ assert !p.is_puctuation?(?A)
10
+ assert !p.is_puctuation?(?a)
11
+ end
12
+
13
+ def test_tokenize
14
+ p = Profile.new("test")
15
+ assert_equal ["this", "is", "A", "test"], p.tokenize("this is ,+_ A \t 123 test")
16
+ end
17
+
18
+ def test_count_ngram
19
+ p = Profile.new("test")
20
+ assert_equal({"w"=>1, "o"=>1, "r"=>1, "d"=>1, "s"=>1}, p.count_ngram('words', 1, {}))
21
+ assert_equal({"wo"=>1, "or"=>1, "rd"=>1, "ds"=>1}, p.count_ngram('words', 2, {}))
22
+ assert_equal({"wor"=>1, "ord"=>1, "rds"=>1}, p.count_ngram('words', 3, {}))
23
+ assert_equal({"word"=>1, "ords"=>1}, p.count_ngram('words', 4, {}))
24
+ assert_equal({"words"=>1}, p.count_ngram('words', 5, {}))
25
+ assert_equal({}, p.count_ngram('words', 6, {}))
26
+ end
27
+
28
+ def test_init_with_string
29
+ p = Profile.new("test")
30
+ p.init_with_string("this is ,+_ A \t 123 test")
31
+ assert_equal({"st"=>12,
32
+ "hi"=>7,
33
+ "tes"=>3,
34
+ "es"=>4,
35
+ "te"=>6,
36
+ "est"=>5,
37
+ "his"=>8,
38
+ "test"=>2,
39
+ "this"=>9,
40
+ "th"=>10,
41
+ "thi"=>11,
42
+ "is"=>1}, p.ngrams)
43
+ end
44
+
45
+ def test_init_with_file
46
+ p = Profile.new("test")
47
+ p.init_with_file("bg-utf8.txt")
48
+ assert !p.ngrams.empty?
49
+ end
50
+
51
+ def test_compute_distance
52
+ p1 = Profile.new("test")
53
+ p1.init_with_string("this is ,+_ A \t 123 test")
54
+ p2 = Profile.new("test")
55
+ p2.init_with_string("this is ,+_ A \t 123 test")
56
+ assert_equal 0, p1.compute_distance(p2)
57
+
58
+ p3 = Profile.new("test")
59
+ p3.init_with_string("xxxx")
60
+ assert_equal 6000, p1.compute_distance(p3)
61
+ end
62
+ end
63
+
64
+ class LanguageDetectorTest < Test::Unit::TestCase
65
+ def test_detect
66
+ d = LanguageDetector.new
67
+
68
+ #assert_equal "es", d.detect("para poner este importante proyecto en práctica")
69
+ assert_equal "en", d.detect("this is a test of the Emergency text categorizing system.")
70
+ assert_equal "fr", d.detect("serait désigné peu après PDG d'Antenne 2 et de FR 3. Pas même lui ! Le")
71
+ assert_equal "it", d.detect("studio dell'uomo interiore? La scienza del cuore umano, che")
72
+ assert_equal "ro", d.detect("taiate pe din doua, in care vezi stralucind brun sau violet cristalele interioare")
73
+ assert_equal "pl", d.detect("na porozumieniu, na ³±czeniu si³ i ¶rodków. Dlatego szukam ludzi, którzy")
74
+ assert_equal "de", d.detect("sagt Hühsam das war bei Über eine Annonce in einem Frankfurter der Töpfer ein. Anhand von gefundenen gut kennt, hatte ihm die wahren Tatsachen Sechzehn Adorno-Schüler erinnern und daß ein Weiterdenken der Theorie für ihre Festlegung sind drei Jahre Erschütterung Einblick in die Abhängigkeit der Bauarbeiten sei")
75
+ assert_equal "fi", d.detect("koulun arkistoihin pölyttymään, vaan nuoret saavat itse vaikuttaa ajatustensa eteenpäinviemiseen esimerkiksi")
76
+ #assert_equal "sv", d.detect("enligt all sannolikhet för att få ro oavsiktligt intagit en för")
77
+ assert_equal "hu", d.detect("esôzéseket egy kissé túlméretezte, ebbôl kifolyólag a Földet egy hatalmas árvíz mosta el")
78
+ assert_equal "fi", d.detect("koulun arkistoihin pölyttymään, vaan nuoret saavat itse vaikuttaa ajatustensa eteenpäinviemiseen esimerkiksi")
79
+ assert_equal "nl", d.detect("tegen de kabinetsplannen. Een speciaal in het leven geroepen Landelijk")
80
+ #assert_equal "da", d.detect("viksomhed, 58 pct. har et arbejde eller er under uddannelse, 76 pct. forsørges ikke længere af Kolding")
81
+ assert_equal "cs", d.detect("datují rokem 1862. Naprosto zakázán byl v pocitech smutku, beznadìje èi jiné")
82
+ assert_equal "no", d.detect("hånd på den enda hvitere restaurant-duken med en bevegelse så forfinet")
83
+ assert_equal "pt", d.detect("popular. Segundo o seu biógrafo, a Maria Adelaide auxiliava muita gente")
84
+ end
85
+ end
metadata ADDED
@@ -0,0 +1,57 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: feedbackmine-language_detector
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - feedbackmine
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-02-25 00:00:00 -08:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: n-gram based language detector, written in ruby
17
+ email: feedbackmine@feedbackmine.com
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files: []
23
+
24
+ files:
25
+ - README
26
+ - Manifest.txt
27
+ - lib/language_detector.rb
28
+ - lib/model.yml
29
+ - test/language_detector_test.rb
30
+ has_rdoc: false
31
+ homepage: http://www.tweetjobsearch.com
32
+ post_install_message:
33
+ rdoc_options: []
34
+
35
+ require_paths:
36
+ - lib
37
+ required_ruby_version: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ version: "0"
42
+ version:
43
+ required_rubygems_version: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: "0"
48
+ version:
49
+ requirements: []
50
+
51
+ rubyforge_project:
52
+ rubygems_version: 1.2.0
53
+ signing_key:
54
+ specification_version: 2
55
+ summary: n-gram based language detector, written in ruby
56
+ test_files: []
57
+