feedbackmine-language_detector 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Manifest.txt +5 -0
- data/README +18 -0
- data/lib/language_detector.rb +234 -0
- data/lib/model.yml +119191 -0
- data/test/language_detector_test.rb +85 -0
- metadata +57 -0
@@ -0,0 +1,85 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require File.dirname(__FILE__) + '/../lib/language_detector'
|
3
|
+
|
4
|
+
class ProfileTest < Test::Unit::TestCase
|
5
|
+
def test_is_puctuation
|
6
|
+
p = Profile.new("test")
|
7
|
+
assert p.is_puctuation?(?,)
|
8
|
+
assert p.is_puctuation?(?.)
|
9
|
+
assert !p.is_puctuation?(?A)
|
10
|
+
assert !p.is_puctuation?(?a)
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_tokenize
|
14
|
+
p = Profile.new("test")
|
15
|
+
assert_equal ["this", "is", "A", "test"], p.tokenize("this is ,+_ A \t 123 test")
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_count_ngram
|
19
|
+
p = Profile.new("test")
|
20
|
+
assert_equal({"w"=>1, "o"=>1, "r"=>1, "d"=>1, "s"=>1}, p.count_ngram('words', 1, {}))
|
21
|
+
assert_equal({"wo"=>1, "or"=>1, "rd"=>1, "ds"=>1}, p.count_ngram('words', 2, {}))
|
22
|
+
assert_equal({"wor"=>1, "ord"=>1, "rds"=>1}, p.count_ngram('words', 3, {}))
|
23
|
+
assert_equal({"word"=>1, "ords"=>1}, p.count_ngram('words', 4, {}))
|
24
|
+
assert_equal({"words"=>1}, p.count_ngram('words', 5, {}))
|
25
|
+
assert_equal({}, p.count_ngram('words', 6, {}))
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_init_with_string
|
29
|
+
p = Profile.new("test")
|
30
|
+
p.init_with_string("this is ,+_ A \t 123 test")
|
31
|
+
assert_equal({"st"=>12,
|
32
|
+
"hi"=>7,
|
33
|
+
"tes"=>3,
|
34
|
+
"es"=>4,
|
35
|
+
"te"=>6,
|
36
|
+
"est"=>5,
|
37
|
+
"his"=>8,
|
38
|
+
"test"=>2,
|
39
|
+
"this"=>9,
|
40
|
+
"th"=>10,
|
41
|
+
"thi"=>11,
|
42
|
+
"is"=>1}, p.ngrams)
|
43
|
+
end
|
44
|
+
|
45
|
+
def test_init_with_file
|
46
|
+
p = Profile.new("test")
|
47
|
+
p.init_with_file("bg-utf8.txt")
|
48
|
+
assert !p.ngrams.empty?
|
49
|
+
end
|
50
|
+
|
51
|
+
def test_compute_distance
|
52
|
+
p1 = Profile.new("test")
|
53
|
+
p1.init_with_string("this is ,+_ A \t 123 test")
|
54
|
+
p2 = Profile.new("test")
|
55
|
+
p2.init_with_string("this is ,+_ A \t 123 test")
|
56
|
+
assert_equal 0, p1.compute_distance(p2)
|
57
|
+
|
58
|
+
p3 = Profile.new("test")
|
59
|
+
p3.init_with_string("xxxx")
|
60
|
+
assert_equal 6000, p1.compute_distance(p3)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
class LanguageDetectorTest < Test::Unit::TestCase
|
65
|
+
def test_detect
|
66
|
+
d = LanguageDetector.new
|
67
|
+
|
68
|
+
#assert_equal "es", d.detect("para poner este importante proyecto en práctica")
|
69
|
+
assert_equal "en", d.detect("this is a test of the Emergency text categorizing system.")
|
70
|
+
assert_equal "fr", d.detect("serait désigné peu après PDG d'Antenne 2 et de FR 3. Pas même lui ! Le")
|
71
|
+
assert_equal "it", d.detect("studio dell'uomo interiore? La scienza del cuore umano, che")
|
72
|
+
assert_equal "ro", d.detect("taiate pe din doua, in care vezi stralucind brun sau violet cristalele interioare")
|
73
|
+
assert_equal "pl", d.detect("na porozumieniu, na ³±czeniu si³ i ¶rodków. Dlatego szukam ludzi, którzy")
|
74
|
+
assert_equal "de", d.detect("sagt Hühsam das war bei Über eine Annonce in einem Frankfurter der Töpfer ein. Anhand von gefundenen gut kennt, hatte ihm die wahren Tatsachen Sechzehn Adorno-Schüler erinnern und daß ein Weiterdenken der Theorie für ihre Festlegung sind drei Jahre Erschütterung Einblick in die Abhängigkeit der Bauarbeiten sei")
|
75
|
+
assert_equal "fi", d.detect("koulun arkistoihin pölyttymään, vaan nuoret saavat itse vaikuttaa ajatustensa eteenpäinviemiseen esimerkiksi")
|
76
|
+
#assert_equal "sv", d.detect("enligt all sannolikhet för att få ro oavsiktligt intagit en för")
|
77
|
+
assert_equal "hu", d.detect("esôzéseket egy kissé túlméretezte, ebbôl kifolyólag a Földet egy hatalmas árvíz mosta el")
|
78
|
+
assert_equal "fi", d.detect("koulun arkistoihin pölyttymään, vaan nuoret saavat itse vaikuttaa ajatustensa eteenpäinviemiseen esimerkiksi")
|
79
|
+
assert_equal "nl", d.detect("tegen de kabinetsplannen. Een speciaal in het leven geroepen Landelijk")
|
80
|
+
#assert_equal "da", d.detect("viksomhed, 58 pct. har et arbejde eller er under uddannelse, 76 pct. forsørges ikke længere af Kolding")
|
81
|
+
assert_equal "cs", d.detect("datují rokem 1862. Naprosto zakázán byl v pocitech smutku, beznadìje èi jiné")
|
82
|
+
assert_equal "no", d.detect("hånd på den enda hvitere restaurant-duken med en bevegelse så forfinet")
|
83
|
+
assert_equal "pt", d.detect("popular. Segundo o seu biógrafo, a Maria Adelaide auxiliava muita gente")
|
84
|
+
end
|
85
|
+
end
|
metadata
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: feedbackmine-language_detector
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- feedbackmine
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-02-25 00:00:00 -08:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: n-gram based language detector, written in ruby
|
17
|
+
email: feedbackmine@feedbackmine.com
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files: []
|
23
|
+
|
24
|
+
files:
|
25
|
+
- README
|
26
|
+
- Manifest.txt
|
27
|
+
- lib/language_detector.rb
|
28
|
+
- lib/model.yml
|
29
|
+
- test/language_detector_test.rb
|
30
|
+
has_rdoc: false
|
31
|
+
homepage: http://www.tweetjobsearch.com
|
32
|
+
post_install_message:
|
33
|
+
rdoc_options: []
|
34
|
+
|
35
|
+
require_paths:
|
36
|
+
- lib
|
37
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - ">="
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: "0"
|
42
|
+
version:
|
43
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: "0"
|
48
|
+
version:
|
49
|
+
requirements: []
|
50
|
+
|
51
|
+
rubyforge_project:
|
52
|
+
rubygems_version: 1.2.0
|
53
|
+
signing_key:
|
54
|
+
specification_version: 2
|
55
|
+
summary: n-gram based language detector, written in ruby
|
56
|
+
test_files: []
|
57
|
+
|