whatlanguage 1.0.4 → 1.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +6 -0
- data/README.md +18 -2
- data/lang/arabic.lang +0 -0
- data/lang/finnish.lang +0 -0
- data/lang/greek.lang +0 -0
- data/lang/hebrew.lang +0 -0
- data/lang/hungarian.lang +0 -0
- data/lang/korean.lang +0 -0
- data/lang/norwegian.lang +0 -0
- data/lang/polish.lang +0 -0
- data/lib/whatlanguage.rb +13 -5
- data/lib/whatlanguage/version.rb +1 -1
- data/test/test_whatlanguage.rb +67 -5
- metadata +17 -7
- checksums.yaml +0 -7
data/History.txt
CHANGED
data/README.md
CHANGED
@@ -4,7 +4,7 @@ by Peter Cooper
|
|
4
4
|
|
5
5
|
Text language detection. Quick, fast, memory efficient, and all in pure Ruby. Uses Bloom filters for aforementioned speed and memory benefits.
|
6
6
|
|
7
|
-
Works with Dutch, English, Farsi, French, German, Italian, Pinyin, Swedish, Portuguese, Russian and Spanish out of the box.
|
7
|
+
Works with Dutch, English, Farsi, French, German, Italian, Pinyin, Swedish, Portuguese, Russian, Arabic, Finnish, Greek, Hebrew, Hungarian, Korean, Norwegian, Polish and Spanish out of the box.
|
8
8
|
|
9
9
|
## Important note
|
10
10
|
|
@@ -25,7 +25,15 @@ Full Example
|
|
25
25
|
texts << %q{Returns the object in enum with the maximum value.}
|
26
26
|
texts << %q{Propose des données au sujet de la langue espagnole.}
|
27
27
|
texts << %q{La palabra "mezquita" se usa en español para referirse a todo tipo de edificios dedicados.}
|
28
|
-
|
28
|
+
texts << %q{اللغة التي هي هذه؟}
|
29
|
+
texts << %q{Mitä kieltä tämä on?}
|
30
|
+
texts << %q{Ποια γλώσσα είναι αυτή;}
|
31
|
+
texts << %q{באיזו שפה זה?}
|
32
|
+
texts << %q{Milyen nyelv ez?}
|
33
|
+
texts << %q{이 어떤 언어인가?}
|
34
|
+
texts << %q{Hvilket språk er dette?}
|
35
|
+
texts << %q{W jakim języku to jest?}
|
36
|
+
|
29
37
|
texts.each { |text| puts "#{text[0..18]}... is in #{text.language.to_s.capitalize}" }
|
30
38
|
|
31
39
|
Initialize WhatLanguage with all filters
|
@@ -44,6 +52,10 @@ Convenience method on String
|
|
44
52
|
|
45
53
|
"This is a test".language # => "English"
|
46
54
|
|
55
|
+
Initialize WhatLanguage with certain languages
|
56
|
+
|
57
|
+
wl = WhatLanguage.new(:english, :german, :french)
|
58
|
+
|
47
59
|
## Requirements
|
48
60
|
|
49
61
|
None, minor libraries (BloominSimple and BitField) included with this release.
|
@@ -57,6 +69,10 @@ To test, go into irb, then:
|
|
57
69
|
require 'whatlanguage'
|
58
70
|
"Je suis un homme".language
|
59
71
|
|
72
|
+
## Credits
|
73
|
+
|
74
|
+
Contributions from Konrad Reiche and Salimane Adjao Moustapha appreciated.
|
75
|
+
|
60
76
|
## License
|
61
77
|
|
62
78
|
MIT License
|
data/lang/arabic.lang
ADDED
Binary file
|
data/lang/finnish.lang
ADDED
Binary file
|
data/lang/greek.lang
ADDED
Binary file
|
data/lang/hebrew.lang
ADDED
Binary file
|
data/lang/hungarian.lang
ADDED
Binary file
|
data/lang/korean.lang
ADDED
Binary file
|
data/lang/norwegian.lang
ADDED
Binary file
|
data/lang/polish.lang
ADDED
Binary file
|
data/lib/whatlanguage.rb
CHANGED
@@ -9,7 +9,8 @@ class WhatLanguage
|
|
9
9
|
|
10
10
|
@@data = {}
|
11
11
|
|
12
|
-
def initialize(
|
12
|
+
def initialize(*selection)
|
13
|
+
@selection = (selection.empty?) ? [:all] : selection
|
13
14
|
languages_folder = File.join(File.dirname(__FILE__), "..", "lang")
|
14
15
|
Dir.entries(languages_folder).grep(/\.lang/).each do |lang|
|
15
16
|
@@data[lang[/\w+/].to_sym] ||= BloominSimple.from_dump(File.new(File.join(languages_folder, lang), 'rb').read, &HASHER)
|
@@ -21,15 +22,22 @@ class WhatLanguage
|
|
21
22
|
def process_text(text)
|
22
23
|
results = Hash.new(0)
|
23
24
|
it = 0
|
24
|
-
text.split.
|
25
|
+
text.downcase.split.each do |word|
|
25
26
|
it += 1
|
26
|
-
|
27
|
+
|
28
|
+
if @selection.include?(:all)
|
29
|
+
languages = @@data.keys
|
30
|
+
else
|
31
|
+
languages = @@data.keys & @selection # intersection
|
32
|
+
end
|
33
|
+
|
34
|
+
languages.each do |lang|
|
27
35
|
results[lang] += 1 if @@data[lang].includes?(word)
|
28
36
|
end
|
29
37
|
|
30
38
|
# Every now and then check to see if we have a really convincing result.. if so, exit early.
|
31
39
|
if it % 4 == 0 && results.size > 1
|
32
|
-
top_results = results.sort_by{|a,b| b}
|
40
|
+
top_results = results.sort_by{|a,b| -b}[0..1]
|
33
41
|
|
34
42
|
# Next line may need some tweaking one day..
|
35
43
|
break if top_results[0][1] > 4 && ((top_results[0][1] > top_results[1][1] * 2) || (top_results[0][1] - top_results[1][1] > 25))
|
@@ -55,4 +63,4 @@ class String
|
|
55
63
|
def language
|
56
64
|
WhatLanguage.new(:all).language(self)
|
57
65
|
end
|
58
|
-
end
|
66
|
+
end
|
data/lib/whatlanguage/version.rb
CHANGED
data/test/test_whatlanguage.rb
CHANGED
@@ -12,25 +12,73 @@ class TestWhatLanguage < Test::Unit::TestCase
|
|
12
12
|
assert_equal :english, "This is a test".language
|
13
13
|
end
|
14
14
|
|
15
|
+
def test_arabic
|
16
|
+
assert_equal :arabic, @wl.language("اللغة التي هي هذه؟")
|
17
|
+
end
|
18
|
+
|
15
19
|
def test_dutch
|
16
20
|
assert_equal :dutch, @wl.language("Als hadden geweest is, is hebben te laat.")
|
17
21
|
end
|
18
22
|
|
23
|
+
def test_farsi
|
24
|
+
assert_equal :farsi, @wl.language("وقتی مادرم به من آموخت که به آواز خواندن.")
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_finnish
|
28
|
+
assert_equal :finnish, @wl.language("Mitä kieltä tämä on?")
|
29
|
+
end
|
30
|
+
|
19
31
|
def test_french
|
20
32
|
assert_equal :french, @wl.language("Bonjour, je m'appelle Sandrine. Voila ma chatte.")
|
21
33
|
end
|
22
34
|
|
23
|
-
def
|
24
|
-
assert_equal :
|
35
|
+
def test_german
|
36
|
+
assert_equal :german, @wl.language("Welche Sprache ist das?")
|
25
37
|
end
|
26
38
|
|
27
|
-
def
|
28
|
-
assert_equal :
|
39
|
+
def test_greek
|
40
|
+
assert_equal :greek, @wl.language("Ποια γλώσσα είναι αυτή;")
|
41
|
+
end
|
42
|
+
|
43
|
+
def test_hebrew
|
44
|
+
assert_equal :hebrew, @wl.language("באיזו שפה זה?")
|
45
|
+
end
|
46
|
+
|
47
|
+
def test_hungarian
|
48
|
+
assert_equal :hungarian, @wl.language("Milyen nyelv ez?")
|
49
|
+
end
|
50
|
+
|
51
|
+
def test_italian
|
52
|
+
assert_equal :italian, @wl.language("Roma, capitale dell'impero romano, è stata per secoli il centro politico e culturale della civiltà occidentale.")
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_korean
|
56
|
+
assert_equal :korean, @wl.language("이 어떤 언어인가?")
|
57
|
+
end
|
58
|
+
|
59
|
+
def test_norwegian
|
60
|
+
assert_equal :norwegian, @wl.language("Hvilket språk er dette?")
|
61
|
+
end
|
62
|
+
|
63
|
+
def test_polish
|
64
|
+
assert_equal :polish, @wl.language("W jakim języku to jest?")
|
65
|
+
end
|
66
|
+
|
67
|
+
def test_portuguese
|
68
|
+
assert_equal :portuguese, @wl.language("Que linguagem é essa?")
|
29
69
|
end
|
30
70
|
|
31
71
|
def test_russian
|
32
72
|
assert_equal :russian, @wl.language("Все новости в хронологическом порядке")
|
33
73
|
end
|
74
|
+
|
75
|
+
def test_spanish
|
76
|
+
assert_equal :spanish, @wl.language("La palabra mezquita se usa en español para referirse a todo tipo de edificios dedicados.")
|
77
|
+
end
|
78
|
+
|
79
|
+
def test_swedish
|
80
|
+
assert_equal :swedish, @wl.language("Vilket språk är detta?")
|
81
|
+
end
|
34
82
|
|
35
83
|
def test_nothing
|
36
84
|
assert_nil @wl.language("")
|
@@ -48,4 +96,18 @@ class TestWhatLanguage < Test::Unit::TestCase
|
|
48
96
|
assert_equal :italian, @wl.language("Roma, capitale dell'impero romano, è stata per secoli il centro politico e culturale della civiltà occidentale.")
|
49
97
|
end
|
50
98
|
|
51
|
-
|
99
|
+
def test_language_selection
|
100
|
+
selective_wl = WhatLanguage.new(:german, :english)
|
101
|
+
assert_equal :german, selective_wl.language("der die das")
|
102
|
+
end
|
103
|
+
|
104
|
+
def test_language_selection_empty
|
105
|
+
selective_wl = WhatLanguage.new()
|
106
|
+
assert_equal :russian, selective_wl.language("Все новости в хронологическом порядке")
|
107
|
+
end
|
108
|
+
|
109
|
+
def test_language_selection_mixed
|
110
|
+
selective_wl = WhatLanguage.new(:german, :all, :english)
|
111
|
+
assert_equal :russian, selective_wl.language("Все новости в хронологическом порядке")
|
112
|
+
end
|
113
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: whatlanguage
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.5
|
5
|
+
prerelease:
|
5
6
|
platform: ruby
|
6
7
|
authors:
|
7
8
|
- Peter Cooper
|
8
9
|
autorequire:
|
9
10
|
bindir: bin
|
10
11
|
cert_chain: []
|
11
|
-
date: 2013-
|
12
|
+
date: 2013-10-05 00:00:00.000000000 Z
|
12
13
|
dependencies: []
|
13
14
|
description: WhatLanguage rapidly detects the language of a sample of text
|
14
15
|
email:
|
@@ -28,13 +29,21 @@ files:
|
|
28
29
|
- build_lang_from_wordlists.rb
|
29
30
|
- copyright-en
|
30
31
|
- example.rb
|
32
|
+
- lang/arabic.lang
|
31
33
|
- lang/dutch.lang
|
32
34
|
- lang/english.lang
|
33
35
|
- lang/farsi.lang
|
36
|
+
- lang/finnish.lang
|
34
37
|
- lang/french.lang
|
35
38
|
- lang/german.lang
|
39
|
+
- lang/greek.lang
|
40
|
+
- lang/hebrew.lang
|
41
|
+
- lang/hungarian.lang
|
36
42
|
- lang/italian.lang
|
43
|
+
- lang/korean.lang
|
44
|
+
- lang/norwegian.lang
|
37
45
|
- lang/pinyin.lang
|
46
|
+
- lang/polish.lang
|
38
47
|
- lang/portuguese.lang
|
39
48
|
- lang/russian.lang
|
40
49
|
- lang/spanish.lang
|
@@ -47,26 +56,27 @@ files:
|
|
47
56
|
- whatlanguage.gemspec
|
48
57
|
homepage: https://github.com/peterc/whatlanguage
|
49
58
|
licenses: []
|
50
|
-
metadata: {}
|
51
59
|
post_install_message:
|
52
60
|
rdoc_options: []
|
53
61
|
require_paths:
|
54
62
|
- lib
|
55
63
|
required_ruby_version: !ruby/object:Gem::Requirement
|
64
|
+
none: false
|
56
65
|
requirements:
|
57
|
-
- - '>='
|
66
|
+
- - ! '>='
|
58
67
|
- !ruby/object:Gem::Version
|
59
68
|
version: '0'
|
60
69
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
70
|
+
none: false
|
61
71
|
requirements:
|
62
|
-
- - '>='
|
72
|
+
- - ! '>='
|
63
73
|
- !ruby/object:Gem::Version
|
64
74
|
version: '0'
|
65
75
|
requirements: []
|
66
76
|
rubyforge_project:
|
67
|
-
rubygems_version:
|
77
|
+
rubygems_version: 1.8.24
|
68
78
|
signing_key:
|
69
|
-
specification_version:
|
79
|
+
specification_version: 3
|
70
80
|
summary: Natural language detection for text samples
|
71
81
|
test_files:
|
72
82
|
- test/test_whatlanguage.rb
|
checksums.yaml
DELETED
@@ -1,7 +0,0 @@
|
|
1
|
-
---
|
2
|
-
SHA1:
|
3
|
-
metadata.gz: ef04e53215ee669aed4e66e5ec8e66bbcc1f7fa0
|
4
|
-
data.tar.gz: 270302e063e5d85ec875de08ac54100c9db5efb8
|
5
|
-
SHA512:
|
6
|
-
metadata.gz: a6be7929232a8b48229cae418de4e4bbb928245f9d4ead0ae38b6286fad8d482e06ec82549781187613799596b3657f99fb442d49348329b3e53ae96130029b0
|
7
|
-
data.tar.gz: 98bea8003cacaf0da3774232f1daf112c11c9b3a826700fd2bb5252e3b9837c012f87f54db17d25c640a45a10e60c5891c8e88b8b0ba7835a6e0389bf2695cce
|