whatlanguage 1.0.4 → 1.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +6 -0
- data/README.md +18 -2
- data/lang/arabic.lang +0 -0
- data/lang/finnish.lang +0 -0
- data/lang/greek.lang +0 -0
- data/lang/hebrew.lang +0 -0
- data/lang/hungarian.lang +0 -0
- data/lang/korean.lang +0 -0
- data/lang/norwegian.lang +0 -0
- data/lang/polish.lang +0 -0
- data/lib/whatlanguage.rb +13 -5
- data/lib/whatlanguage/version.rb +1 -1
- data/test/test_whatlanguage.rb +67 -5
- metadata +17 -7
- checksums.yaml +0 -7
data/History.txt
CHANGED
data/README.md
CHANGED
@@ -4,7 +4,7 @@ by Peter Cooper
|
|
4
4
|
|
5
5
|
Text language detection. Quick, fast, memory efficient, and all in pure Ruby. Uses Bloom filters for aforementioned speed and memory benefits.
|
6
6
|
|
7
|
-
Works with Dutch, English, Farsi, French, German, Italian, Pinyin, Swedish, Portuguese, Russian and Spanish out of the box.
|
7
|
+
Works with Dutch, English, Farsi, French, German, Italian, Pinyin, Swedish, Portuguese, Russian, Arabic, Finnish, Greek, Hebrew, Hungarian, Korean, Norwegian, Polish and Spanish out of the box.
|
8
8
|
|
9
9
|
## Important note
|
10
10
|
|
@@ -25,7 +25,15 @@ Full Example
|
|
25
25
|
texts << %q{Returns the object in enum with the maximum value.}
|
26
26
|
texts << %q{Propose des données au sujet de la langue espagnole.}
|
27
27
|
texts << %q{La palabra "mezquita" se usa en español para referirse a todo tipo de edificios dedicados.}
|
28
|
-
|
28
|
+
texts << %q{اللغة التي هي هذه؟}
|
29
|
+
texts << %q{Mitä kieltä tämä on?}
|
30
|
+
texts << %q{Ποια γλώσσα είναι αυτή;}
|
31
|
+
texts << %q{באיזו שפה זה?}
|
32
|
+
texts << %q{Milyen nyelv ez?}
|
33
|
+
texts << %q{이 어떤 언어인가?}
|
34
|
+
texts << %q{Hvilket språk er dette?}
|
35
|
+
texts << %q{W jakim języku to jest?}
|
36
|
+
|
29
37
|
texts.each { |text| puts "#{text[0..18]}... is in #{text.language.to_s.capitalize}" }
|
30
38
|
|
31
39
|
Initialize WhatLanguage with all filters
|
@@ -44,6 +52,10 @@ Convenience method on String
|
|
44
52
|
|
45
53
|
"This is a test".language # => "English"
|
46
54
|
|
55
|
+
Initialize WhatLanguage with certain languages
|
56
|
+
|
57
|
+
wl = WhatLanguage.new(:english, :german, :french)
|
58
|
+
|
47
59
|
## Requirements
|
48
60
|
|
49
61
|
None, minor libraries (BloominSimple and BitField) included with this release.
|
@@ -57,6 +69,10 @@ To test, go into irb, then:
|
|
57
69
|
require 'whatlanguage'
|
58
70
|
"Je suis un homme".language
|
59
71
|
|
72
|
+
## Credits
|
73
|
+
|
74
|
+
Contributions from Konrad Reiche and Salimane Adjao Moustapha appreciated.
|
75
|
+
|
60
76
|
## License
|
61
77
|
|
62
78
|
MIT License
|
data/lang/arabic.lang
ADDED
Binary file
|
data/lang/finnish.lang
ADDED
Binary file
|
data/lang/greek.lang
ADDED
Binary file
|
data/lang/hebrew.lang
ADDED
Binary file
|
data/lang/hungarian.lang
ADDED
Binary file
|
data/lang/korean.lang
ADDED
Binary file
|
data/lang/norwegian.lang
ADDED
Binary file
|
data/lang/polish.lang
ADDED
Binary file
|
data/lib/whatlanguage.rb
CHANGED
@@ -9,7 +9,8 @@ class WhatLanguage
|
|
9
9
|
|
10
10
|
@@data = {}
|
11
11
|
|
12
|
-
def initialize(
|
12
|
+
def initialize(*selection)
|
13
|
+
@selection = (selection.empty?) ? [:all] : selection
|
13
14
|
languages_folder = File.join(File.dirname(__FILE__), "..", "lang")
|
14
15
|
Dir.entries(languages_folder).grep(/\.lang/).each do |lang|
|
15
16
|
@@data[lang[/\w+/].to_sym] ||= BloominSimple.from_dump(File.new(File.join(languages_folder, lang), 'rb').read, &HASHER)
|
@@ -21,15 +22,22 @@ class WhatLanguage
|
|
21
22
|
def process_text(text)
|
22
23
|
results = Hash.new(0)
|
23
24
|
it = 0
|
24
|
-
text.split.
|
25
|
+
text.downcase.split.each do |word|
|
25
26
|
it += 1
|
26
|
-
|
27
|
+
|
28
|
+
if @selection.include?(:all)
|
29
|
+
languages = @@data.keys
|
30
|
+
else
|
31
|
+
languages = @@data.keys & @selection # intersection
|
32
|
+
end
|
33
|
+
|
34
|
+
languages.each do |lang|
|
27
35
|
results[lang] += 1 if @@data[lang].includes?(word)
|
28
36
|
end
|
29
37
|
|
30
38
|
# Every now and then check to see if we have a really convincing result.. if so, exit early.
|
31
39
|
if it % 4 == 0 && results.size > 1
|
32
|
-
top_results = results.sort_by{|a,b| b}
|
40
|
+
top_results = results.sort_by{|a,b| -b}[0..1]
|
33
41
|
|
34
42
|
# Next line may need some tweaking one day..
|
35
43
|
break if top_results[0][1] > 4 && ((top_results[0][1] > top_results[1][1] * 2) || (top_results[0][1] - top_results[1][1] > 25))
|
@@ -55,4 +63,4 @@ class String
|
|
55
63
|
def language
|
56
64
|
WhatLanguage.new(:all).language(self)
|
57
65
|
end
|
58
|
-
end
|
66
|
+
end
|
data/lib/whatlanguage/version.rb
CHANGED
data/test/test_whatlanguage.rb
CHANGED
@@ -12,25 +12,73 @@ class TestWhatLanguage < Test::Unit::TestCase
|
|
12
12
|
assert_equal :english, "This is a test".language
|
13
13
|
end
|
14
14
|
|
15
|
+
def test_arabic
|
16
|
+
assert_equal :arabic, @wl.language("اللغة التي هي هذه؟")
|
17
|
+
end
|
18
|
+
|
15
19
|
def test_dutch
|
16
20
|
assert_equal :dutch, @wl.language("Als hadden geweest is, is hebben te laat.")
|
17
21
|
end
|
18
22
|
|
23
|
+
def test_farsi
|
24
|
+
assert_equal :farsi, @wl.language("وقتی مادرم به من آموخت که به آواز خواندن.")
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_finnish
|
28
|
+
assert_equal :finnish, @wl.language("Mitä kieltä tämä on?")
|
29
|
+
end
|
30
|
+
|
19
31
|
def test_french
|
20
32
|
assert_equal :french, @wl.language("Bonjour, je m'appelle Sandrine. Voila ma chatte.")
|
21
33
|
end
|
22
34
|
|
23
|
-
def
|
24
|
-
assert_equal :
|
35
|
+
def test_german
|
36
|
+
assert_equal :german, @wl.language("Welche Sprache ist das?")
|
25
37
|
end
|
26
38
|
|
27
|
-
def
|
28
|
-
assert_equal :
|
39
|
+
def test_greek
|
40
|
+
assert_equal :greek, @wl.language("Ποια γλώσσα είναι αυτή;")
|
41
|
+
end
|
42
|
+
|
43
|
+
def test_hebrew
|
44
|
+
assert_equal :hebrew, @wl.language("באיזו שפה זה?")
|
45
|
+
end
|
46
|
+
|
47
|
+
def test_hungarian
|
48
|
+
assert_equal :hungarian, @wl.language("Milyen nyelv ez?")
|
49
|
+
end
|
50
|
+
|
51
|
+
def test_italian
|
52
|
+
assert_equal :italian, @wl.language("Roma, capitale dell'impero romano, è stata per secoli il centro politico e culturale della civiltà occidentale.")
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_korean
|
56
|
+
assert_equal :korean, @wl.language("이 어떤 언어인가?")
|
57
|
+
end
|
58
|
+
|
59
|
+
def test_norwegian
|
60
|
+
assert_equal :norwegian, @wl.language("Hvilket språk er dette?")
|
61
|
+
end
|
62
|
+
|
63
|
+
def test_polish
|
64
|
+
assert_equal :polish, @wl.language("W jakim języku to jest?")
|
65
|
+
end
|
66
|
+
|
67
|
+
def test_portuguese
|
68
|
+
assert_equal :portuguese, @wl.language("Que linguagem é essa?")
|
29
69
|
end
|
30
70
|
|
31
71
|
def test_russian
|
32
72
|
assert_equal :russian, @wl.language("Все новости в хронологическом порядке")
|
33
73
|
end
|
74
|
+
|
75
|
+
def test_spanish
|
76
|
+
assert_equal :spanish, @wl.language("La palabra mezquita se usa en español para referirse a todo tipo de edificios dedicados.")
|
77
|
+
end
|
78
|
+
|
79
|
+
def test_swedish
|
80
|
+
assert_equal :swedish, @wl.language("Vilket språk är detta?")
|
81
|
+
end
|
34
82
|
|
35
83
|
def test_nothing
|
36
84
|
assert_nil @wl.language("")
|
@@ -48,4 +96,18 @@ class TestWhatLanguage < Test::Unit::TestCase
|
|
48
96
|
assert_equal :italian, @wl.language("Roma, capitale dell'impero romano, è stata per secoli il centro politico e culturale della civiltà occidentale.")
|
49
97
|
end
|
50
98
|
|
51
|
-
|
99
|
+
def test_language_selection
|
100
|
+
selective_wl = WhatLanguage.new(:german, :english)
|
101
|
+
assert_equal :german, selective_wl.language("der die das")
|
102
|
+
end
|
103
|
+
|
104
|
+
def test_language_selection_empty
|
105
|
+
selective_wl = WhatLanguage.new()
|
106
|
+
assert_equal :russian, selective_wl.language("Все новости в хронологическом порядке")
|
107
|
+
end
|
108
|
+
|
109
|
+
def test_language_selection_mixed
|
110
|
+
selective_wl = WhatLanguage.new(:german, :all, :english)
|
111
|
+
assert_equal :russian, selective_wl.language("Все новости в хронологическом порядке")
|
112
|
+
end
|
113
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: whatlanguage
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.5
|
5
|
+
prerelease:
|
5
6
|
platform: ruby
|
6
7
|
authors:
|
7
8
|
- Peter Cooper
|
8
9
|
autorequire:
|
9
10
|
bindir: bin
|
10
11
|
cert_chain: []
|
11
|
-
date: 2013-
|
12
|
+
date: 2013-10-05 00:00:00.000000000 Z
|
12
13
|
dependencies: []
|
13
14
|
description: WhatLanguage rapidly detects the language of a sample of text
|
14
15
|
email:
|
@@ -28,13 +29,21 @@ files:
|
|
28
29
|
- build_lang_from_wordlists.rb
|
29
30
|
- copyright-en
|
30
31
|
- example.rb
|
32
|
+
- lang/arabic.lang
|
31
33
|
- lang/dutch.lang
|
32
34
|
- lang/english.lang
|
33
35
|
- lang/farsi.lang
|
36
|
+
- lang/finnish.lang
|
34
37
|
- lang/french.lang
|
35
38
|
- lang/german.lang
|
39
|
+
- lang/greek.lang
|
40
|
+
- lang/hebrew.lang
|
41
|
+
- lang/hungarian.lang
|
36
42
|
- lang/italian.lang
|
43
|
+
- lang/korean.lang
|
44
|
+
- lang/norwegian.lang
|
37
45
|
- lang/pinyin.lang
|
46
|
+
- lang/polish.lang
|
38
47
|
- lang/portuguese.lang
|
39
48
|
- lang/russian.lang
|
40
49
|
- lang/spanish.lang
|
@@ -47,26 +56,27 @@ files:
|
|
47
56
|
- whatlanguage.gemspec
|
48
57
|
homepage: https://github.com/peterc/whatlanguage
|
49
58
|
licenses: []
|
50
|
-
metadata: {}
|
51
59
|
post_install_message:
|
52
60
|
rdoc_options: []
|
53
61
|
require_paths:
|
54
62
|
- lib
|
55
63
|
required_ruby_version: !ruby/object:Gem::Requirement
|
64
|
+
none: false
|
56
65
|
requirements:
|
57
|
-
- - '>='
|
66
|
+
- - ! '>='
|
58
67
|
- !ruby/object:Gem::Version
|
59
68
|
version: '0'
|
60
69
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
70
|
+
none: false
|
61
71
|
requirements:
|
62
|
-
- - '>='
|
72
|
+
- - ! '>='
|
63
73
|
- !ruby/object:Gem::Version
|
64
74
|
version: '0'
|
65
75
|
requirements: []
|
66
76
|
rubyforge_project:
|
67
|
-
rubygems_version:
|
77
|
+
rubygems_version: 1.8.24
|
68
78
|
signing_key:
|
69
|
-
specification_version:
|
79
|
+
specification_version: 3
|
70
80
|
summary: Natural language detection for text samples
|
71
81
|
test_files:
|
72
82
|
- test/test_whatlanguage.rb
|
checksums.yaml
DELETED
@@ -1,7 +0,0 @@
|
|
1
|
-
---
|
2
|
-
SHA1:
|
3
|
-
metadata.gz: ef04e53215ee669aed4e66e5ec8e66bbcc1f7fa0
|
4
|
-
data.tar.gz: 270302e063e5d85ec875de08ac54100c9db5efb8
|
5
|
-
SHA512:
|
6
|
-
metadata.gz: a6be7929232a8b48229cae418de4e4bbb928245f9d4ead0ae38b6286fad8d482e06ec82549781187613799596b3657f99fb442d49348329b3e53ae96130029b0
|
7
|
-
data.tar.gz: 98bea8003cacaf0da3774232f1daf112c11c9b3a826700fd2bb5252e3b9837c012f87f54db17d25c640a45a10e60c5891c8e88b8b0ba7835a6e0389bf2695cce
|