whatlanguage 1.0.4 → 1.0.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,9 @@
1
+ == 1.0.5 / 2013-10-05
2
+
3
+ * Many more languages supported
4
+
5
+ == 1.0.4 / 2013-03-07
6
+
1
7
  == 1.0.1 / 2008-08-22
2
8
 
3
9
  * Public release
data/README.md CHANGED
@@ -4,7 +4,7 @@ by Peter Cooper
4
4
 
5
5
  Text language detection. Quick, fast, memory efficient, and all in pure Ruby. Uses Bloom filters for aforementioned speed and memory benefits.
6
6
 
7
- Works with Dutch, English, Farsi, French, German, Italian, Pinyin, Swedish, Portuguese, Russian and Spanish out of the box.
7
+ Works with Dutch, English, Farsi, French, German, Italian, Pinyin, Swedish, Portuguese, Russian, Arabic, Finnish, Greek, Hebrew, Hungarian, Korean, Norwegian, Polish and Spanish out of the box.
8
8
 
9
9
  ## Important note
10
10
 
@@ -25,7 +25,15 @@ Full Example
25
25
  texts << %q{Returns the object in enum with the maximum value.}
26
26
  texts << %q{Propose des données au sujet de la langue espagnole.}
27
27
  texts << %q{La palabra "mezquita" se usa en español para referirse a todo tipo de edificios dedicados.}
28
-
28
+ texts << %q{اللغة التي هي هذه؟}
29
+ texts << %q{Mitä kieltä tämä on?}
30
+ texts << %q{Ποια γλώσσα είναι αυτή;}
31
+ texts << %q{באיזו שפה זה?}
32
+ texts << %q{Milyen nyelv ez?}
33
+ texts << %q{이 어떤 언어인가?}
34
+ texts << %q{Hvilket språk er dette?}
35
+ texts << %q{W jakim języku to jest?}
36
+
29
37
  texts.each { |text| puts "#{text[0..18]}... is in #{text.language.to_s.capitalize}" }
30
38
 
31
39
  Initialize WhatLanguage with all filters
@@ -44,6 +52,10 @@ Convenience method on String
44
52
 
45
53
  "This is a test".language # => "English"
46
54
 
55
+ Initialize WhatLanguage with certain languages
56
+
57
+ wl = WhatLanguage.new(:english, :german, :french)
58
+
47
59
  ## Requirements
48
60
 
49
61
  None, minor libraries (BloominSimple and BitField) included with this release.
@@ -57,6 +69,10 @@ To test, go into irb, then:
57
69
  require 'whatlanguage'
58
70
  "Je suis un homme".language
59
71
 
72
+ ## Credits
73
+
74
+ Contributions from Konrad Reiche and Salimane Adjao Moustapha appreciated.
75
+
60
76
  ## License
61
77
 
62
78
  MIT License
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -9,7 +9,8 @@ class WhatLanguage
9
9
 
10
10
  @@data = {}
11
11
 
12
- def initialize(options = {})
12
+ def initialize(*selection)
13
+ @selection = (selection.empty?) ? [:all] : selection
13
14
  languages_folder = File.join(File.dirname(__FILE__), "..", "lang")
14
15
  Dir.entries(languages_folder).grep(/\.lang/).each do |lang|
15
16
  @@data[lang[/\w+/].to_sym] ||= BloominSimple.from_dump(File.new(File.join(languages_folder, lang), 'rb').read, &HASHER)
@@ -21,15 +22,22 @@ class WhatLanguage
21
22
  def process_text(text)
22
23
  results = Hash.new(0)
23
24
  it = 0
24
- text.split.collect {|a| a.downcase }.each do |word|
25
+ text.downcase.split.each do |word|
25
26
  it += 1
26
- @@data.keys.each do |lang|
27
+
28
+ if @selection.include?(:all)
29
+ languages = @@data.keys
30
+ else
31
+ languages = @@data.keys & @selection # intersection
32
+ end
33
+
34
+ languages.each do |lang|
27
35
  results[lang] += 1 if @@data[lang].includes?(word)
28
36
  end
29
37
 
30
38
  # Every now and then check to see if we have a really convincing result.. if so, exit early.
31
39
  if it % 4 == 0 && results.size > 1
32
- top_results = results.sort_by{|a,b| b}.reverse[0..1]
40
+ top_results = results.sort_by{|a,b| -b}[0..1]
33
41
 
34
42
  # Next line may need some tweaking one day..
35
43
  break if top_results[0][1] > 4 && ((top_results[0][1] > top_results[1][1] * 2) || (top_results[0][1] - top_results[1][1] > 25))
@@ -55,4 +63,4 @@ class String
55
63
  def language
56
64
  WhatLanguage.new(:all).language(self)
57
65
  end
58
- end
66
+ end
@@ -1,3 +1,3 @@
1
1
  class WhatLanguage
2
- VERSION = '1.0.4'
2
+ VERSION = '1.0.5'
3
3
  end
@@ -12,25 +12,73 @@ class TestWhatLanguage < Test::Unit::TestCase
12
12
  assert_equal :english, "This is a test".language
13
13
  end
14
14
 
15
+ def test_arabic
16
+ assert_equal :arabic, @wl.language("اللغة التي هي هذه؟")
17
+ end
18
+
15
19
  def test_dutch
16
20
  assert_equal :dutch, @wl.language("Als hadden geweest is, is hebben te laat.")
17
21
  end
18
22
 
23
+ def test_farsi
24
+ assert_equal :farsi, @wl.language("وقتی مادرم به من آموخت که به آواز خواندن.")
25
+ end
26
+
27
+ def test_finnish
28
+ assert_equal :finnish, @wl.language("Mitä kieltä tämä on?")
29
+ end
30
+
19
31
  def test_french
20
32
  assert_equal :french, @wl.language("Bonjour, je m'appelle Sandrine. Voila ma chatte.")
21
33
  end
22
34
 
23
- def test_spanish
24
- assert_equal :spanish, @wl.language("La palabra mezquita se usa en español para referirse a todo tipo de edificios dedicados.")
35
+ def test_german
36
+ assert_equal :german, @wl.language("Welche Sprache ist das?")
25
37
  end
26
38
 
27
- def test_swedish
28
- assert_equal :swedish, @wl.language("Den spanska räven rev en annan räv alldeles lagom.")
39
+ def test_greek
40
+ assert_equal :greek, @wl.language("Ποια γλώσσα είναι αυτή;")
41
+ end
42
+
43
+ def test_hebrew
44
+ assert_equal :hebrew, @wl.language("באיזו שפה זה?")
45
+ end
46
+
47
+ def test_hungarian
48
+ assert_equal :hungarian, @wl.language("Milyen nyelv ez?")
49
+ end
50
+
51
+ def test_italian
52
+ assert_equal :italian, @wl.language("Roma, capitale dell'impero romano, è stata per secoli il centro politico e culturale della civiltà occidentale.")
53
+ end
54
+
55
+ def test_korean
56
+ assert_equal :korean, @wl.language("이 어떤 언어인가?")
57
+ end
58
+
59
+ def test_norwegian
60
+ assert_equal :norwegian, @wl.language("Hvilket språk er dette?")
61
+ end
62
+
63
+ def test_polish
64
+ assert_equal :polish, @wl.language("W jakim języku to jest?")
65
+ end
66
+
67
+ def test_portuguese
68
+ assert_equal :portuguese, @wl.language("Que linguagem é essa?")
29
69
  end
30
70
 
31
71
  def test_russian
32
72
  assert_equal :russian, @wl.language("Все новости в хронологическом порядке")
33
73
  end
74
+
75
+ def test_spanish
76
+ assert_equal :spanish, @wl.language("La palabra mezquita se usa en español para referirse a todo tipo de edificios dedicados.")
77
+ end
78
+
79
+ def test_swedish
80
+ assert_equal :swedish, @wl.language("Vilket språk är detta?")
81
+ end
34
82
 
35
83
  def test_nothing
36
84
  assert_nil @wl.language("")
@@ -48,4 +96,18 @@ class TestWhatLanguage < Test::Unit::TestCase
48
96
  assert_equal :italian, @wl.language("Roma, capitale dell'impero romano, è stata per secoli il centro politico e culturale della civiltà occidentale.")
49
97
  end
50
98
 
51
- end
99
+ def test_language_selection
100
+ selective_wl = WhatLanguage.new(:german, :english)
101
+ assert_equal :german, selective_wl.language("der die das")
102
+ end
103
+
104
+ def test_language_selection_empty
105
+ selective_wl = WhatLanguage.new()
106
+ assert_equal :russian, selective_wl.language("Все новости в хронологическом порядке")
107
+ end
108
+
109
+ def test_language_selection_mixed
110
+ selective_wl = WhatLanguage.new(:german, :all, :english)
111
+ assert_equal :russian, selective_wl.language("Все новости в хронологическом порядке")
112
+ end
113
+ end
metadata CHANGED
@@ -1,14 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: whatlanguage
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.4
4
+ version: 1.0.5
5
+ prerelease:
5
6
  platform: ruby
6
7
  authors:
7
8
  - Peter Cooper
8
9
  autorequire:
9
10
  bindir: bin
10
11
  cert_chain: []
11
- date: 2013-03-07 00:00:00.000000000 Z
12
+ date: 2013-10-05 00:00:00.000000000 Z
12
13
  dependencies: []
13
14
  description: WhatLanguage rapidly detects the language of a sample of text
14
15
  email:
@@ -28,13 +29,21 @@ files:
28
29
  - build_lang_from_wordlists.rb
29
30
  - copyright-en
30
31
  - example.rb
32
+ - lang/arabic.lang
31
33
  - lang/dutch.lang
32
34
  - lang/english.lang
33
35
  - lang/farsi.lang
36
+ - lang/finnish.lang
34
37
  - lang/french.lang
35
38
  - lang/german.lang
39
+ - lang/greek.lang
40
+ - lang/hebrew.lang
41
+ - lang/hungarian.lang
36
42
  - lang/italian.lang
43
+ - lang/korean.lang
44
+ - lang/norwegian.lang
37
45
  - lang/pinyin.lang
46
+ - lang/polish.lang
38
47
  - lang/portuguese.lang
39
48
  - lang/russian.lang
40
49
  - lang/spanish.lang
@@ -47,26 +56,27 @@ files:
47
56
  - whatlanguage.gemspec
48
57
  homepage: https://github.com/peterc/whatlanguage
49
58
  licenses: []
50
- metadata: {}
51
59
  post_install_message:
52
60
  rdoc_options: []
53
61
  require_paths:
54
62
  - lib
55
63
  required_ruby_version: !ruby/object:Gem::Requirement
64
+ none: false
56
65
  requirements:
57
- - - '>='
66
+ - - ! '>='
58
67
  - !ruby/object:Gem::Version
59
68
  version: '0'
60
69
  required_rubygems_version: !ruby/object:Gem::Requirement
70
+ none: false
61
71
  requirements:
62
- - - '>='
72
+ - - ! '>='
63
73
  - !ruby/object:Gem::Version
64
74
  version: '0'
65
75
  requirements: []
66
76
  rubyforge_project:
67
- rubygems_version: 2.0.0
77
+ rubygems_version: 1.8.24
68
78
  signing_key:
69
- specification_version: 4
79
+ specification_version: 3
70
80
  summary: Natural language detection for text samples
71
81
  test_files:
72
82
  - test/test_whatlanguage.rb
checksums.yaml DELETED
@@ -1,7 +0,0 @@
1
- ---
2
- SHA1:
3
- metadata.gz: ef04e53215ee669aed4e66e5ec8e66bbcc1f7fa0
4
- data.tar.gz: 270302e063e5d85ec875de08ac54100c9db5efb8
5
- SHA512:
6
- metadata.gz: a6be7929232a8b48229cae418de4e4bbb928245f9d4ead0ae38b6286fad8d482e06ec82549781187613799596b3657f99fb442d49348329b3e53ae96130029b0
7
- data.tar.gz: 98bea8003cacaf0da3774232f1daf112c11c9b3a826700fd2bb5252e3b9837c012f87f54db17d25c640a45a10e60c5891c8e88b8b0ba7835a6e0389bf2695cce