whatlanguage 1.0.4 → 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,9 @@
1
+ == 1.0.5 / 2013-10-05
2
+
3
+ * Many more languages supported
4
+
5
+ == 1.0.4 / 2013-03-07
6
+
1
7
  == 1.0.1 / 2008-08-22
2
8
 
3
9
  * Public release
data/README.md CHANGED
@@ -4,7 +4,7 @@ by Peter Cooper
4
4
 
5
5
  Text language detection. Quick, fast, memory efficient, and all in pure Ruby. Uses Bloom filters for aforementioned speed and memory benefits.
6
6
 
7
- Works with Dutch, English, Farsi, French, German, Italian, Pinyin, Swedish, Portuguese, Russian and Spanish out of the box.
7
+ Works with Dutch, English, Farsi, French, German, Italian, Pinyin, Swedish, Portuguese, Russian, Arabic, Finnish, Greek, Hebrew, Hungarian, Korean, Norwegian, Polish and Spanish out of the box.
8
8
 
9
9
  ## Important note
10
10
 
@@ -25,7 +25,15 @@ Full Example
25
25
  texts << %q{Returns the object in enum with the maximum value.}
26
26
  texts << %q{Propose des données au sujet de la langue espagnole.}
27
27
  texts << %q{La palabra "mezquita" se usa en español para referirse a todo tipo de edificios dedicados.}
28
-
28
+ texts << %q{اللغة التي هي هذه؟}
29
+ texts << %q{Mitä kieltä tämä on?}
30
+ texts << %q{Ποια γλώσσα είναι αυτή;}
31
+ texts << %q{באיזו שפה זה?}
32
+ texts << %q{Milyen nyelv ez?}
33
+ texts << %q{이 어떤 언어인가?}
34
+ texts << %q{Hvilket språk er dette?}
35
+ texts << %q{W jakim języku to jest?}
36
+
29
37
  texts.each { |text| puts "#{text[0..18]}... is in #{text.language.to_s.capitalize}" }
30
38
 
31
39
  Initialize WhatLanguage with all filters
@@ -44,6 +52,10 @@ Convenience method on String
44
52
 
45
53
  "This is a test".language # => "English"
46
54
 
55
+ Initialize WhatLanguage with certain languages
56
+
57
+ wl = WhatLanguage.new(:english, :german, :french)
58
+
47
59
  ## Requirements
48
60
 
49
61
  None, minor libraries (BloominSimple and BitField) included with this release.
@@ -57,6 +69,10 @@ To test, go into irb, then:
57
69
  require 'whatlanguage'
58
70
  "Je suis un homme".language
59
71
 
72
+ ## Credits
73
+
74
+ Contributions from Konrad Reiche and Salimane Adjao Moustapha appreciated.
75
+
60
76
  ## License
61
77
 
62
78
  MIT License
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -9,7 +9,8 @@ class WhatLanguage
9
9
 
10
10
  @@data = {}
11
11
 
12
- def initialize(options = {})
12
+ def initialize(*selection)
13
+ @selection = (selection.empty?) ? [:all] : selection
13
14
  languages_folder = File.join(File.dirname(__FILE__), "..", "lang")
14
15
  Dir.entries(languages_folder).grep(/\.lang/).each do |lang|
15
16
  @@data[lang[/\w+/].to_sym] ||= BloominSimple.from_dump(File.new(File.join(languages_folder, lang), 'rb').read, &HASHER)
@@ -21,15 +22,22 @@ class WhatLanguage
21
22
  def process_text(text)
22
23
  results = Hash.new(0)
23
24
  it = 0
24
- text.split.collect {|a| a.downcase }.each do |word|
25
+ text.downcase.split.each do |word|
25
26
  it += 1
26
- @@data.keys.each do |lang|
27
+
28
+ if @selection.include?(:all)
29
+ languages = @@data.keys
30
+ else
31
+ languages = @@data.keys & @selection # intersection
32
+ end
33
+
34
+ languages.each do |lang|
27
35
  results[lang] += 1 if @@data[lang].includes?(word)
28
36
  end
29
37
 
30
38
  # Every now and then check to see if we have a really convincing result.. if so, exit early.
31
39
  if it % 4 == 0 && results.size > 1
32
- top_results = results.sort_by{|a,b| b}.reverse[0..1]
40
+ top_results = results.sort_by{|a,b| -b}[0..1]
33
41
 
34
42
  # Next line may need some tweaking one day..
35
43
  break if top_results[0][1] > 4 && ((top_results[0][1] > top_results[1][1] * 2) || (top_results[0][1] - top_results[1][1] > 25))
@@ -55,4 +63,4 @@ class String
55
63
  def language
56
64
  WhatLanguage.new(:all).language(self)
57
65
  end
58
- end
66
+ end
@@ -1,3 +1,3 @@
1
1
  class WhatLanguage
2
- VERSION = '1.0.4'
2
+ VERSION = '1.0.5'
3
3
  end
@@ -12,25 +12,73 @@ class TestWhatLanguage < Test::Unit::TestCase
12
12
  assert_equal :english, "This is a test".language
13
13
  end
14
14
 
15
+ def test_arabic
16
+ assert_equal :arabic, @wl.language("اللغة التي هي هذه؟")
17
+ end
18
+
15
19
  def test_dutch
16
20
  assert_equal :dutch, @wl.language("Als hadden geweest is, is hebben te laat.")
17
21
  end
18
22
 
23
+ def test_farsi
24
+ assert_equal :farsi, @wl.language("وقتی مادرم به من آموخت که به آواز خواندن.")
25
+ end
26
+
27
+ def test_finnish
28
+ assert_equal :finnish, @wl.language("Mitä kieltä tämä on?")
29
+ end
30
+
19
31
  def test_french
20
32
  assert_equal :french, @wl.language("Bonjour, je m'appelle Sandrine. Voila ma chatte.")
21
33
  end
22
34
 
23
- def test_spanish
24
- assert_equal :spanish, @wl.language("La palabra mezquita se usa en español para referirse a todo tipo de edificios dedicados.")
35
+ def test_german
36
+ assert_equal :german, @wl.language("Welche Sprache ist das?")
25
37
  end
26
38
 
27
- def test_swedish
28
- assert_equal :swedish, @wl.language("Den spanska räven rev en annan räv alldeles lagom.")
39
+ def test_greek
40
+ assert_equal :greek, @wl.language("Ποια γλώσσα είναι αυτή;")
41
+ end
42
+
43
+ def test_hebrew
44
+ assert_equal :hebrew, @wl.language("באיזו שפה זה?")
45
+ end
46
+
47
+ def test_hungarian
48
+ assert_equal :hungarian, @wl.language("Milyen nyelv ez?")
49
+ end
50
+
51
+ def test_italian
52
+ assert_equal :italian, @wl.language("Roma, capitale dell'impero romano, è stata per secoli il centro politico e culturale della civiltà occidentale.")
53
+ end
54
+
55
+ def test_korean
56
+ assert_equal :korean, @wl.language("이 어떤 언어인가?")
57
+ end
58
+
59
+ def test_norwegian
60
+ assert_equal :norwegian, @wl.language("Hvilket språk er dette?")
61
+ end
62
+
63
+ def test_polish
64
+ assert_equal :polish, @wl.language("W jakim języku to jest?")
65
+ end
66
+
67
+ def test_portuguese
68
+ assert_equal :portuguese, @wl.language("Que linguagem é essa?")
29
69
  end
30
70
 
31
71
  def test_russian
32
72
  assert_equal :russian, @wl.language("Все новости в хронологическом порядке")
33
73
  end
74
+
75
+ def test_spanish
76
+ assert_equal :spanish, @wl.language("La palabra mezquita se usa en español para referirse a todo tipo de edificios dedicados.")
77
+ end
78
+
79
+ def test_swedish
80
+ assert_equal :swedish, @wl.language("Vilket språk är detta?")
81
+ end
34
82
 
35
83
  def test_nothing
36
84
  assert_nil @wl.language("")
@@ -48,4 +96,18 @@ class TestWhatLanguage < Test::Unit::TestCase
48
96
  assert_equal :italian, @wl.language("Roma, capitale dell'impero romano, è stata per secoli il centro politico e culturale della civiltà occidentale.")
49
97
  end
50
98
 
51
- end
99
+ def test_language_selection
100
+ selective_wl = WhatLanguage.new(:german, :english)
101
+ assert_equal :german, selective_wl.language("der die das")
102
+ end
103
+
104
+ def test_language_selection_empty
105
+ selective_wl = WhatLanguage.new()
106
+ assert_equal :russian, selective_wl.language("Все новости в хронологическом порядке")
107
+ end
108
+
109
+ def test_language_selection_mixed
110
+ selective_wl = WhatLanguage.new(:german, :all, :english)
111
+ assert_equal :russian, selective_wl.language("Все новости в хронологическом порядке")
112
+ end
113
+ end
metadata CHANGED
@@ -1,14 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: whatlanguage
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.4
4
+ version: 1.0.5
5
+ prerelease:
5
6
  platform: ruby
6
7
  authors:
7
8
  - Peter Cooper
8
9
  autorequire:
9
10
  bindir: bin
10
11
  cert_chain: []
11
- date: 2013-03-07 00:00:00.000000000 Z
12
+ date: 2013-10-05 00:00:00.000000000 Z
12
13
  dependencies: []
13
14
  description: WhatLanguage rapidly detects the language of a sample of text
14
15
  email:
@@ -28,13 +29,21 @@ files:
28
29
  - build_lang_from_wordlists.rb
29
30
  - copyright-en
30
31
  - example.rb
32
+ - lang/arabic.lang
31
33
  - lang/dutch.lang
32
34
  - lang/english.lang
33
35
  - lang/farsi.lang
36
+ - lang/finnish.lang
34
37
  - lang/french.lang
35
38
  - lang/german.lang
39
+ - lang/greek.lang
40
+ - lang/hebrew.lang
41
+ - lang/hungarian.lang
36
42
  - lang/italian.lang
43
+ - lang/korean.lang
44
+ - lang/norwegian.lang
37
45
  - lang/pinyin.lang
46
+ - lang/polish.lang
38
47
  - lang/portuguese.lang
39
48
  - lang/russian.lang
40
49
  - lang/spanish.lang
@@ -47,26 +56,27 @@ files:
47
56
  - whatlanguage.gemspec
48
57
  homepage: https://github.com/peterc/whatlanguage
49
58
  licenses: []
50
- metadata: {}
51
59
  post_install_message:
52
60
  rdoc_options: []
53
61
  require_paths:
54
62
  - lib
55
63
  required_ruby_version: !ruby/object:Gem::Requirement
64
+ none: false
56
65
  requirements:
57
- - - '>='
66
+ - - ! '>='
58
67
  - !ruby/object:Gem::Version
59
68
  version: '0'
60
69
  required_rubygems_version: !ruby/object:Gem::Requirement
70
+ none: false
61
71
  requirements:
62
- - - '>='
72
+ - - ! '>='
63
73
  - !ruby/object:Gem::Version
64
74
  version: '0'
65
75
  requirements: []
66
76
  rubyforge_project:
67
- rubygems_version: 2.0.0
77
+ rubygems_version: 1.8.24
68
78
  signing_key:
69
- specification_version: 4
79
+ specification_version: 3
70
80
  summary: Natural language detection for text samples
71
81
  test_files:
72
82
  - test/test_whatlanguage.rb
checksums.yaml DELETED
@@ -1,7 +0,0 @@
1
- ---
2
- SHA1:
3
- metadata.gz: ef04e53215ee669aed4e66e5ec8e66bbcc1f7fa0
4
- data.tar.gz: 270302e063e5d85ec875de08ac54100c9db5efb8
5
- SHA512:
6
- metadata.gz: a6be7929232a8b48229cae418de4e4bbb928245f9d4ead0ae38b6286fad8d482e06ec82549781187613799596b3657f99fb442d49348329b3e53ae96130029b0
7
- data.tar.gz: 98bea8003cacaf0da3774232f1daf112c11c9b3a826700fd2bb5252e3b9837c012f87f54db17d25c640a45a10e60c5891c8e88b8b0ba7835a6e0389bf2695cce