peterc-whatlanguage 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.txt +12 -1
- data/build_filter.rb +1 -1
- data/build_lang_from_wordlists.rb +3 -2
- data/lang/swedish.lang +0 -0
- data/lib/whatlanguage.rb +2 -2
- data/test/test_whatlanguage.rb +4 -0
- data/whatlanguage.gemspec +3 -2
- metadata +3 -2
data/README.txt
CHANGED
@@ -7,9 +7,10 @@ whatlanguage
|
|
7
7
|
|
8
8
|
Text language detection. Quick, fast, memory efficient, and all in pure Ruby. Uses Bloom filters for aforementioned speed and memory benefits.
|
9
9
|
|
10
|
+
Works with Dutch, English, Farsi, French, German, Swedish, Portuguese, Russian and Spanish out of the box.
|
11
|
+
|
10
12
|
== FEATURES/PROBLEMS:
|
11
13
|
|
12
|
-
* Only does French, English and Spanish out of the box. Very easy to train new languages though.
|
13
14
|
* It can be made far more efficient at the comparison stage, but all in good time..! It still beats literal dictionary approaches.
|
14
15
|
* No filter selection yet, you get 'em all loaded.
|
15
16
|
* Tests are reasonably light.
|
@@ -45,6 +46,16 @@ Text language detection. Quick, fast, memory efficient, and all in pure Ruby. Us
|
|
45
46
|
|
46
47
|
* None, minor libraries (BloominSimple and BitField) included with this release.
|
47
48
|
|
49
|
+
== INSTALLATION:
|
50
|
+
|
51
|
+
gem sources -a http://gems.github.com
|
52
|
+
sudo gem install peterc-whatlanguage
|
53
|
+
|
54
|
+
To test, go into irb, then:
|
55
|
+
|
56
|
+
require 'whatlanguage'
|
57
|
+
"Je suis un homme".language
|
58
|
+
|
48
59
|
== LICENSE:
|
49
60
|
|
50
61
|
(The MIT License)
|
data/build_filter.rb
CHANGED
@@ -6,7 +6,8 @@ languages_folder = File.join(File.dirname(__FILE__), "lang")
|
|
6
6
|
wordlists_folder = File.join(File.dirname(__FILE__), "wordlists")
|
7
7
|
|
8
8
|
Dir.entries(wordlists_folder).grep(/\w/).each do |lang|
|
9
|
+
next if lang == 'generators'
|
9
10
|
puts "Doing #{lang}"
|
10
11
|
filter = WhatLanguage.filter_from_dictionary(File.join(wordlists_folder, lang))
|
11
|
-
File.open(File.join(languages_folder, lang + ".lang"), '
|
12
|
-
end
|
12
|
+
File.open(File.join(languages_folder, lang + ".lang"), 'wb') { |f| f.write filter.dump }
|
13
|
+
end
|
data/lang/swedish.lang
ADDED
Binary file
|
data/lib/whatlanguage.rb
CHANGED
@@ -2,7 +2,7 @@ require File.join(File.dirname(__FILE__), 'bloominsimple')
|
|
2
2
|
require 'digest/sha1'
|
3
3
|
|
4
4
|
class WhatLanguage
|
5
|
-
VERSION = '1.0.
|
5
|
+
VERSION = '1.0.2'
|
6
6
|
|
7
7
|
HASHER = lambda { |item| Digest::SHA1.digest(item.downcase.strip).unpack("VV") }
|
8
8
|
|
@@ -10,7 +10,7 @@ class WhatLanguage
|
|
10
10
|
|
11
11
|
@@data = {}
|
12
12
|
|
13
|
-
def initialize(options)
|
13
|
+
def initialize(options = {})
|
14
14
|
languages_folder = File.join(File.dirname(__FILE__), "..", "lang")
|
15
15
|
Dir.entries(languages_folder).grep(/\.lang/).each do |lang|
|
16
16
|
@@data[lang[/\w+/].to_sym] ||= BloominSimple.from_dump(File.read(File.join(languages_folder, lang)), &HASHER)
|
data/test/test_whatlanguage.rb
CHANGED
@@ -18,6 +18,10 @@ class TestWhatLanguage < Test::Unit::TestCase
|
|
18
18
|
def test_spanish
|
19
19
|
assert_equal :spanish, @wl.language("La palabra mezquita se usa en español para referirse a todo tipo de edificios dedicados.")
|
20
20
|
end
|
21
|
+
|
22
|
+
def test_swedish
|
23
|
+
assert_equal :swedish, @wl.language("Den spanska räven rev en annan räv alldeles lagom.")
|
24
|
+
end
|
21
25
|
|
22
26
|
def test_nothing
|
23
27
|
assert_nil @wl.language("")
|
data/whatlanguage.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = "whatlanguage"
|
3
|
-
s.version = "1.0.
|
4
|
-
s.date = "2008-08-
|
3
|
+
s.version = "1.0.2"
|
4
|
+
s.date = "2008-08-23"
|
5
5
|
s.summary = "Natural language detection for text samples"
|
6
6
|
s.email = "whatlanguage@peterc.org"
|
7
7
|
s.homepage = "http://github.com/peterc/whatlanguage"
|
@@ -22,6 +22,7 @@ Gem::Specification.new do |s|
|
|
22
22
|
"lang/portuguese.lang",
|
23
23
|
"lang/russian.lang",
|
24
24
|
"lang/spanish.lang",
|
25
|
+
"lang/swedish.lang",
|
25
26
|
"lib/bitfield.rb",
|
26
27
|
"lib/bloominsimple.rb",
|
27
28
|
"lib/whatlanguage.rb",
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: peterc-whatlanguage
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Peter Cooper
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-08-
|
12
|
+
date: 2008-08-23 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|
@@ -37,6 +37,7 @@ files:
|
|
37
37
|
- lang/portuguese.lang
|
38
38
|
- lang/russian.lang
|
39
39
|
- lang/spanish.lang
|
40
|
+
- lang/swedish.lang
|
40
41
|
- lib/bitfield.rb
|
41
42
|
- lib/bloominsimple.rb
|
42
43
|
- lib/whatlanguage.rb
|