whatlanguage 1.0.0 → 1.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/History.txt +5 -0
- data/LICENSE.txt +22 -0
- data/{README.txt → README.md} +34 -22
- data/Rakefile +4 -15
- data/build_filter.rb +1 -1
- data/build_lang_from_wordlists.rb +13 -0
- data/copyright-en +243 -0
- data/lang/italian.lang +0 -0
- data/lang/swedish.lang +0 -0
- data/lib/whatlanguage.rb +5 -6
- data/lib/{bitfield.rb → whatlanguage/bitfield.rb} +0 -0
- data/lib/{bloominsimple.rb → whatlanguage/bloominsimple.rb} +1 -1
- data/lib/whatlanguage/version.rb +3 -0
- data/test/test_whatlanguage.rb +21 -3
- data/whatlanguage.gemspec +19 -0
- metadata +46 -57
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: ef04e53215ee669aed4e66e5ec8e66bbcc1f7fa0
|
4
|
+
data.tar.gz: 270302e063e5d85ec875de08ac54100c9db5efb8
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: a6be7929232a8b48229cae418de4e4bbb928245f9d4ead0ae38b6286fad8d482e06ec82549781187613799596b3657f99fb442d49348329b3e53ae96130029b0
|
7
|
+
data.tar.gz: 98bea8003cacaf0da3774232f1daf112c11c9b3a826700fd2bb5252e3b9837c012f87f54db17d25c640a45a10e60c5891c8e88b8b0ba7835a6e0389bf2695cce
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/History.txt
CHANGED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2008-2013 Peter Cooper
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/{README.txt → README.md}
RENAMED
@@ -1,22 +1,21 @@
|
|
1
|
-
whatlanguage
|
2
|
-
|
3
|
-
|
4
|
-
http://www.rubyinside.com/
|
1
|
+
# whatlanguage
|
2
|
+
|
3
|
+
by Peter Cooper
|
5
4
|
|
6
|
-
== DESCRIPTION:
|
7
|
-
|
8
5
|
Text language detection. Quick, fast, memory efficient, and all in pure Ruby. Uses Bloom filters for aforementioned speed and memory benefits.
|
9
6
|
|
10
|
-
|
7
|
+
Works with Dutch, English, Farsi, French, German, Italian, Pinyin, Swedish, Portuguese, Russian and Spanish out of the box.
|
8
|
+
|
9
|
+
## Important note
|
11
10
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
* Tests are reasonably light.
|
11
|
+
This library was first built in 2007 and has received a few minor updates over the years. There are now more efficient and effective algorithms for doing language detection which I am investigating for a WhatLanguage 2.0.
|
12
|
+
|
13
|
+
This library has been updated to be distributed and to work on modern Ruby implementations but other than that, has had no improvements.
|
16
14
|
|
17
|
-
|
15
|
+
## Synopsis
|
16
|
+
|
17
|
+
Full Example
|
18
18
|
|
19
|
-
Full Example
|
20
19
|
require 'whatlanguage'
|
21
20
|
|
22
21
|
texts = []
|
@@ -29,27 +28,40 @@ Text language detection. Quick, fast, memory efficient, and all in pure Ruby. Us
|
|
29
28
|
|
30
29
|
texts.each { |text| puts "#{text[0..18]}... is in #{text.language.to_s.capitalize}" }
|
31
30
|
|
32
|
-
|
31
|
+
Initialize WhatLanguage with all filters
|
32
|
+
|
33
33
|
wl = WhatLanguage.new(:all)
|
34
34
|
|
35
|
-
|
35
|
+
Return language with best score
|
36
|
+
|
36
37
|
wl.language(text)
|
37
38
|
|
38
|
-
|
39
|
+
Return hash with scores for all relevant languages
|
40
|
+
|
39
41
|
wl.process_text(text)
|
40
42
|
|
41
|
-
|
43
|
+
Convenience method on String
|
44
|
+
|
42
45
|
"This is a test".language # => "English"
|
43
46
|
|
44
|
-
|
47
|
+
## Requirements
|
48
|
+
|
49
|
+
None, minor libraries (BloominSimple and BitField) included with this release.
|
50
|
+
|
51
|
+
## Installation
|
45
52
|
|
46
|
-
|
53
|
+
gem install whatlanguage
|
54
|
+
|
55
|
+
To test, go into irb, then:
|
56
|
+
|
57
|
+
require 'whatlanguage'
|
58
|
+
"Je suis un homme".language
|
47
59
|
|
48
|
-
|
60
|
+
## License
|
49
61
|
|
50
|
-
|
62
|
+
MIT License
|
51
63
|
|
52
|
-
Copyright (c) 2007-
|
64
|
+
Copyright (c) 2007-2013 Peter Cooper
|
53
65
|
|
54
66
|
Permission is hereby granted, free of charge, to any person obtaining
|
55
67
|
a copy of this software and associated documentation files (the
|
data/Rakefile
CHANGED
@@ -1,17 +1,6 @@
|
|
1
|
-
|
1
|
+
require "bundler/gem_tasks"
|
2
|
+
require 'rake/testtask'
|
2
3
|
|
3
|
-
|
4
|
-
require 'hoe'
|
5
|
-
require './lib/whatlanguage.rb'
|
4
|
+
Rake::TestTask.new
|
6
5
|
|
7
|
-
|
8
|
-
p.rubyforge_name = 'whatlanguage'
|
9
|
-
p.author = 'Peter Cooper'
|
10
|
-
p.email = 'whatlanguage@peterc.org'
|
11
|
-
p.summary = 'Fast, quick, textual language detection'
|
12
|
-
p.description = p.paragraphs_of('README.txt', 2..5).join("\n\n")
|
13
|
-
p.url = "http://rubyforge.org/projects/whatlanguage/"
|
14
|
-
p.changes = p.paragraphs_of('History.txt', 0..1).join("\n\n")
|
15
|
-
end
|
16
|
-
|
17
|
-
# vim: syntax=Ruby
|
6
|
+
task :default => :test
|
data/build_filter.rb
CHANGED
@@ -0,0 +1,13 @@
|
|
1
|
+
# Builds all of the word lists in ./wordlists/ into filter files in ./lang/
|
2
|
+
|
3
|
+
require 'lib/whatlanguage'
|
4
|
+
|
5
|
+
languages_folder = File.join(File.dirname(__FILE__), "lang")
|
6
|
+
wordlists_folder = File.join(File.dirname(__FILE__), "wordlists")
|
7
|
+
|
8
|
+
Dir.entries(wordlists_folder).grep(/\w/).each do |lang|
|
9
|
+
next if lang == 'generators'
|
10
|
+
puts "Doing #{lang}"
|
11
|
+
filter = WhatLanguage.filter_from_dictionary(File.join(wordlists_folder, lang))
|
12
|
+
File.open(File.join(languages_folder, lang + ".lang"), 'wb') { |f| f.write filter.dump }
|
13
|
+
end
|
data/copyright-en
ADDED
@@ -0,0 +1,243 @@
|
|
1
|
+
The english word list in `wordlists/english` is compiled from a preexisting word list
|
2
|
+
(public domain) in this repository and the word list from SCOWL 7.0. To comply with the
|
3
|
+
license of Kevin Atkinson's work, his copyright file is attached below.
|
4
|
+
|
5
|
+
------------------------------------------------------------------------
|
6
|
+
|
7
|
+
This English word list is comes directly from SCOWL 7.0 (up to level 60,
|
8
|
+
using the speller/make-aspell-dict script, http://wordlist.sourceforge.net/)
|
9
|
+
and is thus under the same copyright of SCOWL. The affix file (only
|
10
|
+
included in the aspell6 package) is based on the Ispell one which is
|
11
|
+
under the same copyright of Ispell. Part of SCOWL is also based on
|
12
|
+
Ispell thus the Ispell copyright is included with the SCOWL copyright.
|
13
|
+
|
14
|
+
The collective work is Copyright 2000-2011 by Kevin Atkinson as well
|
15
|
+
as any of the copyrights mentioned below:
|
16
|
+
|
17
|
+
Copyright 2000-2011 by Kevin Atkinson
|
18
|
+
|
19
|
+
Permission to use, copy, modify, distribute and sell these word
|
20
|
+
lists, the associated scripts, the output created from the scripts,
|
21
|
+
and its documentation for any purpose is hereby granted without fee,
|
22
|
+
provided that the above copyright notice appears in all copies and
|
23
|
+
that both that copyright notice and this permission notice appear in
|
24
|
+
supporting documentation. Kevin Atkinson makes no representations
|
25
|
+
about the suitability of this array for any purpose. It is provided
|
26
|
+
"as is" without express or implied warranty.
|
27
|
+
|
28
|
+
Alan Beale <biljir@pobox.com> also deserves special credit as he has,
|
29
|
+
in addition to providing the 12Dicts package and being a major
|
30
|
+
contributor to the ENABLE word list, given me an incredible amount of
|
31
|
+
feedback and created a number of special lists (those found in the
|
32
|
+
Supplement) in order to help improve the overall quality of SCOWL.
|
33
|
+
|
34
|
+
The 10 level includes the 1000 most common English words (according to
|
35
|
+
the Moby (TM) Words II [MWords] package), a subset of the 1000 most
|
36
|
+
common words on the Internet (again, according to Moby Words II), and
|
37
|
+
frequently class 16 from Brian Kelk's "UK English Wordlist
|
38
|
+
with Frequency Classification".
|
39
|
+
|
40
|
+
The MWords package was explicitly placed in the public domain:
|
41
|
+
|
42
|
+
The Moby lexicon project is complete and has
|
43
|
+
been place into the public domain. Use, sell,
|
44
|
+
rework, excerpt and use in any way on any platform.
|
45
|
+
|
46
|
+
Placing this material on internal or public servers is
|
47
|
+
also encouraged. The compiler is not aware of any
|
48
|
+
export restrictions so freely distribute world-wide.
|
49
|
+
|
50
|
+
You can verify the public domain status by contacting
|
51
|
+
|
52
|
+
Grady Ward
|
53
|
+
3449 Martha Ct.
|
54
|
+
Arcata, CA 95521-4884
|
55
|
+
|
56
|
+
grady@netcom.com
|
57
|
+
grady@northcoast.com
|
58
|
+
|
59
|
+
The "UK English Wordlist With Frequency Classification" is also in the
|
60
|
+
Public Domain:
|
61
|
+
|
62
|
+
Date: Sat, 08 Jul 2000 20:27:21 +0100
|
63
|
+
From: Brian Kelk <Brian.Kelk@cl.cam.ac.uk>
|
64
|
+
|
65
|
+
> I was wondering what the copyright status of your "UK English
|
66
|
+
> Wordlist With Frequency Classification" word list as it seems to
|
67
|
+
> be lacking any copyright notice.
|
68
|
+
|
69
|
+
There were many many sources in total, but any text marked
|
70
|
+
"copyright" was avoided. Locally-written documentation was one
|
71
|
+
source. An earlier version of the list resided in a filespace called
|
72
|
+
PUBLIC on the University mainframe, because it was considered public
|
73
|
+
domain.
|
74
|
+
|
75
|
+
Date: Tue, 11 Jul 2000 19:31:34 +0100
|
76
|
+
|
77
|
+
> So are you saying your word list is also in the public domain?
|
78
|
+
|
79
|
+
That is the intention.
|
80
|
+
|
81
|
+
The 20 level includes frequency classes 7-15 from Brian's word list.
|
82
|
+
|
83
|
+
The 35 level includes frequency classes 2-6 and words appearing in at
|
84
|
+
least 11 of 12 dictionaries as indicated in the 12Dicts package. All
|
85
|
+
words from the 12Dicts package have had likely inflections added via
|
86
|
+
my inflection database.
|
87
|
+
|
88
|
+
The 12Dicts package and Supplement is in the Public Domain.
|
89
|
+
|
90
|
+
The WordNet database, which was used in the creation of the
|
91
|
+
Inflections database, is under the following copyright:
|
92
|
+
|
93
|
+
This software and database is being provided to you, the LICENSEE,
|
94
|
+
by Princeton University under the following license. By obtaining,
|
95
|
+
using and/or copying this software and database, you agree that you
|
96
|
+
have read, understood, and will comply with these terms and
|
97
|
+
conditions.:
|
98
|
+
|
99
|
+
Permission to use, copy, modify and distribute this software and
|
100
|
+
database and its documentation for any purpose and without fee or
|
101
|
+
royalty is hereby granted, provided that you agree to comply with
|
102
|
+
the following copyright notice and statements, including the
|
103
|
+
disclaimer, and that the same appear on ALL copies of the software,
|
104
|
+
database and documentation, including modifications that you make
|
105
|
+
for internal use or for distribution.
|
106
|
+
|
107
|
+
WordNet 1.6 Copyright 1997 by Princeton University. All rights
|
108
|
+
reserved.
|
109
|
+
|
110
|
+
THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON
|
111
|
+
UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
|
112
|
+
IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PRINCETON
|
113
|
+
UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANT-
|
114
|
+
ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE
|
115
|
+
LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT INFRINGE ANY
|
116
|
+
THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS.
|
117
|
+
|
118
|
+
The name of Princeton University or Princeton may not be used in
|
119
|
+
advertising or publicity pertaining to distribution of the software
|
120
|
+
and/or database. Title to copyright in this software, database and
|
121
|
+
any associated documentation shall at all times remain with
|
122
|
+
Princeton University and LICENSEE agrees to preserve same.
|
123
|
+
|
124
|
+
The 40 level includes words from Alan's 3esl list found in version 4.0
|
125
|
+
of his 12dicts package. Like his other stuff the 3esl list is also in the
|
126
|
+
public domain.
|
127
|
+
|
128
|
+
The 50 level includes Brian's frequency class 1, words words appearing
|
129
|
+
in at least 5 of 12 of the dictionaries as indicated in the 12Dicts
|
130
|
+
package, and uppercase words in at least 4 of the previous 12
|
131
|
+
dictionaries. A decent number of proper names is also included: The
|
132
|
+
top 1000 male, female, and Last names from the 1990 Census report; a
|
133
|
+
list of names sent to me by Alan Beale; and a few names that I added
|
134
|
+
myself. Finally a small list of abbreviations not commonly found in
|
135
|
+
other word lists is included.
|
136
|
+
|
137
|
+
The name files form the Census report is a government document which I
|
138
|
+
don't think can be copyrighted.
|
139
|
+
|
140
|
+
The file special-jargon.50 uses common.lst and word.lst from the
|
141
|
+
"Unofficial Jargon File Word Lists" which is derived from "The Jargon
|
142
|
+
File". All of which is in the Public Domain. This file also contain
|
143
|
+
a few extra UNIX terms which are found in the file "unix-terms" in the
|
144
|
+
special/ directory.
|
145
|
+
|
146
|
+
The 55 level includes words from Alan's 2of4brif list found in version
|
147
|
+
4.0 of his 12dicts package. Like his other stuff the 2of4brif is also
|
148
|
+
in the public domain.
|
149
|
+
|
150
|
+
The 60 level includes all words appearing in at least 2 of the 12
|
151
|
+
dictionaries as indicated by the 12Dicts package.
|
152
|
+
|
153
|
+
The 70 level includes Brian's frequency class 0 and the 74,550 common
|
154
|
+
dictionary words from the MWords package. The common dictionary words,
|
155
|
+
like those from the 12Dicts package, have had all likely inflections
|
156
|
+
added. The 70 level also included the 5desk list from version 4.0 of
|
157
|
+
the 12Dics package which is the public domain.
|
158
|
+
|
159
|
+
The 80 level includes the ENABLE word list, all the lists in the
|
160
|
+
ENABLE supplement package (except for ABLE), the "UK Advanced Cryptics
|
161
|
+
Dictionary" (UKACD), the list of signature words in from YAWL package,
|
162
|
+
and the 10,196 places list from the MWords package.
|
163
|
+
|
164
|
+
The ENABLE package, mainted by M\Cooper <thegrendel@theriver.com>,
|
165
|
+
is in the Public Domain:
|
166
|
+
|
167
|
+
The ENABLE master word list, WORD.LST, is herewith formally released
|
168
|
+
into the Public Domain. Anyone is free to use it or distribute it in
|
169
|
+
any manner they see fit. No fee or registration is required for its
|
170
|
+
use nor are "contributions" solicited (if you feel you absolutely
|
171
|
+
must contribute something for your own peace of mind, the authors of
|
172
|
+
the ENABLE list ask that you make a donation on their behalf to your
|
173
|
+
favorite charity). This word list is our gift to the Scrabble
|
174
|
+
community, as an alternate to "official" word lists. Game designers
|
175
|
+
may feel free to incorporate the WORD.LST into their games. Please
|
176
|
+
mention the source and credit us as originators of the list. Note
|
177
|
+
that if you, as a game designer, use the WORD.LST in your product,
|
178
|
+
you may still copyright and protect your product, but you may *not*
|
179
|
+
legally copyright or in any way restrict redistribution of the
|
180
|
+
WORD.LST portion of your product. This *may* under law restrict your
|
181
|
+
rights to restrict your users' rights, but that is only fair.
|
182
|
+
|
183
|
+
UKACD, by J Ross Beresford <ross@bryson.demon.co.uk>, is under the
|
184
|
+
following copyright:
|
185
|
+
|
186
|
+
Copyright (c) J Ross Beresford 1993-1999. All Rights Reserved.
|
187
|
+
|
188
|
+
The following restriction is placed on the use of this publication:
|
189
|
+
if The UK Advanced Cryptics Dictionary is used in a software package
|
190
|
+
or redistributed in any form, the copyright notice must be
|
191
|
+
prominently displayed and the text of this document must be included
|
192
|
+
verbatim.
|
193
|
+
|
194
|
+
There are no other restrictions: I would like to see the list
|
195
|
+
distributed as widely as possible.
|
196
|
+
|
197
|
+
The 95 level includes the 354,984 single words, 256,772 compound
|
198
|
+
words, 4,946 female names and the 3,897 male names, and 21,986 names
|
199
|
+
from the MWords package, ABLE.LST from the ENABLE Supplement, and some
|
200
|
+
additional words found in my part-of-speech database that were not
|
201
|
+
found anywhere else.
|
202
|
+
|
203
|
+
Accent information was taken from UKACD.
|
204
|
+
|
205
|
+
My VARCON package was used to create the American, British, and
|
206
|
+
Canadian word list.
|
207
|
+
|
208
|
+
Since the original word lists used used in the VARCON package came
|
209
|
+
from the Ispell distribution they are under the Ispell copyright:
|
210
|
+
|
211
|
+
Copyright 1993, Geoff Kuenning, Granada Hills, CA
|
212
|
+
All rights reserved.
|
213
|
+
|
214
|
+
Redistribution and use in source and binary forms, with or without
|
215
|
+
modification, are permitted provided that the following conditions
|
216
|
+
are met:
|
217
|
+
|
218
|
+
1. Redistributions of source code must retain the above copyright
|
219
|
+
notice, this list of conditions and the following disclaimer.
|
220
|
+
2. Redistributions in binary form must reproduce the above copyright
|
221
|
+
notice, this list of conditions and the following disclaimer in the
|
222
|
+
documentation and/or other materials provided with the distribution.
|
223
|
+
3. All modifications to the source code must be clearly marked as
|
224
|
+
such. Binary redistributions based on modified source code
|
225
|
+
must be clearly marked as modified versions in the documentation
|
226
|
+
and/or other materials provided with the distribution.
|
227
|
+
(clause 4 removed with permission from Geoff Kuenning)
|
228
|
+
5. The name of Geoff Kuenning may not be used to endorse or promote
|
229
|
+
products derived from this software without specific prior
|
230
|
+
written permission.
|
231
|
+
|
232
|
+
THIS SOFTWARE IS PROVIDED BY GEOFF KUENNING AND CONTRIBUTORS ``AS
|
233
|
+
IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
234
|
+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
235
|
+
FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GEOFF
|
236
|
+
KUENNING OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
237
|
+
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
238
|
+
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
239
|
+
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
240
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
241
|
+
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
242
|
+
ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
243
|
+
POSSIBILITY OF SUCH DAMAGE.
|
data/lang/italian.lang
ADDED
Binary file
|
data/lang/swedish.lang
ADDED
Binary file
|
data/lib/whatlanguage.rb
CHANGED
@@ -1,19 +1,18 @@
|
|
1
|
-
require
|
1
|
+
require 'whatlanguage/bloominsimple'
|
2
|
+
require 'whatlanguage/bitfield'
|
2
3
|
require 'digest/sha1'
|
3
4
|
|
4
|
-
class WhatLanguage
|
5
|
-
VERSION = '1.0.0'
|
6
|
-
|
5
|
+
class WhatLanguage
|
7
6
|
HASHER = lambda { |item| Digest::SHA1.digest(item.downcase.strip).unpack("VV") }
|
8
7
|
|
9
8
|
BITFIELD_WIDTH = 2_000_000
|
10
9
|
|
11
10
|
@@data = {}
|
12
11
|
|
13
|
-
def initialize(options)
|
12
|
+
def initialize(options = {})
|
14
13
|
languages_folder = File.join(File.dirname(__FILE__), "..", "lang")
|
15
14
|
Dir.entries(languages_folder).grep(/\.lang/).each do |lang|
|
16
|
-
@@data[lang[/\w+/].to_sym] ||= BloominSimple.from_dump(File.
|
15
|
+
@@data[lang[/\w+/].to_sym] ||= BloominSimple.from_dump(File.new(File.join(languages_folder, lang), 'rb').read, &HASHER)
|
17
16
|
end
|
18
17
|
end
|
19
18
|
|
File without changes
|
data/test/test_whatlanguage.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
|
+
# encoding: utf-8
|
1
2
|
require "test/unit"
|
2
3
|
|
3
|
-
require
|
4
|
+
require 'whatlanguage'
|
4
5
|
|
5
6
|
class TestWhatLanguage < Test::Unit::TestCase
|
6
7
|
def setup
|
@@ -10,7 +11,11 @@ class TestWhatLanguage < Test::Unit::TestCase
|
|
10
11
|
def test_string_method
|
11
12
|
assert_equal :english, "This is a test".language
|
12
13
|
end
|
13
|
-
|
14
|
+
|
15
|
+
def test_dutch
|
16
|
+
assert_equal :dutch, @wl.language("Als hadden geweest is, is hebben te laat.")
|
17
|
+
end
|
18
|
+
|
14
19
|
def test_french
|
15
20
|
assert_equal :french, @wl.language("Bonjour, je m'appelle Sandrine. Voila ma chatte.")
|
16
21
|
end
|
@@ -18,6 +23,14 @@ class TestWhatLanguage < Test::Unit::TestCase
|
|
18
23
|
def test_spanish
|
19
24
|
assert_equal :spanish, @wl.language("La palabra mezquita se usa en español para referirse a todo tipo de edificios dedicados.")
|
20
25
|
end
|
26
|
+
|
27
|
+
def test_swedish
|
28
|
+
assert_equal :swedish, @wl.language("Den spanska räven rev en annan räv alldeles lagom.")
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_russian
|
32
|
+
assert_equal :russian, @wl.language("Все новости в хронологическом порядке")
|
33
|
+
end
|
21
34
|
|
22
35
|
def test_nothing
|
23
36
|
assert_nil @wl.language("")
|
@@ -30,4 +43,9 @@ class TestWhatLanguage < Test::Unit::TestCase
|
|
30
43
|
def test_processor
|
31
44
|
assert_kind_of Hash, @wl.process_text("this is a test")
|
32
45
|
end
|
33
|
-
|
46
|
+
|
47
|
+
def test_italian
|
48
|
+
assert_equal :italian, @wl.language("Roma, capitale dell'impero romano, è stata per secoli il centro politico e culturale della civiltà occidentale.")
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'whatlanguage/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "whatlanguage"
|
8
|
+
gem.version = WhatLanguage::VERSION
|
9
|
+
gem.authors = ["Peter Cooper"]
|
10
|
+
gem.email = ["git@peterc.org"]
|
11
|
+
gem.description = %q{WhatLanguage rapidly detects the language of a sample of text}
|
12
|
+
gem.summary = %q{Natural language detection for text samples}
|
13
|
+
gem.homepage = "https://github.com/peterc/whatlanguage"
|
14
|
+
|
15
|
+
gem.files = `git ls-files`.split($/).reject { |f| f.start_with?("wordlists") }
|
16
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
|
+
gem.require_paths = ["lib"]
|
19
|
+
end
|
metadata
CHANGED
@@ -1,83 +1,72 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: whatlanguage
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.4
|
5
5
|
platform: ruby
|
6
|
-
authors:
|
6
|
+
authors:
|
7
7
|
- Peter Cooper
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
-
|
16
|
-
name: hoe
|
17
|
-
type: :development
|
18
|
-
version_requirement:
|
19
|
-
version_requirements: !ruby/object:Gem::Requirement
|
20
|
-
requirements:
|
21
|
-
- - ">="
|
22
|
-
- !ruby/object:Gem::Version
|
23
|
-
version: 1.7.0
|
24
|
-
version:
|
25
|
-
description: "== FEATURES/PROBLEMS: * Only does French, English and Spanish out of the box. Very easy to train new languages though. * It can be made far more efficient at the comparison stage, but all in good time..! It still beats literal dictionary approaches. * No filter selection yet, you get 'em all loaded. * Tests are reasonably light. == SYNOPSIS: Full Example require 'whatlanguage' texts = [] texts << %q{Deux autres personnes ont \xC3\xA9t\xC3\xA9 arr\xC3\xAAt\xC3\xA9es durant la nuit} texts << %q{The links between the attempted car bombings in Glasgow and London are becoming clearer} texts << %q{En estado de m\xC3\xA1xima alertaen su nivel de cr\xC3\xADtico} texts << %q{Returns the object in enum with the maximum value.} texts << %q{Propose des donn\xC3\xA9es au sujet de la langue espagnole.} texts << %q{La palabra \"mezquita\" se usa en espa\xC3\xB1ol para referirse a todo tipo de edificios dedicados.} texts.each { |text| puts \"#{text[0..18]}... is in #{text.language.to_s.capitalize}\" } Initialize WhatLanguage with all filters wl = WhatLanguage.new(:all)"
|
26
|
-
email: whatlanguage@peterc.org
|
11
|
+
date: 2013-03-07 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: WhatLanguage rapidly detects the language of a sample of text
|
14
|
+
email:
|
15
|
+
- git@peterc.org
|
27
16
|
executables: []
|
28
|
-
|
29
17
|
extensions: []
|
30
|
-
|
31
|
-
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- .gitignore
|
21
|
+
- Gemfile
|
32
22
|
- History.txt
|
23
|
+
- LICENSE.txt
|
33
24
|
- Manifest.txt
|
34
|
-
- README.
|
35
|
-
files:
|
36
|
-
- History.txt
|
37
|
-
- Manifest.txt
|
38
|
-
- README.txt
|
25
|
+
- README.md
|
39
26
|
- Rakefile
|
40
27
|
- build_filter.rb
|
28
|
+
- build_lang_from_wordlists.rb
|
29
|
+
- copyright-en
|
41
30
|
- example.rb
|
42
31
|
- lang/dutch.lang
|
32
|
+
- lang/english.lang
|
43
33
|
- lang/farsi.lang
|
34
|
+
- lang/french.lang
|
44
35
|
- lang/german.lang
|
36
|
+
- lang/italian.lang
|
45
37
|
- lang/pinyin.lang
|
46
|
-
- lang/russian.lang
|
47
|
-
- lang/english.lang
|
48
38
|
- lang/portuguese.lang
|
49
|
-
- lang/
|
39
|
+
- lang/russian.lang
|
50
40
|
- lang/spanish.lang
|
51
|
-
-
|
52
|
-
- lib/bloominsimple.rb
|
41
|
+
- lang/swedish.lang
|
53
42
|
- lib/whatlanguage.rb
|
43
|
+
- lib/whatlanguage/bitfield.rb
|
44
|
+
- lib/whatlanguage/bloominsimple.rb
|
45
|
+
- lib/whatlanguage/version.rb
|
54
46
|
- test/test_whatlanguage.rb
|
55
|
-
|
56
|
-
homepage:
|
47
|
+
- whatlanguage.gemspec
|
48
|
+
homepage: https://github.com/peterc/whatlanguage
|
49
|
+
licenses: []
|
50
|
+
metadata: {}
|
57
51
|
post_install_message:
|
58
|
-
rdoc_options:
|
59
|
-
|
60
|
-
- README.txt
|
61
|
-
require_paths:
|
52
|
+
rdoc_options: []
|
53
|
+
require_paths:
|
62
54
|
- lib
|
63
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
64
|
-
requirements:
|
65
|
-
- -
|
66
|
-
- !ruby/object:Gem::Version
|
67
|
-
version:
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
version: "0"
|
74
|
-
version:
|
55
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
56
|
+
requirements:
|
57
|
+
- - '>='
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: '0'
|
60
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
61
|
+
requirements:
|
62
|
+
- - '>='
|
63
|
+
- !ruby/object:Gem::Version
|
64
|
+
version: '0'
|
75
65
|
requirements: []
|
76
|
-
|
77
|
-
|
78
|
-
rubygems_version: 1.2.0
|
66
|
+
rubyforge_project:
|
67
|
+
rubygems_version: 2.0.0
|
79
68
|
signing_key:
|
80
|
-
specification_version:
|
81
|
-
summary:
|
82
|
-
test_files:
|
69
|
+
specification_version: 4
|
70
|
+
summary: Natural language detection for text samples
|
71
|
+
test_files:
|
83
72
|
- test/test_whatlanguage.rb
|