whatlanguage 1.0.5 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +41 -0
- data/Gemfile +2 -2
- data/LICENSE.txt +42 -9
- data/README.md +50 -76
- data/Rakefile +9 -3
- data/lib/whatlanguage/languages.rb +180 -0
- data/lib/whatlanguage/trigrams.json +1 -0
- data/lib/whatlanguage/version.rb +3 -1
- data/lib/whatlanguage.rb +264 -50
- data/whatlanguage.gemspec +24 -13
- metadata +43 -47
- data/.gitignore +0 -17
- data/History.txt +0 -15
- data/Manifest.txt +0 -19
- data/build_filter.rb +0 -9
- data/build_lang_from_wordlists.rb +0 -13
- data/copyright-en +0 -243
- data/example.rb +0 -51
- data/lang/arabic.lang +0 -0
- data/lang/dutch.lang +0 -0
- data/lang/english.lang +0 -0
- data/lang/farsi.lang +0 -0
- data/lang/finnish.lang +0 -0
- data/lang/french.lang +0 -0
- data/lang/german.lang +0 -0
- data/lang/greek.lang +0 -0
- data/lang/hebrew.lang +0 -0
- data/lang/hungarian.lang +0 -0
- data/lang/italian.lang +0 -0
- data/lang/korean.lang +0 -0
- data/lang/norwegian.lang +0 -0
- data/lang/pinyin.lang +0 -0
- data/lang/polish.lang +0 -0
- data/lang/portuguese.lang +0 -0
- data/lang/russian.lang +0 -0
- data/lang/spanish.lang +0 -0
- data/lang/swedish.lang +0 -0
- data/lib/whatlanguage/bitfield.rb +0 -64
- data/lib/whatlanguage/bloominsimple.rb +0 -88
- data/test/test_whatlanguage.rb +0 -113
data/lib/whatlanguage.rb
CHANGED
|
@@ -1,66 +1,280 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
require '
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
require_relative 'whatlanguage/languages'
|
|
5
|
+
require_relative 'whatlanguage/version'
|
|
6
|
+
|
|
7
|
+
class WhatLanguage
|
|
8
|
+
MAX_TRIGRAM_DISTANCE = 300
|
|
9
|
+
MAX_TOTAL_DISTANCE = MAX_TRIGRAM_DISTANCE * MAX_TRIGRAM_DISTANCE # 90_000
|
|
10
|
+
TEXT_TRIGRAMS_SIZE = 600
|
|
11
|
+
DEFAULT_MIN_CHARS = 10
|
|
12
|
+
|
|
13
|
+
Result = Struct.new(:language, :iso, :score, :ranked, keyword_init: true) do
|
|
14
|
+
alias scores ranked
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# Scripts that resolve to a single language by their Unicode block alone.
|
|
18
|
+
# (Hiragana and Katakana both indicate Japanese.) Scripts NOT listed here but
|
|
19
|
+
# present in the trigram dataset are disambiguated statistically instead.
|
|
20
|
+
DETERMINISTIC = {
|
|
21
|
+
'Mandarin' => 'cmn', 'Bengali' => 'ben', 'Hangul' => 'kor',
|
|
22
|
+
'Georgian' => 'kat', 'Greek' => 'ell', 'Kannada' => 'kan',
|
|
23
|
+
'Tamil' => 'tam', 'Thai' => 'tha', 'Gujarati' => 'guj',
|
|
24
|
+
'Gurmukhi' => 'pan', 'Telugu' => 'tel', 'Malayalam'=> 'mal',
|
|
25
|
+
'Oriya' => 'ori', 'Myanmar' => 'mya', 'Sinhala' => 'sin',
|
|
26
|
+
'Khmer' => 'khm', 'Armenian'=> 'hye', 'Hiragana' => 'jpn',
|
|
27
|
+
'Katakana' => 'jpn'
|
|
28
|
+
}.freeze
|
|
29
|
+
|
|
30
|
+
# Unicode ranges per script, in detection priority order (mirrors whatlang's
|
|
31
|
+
# scripts/detect.rs). The first script whose range contains a character claims
|
|
32
|
+
# that character; the script with the most characters wins.
|
|
33
|
+
SCRIPT_RANGES = [
|
|
34
|
+
['Latin', [[0x61,0x7A],[0x41,0x5A],[0x80,0xFF],[0x100,0x17F],[0x180,0x24F],
|
|
35
|
+
[0x250,0x2AF],[0x1D00,0x1D7F],[0x1D80,0x1DBF],[0x1E00,0x1EFF],
|
|
36
|
+
[0x2100,0x214F],[0x2C60,0x2C7F],[0xA720,0xA7FF],[0xAB30,0xAB6F]]],
|
|
37
|
+
['Cyrillic', [[0x400,0x484],[0x487,0x52F],[0x2DE0,0x2DFF],[0xA640,0xA69D],
|
|
38
|
+
[0x1D2B,0x1D2B],[0x1D78,0x1D78],[0xA69F,0xA69F]]],
|
|
39
|
+
['Arabic', [[0x600,0x6FF],[0x750,0x7FF],[0x8A0,0x8FF],[0xFB50,0xFDFF],
|
|
40
|
+
[0xFE70,0xFEFF],[0x10E60,0x10E7F],[0x1EE00,0x1EEFF]]],
|
|
41
|
+
['Mandarin', [[0x2E80,0x2E99],[0x2E9B,0x2EF3],[0x2F00,0x2FD5],[0x3005,0x3005],
|
|
42
|
+
[0x3007,0x3007],[0x3021,0x3029],[0x3038,0x303B],[0x3400,0x4DB5],
|
|
43
|
+
[0x4E00,0x9FCC],[0xF900,0xFA6D],[0xFA70,0xFAD9]]],
|
|
44
|
+
['Devanagari', [[0x900,0x97F],[0xA8E0,0xA8FF],[0x1CD0,0x1CFF]]],
|
|
45
|
+
['Hebrew', [[0x590,0x5FF]]],
|
|
46
|
+
['Ethiopic', [[0x1200,0x139F],[0x2D80,0x2DDF],[0xAB00,0xAB2F]]],
|
|
47
|
+
['Georgian', [[0x10A0,0x10FF]]],
|
|
48
|
+
['Bengali', [[0x980,0x9FF]]],
|
|
49
|
+
['Hangul', [[0xAC00,0xD7AF],[0x1100,0x11FF],[0x3130,0x318F],[0x3200,0x32FF],
|
|
50
|
+
[0xA960,0xA97F],[0xD7B0,0xD7FF]]],
|
|
51
|
+
['Hiragana', [[0x3040,0x309F]]],
|
|
52
|
+
['Katakana', [[0x30A0,0x30FF]]],
|
|
53
|
+
['Greek', [[0x370,0x3FF]]],
|
|
54
|
+
['Kannada', [[0xC80,0xCFF]]],
|
|
55
|
+
['Tamil', [[0xB80,0xBFF]]],
|
|
56
|
+
['Thai', [[0xE00,0xE7F]]],
|
|
57
|
+
['Gujarati', [[0xA80,0xAFF]]],
|
|
58
|
+
['Gurmukhi', [[0xA00,0xA7F]]],
|
|
59
|
+
['Telugu', [[0xC00,0xC7F]]],
|
|
60
|
+
['Malayalam', [[0xD00,0xD7F]]],
|
|
61
|
+
['Oriya', [[0xB00,0xB7F]]],
|
|
62
|
+
['Myanmar', [[0x1000,0x109F]]],
|
|
63
|
+
['Sinhala', [[0xD80,0xDFF]]],
|
|
64
|
+
['Khmer', [[0x1780,0x17FF],[0x19E0,0x19FF]]],
|
|
65
|
+
['Armenian', [[0x530,0x58F],[0xFB13,0xFB17]]]
|
|
66
|
+
].freeze
|
|
67
|
+
|
|
68
|
+
# ISO 639-1 (with 639-3 fallback) lookup by language-name symbol, plus the
|
|
69
|
+
# historical nil => nil entry. Internal; kept for backward compatibility.
|
|
70
|
+
ISO_CODES = CODE_INFO.each_with_object(nil => nil) do |(_code, (name, iso)), h|
|
|
71
|
+
h[name] = iso
|
|
72
|
+
end.freeze
|
|
73
|
+
|
|
74
|
+
NAME_TO_CODE = CODE_INFO.each_with_object({}) do |(code, (name, _iso)), h|
|
|
75
|
+
h[name] ||= code
|
|
76
|
+
end.freeze
|
|
77
|
+
|
|
78
|
+
private_constant :MAX_TRIGRAM_DISTANCE, :MAX_TOTAL_DISTANCE, :TEXT_TRIGRAMS_SIZE,
|
|
79
|
+
:DEFAULT_MIN_CHARS, :DETERMINISTIC, :SCRIPT_RANGES, :ISO_CODES,
|
|
80
|
+
:NAME_TO_CODE
|
|
81
|
+
|
|
82
|
+
class << self
|
|
83
|
+
def detect(text)
|
|
84
|
+
default_detector.detect(text)
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def language(text)
|
|
88
|
+
default_detector.language(text)
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def language_iso(text)
|
|
92
|
+
default_detector.language_iso(text)
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def ranked(text)
|
|
96
|
+
default_detector.ranked(text)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def score_hash(text)
|
|
100
|
+
default_detector.score_hash(text)
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
alias scores score_hash
|
|
104
|
+
alias process_text score_hash
|
|
105
|
+
|
|
106
|
+
def languages
|
|
107
|
+
NAME_TO_CODE.keys
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# script name => [[code, [trigram, ...]], ...], loaded once and memoized.
|
|
111
|
+
def profiles
|
|
112
|
+
@profiles ||= JSON.parse(File.read(File.join(__dir__, 'whatlanguage', 'trigrams.json')))
|
|
113
|
+
.transform_values { |langs| langs.map { |code, str| [code, str.split('|')] } }
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
private
|
|
117
|
+
|
|
118
|
+
def default_detector
|
|
119
|
+
@default_detector ||= new
|
|
17
120
|
end
|
|
18
121
|
end
|
|
19
|
-
|
|
20
|
-
# Very inefficient method for now.. but still beats the non-Bloom alternatives.
|
|
21
|
-
# Change to better bit comparison technique later..
|
|
22
|
-
def process_text(text)
|
|
23
|
-
results = Hash.new(0)
|
|
24
|
-
it = 0
|
|
25
|
-
text.downcase.split.each do |word|
|
|
26
|
-
it += 1
|
|
27
122
|
|
|
123
|
+
def initialize(*selection, only: nil, min_chars: DEFAULT_MIN_CHARS)
|
|
124
|
+
@selection = Array(only || (selection.empty? ? [:all] : selection))
|
|
125
|
+
validate_selection!
|
|
126
|
+
@min_chars = min_chars
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# Language-name symbols this instance scores against: every supported language
|
|
130
|
+
# for :all, otherwise the requested selection intersected with the supported
|
|
131
|
+
# set (legacy aliases such as :pinyin resolved to their modern names).
|
|
132
|
+
def languages
|
|
133
|
+
@languages ||=
|
|
28
134
|
if @selection.include?(:all)
|
|
29
|
-
languages
|
|
135
|
+
self.class.languages
|
|
30
136
|
else
|
|
31
|
-
|
|
137
|
+
wanted = @selection.map { |s| NAME_ALIASES.fetch(s, s) }
|
|
138
|
+
self.class.languages & wanted
|
|
32
139
|
end
|
|
140
|
+
end
|
|
33
141
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
142
|
+
# Per-language scores for the text (higher = more likely). Languages outside
|
|
143
|
+
# the current selection, or not under the detected script, are absent; the
|
|
144
|
+
# hash defaults to 0. Only the relative ranking is meaningful.
|
|
145
|
+
def score_hash(text)
|
|
146
|
+
results = Hash.new(0)
|
|
147
|
+
text = normalize_text(text)
|
|
148
|
+
script = detect_script(text)
|
|
149
|
+
return results unless script
|
|
150
|
+
|
|
151
|
+
if (code = DETERMINISTIC[script])
|
|
152
|
+
name = CODE_INFO[code].first
|
|
153
|
+
results[name] = MAX_TOTAL_DISTANCE if allowed?(name)
|
|
154
|
+
return results
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
candidates = self.class.profiles[script]
|
|
158
|
+
return results unless candidates
|
|
159
|
+
return results if significant_char_count(text) < @min_chars
|
|
160
|
+
|
|
161
|
+
positions = trigram_positions(text)
|
|
162
|
+
candidates.each do |code, trigrams|
|
|
163
|
+
name = CODE_INFO[code].first
|
|
164
|
+
next unless allowed?(name)
|
|
165
|
+
|
|
166
|
+
results[name] = MAX_TOTAL_DISTANCE - distance(trigrams, positions)
|
|
47
167
|
end
|
|
48
168
|
results
|
|
49
169
|
end
|
|
50
|
-
|
|
170
|
+
|
|
171
|
+
alias scores score_hash
|
|
172
|
+
alias process_text score_hash
|
|
173
|
+
|
|
174
|
+
# Per-language scores as an array sorted from most likely to least likely.
|
|
175
|
+
def ranked(text)
|
|
176
|
+
score_hash(text).sort_by { |_name, score| -score }
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
# Detection result with the winning language, ISO code, winning score, and
|
|
180
|
+
# full ranked scores. Returns nil when the text is too short or unrecognized.
|
|
181
|
+
def detect(text)
|
|
182
|
+
ranked_scores = ranked(text)
|
|
183
|
+
return nil if ranked_scores.empty?
|
|
184
|
+
|
|
185
|
+
name, score = ranked_scores.first
|
|
186
|
+
Result.new(language: name, iso: ISO_CODES[name], score: score, ranked: ranked_scores)
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
# Most likely language as a name symbol, or nil when no language is detected.
|
|
51
190
|
def language(text)
|
|
52
|
-
|
|
191
|
+
detect(text)&.language
|
|
53
192
|
end
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
193
|
+
|
|
194
|
+
# Most likely language as an ISO 639-1 symbol (639-3 fallback), or nil.
|
|
195
|
+
def language_iso(text)
|
|
196
|
+
detect(text)&.iso
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
private
|
|
200
|
+
|
|
201
|
+
def normalize_text(text)
|
|
202
|
+
text.to_s.unicode_normalize(:nfkc)
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
def allowed?(name)
|
|
206
|
+
@selection.include?(:all) || languages.include?(name)
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
def validate_selection!
|
|
210
|
+
requested = @selection.reject { |name| name == :all }
|
|
211
|
+
unknown = requested.reject { |name| self.class.languages.include?(NAME_ALIASES.fetch(name, name)) }
|
|
212
|
+
return if unknown.empty?
|
|
213
|
+
|
|
214
|
+
raise ArgumentError, "Unknown language selection: #{unknown.map(&:inspect).join(', ')}"
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
def significant_char_count(text)
|
|
218
|
+
text.each_char.count { |ch| !stop_char?(ch.ord) }
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
# Dominant Unicode script of the text, or nil if it has no script characters.
|
|
222
|
+
def detect_script(text)
|
|
223
|
+
counts = Hash.new(0)
|
|
224
|
+
text.each_char do |ch|
|
|
225
|
+
cp = ch.ord
|
|
226
|
+
next if stop_char?(cp)
|
|
227
|
+
|
|
228
|
+
SCRIPT_RANGES.each do |name, ranges|
|
|
229
|
+
if ranges.any? { |lo, hi| cp >= lo && cp <= hi }
|
|
230
|
+
counts[name] += 1
|
|
231
|
+
break
|
|
232
|
+
end
|
|
233
|
+
end
|
|
234
|
+
end
|
|
235
|
+
return nil if counts.empty?
|
|
236
|
+
|
|
237
|
+
counts.max_by { |_name, n| n }.first
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
# Text trigrams ranked by descending frequency, mapped to their rank index.
|
|
241
|
+
# Mirrors whatlang's trigram extraction: punctuation/digits become spaces,
|
|
242
|
+
# the stream is bounded by spaces, and runs of spaces are collapsed.
|
|
243
|
+
def trigram_positions(text)
|
|
244
|
+
chars = text.downcase.each_char.map { |c| stop_char?(c.ord) ? ' ' : c }
|
|
245
|
+
return {} if chars.empty?
|
|
246
|
+
|
|
247
|
+
occurrences = Hash.new(0)
|
|
248
|
+
c1 = ' '
|
|
249
|
+
c2 = chars[0]
|
|
250
|
+
(chars[1..] + [' ']).each do |c3|
|
|
251
|
+
occurrences[c1 + c2 + c3] += 1 unless c2 == ' ' && (c1 == ' ' || c3 == ' ')
|
|
252
|
+
c1 = c2
|
|
253
|
+
c2 = c3
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
ranked = occurrences.to_a.sort { |a, b| [b[1], b[0]] <=> [a[1], a[0]] }.first(TEXT_TRIGRAMS_SIZE)
|
|
257
|
+
positions = {}
|
|
258
|
+
ranked.each_with_index { |(trigram, _count), i| positions[trigram] = i }
|
|
259
|
+
positions
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
# Out-of-place distance between a language's ordered trigram profile and the
|
|
263
|
+
# text's ranked trigrams. Lower means a closer match.
|
|
264
|
+
def distance(profile, positions)
|
|
265
|
+
total = 0
|
|
266
|
+
profile.each_with_index do |trigram, i|
|
|
267
|
+
pos = positions[trigram]
|
|
268
|
+
total += pos ? (pos - i).abs : MAX_TRIGRAM_DISTANCE
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
count = positions.size
|
|
272
|
+
total -= (MAX_TRIGRAM_DISTANCE - count) * MAX_TRIGRAM_DISTANCE if MAX_TRIGRAM_DISTANCE > count
|
|
273
|
+
total.clamp(0, MAX_TOTAL_DISTANCE)
|
|
59
274
|
end
|
|
60
|
-
end
|
|
61
275
|
|
|
62
|
-
|
|
63
|
-
def
|
|
64
|
-
|
|
276
|
+
# Space, ASCII punctuation, or digit: no value for script/language detection.
|
|
277
|
+
def stop_char?(codepoint)
|
|
278
|
+
codepoint <= 0x40 || (codepoint >= 0x5B && codepoint <= 0x60) || (codepoint >= 0x7B && codepoint <= 0x7E)
|
|
65
279
|
end
|
|
66
280
|
end
|
data/whatlanguage.gemspec
CHANGED
|
@@ -1,19 +1,30 @@
|
|
|
1
|
-
#
|
|
2
|
-
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
lib = File.expand_path('lib', __dir__)
|
|
3
4
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
5
|
require 'whatlanguage/version'
|
|
5
6
|
|
|
6
7
|
Gem::Specification.new do |gem|
|
|
7
|
-
gem.name =
|
|
8
|
+
gem.name = 'whatlanguage'
|
|
8
9
|
gem.version = WhatLanguage::VERSION
|
|
9
|
-
gem.authors = [
|
|
10
|
-
gem.email = [
|
|
11
|
-
gem.description =
|
|
12
|
-
gem.summary =
|
|
13
|
-
gem.homepage =
|
|
10
|
+
gem.authors = ['Peter Cooper']
|
|
11
|
+
gem.email = ['git@peterc.org']
|
|
12
|
+
gem.description = 'WhatLanguage rapidly detects the language of a sample of text'
|
|
13
|
+
gem.summary = 'Natural language detection for text samples'
|
|
14
|
+
gem.homepage = 'https://github.com/peterc/whatlanguage'
|
|
15
|
+
gem.license = 'MIT'
|
|
16
|
+
gem.required_ruby_version = '>= 3.0'
|
|
17
|
+
|
|
18
|
+
gem.files = Dir['lib/**/*'] + [
|
|
19
|
+
'README.md',
|
|
20
|
+
'CHANGELOG.md',
|
|
21
|
+
'LICENSE.txt',
|
|
22
|
+
'Gemfile',
|
|
23
|
+
'Rakefile',
|
|
24
|
+
'whatlanguage.gemspec'
|
|
25
|
+
]
|
|
26
|
+
gem.require_paths = ['lib']
|
|
14
27
|
|
|
15
|
-
gem.
|
|
16
|
-
gem.
|
|
17
|
-
|
|
18
|
-
gem.require_paths = ["lib"]
|
|
19
|
-
end
|
|
28
|
+
gem.add_development_dependency 'minitest', '~> 5.0'
|
|
29
|
+
gem.add_development_dependency 'rake'
|
|
30
|
+
end
|
metadata
CHANGED
|
@@ -1,16 +1,42 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: whatlanguage
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version:
|
|
5
|
-
prerelease:
|
|
4
|
+
version: 2.0.0
|
|
6
5
|
platform: ruby
|
|
7
6
|
authors:
|
|
8
7
|
- Peter Cooper
|
|
9
|
-
autorequire:
|
|
10
8
|
bindir: bin
|
|
11
9
|
cert_chain: []
|
|
12
|
-
date:
|
|
13
|
-
dependencies:
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
|
+
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: minitest
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - "~>"
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: '5.0'
|
|
19
|
+
type: :development
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - "~>"
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: '5.0'
|
|
26
|
+
- !ruby/object:Gem::Dependency
|
|
27
|
+
name: rake
|
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
|
29
|
+
requirements:
|
|
30
|
+
- - ">="
|
|
31
|
+
- !ruby/object:Gem::Version
|
|
32
|
+
version: '0'
|
|
33
|
+
type: :development
|
|
34
|
+
prerelease: false
|
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
36
|
+
requirements:
|
|
37
|
+
- - ">="
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
version: '0'
|
|
14
40
|
description: WhatLanguage rapidly detects the language of a sample of text
|
|
15
41
|
email:
|
|
16
42
|
- git@peterc.org
|
|
@@ -18,65 +44,35 @@ executables: []
|
|
|
18
44
|
extensions: []
|
|
19
45
|
extra_rdoc_files: []
|
|
20
46
|
files:
|
|
21
|
-
- .
|
|
47
|
+
- CHANGELOG.md
|
|
22
48
|
- Gemfile
|
|
23
|
-
- History.txt
|
|
24
49
|
- LICENSE.txt
|
|
25
|
-
- Manifest.txt
|
|
26
50
|
- README.md
|
|
27
51
|
- Rakefile
|
|
28
|
-
- build_filter.rb
|
|
29
|
-
- build_lang_from_wordlists.rb
|
|
30
|
-
- copyright-en
|
|
31
|
-
- example.rb
|
|
32
|
-
- lang/arabic.lang
|
|
33
|
-
- lang/dutch.lang
|
|
34
|
-
- lang/english.lang
|
|
35
|
-
- lang/farsi.lang
|
|
36
|
-
- lang/finnish.lang
|
|
37
|
-
- lang/french.lang
|
|
38
|
-
- lang/german.lang
|
|
39
|
-
- lang/greek.lang
|
|
40
|
-
- lang/hebrew.lang
|
|
41
|
-
- lang/hungarian.lang
|
|
42
|
-
- lang/italian.lang
|
|
43
|
-
- lang/korean.lang
|
|
44
|
-
- lang/norwegian.lang
|
|
45
|
-
- lang/pinyin.lang
|
|
46
|
-
- lang/polish.lang
|
|
47
|
-
- lang/portuguese.lang
|
|
48
|
-
- lang/russian.lang
|
|
49
|
-
- lang/spanish.lang
|
|
50
|
-
- lang/swedish.lang
|
|
51
52
|
- lib/whatlanguage.rb
|
|
52
|
-
- lib/whatlanguage/
|
|
53
|
-
- lib/whatlanguage/
|
|
53
|
+
- lib/whatlanguage/languages.rb
|
|
54
|
+
- lib/whatlanguage/trigrams.json
|
|
54
55
|
- lib/whatlanguage/version.rb
|
|
55
|
-
- test/test_whatlanguage.rb
|
|
56
56
|
- whatlanguage.gemspec
|
|
57
57
|
homepage: https://github.com/peterc/whatlanguage
|
|
58
|
-
licenses:
|
|
59
|
-
|
|
58
|
+
licenses:
|
|
59
|
+
- MIT
|
|
60
|
+
metadata: {}
|
|
60
61
|
rdoc_options: []
|
|
61
62
|
require_paths:
|
|
62
63
|
- lib
|
|
63
64
|
required_ruby_version: !ruby/object:Gem::Requirement
|
|
64
|
-
none: false
|
|
65
65
|
requirements:
|
|
66
|
-
- -
|
|
66
|
+
- - ">="
|
|
67
67
|
- !ruby/object:Gem::Version
|
|
68
|
-
version: '0'
|
|
68
|
+
version: '3.0'
|
|
69
69
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
70
|
-
none: false
|
|
71
70
|
requirements:
|
|
72
|
-
- -
|
|
71
|
+
- - ">="
|
|
73
72
|
- !ruby/object:Gem::Version
|
|
74
73
|
version: '0'
|
|
75
74
|
requirements: []
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
signing_key:
|
|
79
|
-
specification_version: 3
|
|
75
|
+
rubygems_version: 4.0.6
|
|
76
|
+
specification_version: 4
|
|
80
77
|
summary: Natural language detection for text samples
|
|
81
|
-
test_files:
|
|
82
|
-
- test/test_whatlanguage.rb
|
|
78
|
+
test_files: []
|
data/.gitignore
DELETED
data/History.txt
DELETED
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
== 1.0.5 / 2013-10-05
|
|
2
|
-
|
|
3
|
-
* Many more languages supported
|
|
4
|
-
|
|
5
|
-
== 1.0.4 / 2013-03-07
|
|
6
|
-
|
|
7
|
-
== 1.0.1 / 2008-08-22
|
|
8
|
-
|
|
9
|
-
* Public release
|
|
10
|
-
* Removed wordlists from distribution to reduce size
|
|
11
|
-
|
|
12
|
-
== 1.0.0 / 2007-07-02
|
|
13
|
-
|
|
14
|
-
* First version with pre-built English, French, and Spanish filters
|
|
15
|
-
|
data/Manifest.txt
DELETED
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
History.txt
|
|
2
|
-
Manifest.txt
|
|
3
|
-
README.txt
|
|
4
|
-
Rakefile
|
|
5
|
-
build_filter.rb
|
|
6
|
-
example.rb
|
|
7
|
-
lang/dutch.lang
|
|
8
|
-
lang/farsi.lang
|
|
9
|
-
lang/german.lang
|
|
10
|
-
lang/pinyin.lang
|
|
11
|
-
lang/russian.lang
|
|
12
|
-
lang/english.lang
|
|
13
|
-
lang/portuguese.lang
|
|
14
|
-
lang/french.lang
|
|
15
|
-
lang/spanish.lang
|
|
16
|
-
lib/bitfield.rb
|
|
17
|
-
lib/bloominsimple.rb
|
|
18
|
-
lib/whatlanguage.rb
|
|
19
|
-
test/test_whatlanguage.rb
|
data/build_filter.rb
DELETED
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
# Use this to build new filters (for other languages, ideally) from /usr/share/dict/words style dictionaries..
|
|
2
|
-
#
|
|
3
|
-
# Call like so..
|
|
4
|
-
# ruby build_filter.rb /usr/share/dict/words lang/english.lang
|
|
5
|
-
# (replace params as necessary)
|
|
6
|
-
|
|
7
|
-
require 'lib/whatlanguage'
|
|
8
|
-
filter = WhatLanguage.filter_from_dictionary(ARGV[0])
|
|
9
|
-
File.open(ARGV[1], 'wb') { |f| f.write filter.dump }
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
# Builds all of the word lists in ./wordlists/ into filter files in ./lang/
|
|
2
|
-
|
|
3
|
-
require 'lib/whatlanguage'
|
|
4
|
-
|
|
5
|
-
languages_folder = File.join(File.dirname(__FILE__), "lang")
|
|
6
|
-
wordlists_folder = File.join(File.dirname(__FILE__), "wordlists")
|
|
7
|
-
|
|
8
|
-
Dir.entries(wordlists_folder).grep(/\w/).each do |lang|
|
|
9
|
-
next if lang == 'generators'
|
|
10
|
-
puts "Doing #{lang}"
|
|
11
|
-
filter = WhatLanguage.filter_from_dictionary(File.join(wordlists_folder, lang))
|
|
12
|
-
File.open(File.join(languages_folder, lang + ".lang"), 'wb') { |f| f.write filter.dump }
|
|
13
|
-
end
|