opener-language-identifier 4.1.0 → 4.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -3
- data/core/target/LanguageDetection-0.0.1.jar +0 -0
- data/core/target/LanguageDetection-1.0.0.jar +0 -0
- data/core/target/classes/com/cybozu/labs/langdetect/Detector.class +0 -0
- data/core/target/classes/com/cybozu/labs/langdetect/DetectorFactory.class +0 -0
- data/core/target/classes/com/cybozu/labs/langdetect/util/LangProfile.class +0 -0
- data/core/target/classes/com/cybozu/labs/langdetect/util/NGram.class +0 -0
- data/core/target/classes/com/cybozu/labs/langdetect/util/messages.properties +7 -0
- data/core/target/classes/profiles/af +1 -0
- data/core/target/classes/profiles/ko +1 -1
- data/core/target/classes/profiles/ro +1 -1
- data/core/target/classes/profiles/vi +1 -1
- data/core/target/classes/short_profiles/ar +1 -0
- data/core/target/classes/short_profiles/bg +1 -0
- data/core/target/classes/short_profiles/bn +1 -0
- data/core/target/classes/short_profiles/ca +1 -0
- data/core/target/classes/short_profiles/cs +1 -0
- data/core/target/classes/short_profiles/da +1 -0
- data/core/target/classes/short_profiles/de +1 -0
- data/core/target/classes/short_profiles/el +1 -0
- data/core/target/classes/short_profiles/en +1 -0
- data/core/target/classes/short_profiles/es +1 -0
- data/core/target/classes/short_profiles/et +1 -0
- data/core/target/classes/short_profiles/fa +1 -0
- data/core/target/classes/short_profiles/fi +1 -0
- data/core/target/classes/short_profiles/fr +1 -0
- data/core/target/classes/short_profiles/gu +1 -0
- data/core/target/classes/short_profiles/he +1 -0
- data/core/target/classes/short_profiles/hi +1 -0
- data/core/target/classes/short_profiles/hr +1 -0
- data/core/target/classes/short_profiles/hu +1 -0
- data/core/target/classes/short_profiles/id +1 -0
- data/core/target/classes/short_profiles/it +1 -0
- data/core/target/classes/short_profiles/ja +1 -0
- data/core/target/classes/short_profiles/ko +1 -0
- data/core/target/classes/short_profiles/lt +1 -0
- data/core/target/classes/short_profiles/lv +1 -0
- data/core/target/classes/short_profiles/mk +1 -0
- data/core/target/classes/short_profiles/ml +1 -0
- data/core/target/classes/short_profiles/nl +1 -0
- data/core/target/classes/short_profiles/no +1 -0
- data/core/target/classes/short_profiles/pa +1 -0
- data/core/target/classes/short_profiles/pl +1 -0
- data/core/target/classes/short_profiles/pt +1 -0
- data/core/target/classes/short_profiles/ro +1 -0
- data/core/target/classes/short_profiles/ru +1 -0
- data/core/target/classes/short_profiles/si +1 -0
- data/core/target/classes/short_profiles/sq +1 -0
- data/core/target/classes/short_profiles/sv +1 -0
- data/core/target/classes/short_profiles/ta +1 -0
- data/core/target/classes/short_profiles/te +1 -0
- data/core/target/classes/short_profiles/th +1 -0
- data/core/target/classes/short_profiles/tl +1 -0
- data/core/target/classes/short_profiles/tr +1 -0
- data/core/target/classes/short_profiles/uk +1 -0
- data/core/target/classes/short_profiles/ur +1 -0
- data/core/target/classes/short_profiles/vi +1 -0
- data/core/target/classes/short_profiles/zh-cn +1 -0
- data/core/target/classes/short_profiles/zh-tw +1 -0
- data/lib/opener/language_identifier/detector.rb +143 -24
- data/lib/opener/language_identifier/version.rb +1 -1
- data/lib/opener/language_identifier.rb +3 -5
- data/opener-language-identifier.gemspec +0 -1
- metadata +51 -21
- data/core/target/classes/com/cybozu/labs/langdetect/Command.class +0 -0
- data/core/target/classes/com/cybozu/labs/langdetect/GenProfile.class +0 -0
- data/core/target/classes/com/cybozu/labs/langdetect/util/TagExtractor.class +0 -0
- data/core/target/classes/org/vicomtech/opennlp/LanguageDetection/CybozuDetector.class +0 -0
- data/core/target/classes/org/vicomtech/opennlp/LanguageDetection/Main.class +0 -0
@@ -1,41 +1,160 @@
|
|
1
|
-
require 'singleton'
|
2
|
-
|
3
|
-
import 'org.vicomtech.opennlp.LanguageDetection.CybozuDetector'
|
4
|
-
|
5
1
|
module Opener
|
6
2
|
class LanguageIdentifier
|
7
3
|
##
|
8
|
-
#
|
9
|
-
#
|
10
|
-
#
|
4
|
+
# Ruby wrapper around the Cybozu DetectorFactory and Detector classes. This
|
5
|
+
# class automatically handles switching of profiles based on input sizes,
|
6
|
+
# assigning priorities to languages, etc.
|
11
7
|
#
|
12
8
|
class Detector
|
13
|
-
attr_reader :
|
9
|
+
attr_reader :profiles_path, :short_profiles_path
|
10
|
+
|
11
|
+
##
|
12
|
+
# Path to the directory containing the default profiles.
|
13
|
+
#
|
14
|
+
# @return [String]
|
15
|
+
#
|
16
|
+
DEFAULT_PROFILES_PATH = File.expand_path(
|
17
|
+
'../../../../core/target/classes/profiles',
|
18
|
+
__FILE__
|
19
|
+
)
|
20
|
+
|
21
|
+
##
|
22
|
+
# Path to the directory containing the default short profiles.
|
23
|
+
#
|
24
|
+
# @return [String]
|
25
|
+
#
|
26
|
+
DEFAULT_SHORT_PROFILES_PATH = File.expand_path(
|
27
|
+
'../../../../core/target/classes/short_profiles',
|
28
|
+
__FILE__
|
29
|
+
)
|
30
|
+
|
31
|
+
##
|
32
|
+
# The amount of characters after which the detector should switch to using
|
33
|
+
# the longer profiles set.
|
34
|
+
#
|
35
|
+
# @return [Fixnum]
|
36
|
+
#
|
37
|
+
SHORT_THRESHOLD = 15
|
38
|
+
|
39
|
+
##
|
40
|
+
# Prioritize OpeNER languages over the rest. Languages not covered by this
|
41
|
+
# list are automatically given a default priority.
|
42
|
+
#
|
43
|
+
# @return [Hash]
|
44
|
+
#
|
45
|
+
PRIORITIES = {
|
46
|
+
'en' => 1.0,
|
47
|
+
'es' => 0.9,
|
48
|
+
'it' => 0.9,
|
49
|
+
'fr' => 0.9,
|
50
|
+
'de' => 0.9,
|
51
|
+
'nl' => 0.9
|
52
|
+
}
|
14
53
|
|
15
|
-
|
54
|
+
##
|
55
|
+
# The default priority for non OpeNER languages.
|
56
|
+
#
|
57
|
+
# @return [Float]
|
58
|
+
#
|
59
|
+
DEFAULT_PRIORITY = 0.5
|
16
60
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
61
|
+
##
|
62
|
+
# @param [Hash] options
|
63
|
+
#
|
64
|
+
# @option options [String] :profiles_path
|
65
|
+
# @option options [String] :short_profiles_path
|
66
|
+
#
|
67
|
+
def initialize(options = {})
|
68
|
+
options.each do |key, value|
|
69
|
+
instance_variable_set("@#{key}", value) if respond_to?(key)
|
70
|
+
end
|
71
|
+
|
72
|
+
@profiles_path ||= DEFAULT_PROFILES_PATH
|
73
|
+
@short_profiles_path ||= DEFAULT_SHORT_PROFILES_PATH
|
21
74
|
end
|
22
75
|
|
76
|
+
##
|
77
|
+
# @return [String]
|
78
|
+
#
|
23
79
|
def detect(input)
|
24
|
-
|
25
|
-
@detector.detect(input)
|
26
|
-
end
|
80
|
+
return new_detector(input).detect
|
27
81
|
end
|
28
82
|
|
83
|
+
##
|
84
|
+
# @return [Array]
|
85
|
+
#
|
29
86
|
def probabilities(input)
|
30
|
-
|
31
|
-
|
87
|
+
return new_detector(input).get_probabilities.to_array
|
88
|
+
end
|
89
|
+
|
90
|
+
##
|
91
|
+
# Returns a new detector with the profiles set based on the input.
|
92
|
+
#
|
93
|
+
# This method analyses a lowercased version of the input as this yields
|
94
|
+
# better results for short text.
|
95
|
+
#
|
96
|
+
# @param [String] input
|
97
|
+
# @return [CybozuDetector]
|
98
|
+
#
|
99
|
+
def new_detector(input)
|
100
|
+
factory = com.cybozu.labs.langdetect.DetectorFactory.new
|
101
|
+
|
102
|
+
factory.load_profile(determine_profiles(input))
|
103
|
+
factory.set_seed(1)
|
104
|
+
|
105
|
+
priorities = build_priorities(input, factory.langlist)
|
106
|
+
detector = com.cybozu.labs.langdetect.Detector.new(factory)
|
107
|
+
|
108
|
+
detector.set_prior_map(priorities)
|
109
|
+
detector.append(input.downcase)
|
110
|
+
|
111
|
+
return detector
|
112
|
+
end
|
113
|
+
|
114
|
+
##
|
115
|
+
# Builds a Java Hash mapping the priorities for all OpeNER and non OpeNER
|
116
|
+
# languages.
|
117
|
+
#
|
118
|
+
# If the input size is smaller than the short profiles threshold non
|
119
|
+
# OpeNER languages are _disabled_. This is to ensure that these languages
|
120
|
+
# are detected properly when analysing only 1-2 words.
|
121
|
+
#
|
122
|
+
# @param [String] input
|
123
|
+
# @param [Array<String>] languages
|
124
|
+
# @return [java.util.HashMap]
|
125
|
+
#
|
126
|
+
def build_priorities(input, languages)
|
127
|
+
priorities = java.util.HashMap.new
|
128
|
+
priority = short_input?(input) ? 0.0 : DEFAULT_PRIORITY
|
129
|
+
|
130
|
+
PRIORITIES.each do |lang, val|
|
131
|
+
priorities.put(lang, val)
|
32
132
|
end
|
133
|
+
|
134
|
+
languages.each do |language|
|
135
|
+
unless priorities.contains_key(language)
|
136
|
+
priorities.put(language, priority)
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
return priorities
|
141
|
+
end
|
142
|
+
|
143
|
+
##
|
144
|
+
# @param [String] input
|
145
|
+
# @return [String]
|
146
|
+
#
|
147
|
+
def determine_profiles(input)
|
148
|
+
return short_input?(input) ? short_profiles_path : profiles_path
|
33
149
|
end
|
34
150
|
|
35
|
-
|
36
|
-
|
37
|
-
|
151
|
+
##
|
152
|
+
# @param [String] input
|
153
|
+
# @return [TrueClass|FalseClass]
|
154
|
+
#
|
155
|
+
def short_input?(input)
|
156
|
+
return input.length <= SHORT_THRESHOLD
|
38
157
|
end
|
39
|
-
end
|
40
|
-
end
|
41
|
-
end
|
158
|
+
end # Detector
|
159
|
+
end # LanguageIdentifier
|
160
|
+
end # Opener
|
@@ -1,10 +1,8 @@
|
|
1
|
-
require 'java'
|
2
1
|
require 'open3'
|
3
2
|
require 'slop'
|
4
3
|
require 'builder'
|
5
4
|
|
6
|
-
require_relative '../../core/target/LanguageDetection-0.0.
|
7
|
-
import 'org.vicomtech.opennlp.LanguageDetection.CybozuDetector'
|
5
|
+
require_relative '../../core/target/LanguageDetection-1.0.0.jar'
|
8
6
|
|
9
7
|
require_relative 'language_identifier/version'
|
10
8
|
require_relative 'language_identifier/kaf_builder'
|
@@ -46,8 +44,8 @@ module Opener
|
|
46
44
|
# are returned instead of the language/KAF.
|
47
45
|
#
|
48
46
|
def initialize(options = {})
|
49
|
-
@options
|
50
|
-
@detector = Detector.
|
47
|
+
@options = DEFAULT_OPTIONS.merge(options)
|
48
|
+
@detector = Detector.new
|
51
49
|
end
|
52
50
|
|
53
51
|
##
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: opener-language-identifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 4.
|
4
|
+
version: 4.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- development@olery.com
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2015-01-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: opener-daemons
|
@@ -94,20 +94,6 @@ dependencies:
|
|
94
94
|
version: '3.0'
|
95
95
|
prerelease: false
|
96
96
|
type: :development
|
97
|
-
- !ruby/object:Gem::Dependency
|
98
|
-
name: cucumber
|
99
|
-
version_requirements: !ruby/object:Gem::Requirement
|
100
|
-
requirements:
|
101
|
-
- - '>='
|
102
|
-
- !ruby/object:Gem::Version
|
103
|
-
version: '0'
|
104
|
-
requirement: !ruby/object:Gem::Requirement
|
105
|
-
requirements:
|
106
|
-
- - '>='
|
107
|
-
- !ruby/object:Gem::Version
|
108
|
-
version: '0'
|
109
|
-
prerelease: false
|
110
|
-
type: :development
|
111
97
|
- !ruby/object:Gem::Dependency
|
112
98
|
name: rake
|
113
99
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -145,9 +131,8 @@ executables:
|
|
145
131
|
extensions: []
|
146
132
|
extra_rdoc_files: []
|
147
133
|
files:
|
134
|
+
- core/target/LanguageDetection-1.0.0.jar
|
148
135
|
- core/target/LanguageDetection-0.0.1.jar
|
149
|
-
- core/target/classes/org/vicomtech/opennlp/LanguageDetection/CybozuDetector.class
|
150
|
-
- core/target/classes/org/vicomtech/opennlp/LanguageDetection/Main.class
|
151
136
|
- core/target/classes/profiles/fi
|
152
137
|
- core/target/classes/profiles/ta
|
153
138
|
- core/target/classes/profiles/mr
|
@@ -193,6 +178,7 @@ files:
|
|
193
178
|
- core/target/classes/profiles/vi
|
194
179
|
- core/target/classes/profiles/et
|
195
180
|
- core/target/classes/profiles/hi
|
181
|
+
- core/target/classes/profiles/af
|
196
182
|
- core/target/classes/profiles/gu
|
197
183
|
- core/target/classes/profiles/zh-cn
|
198
184
|
- core/target/classes/profiles/mk
|
@@ -202,15 +188,59 @@ files:
|
|
202
188
|
- core/target/classes/profiles/cs
|
203
189
|
- core/target/classes/profiles/bn
|
204
190
|
- core/target/classes/profiles/tl
|
191
|
+
- core/target/classes/short_profiles/fi
|
192
|
+
- core/target/classes/short_profiles/ta
|
193
|
+
- core/target/classes/short_profiles/ml
|
194
|
+
- core/target/classes/short_profiles/hr
|
195
|
+
- core/target/classes/short_profiles/id
|
196
|
+
- core/target/classes/short_profiles/es
|
197
|
+
- core/target/classes/short_profiles/no
|
198
|
+
- core/target/classes/short_profiles/ca
|
199
|
+
- core/target/classes/short_profiles/en
|
200
|
+
- core/target/classes/short_profiles/ru
|
201
|
+
- core/target/classes/short_profiles/te
|
202
|
+
- core/target/classes/short_profiles/lt
|
203
|
+
- core/target/classes/short_profiles/pa
|
204
|
+
- core/target/classes/short_profiles/ja
|
205
|
+
- core/target/classes/short_profiles/he
|
206
|
+
- core/target/classes/short_profiles/nl
|
207
|
+
- core/target/classes/short_profiles/tr
|
208
|
+
- core/target/classes/short_profiles/pl
|
209
|
+
- core/target/classes/short_profiles/si
|
210
|
+
- core/target/classes/short_profiles/fa
|
211
|
+
- core/target/classes/short_profiles/de
|
212
|
+
- core/target/classes/short_profiles/bg
|
213
|
+
- core/target/classes/short_profiles/it
|
214
|
+
- core/target/classes/short_profiles/fr
|
215
|
+
- core/target/classes/short_profiles/el
|
216
|
+
- core/target/classes/short_profiles/pt
|
217
|
+
- core/target/classes/short_profiles/uk
|
218
|
+
- core/target/classes/short_profiles/da
|
219
|
+
- core/target/classes/short_profiles/ar
|
220
|
+
- core/target/classes/short_profiles/zh-tw
|
221
|
+
- core/target/classes/short_profiles/sq
|
222
|
+
- core/target/classes/short_profiles/th
|
223
|
+
- core/target/classes/short_profiles/ko
|
224
|
+
- core/target/classes/short_profiles/ro
|
225
|
+
- core/target/classes/short_profiles/lv
|
226
|
+
- core/target/classes/short_profiles/sv
|
227
|
+
- core/target/classes/short_profiles/vi
|
228
|
+
- core/target/classes/short_profiles/et
|
229
|
+
- core/target/classes/short_profiles/hi
|
230
|
+
- core/target/classes/short_profiles/gu
|
231
|
+
- core/target/classes/short_profiles/zh-cn
|
232
|
+
- core/target/classes/short_profiles/mk
|
233
|
+
- core/target/classes/short_profiles/ur
|
234
|
+
- core/target/classes/short_profiles/hu
|
235
|
+
- core/target/classes/short_profiles/cs
|
236
|
+
- core/target/classes/short_profiles/bn
|
237
|
+
- core/target/classes/short_profiles/tl
|
205
238
|
- core/target/classes/com/cybozu/labs/langdetect/ErrorCode.class
|
206
|
-
- core/target/classes/com/cybozu/labs/langdetect/GenProfile.class
|
207
|
-
- core/target/classes/com/cybozu/labs/langdetect/Command.class
|
208
239
|
- core/target/classes/com/cybozu/labs/langdetect/LangDetectException.class
|
209
240
|
- core/target/classes/com/cybozu/labs/langdetect/Language.class
|
210
241
|
- core/target/classes/com/cybozu/labs/langdetect/Detector.class
|
211
242
|
- core/target/classes/com/cybozu/labs/langdetect/af
|
212
243
|
- core/target/classes/com/cybozu/labs/langdetect/DetectorFactory.class
|
213
|
-
- core/target/classes/com/cybozu/labs/langdetect/util/TagExtractor.class
|
214
244
|
- core/target/classes/com/cybozu/labs/langdetect/util/NGram.class
|
215
245
|
- core/target/classes/com/cybozu/labs/langdetect/util/LangProfile.class
|
216
246
|
- core/target/classes/com/cybozu/labs/langdetect/util/Messages.class
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|