opener-language-identifier 4.1.0 → 4.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -3
- data/core/target/LanguageDetection-0.0.1.jar +0 -0
- data/core/target/LanguageDetection-1.0.0.jar +0 -0
- data/core/target/classes/com/cybozu/labs/langdetect/Detector.class +0 -0
- data/core/target/classes/com/cybozu/labs/langdetect/DetectorFactory.class +0 -0
- data/core/target/classes/com/cybozu/labs/langdetect/util/LangProfile.class +0 -0
- data/core/target/classes/com/cybozu/labs/langdetect/util/NGram.class +0 -0
- data/core/target/classes/com/cybozu/labs/langdetect/util/messages.properties +7 -0
- data/core/target/classes/profiles/af +1 -0
- data/core/target/classes/profiles/ko +1 -1
- data/core/target/classes/profiles/ro +1 -1
- data/core/target/classes/profiles/vi +1 -1
- data/core/target/classes/short_profiles/ar +1 -0
- data/core/target/classes/short_profiles/bg +1 -0
- data/core/target/classes/short_profiles/bn +1 -0
- data/core/target/classes/short_profiles/ca +1 -0
- data/core/target/classes/short_profiles/cs +1 -0
- data/core/target/classes/short_profiles/da +1 -0
- data/core/target/classes/short_profiles/de +1 -0
- data/core/target/classes/short_profiles/el +1 -0
- data/core/target/classes/short_profiles/en +1 -0
- data/core/target/classes/short_profiles/es +1 -0
- data/core/target/classes/short_profiles/et +1 -0
- data/core/target/classes/short_profiles/fa +1 -0
- data/core/target/classes/short_profiles/fi +1 -0
- data/core/target/classes/short_profiles/fr +1 -0
- data/core/target/classes/short_profiles/gu +1 -0
- data/core/target/classes/short_profiles/he +1 -0
- data/core/target/classes/short_profiles/hi +1 -0
- data/core/target/classes/short_profiles/hr +1 -0
- data/core/target/classes/short_profiles/hu +1 -0
- data/core/target/classes/short_profiles/id +1 -0
- data/core/target/classes/short_profiles/it +1 -0
- data/core/target/classes/short_profiles/ja +1 -0
- data/core/target/classes/short_profiles/ko +1 -0
- data/core/target/classes/short_profiles/lt +1 -0
- data/core/target/classes/short_profiles/lv +1 -0
- data/core/target/classes/short_profiles/mk +1 -0
- data/core/target/classes/short_profiles/ml +1 -0
- data/core/target/classes/short_profiles/nl +1 -0
- data/core/target/classes/short_profiles/no +1 -0
- data/core/target/classes/short_profiles/pa +1 -0
- data/core/target/classes/short_profiles/pl +1 -0
- data/core/target/classes/short_profiles/pt +1 -0
- data/core/target/classes/short_profiles/ro +1 -0
- data/core/target/classes/short_profiles/ru +1 -0
- data/core/target/classes/short_profiles/si +1 -0
- data/core/target/classes/short_profiles/sq +1 -0
- data/core/target/classes/short_profiles/sv +1 -0
- data/core/target/classes/short_profiles/ta +1 -0
- data/core/target/classes/short_profiles/te +1 -0
- data/core/target/classes/short_profiles/th +1 -0
- data/core/target/classes/short_profiles/tl +1 -0
- data/core/target/classes/short_profiles/tr +1 -0
- data/core/target/classes/short_profiles/uk +1 -0
- data/core/target/classes/short_profiles/ur +1 -0
- data/core/target/classes/short_profiles/vi +1 -0
- data/core/target/classes/short_profiles/zh-cn +1 -0
- data/core/target/classes/short_profiles/zh-tw +1 -0
- data/lib/opener/language_identifier/detector.rb +143 -24
- data/lib/opener/language_identifier/version.rb +1 -1
- data/lib/opener/language_identifier.rb +3 -5
- data/opener-language-identifier.gemspec +0 -1
- metadata +51 -21
- data/core/target/classes/com/cybozu/labs/langdetect/Command.class +0 -0
- data/core/target/classes/com/cybozu/labs/langdetect/GenProfile.class +0 -0
- data/core/target/classes/com/cybozu/labs/langdetect/util/TagExtractor.class +0 -0
- data/core/target/classes/org/vicomtech/opennlp/LanguageDetection/CybozuDetector.class +0 -0
- data/core/target/classes/org/vicomtech/opennlp/LanguageDetection/Main.class +0 -0
@@ -1,41 +1,160 @@
|
|
1
|
-
require 'singleton'
|
2
|
-
|
3
|
-
import 'org.vicomtech.opennlp.LanguageDetection.CybozuDetector'
|
4
|
-
|
5
1
|
module Opener
|
6
2
|
class LanguageIdentifier
|
7
3
|
##
|
8
|
-
#
|
9
|
-
#
|
10
|
-
#
|
4
|
+
# Ruby wrapper around the Cybozu DetectorFactory and Detector classes. This
|
5
|
+
# class automatically handles switching of profiles based on input sizes,
|
6
|
+
# assigning priorities to languages, etc.
|
11
7
|
#
|
12
8
|
class Detector
|
13
|
-
attr_reader :
|
9
|
+
attr_reader :profiles_path, :short_profiles_path
|
10
|
+
|
11
|
+
##
|
12
|
+
# Path to the directory containing the default profiles.
|
13
|
+
#
|
14
|
+
# @return [String]
|
15
|
+
#
|
16
|
+
DEFAULT_PROFILES_PATH = File.expand_path(
|
17
|
+
'../../../../core/target/classes/profiles',
|
18
|
+
__FILE__
|
19
|
+
)
|
20
|
+
|
21
|
+
##
|
22
|
+
# Path to the directory containing the default short profiles.
|
23
|
+
#
|
24
|
+
# @return [String]
|
25
|
+
#
|
26
|
+
DEFAULT_SHORT_PROFILES_PATH = File.expand_path(
|
27
|
+
'../../../../core/target/classes/short_profiles',
|
28
|
+
__FILE__
|
29
|
+
)
|
30
|
+
|
31
|
+
##
|
32
|
+
# The amount of characters after which the detector should switch to using
|
33
|
+
# the longer profiles set.
|
34
|
+
#
|
35
|
+
# @return [Fixnum]
|
36
|
+
#
|
37
|
+
SHORT_THRESHOLD = 15
|
38
|
+
|
39
|
+
##
|
40
|
+
# Prioritize OpeNER languages over the rest. Languages not covered by this
|
41
|
+
# list are automatically given a default priority.
|
42
|
+
#
|
43
|
+
# @return [Hash]
|
44
|
+
#
|
45
|
+
PRIORITIES = {
|
46
|
+
'en' => 1.0,
|
47
|
+
'es' => 0.9,
|
48
|
+
'it' => 0.9,
|
49
|
+
'fr' => 0.9,
|
50
|
+
'de' => 0.9,
|
51
|
+
'nl' => 0.9
|
52
|
+
}
|
14
53
|
|
15
|
-
|
54
|
+
##
|
55
|
+
# The default priority for non OpeNER languages.
|
56
|
+
#
|
57
|
+
# @return [Float]
|
58
|
+
#
|
59
|
+
DEFAULT_PRIORITY = 0.5
|
16
60
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
61
|
+
##
|
62
|
+
# @param [Hash] options
|
63
|
+
#
|
64
|
+
# @option options [String] :profiles_path
|
65
|
+
# @option options [String] :short_profiles_path
|
66
|
+
#
|
67
|
+
def initialize(options = {})
|
68
|
+
options.each do |key, value|
|
69
|
+
instance_variable_set("@#{key}", value) if respond_to?(key)
|
70
|
+
end
|
71
|
+
|
72
|
+
@profiles_path ||= DEFAULT_PROFILES_PATH
|
73
|
+
@short_profiles_path ||= DEFAULT_SHORT_PROFILES_PATH
|
21
74
|
end
|
22
75
|
|
76
|
+
##
|
77
|
+
# @return [String]
|
78
|
+
#
|
23
79
|
def detect(input)
|
24
|
-
|
25
|
-
@detector.detect(input)
|
26
|
-
end
|
80
|
+
return new_detector(input).detect
|
27
81
|
end
|
28
82
|
|
83
|
+
##
|
84
|
+
# @return [Array]
|
85
|
+
#
|
29
86
|
def probabilities(input)
|
30
|
-
|
31
|
-
|
87
|
+
return new_detector(input).get_probabilities.to_array
|
88
|
+
end
|
89
|
+
|
90
|
+
##
|
91
|
+
# Returns a new detector with the profiles set based on the input.
|
92
|
+
#
|
93
|
+
# This method analyses a lowercased version of the input as this yields
|
94
|
+
# better results for short text.
|
95
|
+
#
|
96
|
+
# @param [String] input
|
97
|
+
# @return [CybozuDetector]
|
98
|
+
#
|
99
|
+
def new_detector(input)
|
100
|
+
factory = com.cybozu.labs.langdetect.DetectorFactory.new
|
101
|
+
|
102
|
+
factory.load_profile(determine_profiles(input))
|
103
|
+
factory.set_seed(1)
|
104
|
+
|
105
|
+
priorities = build_priorities(input, factory.langlist)
|
106
|
+
detector = com.cybozu.labs.langdetect.Detector.new(factory)
|
107
|
+
|
108
|
+
detector.set_prior_map(priorities)
|
109
|
+
detector.append(input.downcase)
|
110
|
+
|
111
|
+
return detector
|
112
|
+
end
|
113
|
+
|
114
|
+
##
|
115
|
+
# Builds a Java Hash mapping the priorities for all OpeNER and non OpeNER
|
116
|
+
# languages.
|
117
|
+
#
|
118
|
+
# If the input size is smaller than the short profiles threshold non
|
119
|
+
# OpeNER languages are _disabled_. This is to ensure that these languages
|
120
|
+
# are detected properly when analysing only 1-2 words.
|
121
|
+
#
|
122
|
+
# @param [String] input
|
123
|
+
# @param [Array<String>] languages
|
124
|
+
# @return [java.util.HashMap]
|
125
|
+
#
|
126
|
+
def build_priorities(input, languages)
|
127
|
+
priorities = java.util.HashMap.new
|
128
|
+
priority = short_input?(input) ? 0.0 : DEFAULT_PRIORITY
|
129
|
+
|
130
|
+
PRIORITIES.each do |lang, val|
|
131
|
+
priorities.put(lang, val)
|
32
132
|
end
|
133
|
+
|
134
|
+
languages.each do |language|
|
135
|
+
unless priorities.contains_key(language)
|
136
|
+
priorities.put(language, priority)
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
return priorities
|
141
|
+
end
|
142
|
+
|
143
|
+
##
|
144
|
+
# @param [String] input
|
145
|
+
# @return [String]
|
146
|
+
#
|
147
|
+
def determine_profiles(input)
|
148
|
+
return short_input?(input) ? short_profiles_path : profiles_path
|
33
149
|
end
|
34
150
|
|
35
|
-
|
36
|
-
|
37
|
-
|
151
|
+
##
|
152
|
+
# @param [String] input
|
153
|
+
# @return [TrueClass|FalseClass]
|
154
|
+
#
|
155
|
+
def short_input?(input)
|
156
|
+
return input.length <= SHORT_THRESHOLD
|
38
157
|
end
|
39
|
-
end
|
40
|
-
end
|
41
|
-
end
|
158
|
+
end # Detector
|
159
|
+
end # LanguageIdentifier
|
160
|
+
end # Opener
|
@@ -1,10 +1,8 @@
|
|
1
|
-
require 'java'
|
2
1
|
require 'open3'
|
3
2
|
require 'slop'
|
4
3
|
require 'builder'
|
5
4
|
|
6
|
-
require_relative '../../core/target/LanguageDetection-0.0.
|
7
|
-
import 'org.vicomtech.opennlp.LanguageDetection.CybozuDetector'
|
5
|
+
require_relative '../../core/target/LanguageDetection-1.0.0.jar'
|
8
6
|
|
9
7
|
require_relative 'language_identifier/version'
|
10
8
|
require_relative 'language_identifier/kaf_builder'
|
@@ -46,8 +44,8 @@ module Opener
|
|
46
44
|
# are returned instead of the language/KAF.
|
47
45
|
#
|
48
46
|
def initialize(options = {})
|
49
|
-
@options
|
50
|
-
@detector = Detector.
|
47
|
+
@options = DEFAULT_OPTIONS.merge(options)
|
48
|
+
@detector = Detector.new
|
51
49
|
end
|
52
50
|
|
53
51
|
##
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: opener-language-identifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 4.
|
4
|
+
version: 4.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- development@olery.com
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2015-01-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: opener-daemons
|
@@ -94,20 +94,6 @@ dependencies:
|
|
94
94
|
version: '3.0'
|
95
95
|
prerelease: false
|
96
96
|
type: :development
|
97
|
-
- !ruby/object:Gem::Dependency
|
98
|
-
name: cucumber
|
99
|
-
version_requirements: !ruby/object:Gem::Requirement
|
100
|
-
requirements:
|
101
|
-
- - '>='
|
102
|
-
- !ruby/object:Gem::Version
|
103
|
-
version: '0'
|
104
|
-
requirement: !ruby/object:Gem::Requirement
|
105
|
-
requirements:
|
106
|
-
- - '>='
|
107
|
-
- !ruby/object:Gem::Version
|
108
|
-
version: '0'
|
109
|
-
prerelease: false
|
110
|
-
type: :development
|
111
97
|
- !ruby/object:Gem::Dependency
|
112
98
|
name: rake
|
113
99
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -145,9 +131,8 @@ executables:
|
|
145
131
|
extensions: []
|
146
132
|
extra_rdoc_files: []
|
147
133
|
files:
|
134
|
+
- core/target/LanguageDetection-1.0.0.jar
|
148
135
|
- core/target/LanguageDetection-0.0.1.jar
|
149
|
-
- core/target/classes/org/vicomtech/opennlp/LanguageDetection/CybozuDetector.class
|
150
|
-
- core/target/classes/org/vicomtech/opennlp/LanguageDetection/Main.class
|
151
136
|
- core/target/classes/profiles/fi
|
152
137
|
- core/target/classes/profiles/ta
|
153
138
|
- core/target/classes/profiles/mr
|
@@ -193,6 +178,7 @@ files:
|
|
193
178
|
- core/target/classes/profiles/vi
|
194
179
|
- core/target/classes/profiles/et
|
195
180
|
- core/target/classes/profiles/hi
|
181
|
+
- core/target/classes/profiles/af
|
196
182
|
- core/target/classes/profiles/gu
|
197
183
|
- core/target/classes/profiles/zh-cn
|
198
184
|
- core/target/classes/profiles/mk
|
@@ -202,15 +188,59 @@ files:
|
|
202
188
|
- core/target/classes/profiles/cs
|
203
189
|
- core/target/classes/profiles/bn
|
204
190
|
- core/target/classes/profiles/tl
|
191
|
+
- core/target/classes/short_profiles/fi
|
192
|
+
- core/target/classes/short_profiles/ta
|
193
|
+
- core/target/classes/short_profiles/ml
|
194
|
+
- core/target/classes/short_profiles/hr
|
195
|
+
- core/target/classes/short_profiles/id
|
196
|
+
- core/target/classes/short_profiles/es
|
197
|
+
- core/target/classes/short_profiles/no
|
198
|
+
- core/target/classes/short_profiles/ca
|
199
|
+
- core/target/classes/short_profiles/en
|
200
|
+
- core/target/classes/short_profiles/ru
|
201
|
+
- core/target/classes/short_profiles/te
|
202
|
+
- core/target/classes/short_profiles/lt
|
203
|
+
- core/target/classes/short_profiles/pa
|
204
|
+
- core/target/classes/short_profiles/ja
|
205
|
+
- core/target/classes/short_profiles/he
|
206
|
+
- core/target/classes/short_profiles/nl
|
207
|
+
- core/target/classes/short_profiles/tr
|
208
|
+
- core/target/classes/short_profiles/pl
|
209
|
+
- core/target/classes/short_profiles/si
|
210
|
+
- core/target/classes/short_profiles/fa
|
211
|
+
- core/target/classes/short_profiles/de
|
212
|
+
- core/target/classes/short_profiles/bg
|
213
|
+
- core/target/classes/short_profiles/it
|
214
|
+
- core/target/classes/short_profiles/fr
|
215
|
+
- core/target/classes/short_profiles/el
|
216
|
+
- core/target/classes/short_profiles/pt
|
217
|
+
- core/target/classes/short_profiles/uk
|
218
|
+
- core/target/classes/short_profiles/da
|
219
|
+
- core/target/classes/short_profiles/ar
|
220
|
+
- core/target/classes/short_profiles/zh-tw
|
221
|
+
- core/target/classes/short_profiles/sq
|
222
|
+
- core/target/classes/short_profiles/th
|
223
|
+
- core/target/classes/short_profiles/ko
|
224
|
+
- core/target/classes/short_profiles/ro
|
225
|
+
- core/target/classes/short_profiles/lv
|
226
|
+
- core/target/classes/short_profiles/sv
|
227
|
+
- core/target/classes/short_profiles/vi
|
228
|
+
- core/target/classes/short_profiles/et
|
229
|
+
- core/target/classes/short_profiles/hi
|
230
|
+
- core/target/classes/short_profiles/gu
|
231
|
+
- core/target/classes/short_profiles/zh-cn
|
232
|
+
- core/target/classes/short_profiles/mk
|
233
|
+
- core/target/classes/short_profiles/ur
|
234
|
+
- core/target/classes/short_profiles/hu
|
235
|
+
- core/target/classes/short_profiles/cs
|
236
|
+
- core/target/classes/short_profiles/bn
|
237
|
+
- core/target/classes/short_profiles/tl
|
205
238
|
- core/target/classes/com/cybozu/labs/langdetect/ErrorCode.class
|
206
|
-
- core/target/classes/com/cybozu/labs/langdetect/GenProfile.class
|
207
|
-
- core/target/classes/com/cybozu/labs/langdetect/Command.class
|
208
239
|
- core/target/classes/com/cybozu/labs/langdetect/LangDetectException.class
|
209
240
|
- core/target/classes/com/cybozu/labs/langdetect/Language.class
|
210
241
|
- core/target/classes/com/cybozu/labs/langdetect/Detector.class
|
211
242
|
- core/target/classes/com/cybozu/labs/langdetect/af
|
212
243
|
- core/target/classes/com/cybozu/labs/langdetect/DetectorFactory.class
|
213
|
-
- core/target/classes/com/cybozu/labs/langdetect/util/TagExtractor.class
|
214
244
|
- core/target/classes/com/cybozu/labs/langdetect/util/NGram.class
|
215
245
|
- core/target/classes/com/cybozu/labs/langdetect/util/LangProfile.class
|
216
246
|
- core/target/classes/com/cybozu/labs/langdetect/util/Messages.class
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|