opener-language-identifier 4.1.0 → 4.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (70) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -3
  3. data/core/target/LanguageDetection-0.0.1.jar +0 -0
  4. data/core/target/LanguageDetection-1.0.0.jar +0 -0
  5. data/core/target/classes/com/cybozu/labs/langdetect/Detector.class +0 -0
  6. data/core/target/classes/com/cybozu/labs/langdetect/DetectorFactory.class +0 -0
  7. data/core/target/classes/com/cybozu/labs/langdetect/util/LangProfile.class +0 -0
  8. data/core/target/classes/com/cybozu/labs/langdetect/util/NGram.class +0 -0
  9. data/core/target/classes/com/cybozu/labs/langdetect/util/messages.properties +7 -0
  10. data/core/target/classes/profiles/af +1 -0
  11. data/core/target/classes/profiles/ko +1 -1
  12. data/core/target/classes/profiles/ro +1 -1
  13. data/core/target/classes/profiles/vi +1 -1
  14. data/core/target/classes/short_profiles/ar +1 -0
  15. data/core/target/classes/short_profiles/bg +1 -0
  16. data/core/target/classes/short_profiles/bn +1 -0
  17. data/core/target/classes/short_profiles/ca +1 -0
  18. data/core/target/classes/short_profiles/cs +1 -0
  19. data/core/target/classes/short_profiles/da +1 -0
  20. data/core/target/classes/short_profiles/de +1 -0
  21. data/core/target/classes/short_profiles/el +1 -0
  22. data/core/target/classes/short_profiles/en +1 -0
  23. data/core/target/classes/short_profiles/es +1 -0
  24. data/core/target/classes/short_profiles/et +1 -0
  25. data/core/target/classes/short_profiles/fa +1 -0
  26. data/core/target/classes/short_profiles/fi +1 -0
  27. data/core/target/classes/short_profiles/fr +1 -0
  28. data/core/target/classes/short_profiles/gu +1 -0
  29. data/core/target/classes/short_profiles/he +1 -0
  30. data/core/target/classes/short_profiles/hi +1 -0
  31. data/core/target/classes/short_profiles/hr +1 -0
  32. data/core/target/classes/short_profiles/hu +1 -0
  33. data/core/target/classes/short_profiles/id +1 -0
  34. data/core/target/classes/short_profiles/it +1 -0
  35. data/core/target/classes/short_profiles/ja +1 -0
  36. data/core/target/classes/short_profiles/ko +1 -0
  37. data/core/target/classes/short_profiles/lt +1 -0
  38. data/core/target/classes/short_profiles/lv +1 -0
  39. data/core/target/classes/short_profiles/mk +1 -0
  40. data/core/target/classes/short_profiles/ml +1 -0
  41. data/core/target/classes/short_profiles/nl +1 -0
  42. data/core/target/classes/short_profiles/no +1 -0
  43. data/core/target/classes/short_profiles/pa +1 -0
  44. data/core/target/classes/short_profiles/pl +1 -0
  45. data/core/target/classes/short_profiles/pt +1 -0
  46. data/core/target/classes/short_profiles/ro +1 -0
  47. data/core/target/classes/short_profiles/ru +1 -0
  48. data/core/target/classes/short_profiles/si +1 -0
  49. data/core/target/classes/short_profiles/sq +1 -0
  50. data/core/target/classes/short_profiles/sv +1 -0
  51. data/core/target/classes/short_profiles/ta +1 -0
  52. data/core/target/classes/short_profiles/te +1 -0
  53. data/core/target/classes/short_profiles/th +1 -0
  54. data/core/target/classes/short_profiles/tl +1 -0
  55. data/core/target/classes/short_profiles/tr +1 -0
  56. data/core/target/classes/short_profiles/uk +1 -0
  57. data/core/target/classes/short_profiles/ur +1 -0
  58. data/core/target/classes/short_profiles/vi +1 -0
  59. data/core/target/classes/short_profiles/zh-cn +1 -0
  60. data/core/target/classes/short_profiles/zh-tw +1 -0
  61. data/lib/opener/language_identifier/detector.rb +143 -24
  62. data/lib/opener/language_identifier/version.rb +1 -1
  63. data/lib/opener/language_identifier.rb +3 -5
  64. data/opener-language-identifier.gemspec +0 -1
  65. metadata +51 -21
  66. data/core/target/classes/com/cybozu/labs/langdetect/Command.class +0 -0
  67. data/core/target/classes/com/cybozu/labs/langdetect/GenProfile.class +0 -0
  68. data/core/target/classes/com/cybozu/labs/langdetect/util/TagExtractor.class +0 -0
  69. data/core/target/classes/org/vicomtech/opennlp/LanguageDetection/CybozuDetector.class +0 -0
  70. data/core/target/classes/org/vicomtech/opennlp/LanguageDetection/Main.class +0 -0
@@ -1,41 +1,160 @@
1
- require 'singleton'
2
-
3
- import 'org.vicomtech.opennlp.LanguageDetection.CybozuDetector'
4
-
5
1
  module Opener
6
2
  class LanguageIdentifier
7
3
  ##
8
- # Singleton class wrapped around the Cybozu detector. The Cybozu code uses
9
- # the factory pattern and stores a bunch of things on class level. As such
10
- # the Cybozu code is *not* thread-safe.
4
+ # Ruby wrapper around the Cybozu DetectorFactory and Detector classes. This
5
+ # class automatically handles switching of profiles based on input sizes,
6
+ # assigning priorities to languages, etc.
11
7
  #
12
8
  class Detector
13
- attr_reader :options
9
+ attr_reader :profiles_path, :short_profiles_path
10
+
11
+ ##
12
+ # Path to the directory containing the default profiles.
13
+ #
14
+ # @return [String]
15
+ #
16
+ DEFAULT_PROFILES_PATH = File.expand_path(
17
+ '../../../../core/target/classes/profiles',
18
+ __FILE__
19
+ )
20
+
21
+ ##
22
+ # Path to the directory containing the default short profiles.
23
+ #
24
+ # @return [String]
25
+ #
26
+ DEFAULT_SHORT_PROFILES_PATH = File.expand_path(
27
+ '../../../../core/target/classes/short_profiles',
28
+ __FILE__
29
+ )
30
+
31
+ ##
32
+ # The amount of characters after which the detector should switch to using
33
+ # the longer profiles set.
34
+ #
35
+ # @return [Fixnum]
36
+ #
37
+ SHORT_THRESHOLD = 15
38
+
39
+ ##
40
+ # Prioritize OpeNER languages over the rest. Languages not covered by this
41
+ # list are automatically given a default priority.
42
+ #
43
+ # @return [Hash]
44
+ #
45
+ PRIORITIES = {
46
+ 'en' => 1.0,
47
+ 'es' => 0.9,
48
+ 'it' => 0.9,
49
+ 'fr' => 0.9,
50
+ 'de' => 0.9,
51
+ 'nl' => 0.9
52
+ }
14
53
 
15
- include Singleton
54
+ ##
55
+ # The default priority for non OpeNER languages.
56
+ #
57
+ # @return [Float]
58
+ #
59
+ DEFAULT_PRIORITY = 0.5
16
60
 
17
- def initialize(options={})
18
- @options = options
19
- @detector = CybozuDetector.new(profiles_path)
20
- @semaphore = Mutex.new
61
+ ##
62
+ # @param [Hash] options
63
+ #
64
+ # @option options [String] :profiles_path
65
+ # @option options [String] :short_profiles_path
66
+ #
67
+ def initialize(options = {})
68
+ options.each do |key, value|
69
+ instance_variable_set("@#{key}", value) if respond_to?(key)
70
+ end
71
+
72
+ @profiles_path ||= DEFAULT_PROFILES_PATH
73
+ @short_profiles_path ||= DEFAULT_SHORT_PROFILES_PATH
21
74
  end
22
75
 
76
+ ##
77
+ # @return [String]
78
+ #
23
79
  def detect(input)
24
- @semaphore.synchronize do
25
- @detector.detect(input)
26
- end
80
+ return new_detector(input).detect
27
81
  end
28
82
 
83
+ ##
84
+ # @return [Array]
85
+ #
29
86
  def probabilities(input)
30
- @semaphore.synchronize do
31
- result = @detector.detect_langs(input)
87
+ return new_detector(input).get_probabilities.to_array
88
+ end
89
+
90
+ ##
91
+ # Returns a new detector with the profiles set based on the input.
92
+ #
93
+ # This method analyses a lowercased version of the input as this yields
94
+ # better results for short text.
95
+ #
96
+ # @param [String] input
97
+ # @return [CybozuDetector]
98
+ #
99
+ def new_detector(input)
100
+ factory = com.cybozu.labs.langdetect.DetectorFactory.new
101
+
102
+ factory.load_profile(determine_profiles(input))
103
+ factory.set_seed(1)
104
+
105
+ priorities = build_priorities(input, factory.langlist)
106
+ detector = com.cybozu.labs.langdetect.Detector.new(factory)
107
+
108
+ detector.set_prior_map(priorities)
109
+ detector.append(input.downcase)
110
+
111
+ return detector
112
+ end
113
+
114
+ ##
115
+ # Builds a Java Hash mapping the priorities for all OpeNER and non OpeNER
116
+ # languages.
117
+ #
118
+ # If the input size is smaller than the short profiles threshold non
119
+ # OpeNER languages are _disabled_. This is to ensure that these languages
120
+ # are detected properly when analysing only 1-2 words.
121
+ #
122
+ # @param [String] input
123
+ # @param [Array<String>] languages
124
+ # @return [java.util.HashMap]
125
+ #
126
+ def build_priorities(input, languages)
127
+ priorities = java.util.HashMap.new
128
+ priority = short_input?(input) ? 0.0 : DEFAULT_PRIORITY
129
+
130
+ PRIORITIES.each do |lang, val|
131
+ priorities.put(lang, val)
32
132
  end
133
+
134
+ languages.each do |language|
135
+ unless priorities.contains_key(language)
136
+ priorities.put(language, priority)
137
+ end
138
+ end
139
+
140
+ return priorities
141
+ end
142
+
143
+ ##
144
+ # @param [String] input
145
+ # @return [String]
146
+ #
147
+ def determine_profiles(input)
148
+ return short_input?(input) ? short_profiles_path : profiles_path
33
149
  end
34
150
 
35
- def profiles_path
36
- default_path = File.expand_path("../../../../core/target/classes/profiles", __FILE__)
37
- options.fetch(:profiles_path, default_path)
151
+ ##
152
+ # @param [String] input
153
+ # @return [TrueClass|FalseClass]
154
+ #
155
+ def short_input?(input)
156
+ return input.length <= SHORT_THRESHOLD
38
157
  end
39
- end
40
- end
41
- end
158
+ end # Detector
159
+ end # LanguageIdentifier
160
+ end # Opener
@@ -1,5 +1,5 @@
1
1
  module Opener
2
2
  class LanguageIdentifier
3
- VERSION = "4.1.0"
3
+ VERSION = '4.2.0'
4
4
  end
5
5
  end
@@ -1,10 +1,8 @@
1
- require 'java'
2
1
  require 'open3'
3
2
  require 'slop'
4
3
  require 'builder'
5
4
 
6
- require_relative '../../core/target/LanguageDetection-0.0.1.jar'
7
- import 'org.vicomtech.opennlp.LanguageDetection.CybozuDetector'
5
+ require_relative '../../core/target/LanguageDetection-1.0.0.jar'
8
6
 
9
7
  require_relative 'language_identifier/version'
10
8
  require_relative 'language_identifier/kaf_builder'
@@ -46,8 +44,8 @@ module Opener
46
44
  # are returned instead of the language/KAF.
47
45
  #
48
46
  def initialize(options = {})
49
- @options = DEFAULT_OPTIONS.merge(options)
50
- @detector = Detector.instance
47
+ @options = DEFAULT_OPTIONS.merge(options)
48
+ @detector = Detector.new
51
49
  end
52
50
 
53
51
  ##
@@ -33,7 +33,6 @@ Gem::Specification.new do |gem|
33
33
  gem.add_dependency 'slop', '~> 3.5'
34
34
 
35
35
  gem.add_development_dependency 'rspec', '~> 3.0'
36
- gem.add_development_dependency 'cucumber'
37
36
  gem.add_development_dependency 'rake'
38
37
  gem.add_development_dependency 'cliver'
39
38
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: opener-language-identifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.1.0
4
+ version: 4.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - development@olery.com
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-11-25 00:00:00.000000000 Z
11
+ date: 2015-01-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: opener-daemons
@@ -94,20 +94,6 @@ dependencies:
94
94
  version: '3.0'
95
95
  prerelease: false
96
96
  type: :development
97
- - !ruby/object:Gem::Dependency
98
- name: cucumber
99
- version_requirements: !ruby/object:Gem::Requirement
100
- requirements:
101
- - - '>='
102
- - !ruby/object:Gem::Version
103
- version: '0'
104
- requirement: !ruby/object:Gem::Requirement
105
- requirements:
106
- - - '>='
107
- - !ruby/object:Gem::Version
108
- version: '0'
109
- prerelease: false
110
- type: :development
111
97
  - !ruby/object:Gem::Dependency
112
98
  name: rake
113
99
  version_requirements: !ruby/object:Gem::Requirement
@@ -145,9 +131,8 @@ executables:
145
131
  extensions: []
146
132
  extra_rdoc_files: []
147
133
  files:
134
+ - core/target/LanguageDetection-1.0.0.jar
148
135
  - core/target/LanguageDetection-0.0.1.jar
149
- - core/target/classes/org/vicomtech/opennlp/LanguageDetection/CybozuDetector.class
150
- - core/target/classes/org/vicomtech/opennlp/LanguageDetection/Main.class
151
136
  - core/target/classes/profiles/fi
152
137
  - core/target/classes/profiles/ta
153
138
  - core/target/classes/profiles/mr
@@ -193,6 +178,7 @@ files:
193
178
  - core/target/classes/profiles/vi
194
179
  - core/target/classes/profiles/et
195
180
  - core/target/classes/profiles/hi
181
+ - core/target/classes/profiles/af
196
182
  - core/target/classes/profiles/gu
197
183
  - core/target/classes/profiles/zh-cn
198
184
  - core/target/classes/profiles/mk
@@ -202,15 +188,59 @@ files:
202
188
  - core/target/classes/profiles/cs
203
189
  - core/target/classes/profiles/bn
204
190
  - core/target/classes/profiles/tl
191
+ - core/target/classes/short_profiles/fi
192
+ - core/target/classes/short_profiles/ta
193
+ - core/target/classes/short_profiles/ml
194
+ - core/target/classes/short_profiles/hr
195
+ - core/target/classes/short_profiles/id
196
+ - core/target/classes/short_profiles/es
197
+ - core/target/classes/short_profiles/no
198
+ - core/target/classes/short_profiles/ca
199
+ - core/target/classes/short_profiles/en
200
+ - core/target/classes/short_profiles/ru
201
+ - core/target/classes/short_profiles/te
202
+ - core/target/classes/short_profiles/lt
203
+ - core/target/classes/short_profiles/pa
204
+ - core/target/classes/short_profiles/ja
205
+ - core/target/classes/short_profiles/he
206
+ - core/target/classes/short_profiles/nl
207
+ - core/target/classes/short_profiles/tr
208
+ - core/target/classes/short_profiles/pl
209
+ - core/target/classes/short_profiles/si
210
+ - core/target/classes/short_profiles/fa
211
+ - core/target/classes/short_profiles/de
212
+ - core/target/classes/short_profiles/bg
213
+ - core/target/classes/short_profiles/it
214
+ - core/target/classes/short_profiles/fr
215
+ - core/target/classes/short_profiles/el
216
+ - core/target/classes/short_profiles/pt
217
+ - core/target/classes/short_profiles/uk
218
+ - core/target/classes/short_profiles/da
219
+ - core/target/classes/short_profiles/ar
220
+ - core/target/classes/short_profiles/zh-tw
221
+ - core/target/classes/short_profiles/sq
222
+ - core/target/classes/short_profiles/th
223
+ - core/target/classes/short_profiles/ko
224
+ - core/target/classes/short_profiles/ro
225
+ - core/target/classes/short_profiles/lv
226
+ - core/target/classes/short_profiles/sv
227
+ - core/target/classes/short_profiles/vi
228
+ - core/target/classes/short_profiles/et
229
+ - core/target/classes/short_profiles/hi
230
+ - core/target/classes/short_profiles/gu
231
+ - core/target/classes/short_profiles/zh-cn
232
+ - core/target/classes/short_profiles/mk
233
+ - core/target/classes/short_profiles/ur
234
+ - core/target/classes/short_profiles/hu
235
+ - core/target/classes/short_profiles/cs
236
+ - core/target/classes/short_profiles/bn
237
+ - core/target/classes/short_profiles/tl
205
238
  - core/target/classes/com/cybozu/labs/langdetect/ErrorCode.class
206
- - core/target/classes/com/cybozu/labs/langdetect/GenProfile.class
207
- - core/target/classes/com/cybozu/labs/langdetect/Command.class
208
239
  - core/target/classes/com/cybozu/labs/langdetect/LangDetectException.class
209
240
  - core/target/classes/com/cybozu/labs/langdetect/Language.class
210
241
  - core/target/classes/com/cybozu/labs/langdetect/Detector.class
211
242
  - core/target/classes/com/cybozu/labs/langdetect/af
212
243
  - core/target/classes/com/cybozu/labs/langdetect/DetectorFactory.class
213
- - core/target/classes/com/cybozu/labs/langdetect/util/TagExtractor.class
214
244
  - core/target/classes/com/cybozu/labs/langdetect/util/NGram.class
215
245
  - core/target/classes/com/cybozu/labs/langdetect/util/LangProfile.class
216
246
  - core/target/classes/com/cybozu/labs/langdetect/util/Messages.class