opener-language-identifier 4.1.0 → 4.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -3
  3. data/core/target/LanguageDetection-0.0.1.jar +0 -0
  4. data/core/target/LanguageDetection-1.0.0.jar +0 -0
  5. data/core/target/classes/com/cybozu/labs/langdetect/Detector.class +0 -0
  6. data/core/target/classes/com/cybozu/labs/langdetect/DetectorFactory.class +0 -0
  7. data/core/target/classes/com/cybozu/labs/langdetect/util/LangProfile.class +0 -0
  8. data/core/target/classes/com/cybozu/labs/langdetect/util/NGram.class +0 -0
  9. data/core/target/classes/com/cybozu/labs/langdetect/util/messages.properties +7 -0
  10. data/core/target/classes/profiles/af +1 -0
  11. data/core/target/classes/profiles/ko +1 -1
  12. data/core/target/classes/profiles/ro +1 -1
  13. data/core/target/classes/profiles/vi +1 -1
  14. data/core/target/classes/short_profiles/ar +1 -0
  15. data/core/target/classes/short_profiles/bg +1 -0
  16. data/core/target/classes/short_profiles/bn +1 -0
  17. data/core/target/classes/short_profiles/ca +1 -0
  18. data/core/target/classes/short_profiles/cs +1 -0
  19. data/core/target/classes/short_profiles/da +1 -0
  20. data/core/target/classes/short_profiles/de +1 -0
  21. data/core/target/classes/short_profiles/el +1 -0
  22. data/core/target/classes/short_profiles/en +1 -0
  23. data/core/target/classes/short_profiles/es +1 -0
  24. data/core/target/classes/short_profiles/et +1 -0
  25. data/core/target/classes/short_profiles/fa +1 -0
  26. data/core/target/classes/short_profiles/fi +1 -0
  27. data/core/target/classes/short_profiles/fr +1 -0
  28. data/core/target/classes/short_profiles/gu +1 -0
  29. data/core/target/classes/short_profiles/he +1 -0
  30. data/core/target/classes/short_profiles/hi +1 -0
  31. data/core/target/classes/short_profiles/hr +1 -0
  32. data/core/target/classes/short_profiles/hu +1 -0
  33. data/core/target/classes/short_profiles/id +1 -0
  34. data/core/target/classes/short_profiles/it +1 -0
  35. data/core/target/classes/short_profiles/ja +1 -0
  36. data/core/target/classes/short_profiles/ko +1 -0
  37. data/core/target/classes/short_profiles/lt +1 -0
  38. data/core/target/classes/short_profiles/lv +1 -0
  39. data/core/target/classes/short_profiles/mk +1 -0
  40. data/core/target/classes/short_profiles/ml +1 -0
  41. data/core/target/classes/short_profiles/nl +1 -0
  42. data/core/target/classes/short_profiles/no +1 -0
  43. data/core/target/classes/short_profiles/pa +1 -0
  44. data/core/target/classes/short_profiles/pl +1 -0
  45. data/core/target/classes/short_profiles/pt +1 -0
  46. data/core/target/classes/short_profiles/ro +1 -0
  47. data/core/target/classes/short_profiles/ru +1 -0
  48. data/core/target/classes/short_profiles/si +1 -0
  49. data/core/target/classes/short_profiles/sq +1 -0
  50. data/core/target/classes/short_profiles/sv +1 -0
  51. data/core/target/classes/short_profiles/ta +1 -0
  52. data/core/target/classes/short_profiles/te +1 -0
  53. data/core/target/classes/short_profiles/th +1 -0
  54. data/core/target/classes/short_profiles/tl +1 -0
  55. data/core/target/classes/short_profiles/tr +1 -0
  56. data/core/target/classes/short_profiles/uk +1 -0
  57. data/core/target/classes/short_profiles/ur +1 -0
  58. data/core/target/classes/short_profiles/vi +1 -0
  59. data/core/target/classes/short_profiles/zh-cn +1 -0
  60. data/core/target/classes/short_profiles/zh-tw +1 -0
  61. data/lib/opener/language_identifier/detector.rb +143 -24
  62. data/lib/opener/language_identifier/version.rb +1 -1
  63. data/lib/opener/language_identifier.rb +3 -5
  64. data/opener-language-identifier.gemspec +0 -1
  65. metadata +51 -21
  66. data/core/target/classes/com/cybozu/labs/langdetect/Command.class +0 -0
  67. data/core/target/classes/com/cybozu/labs/langdetect/GenProfile.class +0 -0
  68. data/core/target/classes/com/cybozu/labs/langdetect/util/TagExtractor.class +0 -0
  69. data/core/target/classes/org/vicomtech/opennlp/LanguageDetection/CybozuDetector.class +0 -0
  70. data/core/target/classes/org/vicomtech/opennlp/LanguageDetection/Main.class +0 -0
@@ -1,41 +1,160 @@
1
- require 'singleton'
2
-
3
- import 'org.vicomtech.opennlp.LanguageDetection.CybozuDetector'
4
-
5
1
  module Opener
6
2
  class LanguageIdentifier
7
3
  ##
8
- # Singleton class wrapped around the Cybozu detector. The Cybozu code uses
9
- # the factory pattern and stores a bunch of things on class level. As such
10
- # the Cybozu code is *not* thread-safe.
4
+ # Ruby wrapper around the Cybozu DetectorFactory and Detector classes. This
5
+ # class automatically handles switching of profiles based on input sizes,
6
+ # assigning priorities to languages, etc.
11
7
  #
12
8
  class Detector
13
- attr_reader :options
9
+ attr_reader :profiles_path, :short_profiles_path
10
+
11
+ ##
12
+ # Path to the directory containing the default profiles.
13
+ #
14
+ # @return [String]
15
+ #
16
+ DEFAULT_PROFILES_PATH = File.expand_path(
17
+ '../../../../core/target/classes/profiles',
18
+ __FILE__
19
+ )
20
+
21
+ ##
22
+ # Path to the directory containing the default short profiles.
23
+ #
24
+ # @return [String]
25
+ #
26
+ DEFAULT_SHORT_PROFILES_PATH = File.expand_path(
27
+ '../../../../core/target/classes/short_profiles',
28
+ __FILE__
29
+ )
30
+
31
+ ##
32
+ # The amount of characters after which the detector should switch to using
33
+ # the longer profiles set.
34
+ #
35
+ # @return [Fixnum]
36
+ #
37
+ SHORT_THRESHOLD = 15
38
+
39
+ ##
40
+ # Prioritize OpeNER languages over the rest. Languages not covered by this
41
+ # list are automatically given a default priority.
42
+ #
43
+ # @return [Hash]
44
+ #
45
+ PRIORITIES = {
46
+ 'en' => 1.0,
47
+ 'es' => 0.9,
48
+ 'it' => 0.9,
49
+ 'fr' => 0.9,
50
+ 'de' => 0.9,
51
+ 'nl' => 0.9
52
+ }
14
53
 
15
- include Singleton
54
+ ##
55
+ # The default priority for non OpeNER languages.
56
+ #
57
+ # @return [Float]
58
+ #
59
+ DEFAULT_PRIORITY = 0.5
16
60
 
17
- def initialize(options={})
18
- @options = options
19
- @detector = CybozuDetector.new(profiles_path)
20
- @semaphore = Mutex.new
61
+ ##
62
+ # @param [Hash] options
63
+ #
64
+ # @option options [String] :profiles_path
65
+ # @option options [String] :short_profiles_path
66
+ #
67
+ def initialize(options = {})
68
+ options.each do |key, value|
69
+ instance_variable_set("@#{key}", value) if respond_to?(key)
70
+ end
71
+
72
+ @profiles_path ||= DEFAULT_PROFILES_PATH
73
+ @short_profiles_path ||= DEFAULT_SHORT_PROFILES_PATH
21
74
  end
22
75
 
76
+ ##
77
+ # @return [String]
78
+ #
23
79
  def detect(input)
24
- @semaphore.synchronize do
25
- @detector.detect(input)
26
- end
80
+ return new_detector(input).detect
27
81
  end
28
82
 
83
+ ##
84
+ # @return [Array]
85
+ #
29
86
  def probabilities(input)
30
- @semaphore.synchronize do
31
- result = @detector.detect_langs(input)
87
+ return new_detector(input).get_probabilities.to_array
88
+ end
89
+
90
+ ##
91
+ # Returns a new detector with the profiles set based on the input.
92
+ #
93
+ # This method analyses a lowercased version of the input as this yields
94
+ # better results for short text.
95
+ #
96
+ # @param [String] input
97
+ # @return [CybozuDetector]
98
+ #
99
+ def new_detector(input)
100
+ factory = com.cybozu.labs.langdetect.DetectorFactory.new
101
+
102
+ factory.load_profile(determine_profiles(input))
103
+ factory.set_seed(1)
104
+
105
+ priorities = build_priorities(input, factory.langlist)
106
+ detector = com.cybozu.labs.langdetect.Detector.new(factory)
107
+
108
+ detector.set_prior_map(priorities)
109
+ detector.append(input.downcase)
110
+
111
+ return detector
112
+ end
113
+
114
+ ##
115
+ # Builds a Java Hash mapping the priorities for all OpeNER and non OpeNER
116
+ # languages.
117
+ #
118
+ # If the input size is smaller than the short profiles threshold non
119
+ # OpeNER languages are _disabled_. This is to ensure that these languages
120
+ # are detected properly when analysing only 1-2 words.
121
+ #
122
+ # @param [String] input
123
+ # @param [Array<String>] languages
124
+ # @return [java.util.HashMap]
125
+ #
126
+ def build_priorities(input, languages)
127
+ priorities = java.util.HashMap.new
128
+ priority = short_input?(input) ? 0.0 : DEFAULT_PRIORITY
129
+
130
+ PRIORITIES.each do |lang, val|
131
+ priorities.put(lang, val)
32
132
  end
133
+
134
+ languages.each do |language|
135
+ unless priorities.contains_key(language)
136
+ priorities.put(language, priority)
137
+ end
138
+ end
139
+
140
+ return priorities
141
+ end
142
+
143
+ ##
144
+ # @param [String] input
145
+ # @return [String]
146
+ #
147
+ def determine_profiles(input)
148
+ return short_input?(input) ? short_profiles_path : profiles_path
33
149
  end
34
150
 
35
- def profiles_path
36
- default_path = File.expand_path("../../../../core/target/classes/profiles", __FILE__)
37
- options.fetch(:profiles_path, default_path)
151
+ ##
152
+ # @param [String] input
153
+ # @return [TrueClass|FalseClass]
154
+ #
155
+ def short_input?(input)
156
+ return input.length <= SHORT_THRESHOLD
38
157
  end
39
- end
40
- end
41
- end
158
+ end # Detector
159
+ end # LanguageIdentifier
160
+ end # Opener
@@ -1,5 +1,5 @@
1
1
  module Opener
2
2
  class LanguageIdentifier
3
- VERSION = "4.1.0"
3
+ VERSION = '4.2.0'
4
4
  end
5
5
  end
@@ -1,10 +1,8 @@
1
- require 'java'
2
1
  require 'open3'
3
2
  require 'slop'
4
3
  require 'builder'
5
4
 
6
- require_relative '../../core/target/LanguageDetection-0.0.1.jar'
7
- import 'org.vicomtech.opennlp.LanguageDetection.CybozuDetector'
5
+ require_relative '../../core/target/LanguageDetection-1.0.0.jar'
8
6
 
9
7
  require_relative 'language_identifier/version'
10
8
  require_relative 'language_identifier/kaf_builder'
@@ -46,8 +44,8 @@ module Opener
46
44
  # are returned instead of the language/KAF.
47
45
  #
48
46
  def initialize(options = {})
49
- @options = DEFAULT_OPTIONS.merge(options)
50
- @detector = Detector.instance
47
+ @options = DEFAULT_OPTIONS.merge(options)
48
+ @detector = Detector.new
51
49
  end
52
50
 
53
51
  ##
@@ -33,7 +33,6 @@ Gem::Specification.new do |gem|
33
33
  gem.add_dependency 'slop', '~> 3.5'
34
34
 
35
35
  gem.add_development_dependency 'rspec', '~> 3.0'
36
- gem.add_development_dependency 'cucumber'
37
36
  gem.add_development_dependency 'rake'
38
37
  gem.add_development_dependency 'cliver'
39
38
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: opener-language-identifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.1.0
4
+ version: 4.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - development@olery.com
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-11-25 00:00:00.000000000 Z
11
+ date: 2015-01-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: opener-daemons
@@ -94,20 +94,6 @@ dependencies:
94
94
  version: '3.0'
95
95
  prerelease: false
96
96
  type: :development
97
- - !ruby/object:Gem::Dependency
98
- name: cucumber
99
- version_requirements: !ruby/object:Gem::Requirement
100
- requirements:
101
- - - '>='
102
- - !ruby/object:Gem::Version
103
- version: '0'
104
- requirement: !ruby/object:Gem::Requirement
105
- requirements:
106
- - - '>='
107
- - !ruby/object:Gem::Version
108
- version: '0'
109
- prerelease: false
110
- type: :development
111
97
  - !ruby/object:Gem::Dependency
112
98
  name: rake
113
99
  version_requirements: !ruby/object:Gem::Requirement
@@ -145,9 +131,8 @@ executables:
145
131
  extensions: []
146
132
  extra_rdoc_files: []
147
133
  files:
134
+ - core/target/LanguageDetection-1.0.0.jar
148
135
  - core/target/LanguageDetection-0.0.1.jar
149
- - core/target/classes/org/vicomtech/opennlp/LanguageDetection/CybozuDetector.class
150
- - core/target/classes/org/vicomtech/opennlp/LanguageDetection/Main.class
151
136
  - core/target/classes/profiles/fi
152
137
  - core/target/classes/profiles/ta
153
138
  - core/target/classes/profiles/mr
@@ -193,6 +178,7 @@ files:
193
178
  - core/target/classes/profiles/vi
194
179
  - core/target/classes/profiles/et
195
180
  - core/target/classes/profiles/hi
181
+ - core/target/classes/profiles/af
196
182
  - core/target/classes/profiles/gu
197
183
  - core/target/classes/profiles/zh-cn
198
184
  - core/target/classes/profiles/mk
@@ -202,15 +188,59 @@ files:
202
188
  - core/target/classes/profiles/cs
203
189
  - core/target/classes/profiles/bn
204
190
  - core/target/classes/profiles/tl
191
+ - core/target/classes/short_profiles/fi
192
+ - core/target/classes/short_profiles/ta
193
+ - core/target/classes/short_profiles/ml
194
+ - core/target/classes/short_profiles/hr
195
+ - core/target/classes/short_profiles/id
196
+ - core/target/classes/short_profiles/es
197
+ - core/target/classes/short_profiles/no
198
+ - core/target/classes/short_profiles/ca
199
+ - core/target/classes/short_profiles/en
200
+ - core/target/classes/short_profiles/ru
201
+ - core/target/classes/short_profiles/te
202
+ - core/target/classes/short_profiles/lt
203
+ - core/target/classes/short_profiles/pa
204
+ - core/target/classes/short_profiles/ja
205
+ - core/target/classes/short_profiles/he
206
+ - core/target/classes/short_profiles/nl
207
+ - core/target/classes/short_profiles/tr
208
+ - core/target/classes/short_profiles/pl
209
+ - core/target/classes/short_profiles/si
210
+ - core/target/classes/short_profiles/fa
211
+ - core/target/classes/short_profiles/de
212
+ - core/target/classes/short_profiles/bg
213
+ - core/target/classes/short_profiles/it
214
+ - core/target/classes/short_profiles/fr
215
+ - core/target/classes/short_profiles/el
216
+ - core/target/classes/short_profiles/pt
217
+ - core/target/classes/short_profiles/uk
218
+ - core/target/classes/short_profiles/da
219
+ - core/target/classes/short_profiles/ar
220
+ - core/target/classes/short_profiles/zh-tw
221
+ - core/target/classes/short_profiles/sq
222
+ - core/target/classes/short_profiles/th
223
+ - core/target/classes/short_profiles/ko
224
+ - core/target/classes/short_profiles/ro
225
+ - core/target/classes/short_profiles/lv
226
+ - core/target/classes/short_profiles/sv
227
+ - core/target/classes/short_profiles/vi
228
+ - core/target/classes/short_profiles/et
229
+ - core/target/classes/short_profiles/hi
230
+ - core/target/classes/short_profiles/gu
231
+ - core/target/classes/short_profiles/zh-cn
232
+ - core/target/classes/short_profiles/mk
233
+ - core/target/classes/short_profiles/ur
234
+ - core/target/classes/short_profiles/hu
235
+ - core/target/classes/short_profiles/cs
236
+ - core/target/classes/short_profiles/bn
237
+ - core/target/classes/short_profiles/tl
205
238
  - core/target/classes/com/cybozu/labs/langdetect/ErrorCode.class
206
- - core/target/classes/com/cybozu/labs/langdetect/GenProfile.class
207
- - core/target/classes/com/cybozu/labs/langdetect/Command.class
208
239
  - core/target/classes/com/cybozu/labs/langdetect/LangDetectException.class
209
240
  - core/target/classes/com/cybozu/labs/langdetect/Language.class
210
241
  - core/target/classes/com/cybozu/labs/langdetect/Detector.class
211
242
  - core/target/classes/com/cybozu/labs/langdetect/af
212
243
  - core/target/classes/com/cybozu/labs/langdetect/DetectorFactory.class
213
- - core/target/classes/com/cybozu/labs/langdetect/util/TagExtractor.class
214
244
  - core/target/classes/com/cybozu/labs/langdetect/util/NGram.class
215
245
  - core/target/classes/com/cybozu/labs/langdetect/util/LangProfile.class
216
246
  - core/target/classes/com/cybozu/labs/langdetect/util/Messages.class