opener-language-identifier 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +138 -0
  3. data/bin/language-identifier +6 -0
  4. data/bin/language-identifier-daemon +10 -0
  5. data/bin/language-identifier-server +8 -0
  6. data/config.ru +4 -0
  7. data/core/target/LanguageDetection-0.0.1.jar +0 -0
  8. data/core/target/classes/com/cybozu/labs/langdetect/Command.class +0 -0
  9. data/core/target/classes/com/cybozu/labs/langdetect/Detector.class +0 -0
  10. data/core/target/classes/com/cybozu/labs/langdetect/DetectorFactory.class +0 -0
  11. data/core/target/classes/com/cybozu/labs/langdetect/ErrorCode.class +0 -0
  12. data/core/target/classes/com/cybozu/labs/langdetect/GenProfile.class +0 -0
  13. data/core/target/classes/com/cybozu/labs/langdetect/LangDetectException.class +0 -0
  14. data/core/target/classes/com/cybozu/labs/langdetect/Language.class +0 -0
  15. data/core/target/classes/com/cybozu/labs/langdetect/util/LangProfile.class +0 -0
  16. data/core/target/classes/com/cybozu/labs/langdetect/util/Messages.class +0 -0
  17. data/core/target/classes/com/cybozu/labs/langdetect/util/NGram.class +0 -0
  18. data/core/target/classes/com/cybozu/labs/langdetect/util/TagExtractor.class +0 -0
  19. data/core/target/classes/com/cybozu/labs/langdetect/util/messages.properties +128 -0
  20. data/core/target/classes/org/vicomtech/opennlp/LanguageDetection/CybozuDetector.class +0 -0
  21. data/core/target/classes/org/vicomtech/opennlp/LanguageDetection/Main.class +0 -0
  22. data/exec/language-identifier.rb +9 -0
  23. data/lib/opener/language_identifier.rb +89 -0
  24. data/lib/opener/language_identifier/cli.rb +139 -0
  25. data/lib/opener/language_identifier/detector.rb +36 -0
  26. data/lib/opener/language_identifier/kaf_builder.rb +62 -0
  27. data/lib/opener/language_identifier/public/markdown.css +283 -0
  28. data/lib/opener/language_identifier/server.rb +32 -0
  29. data/lib/opener/language_identifier/version.rb +5 -0
  30. data/lib/opener/language_identifier/views/index.erb +110 -0
  31. data/lib/opener/language_identifier/views/result.erb +15 -0
  32. data/opener-language-identifier.gemspec +37 -0
  33. metadata +231 -0
@@ -0,0 +1,139 @@
1
+ module Opener
2
+ class LanguageIdentifier
3
+ ##
4
+ # CLI wrapper around {Opener::LanguageIdentifier} using OptionParser.
5
+ #
6
+ # @!attribute [r] options
7
+ # @return [Hash]
8
+ #
9
+ # @!attribute [r] option_parser
10
+ # @return [OptionParser]
11
+ #
12
+ class CLI
13
+ attr_reader :options, :option_parser
14
+
15
+ ##
16
+ # @param [Hash] options
17
+ #
18
+ def initialize(options = {})
19
+ @options = DEFAULT_OPTIONS.merge(options)
20
+
21
+ @option_parser = OptionParser.new do |opts|
22
+ opts.program_name = 'language-identifier'
23
+ opts.summary_indent = ' '
24
+
25
+ opts.on('-v', '--version', 'Shows the current version') do
26
+ show_version
27
+ end
28
+
29
+ opts.on('-k', '--[no-]kaf', 'Output the language as KAF') do |v|
30
+ @options[:kaf] = v
31
+ end
32
+
33
+ opts.on('-p', '--probs', 'Provide probabilities, assumes --no-kaf') do
34
+ @options[:kaf] = false
35
+ @options[:probs] = true
36
+ end
37
+
38
+ opts.separator <<-EOF
39
+
40
+ Examples:
41
+
42
+ cat example_text.txt | #{opts.program_name} # Basic detection
43
+
44
+ Languages:
45
+
46
+ * ar Arabic
47
+ * bg Bulgarian
48
+ * bn Bengali
49
+ * cs Czech
50
+ * da Danish
51
+ * de German
52
+ * el Greek
53
+ * en English
54
+ * es Spanish
55
+ * et Estonian
56
+ * fa Persian
57
+ * fi Finnish
58
+ * fr French
59
+ * gu Gujarati
60
+ * he Hebrew
61
+ * hi Hindi
62
+ * hr Croatian
63
+ * hu Hungarian
64
+ * id Indonesian
65
+ * it Italian
66
+ * ja Japanese
67
+ * kn Kannada
68
+ * ko Korean
69
+ * lt Lithuanian
70
+ * lv Latvian
71
+ * mk Macedonian
72
+ * ml Malayalam
73
+ * mr Marathi
74
+ * ne Nepali
75
+ * nl Dutch
76
+ * no Norwegian
77
+ * pa Punjabi
78
+ * pl Polish
79
+ * pt Portuguese
80
+ * ro Romanian
81
+ * ru Russian
82
+ * sk Slovak
83
+ * sl Slovene
84
+ * so Somali
85
+ * sq Albanian
86
+ * sv Swedish
87
+ * sw Swahili
88
+ * ta Tamil
89
+ * te Telugu
90
+ * th Thai
91
+ * tl Tagalog
92
+ * tr Turkish
93
+ * uk Ukrainian
94
+ * ur Urdu
95
+ * vi Vietnamese
96
+ * zh-cn Simplified Chinese
97
+ * zh-tw Traditional Chinese
98
+ EOF
99
+
100
+ opts.separator ""
101
+ opts.separator "Common options:"
102
+ # No argument, shows at tail. This will print an options summary.
103
+ # Try it and see!
104
+ opts.on_tail("-h", "--help", "Show this message.") do
105
+ puts opts
106
+ exit
107
+ end
108
+ end
109
+ end
110
+
111
+ ##
112
+ # @param [String] input
113
+ #
114
+ def run(input)
115
+ option_parser.parse!(options[:args])
116
+ identifier = LanguageIdentifier.new(options)
117
+
118
+ output = identifier.run(input)
119
+ puts output
120
+ end
121
+
122
+ private
123
+
124
+ ##
125
+ # Shows the help message and exits the program.
126
+ #
127
+ def show_help
128
+ abort option_parser.to_s
129
+ end
130
+
131
+ ##
132
+ # Shows the version and exits the program.
133
+ #
134
+ def show_version
135
+ abort "#{option_parser.program_name} v#{VERSION} on #{RUBY_DESCRIPTION}"
136
+ end
137
+ end # CLI
138
+ end # LanguageIdentifier
139
+ end # Opener
@@ -0,0 +1,36 @@
1
+ require 'singleton'
2
+
3
+ import 'org.vicomtech.opennlp.LanguageDetection.CybozuDetector'
4
+
5
+ module Opener
6
+ class LanguageIdentifier
7
+ class Detector
8
+ attr_reader :options
9
+
10
+ include Singleton
11
+
12
+ def initialize(options={})
13
+ @options = options
14
+ @detector = CybozuDetector.new(profiles_path)
15
+ @semaphore = Mutex.new
16
+ end
17
+
18
+ def detect(input)
19
+ @semaphore.synchronize do
20
+ @detector.detect(input)
21
+ end
22
+ end
23
+
24
+ def probabilities(input)
25
+ @semaphore.synchronize do
26
+ result = @detector.detect_langs(input)
27
+ end
28
+ end
29
+
30
+ def profiles_path
31
+ default_path = File.expand_path("../../../../core/target/classes/profiles", __FILE__)
32
+ options.fetch(:profiles_path, default_path)
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,62 @@
1
+ module Opener
2
+ class LanguageIdentifier
3
+ ##
4
+ # Class for building basic KAF documents that contain the correct language
5
+ # tag and the raw input that was anaylzed.
6
+ #
7
+ # @!attribute [r] xml
8
+ # @return [Builder::XmlMarkup]
9
+ #
10
+ # @!attribute [r] original_text
11
+ # @return [String]
12
+ #
13
+ # @!attribute [r] language
14
+ # @return [String]
15
+ #
16
+ class KafBuilder
17
+ attr_reader :xml, :original_text, :language
18
+
19
+ ##
20
+ # @param [String] text The input text that was analyzed.
21
+ # @param [String] language The language of the text.
22
+ #
23
+ def initialize(text, language)
24
+ @xml = Builder::XmlMarkup.new(:indent => 2)
25
+ @language = language.strip
26
+ @original_text = text
27
+ end
28
+
29
+ ##
30
+ # Builds the KAF document.
31
+ #
32
+ def build
33
+ xml.instruct!(
34
+ :xml,
35
+ :version => '1.0',
36
+ :encoding => 'UTF-8',
37
+ :standalone => 'yes'
38
+ )
39
+
40
+ xml.KAF('xml:lang' => language, 'version' => version) do |node|
41
+ node.raw(original_text)
42
+ end
43
+ end
44
+
45
+ ##
46
+ # Returns the XML document as a String.
47
+ #
48
+ # @return [String]
49
+ #
50
+ def to_s
51
+ return xml.target!
52
+ end
53
+
54
+ ##
55
+ # @return [String]
56
+ #
57
+ def version
58
+ return "2.1"
59
+ end
60
+ end # KafBuilder
61
+ end # LanguageIdentifier
62
+ end # Opener
@@ -0,0 +1,283 @@
1
+ input[type="text"], textarea
2
+ {
3
+ width: 500px;
4
+ }
5
+
6
+ body {
7
+ font-family: Helvetica, arial, sans-serif;
8
+ font-size: 14px;
9
+ line-height: 1.6;
10
+ padding-top: 10px;
11
+ padding-bottom: 10px;
12
+ background-color: white;
13
+ padding: 30px; }
14
+
15
+ body > *:first-child {
16
+ margin-top: 0 !important; }
17
+ body > *:last-child {
18
+ margin-bottom: 0 !important; }
19
+
20
+ a {
21
+ color: #4183C4; }
22
+ a.absent {
23
+ color: #cc0000; }
24
+ a.anchor {
25
+ display: block;
26
+ padding-left: 30px;
27
+ margin-left: -30px;
28
+ cursor: pointer;
29
+ position: absolute;
30
+ top: 0;
31
+ left: 0;
32
+ bottom: 0; }
33
+
34
+ h1, h2, h3, h4, h5, h6 {
35
+ margin: 20px 0 10px;
36
+ padding: 0;
37
+ font-weight: bold;
38
+ -webkit-font-smoothing: antialiased;
39
+ cursor: text;
40
+ position: relative; }
41
+
42
+ h1:hover a.anchor, h2:hover a.anchor, h3:hover a.anchor, h4:hover a.anchor, h5:hover a.anchor, h6:hover a.anchor {
43
+ background: url("../../images/modules/styleguide/para.png") no-repeat 10px center;
44
+ text-decoration: none; }
45
+
46
+ h1 tt, h1 code {
47
+ font-size: inherit; }
48
+
49
+ h2 tt, h2 code {
50
+ font-size: inherit; }
51
+
52
+ h3 tt, h3 code {
53
+ font-size: inherit; }
54
+
55
+ h4 tt, h4 code {
56
+ font-size: inherit; }
57
+
58
+ h5 tt, h5 code {
59
+ font-size: inherit; }
60
+
61
+ h6 tt, h6 code {
62
+ font-size: inherit; }
63
+
64
+ h1 {
65
+ font-size: 28px;
66
+ color: black; }
67
+
68
+ h2 {
69
+ font-size: 24px;
70
+ border-bottom: 1px solid #cccccc;
71
+ color: black; }
72
+
73
+ h3 {
74
+ font-size: 18px; }
75
+
76
+ h4 {
77
+ font-size: 16px; }
78
+
79
+ h5 {
80
+ font-size: 14px; }
81
+
82
+ h6 {
83
+ color: #777777;
84
+ font-size: 14px; }
85
+
86
+ p, blockquote, ul, ol, dl, li, table, pre {
87
+ margin: 15px 0; }
88
+
89
+ hr {
90
+ background: transparent url("../../images/modules/pulls/dirty-shade.png") repeat-x 0 0;
91
+ border: 0 none;
92
+ color: #cccccc;
93
+ height: 4px;
94
+ padding: 0; }
95
+
96
+ body > h2:first-child {
97
+ margin-top: 0;
98
+ padding-top: 0; }
99
+ body > h1:first-child {
100
+ margin-top: 0;
101
+ padding-top: 0; }
102
+ body > h1:first-child + h2 {
103
+ margin-top: 0;
104
+ padding-top: 0; }
105
+ body > h3:first-child, body > h4:first-child, body > h5:first-child, body > h6:first-child {
106
+ margin-top: 0;
107
+ padding-top: 0; }
108
+
109
+ a:first-child h1, a:first-child h2, a:first-child h3, a:first-child h4, a:first-child h5, a:first-child h6 {
110
+ margin-top: 0;
111
+ padding-top: 0; }
112
+
113
+ h1 p, h2 p, h3 p, h4 p, h5 p, h6 p {
114
+ margin-top: 0; }
115
+
116
+ li p.first {
117
+ display: inline-block; }
118
+
119
+ ul, ol {
120
+ padding-left: 30px; }
121
+
122
+ ul :first-child, ol :first-child {
123
+ margin-top: 0; }
124
+
125
+ ul :last-child, ol :last-child {
126
+ margin-bottom: 0; }
127
+
128
+ dl {
129
+ padding: 0; }
130
+ dl dt {
131
+ font-size: 14px;
132
+ font-weight: bold;
133
+ font-style: italic;
134
+ padding: 0;
135
+ margin: 15px 0 5px; }
136
+ dl dt:first-child {
137
+ padding: 0; }
138
+ dl dt > :first-child {
139
+ margin-top: 0; }
140
+ dl dt > :last-child {
141
+ margin-bottom: 0; }
142
+ dl dd {
143
+ margin: 0 0 15px;
144
+ padding: 0 15px; }
145
+ dl dd > :first-child {
146
+ margin-top: 0; }
147
+ dl dd > :last-child {
148
+ margin-bottom: 0; }
149
+
150
+ blockquote {
151
+ border-left: 4px solid #dddddd;
152
+ padding: 0 15px;
153
+ color: #777777; }
154
+ blockquote > :first-child {
155
+ margin-top: 0; }
156
+ blockquote > :last-child {
157
+ margin-bottom: 0; }
158
+
159
+ table {
160
+ padding: 0; }
161
+ table tr {
162
+ border-top: 1px solid #cccccc;
163
+ background-color: white;
164
+ margin: 0;
165
+ padding: 0; }
166
+ table tr:nth-child(2n) {
167
+ background-color: #f8f8f8; }
168
+ table tr th {
169
+ font-weight: bold;
170
+ border: 1px solid #cccccc;
171
+ text-align: left;
172
+ margin: 0;
173
+ padding: 6px 13px; }
174
+ table tr td {
175
+ border: 1px solid #cccccc;
176
+ text-align: left;
177
+ margin: 0;
178
+ padding: 6px 13px; }
179
+ table tr th :first-child, table tr td :first-child {
180
+ margin-top: 0; }
181
+ table tr th :last-child, table tr td :last-child {
182
+ margin-bottom: 0; }
183
+
184
+ img {
185
+ max-width: 100%; }
186
+
187
+ span.frame {
188
+ display: block;
189
+ overflow: hidden; }
190
+ span.frame > span {
191
+ border: 1px solid #dddddd;
192
+ display: block;
193
+ float: left;
194
+ overflow: hidden;
195
+ margin: 13px 0 0;
196
+ padding: 7px;
197
+ width: auto; }
198
+ span.frame span img {
199
+ display: block;
200
+ float: left; }
201
+ span.frame span span {
202
+ clear: both;
203
+ color: #333333;
204
+ display: block;
205
+ padding: 5px 0 0; }
206
+ span.align-center {
207
+ display: block;
208
+ overflow: hidden;
209
+ clear: both; }
210
+ span.align-center > span {
211
+ display: block;
212
+ overflow: hidden;
213
+ margin: 13px auto 0;
214
+ text-align: center; }
215
+ span.align-center span img {
216
+ margin: 0 auto;
217
+ text-align: center; }
218
+ span.align-right {
219
+ display: block;
220
+ overflow: hidden;
221
+ clear: both; }
222
+ span.align-right > span {
223
+ display: block;
224
+ overflow: hidden;
225
+ margin: 13px 0 0;
226
+ text-align: right; }
227
+ span.align-right span img {
228
+ margin: 0;
229
+ text-align: right; }
230
+ span.float-left {
231
+ display: block;
232
+ margin-right: 13px;
233
+ overflow: hidden;
234
+ float: left; }
235
+ span.float-left span {
236
+ margin: 13px 0 0; }
237
+ span.float-right {
238
+ display: block;
239
+ margin-left: 13px;
240
+ overflow: hidden;
241
+ float: right; }
242
+ span.float-right > span {
243
+ display: block;
244
+ overflow: hidden;
245
+ margin: 13px auto 0;
246
+ text-align: right; }
247
+
248
+ code, tt {
249
+ margin: 0 2px;
250
+ padding: 0 5px;
251
+ white-space: nowrap;
252
+ border: 1px solid #eaeaea;
253
+ background-color: #f8f8f8;
254
+ border-radius: 3px; }
255
+
256
+ pre code {
257
+ margin: 0;
258
+ padding: 0;
259
+ white-space: pre;
260
+ border: none;
261
+ background: transparent; }
262
+
263
+ .highlight pre {
264
+ background-color: #f8f8f8;
265
+ border: 1px solid #cccccc;
266
+ font-size: 13px;
267
+ line-height: 19px;
268
+ overflow: auto;
269
+ padding: 6px 10px;
270
+ border-radius: 3px; }
271
+
272
+ pre {
273
+ background-color: #f8f8f8;
274
+ border: 1px solid #cccccc;
275
+ font-size: 13px;
276
+ line-height: 19px;
277
+ overflow: auto;
278
+ padding: 6px 10px;
279
+ border-radius: 3px; }
280
+ pre code, pre tt {
281
+ background-color: transparent;
282
+ border: none; }
283
+