opener-language-identifier 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +138 -0
  3. data/bin/language-identifier +6 -0
  4. data/bin/language-identifier-daemon +10 -0
  5. data/bin/language-identifier-server +8 -0
  6. data/config.ru +4 -0
  7. data/core/target/LanguageDetection-0.0.1.jar +0 -0
  8. data/core/target/classes/com/cybozu/labs/langdetect/Command.class +0 -0
  9. data/core/target/classes/com/cybozu/labs/langdetect/Detector.class +0 -0
  10. data/core/target/classes/com/cybozu/labs/langdetect/DetectorFactory.class +0 -0
  11. data/core/target/classes/com/cybozu/labs/langdetect/ErrorCode.class +0 -0
  12. data/core/target/classes/com/cybozu/labs/langdetect/GenProfile.class +0 -0
  13. data/core/target/classes/com/cybozu/labs/langdetect/LangDetectException.class +0 -0
  14. data/core/target/classes/com/cybozu/labs/langdetect/Language.class +0 -0
  15. data/core/target/classes/com/cybozu/labs/langdetect/util/LangProfile.class +0 -0
  16. data/core/target/classes/com/cybozu/labs/langdetect/util/Messages.class +0 -0
  17. data/core/target/classes/com/cybozu/labs/langdetect/util/NGram.class +0 -0
  18. data/core/target/classes/com/cybozu/labs/langdetect/util/TagExtractor.class +0 -0
  19. data/core/target/classes/com/cybozu/labs/langdetect/util/messages.properties +128 -0
  20. data/core/target/classes/org/vicomtech/opennlp/LanguageDetection/CybozuDetector.class +0 -0
  21. data/core/target/classes/org/vicomtech/opennlp/LanguageDetection/Main.class +0 -0
  22. data/exec/language-identifier.rb +9 -0
  23. data/lib/opener/language_identifier.rb +89 -0
  24. data/lib/opener/language_identifier/cli.rb +139 -0
  25. data/lib/opener/language_identifier/detector.rb +36 -0
  26. data/lib/opener/language_identifier/kaf_builder.rb +62 -0
  27. data/lib/opener/language_identifier/public/markdown.css +283 -0
  28. data/lib/opener/language_identifier/server.rb +32 -0
  29. data/lib/opener/language_identifier/version.rb +5 -0
  30. data/lib/opener/language_identifier/views/index.erb +110 -0
  31. data/lib/opener/language_identifier/views/result.erb +15 -0
  32. data/opener-language-identifier.gemspec +37 -0
  33. metadata +231 -0
@@ -0,0 +1,139 @@
1
+ module Opener
2
+ class LanguageIdentifier
3
+ ##
4
+ # CLI wrapper around {Opener::LanguageIdentifier} using OptionParser.
5
+ #
6
+ # @!attribute [r] options
7
+ # @return [Hash]
8
+ #
9
+ # @!attribute [r] option_parser
10
+ # @return [OptionParser]
11
+ #
12
+ class CLI
13
+ attr_reader :options, :option_parser
14
+
15
+ ##
16
+ # @param [Hash] options
17
+ #
18
+ def initialize(options = {})
19
+ @options = DEFAULT_OPTIONS.merge(options)
20
+
21
+ @option_parser = OptionParser.new do |opts|
22
+ opts.program_name = 'language-identifier'
23
+ opts.summary_indent = ' '
24
+
25
+ opts.on('-v', '--version', 'Shows the current version') do
26
+ show_version
27
+ end
28
+
29
+ opts.on('-k', '--[no-]kaf', 'Output the language as KAF') do |v|
30
+ @options[:kaf] = v
31
+ end
32
+
33
+ opts.on('-p', '--probs', 'Provide probabilities, assumes --no-kaf') do
34
+ @options[:kaf] = false
35
+ @options[:probs] = true
36
+ end
37
+
38
+ opts.separator <<-EOF
39
+
40
+ Examples:
41
+
42
+ cat example_text.txt | #{opts.program_name} # Basic detection
43
+
44
+ Languages:
45
+
46
+ * ar Arabic
47
+ * bg Bulgarian
48
+ * bn Bengali
49
+ * cs Czech
50
+ * da Danish
51
+ * de German
52
+ * el Greek
53
+ * en English
54
+ * es Spanish
55
+ * et Estonian
56
+ * fa Persian
57
+ * fi Finnish
58
+ * fr French
59
+ * gu Gujarati
60
+ * he Hebrew
61
+ * hi Hindi
62
+ * hr Croatian
63
+ * hu Hungarian
64
+ * id Indonesian
65
+ * it Italian
66
+ * ja Japanese
67
+ * kn Kannada
68
+ * ko Korean
69
+ * lt Lithuanian
70
+ * lv Latvian
71
+ * mk Macedonian
72
+ * ml Malayalam
73
+ * mr Marathi
74
+ * ne Nepali
75
+ * nl Dutch
76
+ * no Norwegian
77
+ * pa Punjabi
78
+ * pl Polish
79
+ * pt Portuguese
80
+ * ro Romanian
81
+ * ru Russian
82
+ * sk Slovak
83
+ * sl Slovene
84
+ * so Somali
85
+ * sq Albanian
86
+ * sv Swedish
87
+ * sw Swahili
88
+ * ta Tamil
89
+ * te Telugu
90
+ * th Thai
91
+ * tl Tagalog
92
+ * tr Turkish
93
+ * uk Ukrainian
94
+ * ur Urdu
95
+ * vi Vietnamese
96
+ * zh-cn Simplified Chinese
97
+ * zh-tw Traditional Chinese
98
+ EOF
99
+
100
+ opts.separator ""
101
+ opts.separator "Common options:"
102
+ # No argument, shows at tail. This will print an options summary.
103
+ # Try it and see!
104
+ opts.on_tail("-h", "--help", "Show this message.") do
105
+ puts opts
106
+ exit
107
+ end
108
+ end
109
+ end
110
+
111
+ ##
112
+ # @param [String] input
113
+ #
114
+ def run(input)
115
+ option_parser.parse!(options[:args])
116
+ identifier = LanguageIdentifier.new(options)
117
+
118
+ output = identifier.run(input)
119
+ puts output
120
+ end
121
+
122
+ private
123
+
124
+ ##
125
+ # Shows the help message and exits the program.
126
+ #
127
+ def show_help
128
+ abort option_parser.to_s
129
+ end
130
+
131
+ ##
132
+ # Shows the version and exits the program.
133
+ #
134
+ def show_version
135
+ abort "#{option_parser.program_name} v#{VERSION} on #{RUBY_DESCRIPTION}"
136
+ end
137
+ end # CLI
138
+ end # LanguageIdentifier
139
+ end # Opener
@@ -0,0 +1,36 @@
1
+ require 'singleton'
2
+
3
+ import 'org.vicomtech.opennlp.LanguageDetection.CybozuDetector'
4
+
5
+ module Opener
6
+ class LanguageIdentifier
7
+ class Detector
8
+ attr_reader :options
9
+
10
+ include Singleton
11
+
12
+ def initialize(options={})
13
+ @options = options
14
+ @detector = CybozuDetector.new(profiles_path)
15
+ @semaphore = Mutex.new
16
+ end
17
+
18
+ def detect(input)
19
+ @semaphore.synchronize do
20
+ @detector.detect(input)
21
+ end
22
+ end
23
+
24
+ def probabilities(input)
25
+ @semaphore.synchronize do
26
+ result = @detector.detect_langs(input)
27
+ end
28
+ end
29
+
30
+ def profiles_path
31
+ default_path = File.expand_path("../../../../core/target/classes/profiles", __FILE__)
32
+ options.fetch(:profiles_path, default_path)
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,62 @@
1
+ module Opener
2
+ class LanguageIdentifier
3
+ ##
4
+ # Class for building basic KAF documents that contain the correct language
5
+ # tag and the raw input that was anaylzed.
6
+ #
7
+ # @!attribute [r] xml
8
+ # @return [Builder::XmlMarkup]
9
+ #
10
+ # @!attribute [r] original_text
11
+ # @return [String]
12
+ #
13
+ # @!attribute [r] language
14
+ # @return [String]
15
+ #
16
+ class KafBuilder
17
+ attr_reader :xml, :original_text, :language
18
+
19
+ ##
20
+ # @param [String] text The input text that was analyzed.
21
+ # @param [String] language The language of the text.
22
+ #
23
+ def initialize(text, language)
24
+ @xml = Builder::XmlMarkup.new(:indent => 2)
25
+ @language = language.strip
26
+ @original_text = text
27
+ end
28
+
29
+ ##
30
+ # Builds the KAF document.
31
+ #
32
+ def build
33
+ xml.instruct!(
34
+ :xml,
35
+ :version => '1.0',
36
+ :encoding => 'UTF-8',
37
+ :standalone => 'yes'
38
+ )
39
+
40
+ xml.KAF('xml:lang' => language, 'version' => version) do |node|
41
+ node.raw(original_text)
42
+ end
43
+ end
44
+
45
+ ##
46
+ # Returns the XML document as a String.
47
+ #
48
+ # @return [String]
49
+ #
50
+ def to_s
51
+ return xml.target!
52
+ end
53
+
54
+ ##
55
+ # @return [String]
56
+ #
57
+ def version
58
+ return "2.1"
59
+ end
60
+ end # KafBuilder
61
+ end # LanguageIdentifier
62
+ end # Opener
@@ -0,0 +1,283 @@
1
+ input[type="text"], textarea
2
+ {
3
+ width: 500px;
4
+ }
5
+
6
+ body {
7
+ font-family: Helvetica, arial, sans-serif;
8
+ font-size: 14px;
9
+ line-height: 1.6;
10
+ padding-top: 10px;
11
+ padding-bottom: 10px;
12
+ background-color: white;
13
+ padding: 30px; }
14
+
15
+ body > *:first-child {
16
+ margin-top: 0 !important; }
17
+ body > *:last-child {
18
+ margin-bottom: 0 !important; }
19
+
20
+ a {
21
+ color: #4183C4; }
22
+ a.absent {
23
+ color: #cc0000; }
24
+ a.anchor {
25
+ display: block;
26
+ padding-left: 30px;
27
+ margin-left: -30px;
28
+ cursor: pointer;
29
+ position: absolute;
30
+ top: 0;
31
+ left: 0;
32
+ bottom: 0; }
33
+
34
+ h1, h2, h3, h4, h5, h6 {
35
+ margin: 20px 0 10px;
36
+ padding: 0;
37
+ font-weight: bold;
38
+ -webkit-font-smoothing: antialiased;
39
+ cursor: text;
40
+ position: relative; }
41
+
42
+ h1:hover a.anchor, h2:hover a.anchor, h3:hover a.anchor, h4:hover a.anchor, h5:hover a.anchor, h6:hover a.anchor {
43
+ background: url("../../images/modules/styleguide/para.png") no-repeat 10px center;
44
+ text-decoration: none; }
45
+
46
+ h1 tt, h1 code {
47
+ font-size: inherit; }
48
+
49
+ h2 tt, h2 code {
50
+ font-size: inherit; }
51
+
52
+ h3 tt, h3 code {
53
+ font-size: inherit; }
54
+
55
+ h4 tt, h4 code {
56
+ font-size: inherit; }
57
+
58
+ h5 tt, h5 code {
59
+ font-size: inherit; }
60
+
61
+ h6 tt, h6 code {
62
+ font-size: inherit; }
63
+
64
+ h1 {
65
+ font-size: 28px;
66
+ color: black; }
67
+
68
+ h2 {
69
+ font-size: 24px;
70
+ border-bottom: 1px solid #cccccc;
71
+ color: black; }
72
+
73
+ h3 {
74
+ font-size: 18px; }
75
+
76
+ h4 {
77
+ font-size: 16px; }
78
+
79
+ h5 {
80
+ font-size: 14px; }
81
+
82
+ h6 {
83
+ color: #777777;
84
+ font-size: 14px; }
85
+
86
+ p, blockquote, ul, ol, dl, li, table, pre {
87
+ margin: 15px 0; }
88
+
89
+ hr {
90
+ background: transparent url("../../images/modules/pulls/dirty-shade.png") repeat-x 0 0;
91
+ border: 0 none;
92
+ color: #cccccc;
93
+ height: 4px;
94
+ padding: 0; }
95
+
96
+ body > h2:first-child {
97
+ margin-top: 0;
98
+ padding-top: 0; }
99
+ body > h1:first-child {
100
+ margin-top: 0;
101
+ padding-top: 0; }
102
+ body > h1:first-child + h2 {
103
+ margin-top: 0;
104
+ padding-top: 0; }
105
+ body > h3:first-child, body > h4:first-child, body > h5:first-child, body > h6:first-child {
106
+ margin-top: 0;
107
+ padding-top: 0; }
108
+
109
+ a:first-child h1, a:first-child h2, a:first-child h3, a:first-child h4, a:first-child h5, a:first-child h6 {
110
+ margin-top: 0;
111
+ padding-top: 0; }
112
+
113
+ h1 p, h2 p, h3 p, h4 p, h5 p, h6 p {
114
+ margin-top: 0; }
115
+
116
+ li p.first {
117
+ display: inline-block; }
118
+
119
+ ul, ol {
120
+ padding-left: 30px; }
121
+
122
+ ul :first-child, ol :first-child {
123
+ margin-top: 0; }
124
+
125
+ ul :last-child, ol :last-child {
126
+ margin-bottom: 0; }
127
+
128
+ dl {
129
+ padding: 0; }
130
+ dl dt {
131
+ font-size: 14px;
132
+ font-weight: bold;
133
+ font-style: italic;
134
+ padding: 0;
135
+ margin: 15px 0 5px; }
136
+ dl dt:first-child {
137
+ padding: 0; }
138
+ dl dt > :first-child {
139
+ margin-top: 0; }
140
+ dl dt > :last-child {
141
+ margin-bottom: 0; }
142
+ dl dd {
143
+ margin: 0 0 15px;
144
+ padding: 0 15px; }
145
+ dl dd > :first-child {
146
+ margin-top: 0; }
147
+ dl dd > :last-child {
148
+ margin-bottom: 0; }
149
+
150
+ blockquote {
151
+ border-left: 4px solid #dddddd;
152
+ padding: 0 15px;
153
+ color: #777777; }
154
+ blockquote > :first-child {
155
+ margin-top: 0; }
156
+ blockquote > :last-child {
157
+ margin-bottom: 0; }
158
+
159
+ table {
160
+ padding: 0; }
161
+ table tr {
162
+ border-top: 1px solid #cccccc;
163
+ background-color: white;
164
+ margin: 0;
165
+ padding: 0; }
166
+ table tr:nth-child(2n) {
167
+ background-color: #f8f8f8; }
168
+ table tr th {
169
+ font-weight: bold;
170
+ border: 1px solid #cccccc;
171
+ text-align: left;
172
+ margin: 0;
173
+ padding: 6px 13px; }
174
+ table tr td {
175
+ border: 1px solid #cccccc;
176
+ text-align: left;
177
+ margin: 0;
178
+ padding: 6px 13px; }
179
+ table tr th :first-child, table tr td :first-child {
180
+ margin-top: 0; }
181
+ table tr th :last-child, table tr td :last-child {
182
+ margin-bottom: 0; }
183
+
184
+ img {
185
+ max-width: 100%; }
186
+
187
+ span.frame {
188
+ display: block;
189
+ overflow: hidden; }
190
+ span.frame > span {
191
+ border: 1px solid #dddddd;
192
+ display: block;
193
+ float: left;
194
+ overflow: hidden;
195
+ margin: 13px 0 0;
196
+ padding: 7px;
197
+ width: auto; }
198
+ span.frame span img {
199
+ display: block;
200
+ float: left; }
201
+ span.frame span span {
202
+ clear: both;
203
+ color: #333333;
204
+ display: block;
205
+ padding: 5px 0 0; }
206
+ span.align-center {
207
+ display: block;
208
+ overflow: hidden;
209
+ clear: both; }
210
+ span.align-center > span {
211
+ display: block;
212
+ overflow: hidden;
213
+ margin: 13px auto 0;
214
+ text-align: center; }
215
+ span.align-center span img {
216
+ margin: 0 auto;
217
+ text-align: center; }
218
+ span.align-right {
219
+ display: block;
220
+ overflow: hidden;
221
+ clear: both; }
222
+ span.align-right > span {
223
+ display: block;
224
+ overflow: hidden;
225
+ margin: 13px 0 0;
226
+ text-align: right; }
227
+ span.align-right span img {
228
+ margin: 0;
229
+ text-align: right; }
230
+ span.float-left {
231
+ display: block;
232
+ margin-right: 13px;
233
+ overflow: hidden;
234
+ float: left; }
235
+ span.float-left span {
236
+ margin: 13px 0 0; }
237
+ span.float-right {
238
+ display: block;
239
+ margin-left: 13px;
240
+ overflow: hidden;
241
+ float: right; }
242
+ span.float-right > span {
243
+ display: block;
244
+ overflow: hidden;
245
+ margin: 13px auto 0;
246
+ text-align: right; }
247
+
248
+ code, tt {
249
+ margin: 0 2px;
250
+ padding: 0 5px;
251
+ white-space: nowrap;
252
+ border: 1px solid #eaeaea;
253
+ background-color: #f8f8f8;
254
+ border-radius: 3px; }
255
+
256
+ pre code {
257
+ margin: 0;
258
+ padding: 0;
259
+ white-space: pre;
260
+ border: none;
261
+ background: transparent; }
262
+
263
+ .highlight pre {
264
+ background-color: #f8f8f8;
265
+ border: 1px solid #cccccc;
266
+ font-size: 13px;
267
+ line-height: 19px;
268
+ overflow: auto;
269
+ padding: 6px 10px;
270
+ border-radius: 3px; }
271
+
272
+ pre {
273
+ background-color: #f8f8f8;
274
+ border: 1px solid #cccccc;
275
+ font-size: 13px;
276
+ line-height: 19px;
277
+ overflow: auto;
278
+ padding: 6px 10px;
279
+ border-radius: 3px; }
280
+ pre code, pre tt {
281
+ background-color: transparent;
282
+ border: none; }
283
+