twkorean 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +32 -36
- data/lib/jars/korean-text-4.4.jar +0 -0
- data/lib/jars/twitter-text-1.13.3.jar +0 -0
- data/lib/twkorean.rb +1 -1
- data/lib/twkorean/twitter_korean_text.rb +17 -17
- data/lib/twkorean/version.rb +2 -2
- data/test/test_helper.rb +1 -1
- data/test/twkorean.rb +13 -25
- metadata +4 -4
- data/lib/jars/korean-text-3.0.jar +0 -0
- data/lib/jars/twitter-text-1.11.1.jar +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 36d821bc142a63e34c0e143cb53ed2e72f0f72f9
|
4
|
+
data.tar.gz: 274df73ac337c3f01090f942c4b5f3948498470c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b3eb5720642faf5de2e04cc2c2d035f15da4889ed6f723facdd2771f9f4f404d52d09dffc1cc8f39a911d4456c62041c51a697c45c7410b8a4e8ae69c151b46f
|
7
|
+
data.tar.gz: 9c80732be97e998f09ff640fc7bbedbb6ddc6f412805386cb1b7b3c8988597331814a6af66ab4e6247b404f7f2525dae9264c11eb972afff3e2517b8d8a6dc1a
|
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
## Compatibility
|
4
4
|
|
5
|
-
Currently wraps [twitter-korean-text
|
5
|
+
Currently wraps [twitter-korean-text 4.4](https://github.com/twitter/twitter-korean-text/tree/korean-text-4.4) / 현재 이 프로젝트는 [twitter-korean-text 4.4](https://github.com/twitter/twitter-korean-text/tree/korean-text-4.4)을 사용중입니다.
|
6
6
|
|
7
7
|
## Installation
|
8
8
|
|
@@ -29,41 +29,37 @@ Or install it yourself as:
|
|
29
29
|
|
30
30
|
## Usage
|
31
31
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
# ["한국어(Noun: 0, 3)", "처리(Noun: 5, 2)", "처리하는 예시(Noun: 5, 7)", "예시(Noun: 10, 2)", "#한국어(Hashtag: 18, 4)"]
|
64
|
-
end
|
65
|
-
end
|
66
|
-
## Contributing
|
32
|
+
describe "Twkorean" do
|
33
|
+
text = "한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ #한국어"
|
34
|
+
twkorean = Twkorean::TwitterKoreanText.new
|
35
|
+
text = twkorean.normalize(text)
|
36
|
+
tokens = twkorean.tokenize(text)
|
37
|
+
|
38
|
+
it "Tokenize" do
|
39
|
+
p "#Tokenize"
|
40
|
+
p twkorean.tokens_to_string_list(tokens)
|
41
|
+
# ["한국어", "를", "처리", "하는", "예시", "입니", "다", "ㅋㅋ", "#한국어"]
|
42
|
+
p twkorean.tokens_to_token_list(tokens)
|
43
|
+
# ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하는(Verb: 7, 2)", "예시(Noun: 10, 2)", "입니(Adjective: 12, 2)", "다(Eomi: 14, 1)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
|
44
|
+
end
|
45
|
+
|
46
|
+
it "Stemming" do
|
47
|
+
p "#Stemming"
|
48
|
+
stem = twkorean.stem(tokens)
|
49
|
+
p twkorean.tokens_to_string_list(stem)
|
50
|
+
# ["한국어", "를", "처리", "하다", "예시", "이다", "ㅋㅋ", "#한국어"]
|
51
|
+
p twkorean.tokens_to_token_list(stem)
|
52
|
+
# ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하다(Verb: 7, 2)", "예시(Noun: 10, 2)", "이다(Adjective: 12, 3)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
|
53
|
+
end
|
54
|
+
|
55
|
+
it "Phrase extraction" do
|
56
|
+
p "Phrase extraction"
|
57
|
+
p twkorean.extract_phrases(tokens)
|
58
|
+
# ["한국어(Noun: 0, 3)", "처리(Noun: 5, 2)", "처리하는 예시(Noun: 5, 7)", "예시(Noun: 10, 2)", "#한국어(Hashtag: 18, 4)"]
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
end## Contributing
|
67
63
|
|
68
64
|
1. Fork it ( https://github.com/[my-github-username]/twkorean/fork )
|
69
65
|
2. Create your feature branch (`git checkout -b my-new-feature`)
|
Binary file
|
Binary file
|
data/lib/twkorean.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# @name twkorean-ruby
|
2
2
|
# @author JunSangPil
|
3
|
-
# @version 0.0.
|
3
|
+
# @version 0.0.4
|
4
4
|
# @url https://github.com/jun85664396/twkorean-ruby
|
5
5
|
# @license Apache License 2.0
|
6
6
|
module Twkorean
|
@@ -11,14 +11,7 @@ module Twkorean
|
|
11
11
|
def initialize(normalization = true, stemming = true)
|
12
12
|
jars = Dir.glob(File.dirname(__FILE__)+"/../jars/*.jar").join(':')
|
13
13
|
Rjb::load(jars, ['-Xmx512M'])
|
14
|
-
korean_processor = Rjb::import('com.twitter.penguin.korean.TwitterKoreanProcessorJava
|
15
|
-
unless normalization
|
16
|
-
korean_processor.disableNormalizer
|
17
|
-
end
|
18
|
-
unless stemming
|
19
|
-
korean_processor.disableStemmer
|
20
|
-
end
|
21
|
-
self.korean_processor = korean_processor.build
|
14
|
+
self.korean_processor = Rjb::import('com.twitter.penguin.korean.TwitterKoreanProcessorJava')
|
22
15
|
end
|
23
16
|
|
24
17
|
def normalize(text)
|
@@ -27,19 +20,26 @@ module Twkorean
|
|
27
20
|
|
28
21
|
def tokenize(text)
|
29
22
|
tokens = self.korean_processor.tokenize(text)
|
30
|
-
|
31
|
-
tokens.toArray.map{|x| x.toString}
|
23
|
+
tokens
|
32
24
|
end
|
33
25
|
|
34
|
-
def
|
35
|
-
tokens = self.korean_processor.
|
36
|
-
return [] unless tokens
|
26
|
+
def tokens_to_string_list(tokens)
|
27
|
+
tokens = self.korean_processor.tokensToJavaStringList(tokens)
|
37
28
|
tokens.toArray.map{|x| x.toString}
|
38
29
|
end
|
39
30
|
|
40
|
-
def
|
41
|
-
|
42
|
-
|
31
|
+
def tokens_to_token_list(tokens)
|
32
|
+
tokens = self.korean_processor.tokensToJavaKoreanTokenList(tokens)
|
33
|
+
tokens.toArray.map{|x| self.parser(x.toString)}
|
34
|
+
end
|
35
|
+
|
36
|
+
def stem(tokens)
|
37
|
+
stemmed = self.korean_processor.stem(tokens)
|
38
|
+
stemmed
|
39
|
+
end
|
40
|
+
|
41
|
+
def extract_phrases(tokens)
|
42
|
+
phrases = self.korean_processor.extractPhrases(tokens, true, true)
|
43
43
|
phrases.toArray.map{|x| x.toString}
|
44
44
|
end
|
45
45
|
|
data/lib/twkorean/version.rb
CHANGED
data/test/test_helper.rb
CHANGED
data/test/twkorean.rb
CHANGED
@@ -1,50 +1,38 @@
|
|
1
1
|
# @name twkorean-ruby
|
2
2
|
# @author JunSangPil
|
3
|
-
# @version 0.0.
|
3
|
+
# @version 0.0.4
|
4
4
|
# @url https://github.com/jun85664396/twkorean-ruby
|
5
5
|
# @license Apache License 2.0
|
6
6
|
require_relative 'test_helper'
|
7
7
|
require 'twkorean'
|
8
8
|
|
9
9
|
describe "Twkorean" do
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
it "Normalize" do
|
16
|
-
twkorean = Twkorean::TwitterKoreanText.new
|
17
|
-
p "Normlize"
|
18
|
-
p twkorean.normalize(TEXT)
|
19
|
-
# 한국어를 처리하는 예시입니다ㅋㅋ #한국어
|
20
|
-
end
|
10
|
+
text = "한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ #한국어"
|
11
|
+
twkorean = Twkorean::TwitterKoreanText.new
|
12
|
+
text = twkorean.normalize(text)
|
13
|
+
tokens = twkorean.tokenize(text)
|
21
14
|
|
22
15
|
it "Tokenize" do
|
23
|
-
twkorean = Twkorean::TwitterKoreanText.new(true, false)
|
24
16
|
p "#Tokenize"
|
25
|
-
p twkorean.
|
17
|
+
p twkorean.tokens_to_string_list(tokens)
|
18
|
+
# ["한국어", "를", "처리", "하는", "예시", "입니", "다", "ㅋㅋ", "#한국어"]
|
19
|
+
p twkorean.tokens_to_token_list(tokens)
|
26
20
|
# ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하는(Verb: 7, 2)", "예시(Noun: 10, 2)", "입니(Adjective: 12, 2)", "다(Eomi: 14, 1)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
|
27
21
|
end
|
28
22
|
|
29
23
|
it "Stemming" do
|
30
|
-
twkorean = Twkorean::TwitterKoreanText.new
|
31
24
|
p "#Stemming"
|
32
|
-
|
25
|
+
stem = twkorean.stem(tokens)
|
26
|
+
p twkorean.tokens_to_string_list(stem)
|
27
|
+
# ["한국어", "를", "처리", "하다", "예시", "이다", "ㅋㅋ", "#한국어"]
|
28
|
+
p twkorean.tokens_to_token_list(stem)
|
33
29
|
# ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하다(Verb: 7, 2)", "예시(Noun: 10, 2)", "이다(Adjective: 12, 3)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
|
34
30
|
end
|
35
31
|
|
36
32
|
it "Phrase extraction" do
|
37
|
-
twkorean = Twkorean::TwitterKoreanText.new
|
38
33
|
p "Phrase extraction"
|
39
|
-
p twkorean.extract_phrases(
|
34
|
+
p twkorean.extract_phrases(tokens)
|
40
35
|
# ["한국어(Noun: 0, 3)", "처리(Noun: 5, 2)", "처리하는 예시(Noun: 5, 7)", "예시(Noun: 10, 2)", "#한국어(Hashtag: 18, 4)"]
|
41
36
|
end
|
42
37
|
|
43
|
-
it "Parser" do
|
44
|
-
twkorean = Twkorean::TwitterKoreanText.new(true, false)
|
45
|
-
p "#Tokenize Parser"
|
46
|
-
p twkorean.tokenize(TEXT).map{|x| twkorean.parser(x) }
|
47
|
-
# ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하는(Verb: 7, 2)", "예시(Noun: 10, 2)", "입니(Adjective: 12, 2)", "다(Eomi: 14, 1)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
|
48
|
-
end
|
49
|
-
|
50
38
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twkorean
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- JunSangPil
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-02-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -78,9 +78,9 @@ files:
|
|
78
78
|
- LICENSE.txt
|
79
79
|
- README.md
|
80
80
|
- Rakefile
|
81
|
-
- lib/jars/korean-text-
|
81
|
+
- lib/jars/korean-text-4.4.jar
|
82
82
|
- lib/jars/scala-library-2.11.6.jar
|
83
|
-
- lib/jars/twitter-text-1.
|
83
|
+
- lib/jars/twitter-text-1.13.3.jar
|
84
84
|
- lib/twkorean.rb
|
85
85
|
- lib/twkorean/twitter_korean_text.rb
|
86
86
|
- lib/twkorean/version.rb
|
Binary file
|
Binary file
|