twkorean 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +32 -36
- data/lib/jars/korean-text-4.4.jar +0 -0
- data/lib/jars/twitter-text-1.13.3.jar +0 -0
- data/lib/twkorean.rb +1 -1
- data/lib/twkorean/twitter_korean_text.rb +17 -17
- data/lib/twkorean/version.rb +2 -2
- data/test/test_helper.rb +1 -1
- data/test/twkorean.rb +13 -25
- metadata +4 -4
- data/lib/jars/korean-text-3.0.jar +0 -0
- data/lib/jars/twitter-text-1.11.1.jar +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 36d821bc142a63e34c0e143cb53ed2e72f0f72f9
|
4
|
+
data.tar.gz: 274df73ac337c3f01090f942c4b5f3948498470c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b3eb5720642faf5de2e04cc2c2d035f15da4889ed6f723facdd2771f9f4f404d52d09dffc1cc8f39a911d4456c62041c51a697c45c7410b8a4e8ae69c151b46f
|
7
|
+
data.tar.gz: 9c80732be97e998f09ff640fc7bbedbb6ddc6f412805386cb1b7b3c8988597331814a6af66ab4e6247b404f7f2525dae9264c11eb972afff3e2517b8d8a6dc1a
|
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
## Compatibility
|
4
4
|
|
5
|
-
Currently wraps [twitter-korean-text
|
5
|
+
Currently wraps [twitter-korean-text 4.4](https://github.com/twitter/twitter-korean-text/tree/korean-text-4.4) / 현재 이 프로젝트는 [twitter-korean-text 4.4](https://github.com/twitter/twitter-korean-text/tree/korean-text-4.4)을 사용중입니다.
|
6
6
|
|
7
7
|
## Installation
|
8
8
|
|
@@ -29,41 +29,37 @@ Or install it yourself as:
|
|
29
29
|
|
30
30
|
## Usage
|
31
31
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
# ["한국어(Noun: 0, 3)", "처리(Noun: 5, 2)", "처리하는 예시(Noun: 5, 7)", "예시(Noun: 10, 2)", "#한국어(Hashtag: 18, 4)"]
|
64
|
-
end
|
65
|
-
end
|
66
|
-
## Contributing
|
32
|
+
describe "Twkorean" do
|
33
|
+
text = "한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ #한국어"
|
34
|
+
twkorean = Twkorean::TwitterKoreanText.new
|
35
|
+
text = twkorean.normalize(text)
|
36
|
+
tokens = twkorean.tokenize(text)
|
37
|
+
|
38
|
+
it "Tokenize" do
|
39
|
+
p "#Tokenize"
|
40
|
+
p twkorean.tokens_to_string_list(tokens)
|
41
|
+
# ["한국어", "를", "처리", "하는", "예시", "입니", "다", "ㅋㅋ", "#한국어"]
|
42
|
+
p twkorean.tokens_to_token_list(tokens)
|
43
|
+
# ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하는(Verb: 7, 2)", "예시(Noun: 10, 2)", "입니(Adjective: 12, 2)", "다(Eomi: 14, 1)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
|
44
|
+
end
|
45
|
+
|
46
|
+
it "Stemming" do
|
47
|
+
p "#Stemming"
|
48
|
+
stem = twkorean.stem(tokens)
|
49
|
+
p twkorean.tokens_to_string_list(stem)
|
50
|
+
# ["한국어", "를", "처리", "하다", "예시", "이다", "ㅋㅋ", "#한국어"]
|
51
|
+
p twkorean.tokens_to_token_list(stem)
|
52
|
+
# ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하다(Verb: 7, 2)", "예시(Noun: 10, 2)", "이다(Adjective: 12, 3)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
|
53
|
+
end
|
54
|
+
|
55
|
+
it "Phrase extraction" do
|
56
|
+
p "Phrase extraction"
|
57
|
+
p twkorean.extract_phrases(tokens)
|
58
|
+
# ["한국어(Noun: 0, 3)", "처리(Noun: 5, 2)", "처리하는 예시(Noun: 5, 7)", "예시(Noun: 10, 2)", "#한국어(Hashtag: 18, 4)"]
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
end## Contributing
|
67
63
|
|
68
64
|
1. Fork it ( https://github.com/[my-github-username]/twkorean/fork )
|
69
65
|
2. Create your feature branch (`git checkout -b my-new-feature`)
|
Binary file
|
Binary file
|
data/lib/twkorean.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# @name twkorean-ruby
|
2
2
|
# @author JunSangPil
|
3
|
-
# @version 0.0.
|
3
|
+
# @version 0.0.4
|
4
4
|
# @url https://github.com/jun85664396/twkorean-ruby
|
5
5
|
# @license Apache License 2.0
|
6
6
|
module Twkorean
|
@@ -11,14 +11,7 @@ module Twkorean
|
|
11
11
|
def initialize(normalization = true, stemming = true)
|
12
12
|
jars = Dir.glob(File.dirname(__FILE__)+"/../jars/*.jar").join(':')
|
13
13
|
Rjb::load(jars, ['-Xmx512M'])
|
14
|
-
korean_processor = Rjb::import('com.twitter.penguin.korean.TwitterKoreanProcessorJava
|
15
|
-
unless normalization
|
16
|
-
korean_processor.disableNormalizer
|
17
|
-
end
|
18
|
-
unless stemming
|
19
|
-
korean_processor.disableStemmer
|
20
|
-
end
|
21
|
-
self.korean_processor = korean_processor.build
|
14
|
+
self.korean_processor = Rjb::import('com.twitter.penguin.korean.TwitterKoreanProcessorJava')
|
22
15
|
end
|
23
16
|
|
24
17
|
def normalize(text)
|
@@ -27,19 +20,26 @@ module Twkorean
|
|
27
20
|
|
28
21
|
def tokenize(text)
|
29
22
|
tokens = self.korean_processor.tokenize(text)
|
30
|
-
|
31
|
-
tokens.toArray.map{|x| x.toString}
|
23
|
+
tokens
|
32
24
|
end
|
33
25
|
|
34
|
-
def
|
35
|
-
tokens = self.korean_processor.
|
36
|
-
return [] unless tokens
|
26
|
+
def tokens_to_string_list(tokens)
|
27
|
+
tokens = self.korean_processor.tokensToJavaStringList(tokens)
|
37
28
|
tokens.toArray.map{|x| x.toString}
|
38
29
|
end
|
39
30
|
|
40
|
-
def
|
41
|
-
|
42
|
-
|
31
|
+
def tokens_to_token_list(tokens)
|
32
|
+
tokens = self.korean_processor.tokensToJavaKoreanTokenList(tokens)
|
33
|
+
tokens.toArray.map{|x| self.parser(x.toString)}
|
34
|
+
end
|
35
|
+
|
36
|
+
def stem(tokens)
|
37
|
+
stemmed = self.korean_processor.stem(tokens)
|
38
|
+
stemmed
|
39
|
+
end
|
40
|
+
|
41
|
+
def extract_phrases(tokens)
|
42
|
+
phrases = self.korean_processor.extractPhrases(tokens, true, true)
|
43
43
|
phrases.toArray.map{|x| x.toString}
|
44
44
|
end
|
45
45
|
|
data/lib/twkorean/version.rb
CHANGED
data/test/test_helper.rb
CHANGED
data/test/twkorean.rb
CHANGED
@@ -1,50 +1,38 @@
|
|
1
1
|
# @name twkorean-ruby
|
2
2
|
# @author JunSangPil
|
3
|
-
# @version 0.0.
|
3
|
+
# @version 0.0.4
|
4
4
|
# @url https://github.com/jun85664396/twkorean-ruby
|
5
5
|
# @license Apache License 2.0
|
6
6
|
require_relative 'test_helper'
|
7
7
|
require 'twkorean'
|
8
8
|
|
9
9
|
describe "Twkorean" do
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
it "Normalize" do
|
16
|
-
twkorean = Twkorean::TwitterKoreanText.new
|
17
|
-
p "Normlize"
|
18
|
-
p twkorean.normalize(TEXT)
|
19
|
-
# 한국어를 처리하는 예시입니다ㅋㅋ #한국어
|
20
|
-
end
|
10
|
+
text = "한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ #한국어"
|
11
|
+
twkorean = Twkorean::TwitterKoreanText.new
|
12
|
+
text = twkorean.normalize(text)
|
13
|
+
tokens = twkorean.tokenize(text)
|
21
14
|
|
22
15
|
it "Tokenize" do
|
23
|
-
twkorean = Twkorean::TwitterKoreanText.new(true, false)
|
24
16
|
p "#Tokenize"
|
25
|
-
p twkorean.
|
17
|
+
p twkorean.tokens_to_string_list(tokens)
|
18
|
+
# ["한국어", "를", "처리", "하는", "예시", "입니", "다", "ㅋㅋ", "#한국어"]
|
19
|
+
p twkorean.tokens_to_token_list(tokens)
|
26
20
|
# ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하는(Verb: 7, 2)", "예시(Noun: 10, 2)", "입니(Adjective: 12, 2)", "다(Eomi: 14, 1)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
|
27
21
|
end
|
28
22
|
|
29
23
|
it "Stemming" do
|
30
|
-
twkorean = Twkorean::TwitterKoreanText.new
|
31
24
|
p "#Stemming"
|
32
|
-
|
25
|
+
stem = twkorean.stem(tokens)
|
26
|
+
p twkorean.tokens_to_string_list(stem)
|
27
|
+
# ["한국어", "를", "처리", "하다", "예시", "이다", "ㅋㅋ", "#한국어"]
|
28
|
+
p twkorean.tokens_to_token_list(stem)
|
33
29
|
# ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하다(Verb: 7, 2)", "예시(Noun: 10, 2)", "이다(Adjective: 12, 3)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
|
34
30
|
end
|
35
31
|
|
36
32
|
it "Phrase extraction" do
|
37
|
-
twkorean = Twkorean::TwitterKoreanText.new
|
38
33
|
p "Phrase extraction"
|
39
|
-
p twkorean.extract_phrases(
|
34
|
+
p twkorean.extract_phrases(tokens)
|
40
35
|
# ["한국어(Noun: 0, 3)", "처리(Noun: 5, 2)", "처리하는 예시(Noun: 5, 7)", "예시(Noun: 10, 2)", "#한국어(Hashtag: 18, 4)"]
|
41
36
|
end
|
42
37
|
|
43
|
-
it "Parser" do
|
44
|
-
twkorean = Twkorean::TwitterKoreanText.new(true, false)
|
45
|
-
p "#Tokenize Parser"
|
46
|
-
p twkorean.tokenize(TEXT).map{|x| twkorean.parser(x) }
|
47
|
-
# ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하는(Verb: 7, 2)", "예시(Noun: 10, 2)", "입니(Adjective: 12, 2)", "다(Eomi: 14, 1)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
|
48
|
-
end
|
49
|
-
|
50
38
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twkorean
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- JunSangPil
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-02-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -78,9 +78,9 @@ files:
|
|
78
78
|
- LICENSE.txt
|
79
79
|
- README.md
|
80
80
|
- Rakefile
|
81
|
-
- lib/jars/korean-text-
|
81
|
+
- lib/jars/korean-text-4.4.jar
|
82
82
|
- lib/jars/scala-library-2.11.6.jar
|
83
|
-
- lib/jars/twitter-text-1.
|
83
|
+
- lib/jars/twitter-text-1.13.3.jar
|
84
84
|
- lib/twkorean.rb
|
85
85
|
- lib/twkorean/twitter_korean_text.rb
|
86
86
|
- lib/twkorean/version.rb
|
Binary file
|
Binary file
|