twkorean 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9fa5c2b3f783010b5b67c23db599ffd3ec75ef61
4
- data.tar.gz: 2117abd053395906001c822082aef413c91441c2
3
+ metadata.gz: 36d821bc142a63e34c0e143cb53ed2e72f0f72f9
4
+ data.tar.gz: 274df73ac337c3f01090f942c4b5f3948498470c
5
5
  SHA512:
6
- metadata.gz: 7ed15b2886e42e367c0652e1a86aad07b98c09279b37e0cd02b1e733499076e69c9481573e91c51d01ae70ed4e3feec4d0610e8dd0bed059d5a536c9193ce776
7
- data.tar.gz: 854b177257cf8252e1a81916a02ea721e01ef94ea3e3d4a4d02b02e65a9872bb30934d46c0b7c24a3cb8a8ae5fbca06dceb688496807a4d0f24a6c65ece08198
6
+ metadata.gz: b3eb5720642faf5de2e04cc2c2d035f15da4889ed6f723facdd2771f9f4f404d52d09dffc1cc8f39a911d4456c62041c51a697c45c7410b8a4e8ae69c151b46f
7
+ data.tar.gz: 9c80732be97e998f09ff640fc7bbedbb6ddc6f412805386cb1b7b3c8988597331814a6af66ab4e6247b404f7f2525dae9264c11eb972afff3e2517b8d8a6dc1a
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  ## Compatibility
4
4
 
5
- Currently wraps [twitter-korean-text 3.0](https://github.com/twitter/twitter-korean-text/tree/korean-text-3.0) / 현재 이 프로젝트는 [twitter-korean-text 3.0](https://github.com/twitter/twitter-korean-text/tree/korean-text-3.0)을 사용중입니다.
5
+ Currently wraps [twitter-korean-text 4.4](https://github.com/twitter/twitter-korean-text/tree/korean-text-4.4) / 현재 이 프로젝트는 [twitter-korean-text 4.4](https://github.com/twitter/twitter-korean-text/tree/korean-text-4.4)을 사용중입니다.
6
6
 
7
7
  ## Installation
8
8
 
@@ -29,41 +29,37 @@ Or install it yourself as:
29
29
 
30
30
  ## Usage
31
31
 
32
- describe "Twkorean" do
33
- TEXT = "한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ #한국어"
34
- before do
35
-
36
- end
37
-
38
- it "Normalize" do
39
- twkorean = Twkorean::TwitterKoreanText.new
40
- p "Normlize"
41
- p twkorean.normalize(TEXT)
42
- # 한국어를 처리하는 예시입니다ㅋㅋ #한국어
43
- end
44
-
45
- it "Tokenize" do
46
- twkorean = Twkorean::TwitterKoreanText.new(true, false)
47
- p "#Tokenize"
48
- p twkorean.tokenize(TEXT)
49
- # ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하는(Verb: 7, 2)", "예시(Noun: 10, 2)", "입니(Adjective: 12, 2)", "다(Eomi: 14, 1)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
50
- end
51
-
52
- it "Stemming" do
53
- twkorean = Twkorean::TwitterKoreanText.new
54
- p "#Stemming"
55
- p twkorean.tokenize(TEXT)
56
- # ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하다(Verb: 7, 2)", "예시(Noun: 10, 2)", "이다(Adjective: 12, 3)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
57
- end
58
-
59
- it "Phrase extraction" do
60
- twkorean = Twkorean::TwitterKoreanText.new
61
- p "Phrase extraction"
62
- p twkorean.extract_phrases(TEXT)
63
- # ["한국어(Noun: 0, 3)", "처리(Noun: 5, 2)", "처리하는 예시(Noun: 5, 7)", "예시(Noun: 10, 2)", "#한국어(Hashtag: 18, 4)"]
64
- end
65
- end
66
- ## Contributing
32
+ describe "Twkorean" do
33
+ text = "한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ #한국어"
34
+ twkorean = Twkorean::TwitterKoreanText.new
35
+ text = twkorean.normalize(text)
36
+ tokens = twkorean.tokenize(text)
37
+
38
+ it "Tokenize" do
39
+ p "#Tokenize"
40
+ p twkorean.tokens_to_string_list(tokens)
41
+ # ["한국어", "를", "처리", "하는", "예시", "입니", "다", "ㅋㅋ", "#한국어"]
42
+ p twkorean.tokens_to_token_list(tokens)
43
+ # ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하는(Verb: 7, 2)", "예시(Noun: 10, 2)", "입니(Adjective: 12, 2)", "다(Eomi: 14, 1)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
44
+ end
45
+
46
+ it "Stemming" do
47
+ p "#Stemming"
48
+ stem = twkorean.stem(tokens)
49
+ p twkorean.tokens_to_string_list(stem)
50
+ # ["한국어", "를", "처리", "하다", "예시", "이다", "ㅋㅋ", "#한국어"]
51
+ p twkorean.tokens_to_token_list(stem)
52
+ # ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하다(Verb: 7, 2)", "예시(Noun: 10, 2)", "이다(Adjective: 12, 3)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
53
+ end
54
+
55
+ it "Phrase extraction" do
56
+ p "Phrase extraction"
57
+ p twkorean.extract_phrases(tokens)
58
+ # ["한국어(Noun: 0, 3)", "처리(Noun: 5, 2)", "처리하는 예시(Noun: 5, 7)", "예시(Noun: 10, 2)", "#한국어(Hashtag: 18, 4)"]
59
+ end
60
+ end
61
+
62
+ end## Contributing
67
63
 
68
64
  1. Fork it ( https://github.com/[my-github-username]/twkorean/fork )
69
65
  2. Create your feature branch (`git checkout -b my-new-feature`)
@@ -1,6 +1,6 @@
1
1
  # @name twkorean-ruby
2
2
  # @author JunSangPil
3
- # @version 0.0.3
3
+ # @version 0.0.4
4
4
  # @url https://github.com/jun85664396/twkorean-ruby
5
5
  # @license Apache License 2.0
6
6
  require "twkorean/version"
@@ -1,6 +1,6 @@
1
1
  # @name twkorean-ruby
2
2
  # @author JunSangPil
3
- # @version 0.0.3
3
+ # @version 0.0.4
4
4
  # @url https://github.com/jun85664396/twkorean-ruby
5
5
  # @license Apache License 2.0
6
6
  module Twkorean
@@ -11,14 +11,7 @@ module Twkorean
11
11
  def initialize(normalization = true, stemming = true)
12
12
  jars = Dir.glob(File.dirname(__FILE__)+"/../jars/*.jar").join(':')
13
13
  Rjb::load(jars, ['-Xmx512M'])
14
- korean_processor = Rjb::import('com.twitter.penguin.korean.TwitterKoreanProcessorJava$Builder').new
15
- unless normalization
16
- korean_processor.disableNormalizer
17
- end
18
- unless stemming
19
- korean_processor.disableStemmer
20
- end
21
- self.korean_processor = korean_processor.build
14
+ self.korean_processor = Rjb::import('com.twitter.penguin.korean.TwitterKoreanProcessorJava')
22
15
  end
23
16
 
24
17
  def normalize(text)
@@ -27,19 +20,26 @@ module Twkorean
27
20
 
28
21
  def tokenize(text)
29
22
  tokens = self.korean_processor.tokenize(text)
30
- return [] unless tokens
31
- tokens.toArray.map{|x| x.toString}
23
+ tokens
32
24
  end
33
25
 
34
- def tokenize_to_strings(text)
35
- tokens = self.korean_processor.tokenizeToStrings(text)
36
- return [] unless tokens
26
+ def tokens_to_string_list(tokens)
27
+ tokens = self.korean_processor.tokensToJavaStringList(tokens)
37
28
  tokens.toArray.map{|x| x.toString}
38
29
  end
39
30
 
40
- def extract_phrases(text)
41
- phrases = self.korean_processor.extractPhrases(text)
42
- return [] unless phrases
31
+ def tokens_to_token_list(tokens)
32
+ tokens = self.korean_processor.tokensToJavaKoreanTokenList(tokens)
33
+ tokens.toArray.map{|x| self.parser(x.toString)}
34
+ end
35
+
36
+ def stem(tokens)
37
+ stemmed = self.korean_processor.stem(tokens)
38
+ stemmed
39
+ end
40
+
41
+ def extract_phrases(tokens)
42
+ phrases = self.korean_processor.extractPhrases(tokens, true, true)
43
43
  phrases.toArray.map{|x| x.toString}
44
44
  end
45
45
 
@@ -1,8 +1,8 @@
1
1
  # @name twkorean-ruby
2
2
  # @author JunSangPil
3
- # @version 0.0.3
3
+ # @version 0.0.4
4
4
  # @url https://github.com/jun85664396/twkorean-ruby
5
5
  # @license Apache License 2.0
6
6
  module Twkorean
7
- VERSION = "0.0.3"
7
+ VERSION = "0.0.4"
8
8
  end
@@ -1,6 +1,6 @@
1
1
  # @name twkorean-ruby
2
2
  # @author JunSangPil
3
- # @version 0.0.3
3
+ # @version 0.0.4
4
4
  # @url https://github.com/jun85664396/twkorean-ruby
5
5
  # @license Apache License 2.0
6
6
  require 'minitest/autorun'
@@ -1,50 +1,38 @@
1
1
  # @name twkorean-ruby
2
2
  # @author JunSangPil
3
- # @version 0.0.3
3
+ # @version 0.0.4
4
4
  # @url https://github.com/jun85664396/twkorean-ruby
5
5
  # @license Apache License 2.0
6
6
  require_relative 'test_helper'
7
7
  require 'twkorean'
8
8
 
9
9
  describe "Twkorean" do
10
- TEXT = "한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ #한국어"
11
- before do
12
-
13
- end
14
-
15
- it "Normalize" do
16
- twkorean = Twkorean::TwitterKoreanText.new
17
- p "Normlize"
18
- p twkorean.normalize(TEXT)
19
- # 한국어를 처리하는 예시입니다ㅋㅋ #한국어
20
- end
10
+ text = "한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ #한국어"
11
+ twkorean = Twkorean::TwitterKoreanText.new
12
+ text = twkorean.normalize(text)
13
+ tokens = twkorean.tokenize(text)
21
14
 
22
15
  it "Tokenize" do
23
- twkorean = Twkorean::TwitterKoreanText.new(true, false)
24
16
  p "#Tokenize"
25
- p twkorean.tokenize(TEXT)
17
+ p twkorean.tokens_to_string_list(tokens)
18
+ # ["한국어", "를", "처리", "하는", "예시", "입니", "다", "ㅋㅋ", "#한국어"]
19
+ p twkorean.tokens_to_token_list(tokens)
26
20
  # ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하는(Verb: 7, 2)", "예시(Noun: 10, 2)", "입니(Adjective: 12, 2)", "다(Eomi: 14, 1)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
27
21
  end
28
22
 
29
23
  it "Stemming" do
30
- twkorean = Twkorean::TwitterKoreanText.new
31
24
  p "#Stemming"
32
- p twkorean.tokenize(TEXT)
25
+ stem = twkorean.stem(tokens)
26
+ p twkorean.tokens_to_string_list(stem)
27
+ # ["한국어", "를", "처리", "하다", "예시", "이다", "ㅋㅋ", "#한국어"]
28
+ p twkorean.tokens_to_token_list(stem)
33
29
  # ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하다(Verb: 7, 2)", "예시(Noun: 10, 2)", "이다(Adjective: 12, 3)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
34
30
  end
35
31
 
36
32
  it "Phrase extraction" do
37
- twkorean = Twkorean::TwitterKoreanText.new
38
33
  p "Phrase extraction"
39
- p twkorean.extract_phrases(TEXT)
34
+ p twkorean.extract_phrases(tokens)
40
35
  # ["한국어(Noun: 0, 3)", "처리(Noun: 5, 2)", "처리하는 예시(Noun: 5, 7)", "예시(Noun: 10, 2)", "#한국어(Hashtag: 18, 4)"]
41
36
  end
42
37
 
43
- it "Parser" do
44
- twkorean = Twkorean::TwitterKoreanText.new(true, false)
45
- p "#Tokenize Parser"
46
- p twkorean.tokenize(TEXT).map{|x| twkorean.parser(x) }
47
- # ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하는(Verb: 7, 2)", "예시(Noun: 10, 2)", "입니(Adjective: 12, 2)", "다(Eomi: 14, 1)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
48
- end
49
-
50
38
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twkorean
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - JunSangPil
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-10-05 00:00:00.000000000 Z
11
+ date: 2016-02-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -78,9 +78,9 @@ files:
78
78
  - LICENSE.txt
79
79
  - README.md
80
80
  - Rakefile
81
- - lib/jars/korean-text-3.0.jar
81
+ - lib/jars/korean-text-4.4.jar
82
82
  - lib/jars/scala-library-2.11.6.jar
83
- - lib/jars/twitter-text-1.11.1.jar
83
+ - lib/jars/twitter-text-1.13.3.jar
84
84
  - lib/twkorean.rb
85
85
  - lib/twkorean/twitter_korean_text.rb
86
86
  - lib/twkorean/version.rb
Binary file