twkorean 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9fa5c2b3f783010b5b67c23db599ffd3ec75ef61
4
- data.tar.gz: 2117abd053395906001c822082aef413c91441c2
3
+ metadata.gz: 36d821bc142a63e34c0e143cb53ed2e72f0f72f9
4
+ data.tar.gz: 274df73ac337c3f01090f942c4b5f3948498470c
5
5
  SHA512:
6
- metadata.gz: 7ed15b2886e42e367c0652e1a86aad07b98c09279b37e0cd02b1e733499076e69c9481573e91c51d01ae70ed4e3feec4d0610e8dd0bed059d5a536c9193ce776
7
- data.tar.gz: 854b177257cf8252e1a81916a02ea721e01ef94ea3e3d4a4d02b02e65a9872bb30934d46c0b7c24a3cb8a8ae5fbca06dceb688496807a4d0f24a6c65ece08198
6
+ metadata.gz: b3eb5720642faf5de2e04cc2c2d035f15da4889ed6f723facdd2771f9f4f404d52d09dffc1cc8f39a911d4456c62041c51a697c45c7410b8a4e8ae69c151b46f
7
+ data.tar.gz: 9c80732be97e998f09ff640fc7bbedbb6ddc6f412805386cb1b7b3c8988597331814a6af66ab4e6247b404f7f2525dae9264c11eb972afff3e2517b8d8a6dc1a
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  ## Compatibility
4
4
 
5
- Currently wraps [twitter-korean-text 3.0](https://github.com/twitter/twitter-korean-text/tree/korean-text-3.0) / 현재 이 프로젝트는 [twitter-korean-text 3.0](https://github.com/twitter/twitter-korean-text/tree/korean-text-3.0)을 사용중입니다.
5
+ Currently wraps [twitter-korean-text 4.4](https://github.com/twitter/twitter-korean-text/tree/korean-text-4.4) / 현재 이 프로젝트는 [twitter-korean-text 4.4](https://github.com/twitter/twitter-korean-text/tree/korean-text-4.4)을 사용중입니다.
6
6
 
7
7
  ## Installation
8
8
 
@@ -29,41 +29,37 @@ Or install it yourself as:
29
29
 
30
30
  ## Usage
31
31
 
32
- describe "Twkorean" do
33
- TEXT = "한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ #한국어"
34
- before do
35
-
36
- end
37
-
38
- it "Normalize" do
39
- twkorean = Twkorean::TwitterKoreanText.new
40
- p "Normlize"
41
- p twkorean.normalize(TEXT)
42
- # 한국어를 처리하는 예시입니다ㅋㅋ #한국어
43
- end
44
-
45
- it "Tokenize" do
46
- twkorean = Twkorean::TwitterKoreanText.new(true, false)
47
- p "#Tokenize"
48
- p twkorean.tokenize(TEXT)
49
- # ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하는(Verb: 7, 2)", "예시(Noun: 10, 2)", "입니(Adjective: 12, 2)", "다(Eomi: 14, 1)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
50
- end
51
-
52
- it "Stemming" do
53
- twkorean = Twkorean::TwitterKoreanText.new
54
- p "#Stemming"
55
- p twkorean.tokenize(TEXT)
56
- # ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하다(Verb: 7, 2)", "예시(Noun: 10, 2)", "이다(Adjective: 12, 3)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
57
- end
58
-
59
- it "Phrase extraction" do
60
- twkorean = Twkorean::TwitterKoreanText.new
61
- p "Phrase extraction"
62
- p twkorean.extract_phrases(TEXT)
63
- # ["한국어(Noun: 0, 3)", "처리(Noun: 5, 2)", "처리하는 예시(Noun: 5, 7)", "예시(Noun: 10, 2)", "#한국어(Hashtag: 18, 4)"]
64
- end
65
- end
66
- ## Contributing
32
+ describe "Twkorean" do
33
+ text = "한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ #한국어"
34
+ twkorean = Twkorean::TwitterKoreanText.new
35
+ text = twkorean.normalize(text)
36
+ tokens = twkorean.tokenize(text)
37
+
38
+ it "Tokenize" do
39
+ p "#Tokenize"
40
+ p twkorean.tokens_to_string_list(tokens)
41
+ # ["한국어", "를", "처리", "하는", "예시", "입니", "다", "ㅋㅋ", "#한국어"]
42
+ p twkorean.tokens_to_token_list(tokens)
43
+ # ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하는(Verb: 7, 2)", "예시(Noun: 10, 2)", "입니(Adjective: 12, 2)", "다(Eomi: 14, 1)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
44
+ end
45
+
46
+ it "Stemming" do
47
+ p "#Stemming"
48
+ stem = twkorean.stem(tokens)
49
+ p twkorean.tokens_to_string_list(stem)
50
+ # ["한국어", "를", "처리", "하다", "예시", "이다", "ㅋㅋ", "#한국어"]
51
+ p twkorean.tokens_to_token_list(stem)
52
+ # ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하다(Verb: 7, 2)", "예시(Noun: 10, 2)", "이다(Adjective: 12, 3)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
53
+ end
54
+
55
+ it "Phrase extraction" do
56
+ p "Phrase extraction"
57
+ p twkorean.extract_phrases(tokens)
58
+ # ["한국어(Noun: 0, 3)", "처리(Noun: 5, 2)", "처리하는 예시(Noun: 5, 7)", "예시(Noun: 10, 2)", "#한국어(Hashtag: 18, 4)"]
59
+ end
60
+ end
61
+
62
+ end## Contributing
67
63
 
68
64
  1. Fork it ( https://github.com/[my-github-username]/twkorean/fork )
69
65
  2. Create your feature branch (`git checkout -b my-new-feature`)
@@ -1,6 +1,6 @@
1
1
  # @name twkorean-ruby
2
2
  # @author JunSangPil
3
- # @version 0.0.3
3
+ # @version 0.0.4
4
4
  # @url https://github.com/jun85664396/twkorean-ruby
5
5
  # @license Apache License 2.0
6
6
  require "twkorean/version"
@@ -1,6 +1,6 @@
1
1
  # @name twkorean-ruby
2
2
  # @author JunSangPil
3
- # @version 0.0.3
3
+ # @version 0.0.4
4
4
  # @url https://github.com/jun85664396/twkorean-ruby
5
5
  # @license Apache License 2.0
6
6
  module Twkorean
@@ -11,14 +11,7 @@ module Twkorean
11
11
  def initialize(normalization = true, stemming = true)
12
12
  jars = Dir.glob(File.dirname(__FILE__)+"/../jars/*.jar").join(':')
13
13
  Rjb::load(jars, ['-Xmx512M'])
14
- korean_processor = Rjb::import('com.twitter.penguin.korean.TwitterKoreanProcessorJava$Builder').new
15
- unless normalization
16
- korean_processor.disableNormalizer
17
- end
18
- unless stemming
19
- korean_processor.disableStemmer
20
- end
21
- self.korean_processor = korean_processor.build
14
+ self.korean_processor = Rjb::import('com.twitter.penguin.korean.TwitterKoreanProcessorJava')
22
15
  end
23
16
 
24
17
  def normalize(text)
@@ -27,19 +20,26 @@ module Twkorean
27
20
 
28
21
  def tokenize(text)
29
22
  tokens = self.korean_processor.tokenize(text)
30
- return [] unless tokens
31
- tokens.toArray.map{|x| x.toString}
23
+ tokens
32
24
  end
33
25
 
34
- def tokenize_to_strings(text)
35
- tokens = self.korean_processor.tokenizeToStrings(text)
36
- return [] unless tokens
26
+ def tokens_to_string_list(tokens)
27
+ tokens = self.korean_processor.tokensToJavaStringList(tokens)
37
28
  tokens.toArray.map{|x| x.toString}
38
29
  end
39
30
 
40
- def extract_phrases(text)
41
- phrases = self.korean_processor.extractPhrases(text)
42
- return [] unless phrases
31
+ def tokens_to_token_list(tokens)
32
+ tokens = self.korean_processor.tokensToJavaKoreanTokenList(tokens)
33
+ tokens.toArray.map{|x| self.parser(x.toString)}
34
+ end
35
+
36
+ def stem(tokens)
37
+ stemmed = self.korean_processor.stem(tokens)
38
+ stemmed
39
+ end
40
+
41
+ def extract_phrases(tokens)
42
+ phrases = self.korean_processor.extractPhrases(tokens, true, true)
43
43
  phrases.toArray.map{|x| x.toString}
44
44
  end
45
45
 
@@ -1,8 +1,8 @@
1
1
  # @name twkorean-ruby
2
2
  # @author JunSangPil
3
- # @version 0.0.3
3
+ # @version 0.0.4
4
4
  # @url https://github.com/jun85664396/twkorean-ruby
5
5
  # @license Apache License 2.0
6
6
  module Twkorean
7
- VERSION = "0.0.3"
7
+ VERSION = "0.0.4"
8
8
  end
@@ -1,6 +1,6 @@
1
1
  # @name twkorean-ruby
2
2
  # @author JunSangPil
3
- # @version 0.0.3
3
+ # @version 0.0.4
4
4
  # @url https://github.com/jun85664396/twkorean-ruby
5
5
  # @license Apache License 2.0
6
6
  require 'minitest/autorun'
@@ -1,50 +1,38 @@
1
1
  # @name twkorean-ruby
2
2
  # @author JunSangPil
3
- # @version 0.0.3
3
+ # @version 0.0.4
4
4
  # @url https://github.com/jun85664396/twkorean-ruby
5
5
  # @license Apache License 2.0
6
6
  require_relative 'test_helper'
7
7
  require 'twkorean'
8
8
 
9
9
  describe "Twkorean" do
10
- TEXT = "한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ #한국어"
11
- before do
12
-
13
- end
14
-
15
- it "Normalize" do
16
- twkorean = Twkorean::TwitterKoreanText.new
17
- p "Normlize"
18
- p twkorean.normalize(TEXT)
19
- # 한국어를 처리하는 예시입니다ㅋㅋ #한국어
20
- end
10
+ text = "한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ #한국어"
11
+ twkorean = Twkorean::TwitterKoreanText.new
12
+ text = twkorean.normalize(text)
13
+ tokens = twkorean.tokenize(text)
21
14
 
22
15
  it "Tokenize" do
23
- twkorean = Twkorean::TwitterKoreanText.new(true, false)
24
16
  p "#Tokenize"
25
- p twkorean.tokenize(TEXT)
17
+ p twkorean.tokens_to_string_list(tokens)
18
+ # ["한국어", "를", "처리", "하는", "예시", "입니", "다", "ㅋㅋ", "#한국어"]
19
+ p twkorean.tokens_to_token_list(tokens)
26
20
  # ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하는(Verb: 7, 2)", "예시(Noun: 10, 2)", "입니(Adjective: 12, 2)", "다(Eomi: 14, 1)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
27
21
  end
28
22
 
29
23
  it "Stemming" do
30
- twkorean = Twkorean::TwitterKoreanText.new
31
24
  p "#Stemming"
32
- p twkorean.tokenize(TEXT)
25
+ stem = twkorean.stem(tokens)
26
+ p twkorean.tokens_to_string_list(stem)
27
+ # ["한국어", "를", "처리", "하다", "예시", "이다", "ㅋㅋ", "#한국어"]
28
+ p twkorean.tokens_to_token_list(stem)
33
29
  # ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하다(Verb: 7, 2)", "예시(Noun: 10, 2)", "이다(Adjective: 12, 3)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
34
30
  end
35
31
 
36
32
  it "Phrase extraction" do
37
- twkorean = Twkorean::TwitterKoreanText.new
38
33
  p "Phrase extraction"
39
- p twkorean.extract_phrases(TEXT)
34
+ p twkorean.extract_phrases(tokens)
40
35
  # ["한국어(Noun: 0, 3)", "처리(Noun: 5, 2)", "처리하는 예시(Noun: 5, 7)", "예시(Noun: 10, 2)", "#한국어(Hashtag: 18, 4)"]
41
36
  end
42
37
 
43
- it "Parser" do
44
- twkorean = Twkorean::TwitterKoreanText.new(true, false)
45
- p "#Tokenize Parser"
46
- p twkorean.tokenize(TEXT).map{|x| twkorean.parser(x) }
47
- # ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하는(Verb: 7, 2)", "예시(Noun: 10, 2)", "입니(Adjective: 12, 2)", "다(Eomi: 14, 1)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
48
- end
49
-
50
38
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twkorean
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - JunSangPil
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-10-05 00:00:00.000000000 Z
11
+ date: 2016-02-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -78,9 +78,9 @@ files:
78
78
  - LICENSE.txt
79
79
  - README.md
80
80
  - Rakefile
81
- - lib/jars/korean-text-3.0.jar
81
+ - lib/jars/korean-text-4.4.jar
82
82
  - lib/jars/scala-library-2.11.6.jar
83
- - lib/jars/twitter-text-1.11.1.jar
83
+ - lib/jars/twitter-text-1.13.3.jar
84
84
  - lib/twkorean.rb
85
85
  - lib/twkorean/twitter_korean_text.rb
86
86
  - lib/twkorean/version.rb
Binary file