twitter-korean-text-ruby 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +12 -0
- data/.travis.yml +11 -0
- data/Gemfile +4 -0
- data/README.md +68 -0
- data/Rakefile +10 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/twitter-korean-text-ruby.rb +7 -0
- data/lib/twitter_korean/jars/korean-text-4.4.jar +0 -0
- data/lib/twitter_korean/jars/scala-library-2.11.7.jar +0 -0
- data/lib/twitter_korean/jars/twitter-text-1.13.3.jar +0 -0
- data/lib/twitter_korean/jvm_bridge.rb +15 -0
- data/lib/twitter_korean/korean_token.rb +53 -0
- data/lib/twitter_korean/processor.rb +55 -0
- data/lib/twitter_korean/version.rb +3 -0
- data/twitter-korean-text-ruby.gemspec +26 -0
- metadata +146 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 66a39cfe7f6759aa6bc9412a49e069817201d08b
|
4
|
+
data.tar.gz: d2d0ffd199077dd5f7d0463764e2d1b2c6bb1f27
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 5522edf916fb34042b99333b141146d450634e7d3503e0716204e4dcf2a74a2361fd3ad77dce961142b077f998e6dff627a0fb0603ad7e446b7c0fce1c64ac8e
|
7
|
+
data.tar.gz: d812c9d805a0957631a7845cde29ec9bbead25a610ef42d19834cd518e9b12648a210d5a071f837fd5669b93dcbb2aff284b200148dd1b425f04514397df8946
|
data/.gitignore
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
## twitter-korean-text-ruby
|
2
|
+
[](https://travis-ci.org/keepcosmos/twitter-korean-text-ruby)
|
3
|
+
[](https://codeclimate.com/repos/56d562f8e4ecf4707f00309b/feed)
|
4
|
+
|
5
|
+
Ruby interface to [twitter-korean-text](https://github.com/twitter/twitter-korean-text) by Twitter
|
6
|
+
|
7
|
+
트위터에서 제공하는 한글 형태소 분석기인 [twitter-korean-text](https://github.com/twitter/twitter-korean-text)(Scala)를 Ruby에서 사용가능하도록 Wrapping 하였습니다.
|
8
|
+
|
9
|
+
### install
|
10
|
+
```{ruby}
|
11
|
+
$ gem install twitter-korean-text-ruby
|
12
|
+
```
|
13
|
+
Gemfile을 사용할 경우
|
14
|
+
```{ruby}
|
15
|
+
# Gemfile
|
16
|
+
gem 'twitter-korean-text-ruby'
|
17
|
+
```
|
18
|
+
|
19
|
+
### Useage
|
20
|
+
```ruby
|
21
|
+
require 'twitter-korean-text-ruby'
|
22
|
+
|
23
|
+
processor = TwitterKorean::Processor.new
|
24
|
+
# OR with JVM arguments
|
25
|
+
processor = TwitterKorean::Processor.new('-Xms126M', '-Xms512M', ...)
|
26
|
+
|
27
|
+
# Normalize
|
28
|
+
processor.normalize("형태소 분석을 합니닼ㅋㅋㅋㅋㅋㅋ")
|
29
|
+
# => "형태소 분석을 합니다ㅋㅋㅋㅋㅋㅋ"
|
30
|
+
|
31
|
+
# Tokenize
|
32
|
+
tokens = proccessor.tokenize("한국어를 처리하는 예시입니다 ㅋㅋ")
|
33
|
+
puts tokens
|
34
|
+
# => ["한국어", "를", " ", "처리", "하는", " ", "예시", "입니", "다", " ", "ㅋㅋ"]
|
35
|
+
|
36
|
+
# metadata of token, 토큰에 대한 정보
|
37
|
+
metadata = tokens.first.metadata
|
38
|
+
matadata #=> "noun, 0, 3"
|
39
|
+
metadata.pos #=> :noun
|
40
|
+
metadata.offset #=> 0
|
41
|
+
metadata.length #=> 3
|
42
|
+
|
43
|
+
# Stemming
|
44
|
+
tokens = proccessor.stem("한국어를 처리하는 예시입니다 ㅋㅋ")
|
45
|
+
puts tokens
|
46
|
+
# => ["한국어", "를", " ", "처리", "하다", " ", "예시", "이다", " ", "ㅋㅋ"]
|
47
|
+
|
48
|
+
# extract phrases
|
49
|
+
tokens = proccessor.stem("한국어를 처리하는 예시입니다 ㅋㅋ")
|
50
|
+
puts tokens
|
51
|
+
# => ["한국어", "처리", "처리하는 예시", "예시"]
|
52
|
+
|
53
|
+
```
|
54
|
+
|
55
|
+
### Test
|
56
|
+
```{ruby}
|
57
|
+
rake test
|
58
|
+
```
|
59
|
+
|
60
|
+
### Issue
|
61
|
+
JAVA Path를 찾지 못했을 경우,
|
62
|
+
```{bash}
|
63
|
+
export JAVA_HOME=$(java_home_path)
|
64
|
+
|
65
|
+
### Contribute
|
66
|
+
이 프로젝트는 [twitter-korean-text](https://github.com/twitter/twitter-korean-text) 프로젝트의 Scala 코드를 Ruby로 Wrapping하는 프로젝트입니다.
|
67
|
+
관련된 범주에 대한 Issue와 Pull Request(테스트 코드가 포함된)는 언제나 환영입니다.
|
68
|
+
```
|
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "twitter/korean/text/ruby"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start
|
data/bin/setup
ADDED
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'rjb'
|
2
|
+
|
3
|
+
module TwitterKorean
|
4
|
+
class JvmBridge
|
5
|
+
CLASS_PATH = Dir.glob(File.dirname(__FILE__) + '/jars/*.jar').join(File::PATH_SEPARATOR).freeze
|
6
|
+
|
7
|
+
def initialize(jvmargs = [])
|
8
|
+
Rjb.load CLASS_PATH, jvmargs
|
9
|
+
end
|
10
|
+
|
11
|
+
def scala_twitter_korean_processor
|
12
|
+
Rjb.import('com.twitter.penguin.korean.TwitterKoreanProcessor')
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module TwitterKorean
|
2
|
+
class KoreanToken < String
|
3
|
+
attr_accessor :metadata
|
4
|
+
|
5
|
+
class << self
|
6
|
+
# by form like `한국어(Noun: 0, 3)`
|
7
|
+
def build_by_formed_str(str)
|
8
|
+
token = new(str[0...str.rindex('(')])
|
9
|
+
|
10
|
+
infos = str[(str.rindex('(') + 1)...str.rindex(')')]
|
11
|
+
attrs = {
|
12
|
+
pos: infos.match(/\w+/).to_s,
|
13
|
+
offset: infos.match(/\d+/).to_s.to_i,
|
14
|
+
length: infos.match(/\d+$/).to_s.to_i
|
15
|
+
}
|
16
|
+
|
17
|
+
token.metadata = TwitterKorean::KoreanToken::Metadata.new(attrs)
|
18
|
+
token
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
class Metadata
|
23
|
+
attr_accessor :pos, :offset, :length, :unkown
|
24
|
+
|
25
|
+
def initialize(attrs = {})
|
26
|
+
attrs.each { |k, v| send("#{k}=", v) if respond_to?("#{k}=") }
|
27
|
+
symbolize_pos!
|
28
|
+
end
|
29
|
+
|
30
|
+
def inspect
|
31
|
+
"#{pos}, #{offset}, #{length}"
|
32
|
+
end
|
33
|
+
|
34
|
+
def ==(other)
|
35
|
+
[:pos, :offset, :length, :unkown].inject(true) { |res, attr| res && (send(attr) == other.send(attr)) }
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def symbolize_pos!
|
41
|
+
return if pos.nil?
|
42
|
+
@pos = to_underscore(pos).to_sym
|
43
|
+
end
|
44
|
+
|
45
|
+
def to_underscore(txt)
|
46
|
+
txt.gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
|
47
|
+
gsub(/([a-z\d])([A-Z])/,'\1_\2').
|
48
|
+
tr("-", "_").
|
49
|
+
downcase
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'rjb'
|
2
|
+
|
3
|
+
module TwitterKorean
|
4
|
+
# Ruby interface to Scala TwitterKoreanProcessor
|
5
|
+
class Processor
|
6
|
+
attr_reader :jvm_processor, :java_convertor
|
7
|
+
|
8
|
+
def initialize(*jvmargs)
|
9
|
+
bridge = TwitterKorean::JvmBridge.new(jvmargs)
|
10
|
+
@jvm_processor = bridge.scala_twitter_korean_processor
|
11
|
+
end
|
12
|
+
|
13
|
+
def normalize(text)
|
14
|
+
return unless text
|
15
|
+
jvm_processor.normalize(text).toString
|
16
|
+
end
|
17
|
+
|
18
|
+
def tokenize(text)
|
19
|
+
return unless text
|
20
|
+
converto_to_korean_tokens do
|
21
|
+
jvm_processor.tokenize(text)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def stem(text)
|
26
|
+
return unless text
|
27
|
+
converto_to_korean_tokens do
|
28
|
+
jvm_processor.stem(jvm_processor.tokenize(text))
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def extract_phrases(text, options = {})
|
33
|
+
return unless text
|
34
|
+
filter_spam = options[:filter_spam] || false
|
35
|
+
including_hashtags = options[:including_hashtags] || true
|
36
|
+
converto_to_korean_tokens do
|
37
|
+
jvm_processor.extractPhrases(jvm_processor.tokenize(text), filter_spam, including_hashtags)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
def converto_to_korean_tokens &block
|
44
|
+
scala_list = block.call.toString
|
45
|
+
token_strs = scala_list_to_array(scala_list)
|
46
|
+
token_strs.map do |formed_token_str|
|
47
|
+
TwitterKorean::KoreanToken.build_by_formed_str(formed_token_str.first)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def scala_list_to_array(result)
|
52
|
+
result.scan(/(?<=List\(|\,\s)(.*?\([a-zA-Z]+\:\s[0-9]+,\s[0-9]\))/).to_a
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
lib = File.expand_path('../lib', __FILE__)
|
2
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
|
+
require 'twitter_korean/version'
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = "twitter-korean-text-ruby"
|
7
|
+
spec.version = TwitterKorean::VERSION
|
8
|
+
spec.authors = ["Jaehyun Shin"]
|
9
|
+
spec.email = ["keepcosmos@gmail.com"]
|
10
|
+
|
11
|
+
spec.summary = "Ruby interface to Twitter Korean Text(written in Scala)"
|
12
|
+
spec.description = "Ruby interface to Twitter Korean Text written in Scala(https://github.com/twitter/twitter-korean-text) "
|
13
|
+
spec.homepage = 'https://github.com/keepcosmos/twitter-korean-text-ruby'
|
14
|
+
spec.license = "Apache License 2.0"
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
17
|
+
spec.require_paths = ["lib"]
|
18
|
+
|
19
|
+
spec.add_dependency "rjb", "~> 1.5"
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.11"
|
22
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
23
|
+
spec.add_development_dependency "byebug", "~> 8.0"
|
24
|
+
spec.add_development_dependency "minitest", "~> 5.0"
|
25
|
+
spec.add_development_dependency "simplecov", "~> 0.11.0"
|
26
|
+
end
|
metadata
ADDED
@@ -0,0 +1,146 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: twitter-korean-text-ruby
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.9.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jaehyun Shin
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-03-01 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rjb
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.5'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.5'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: bundler
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.11'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.11'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '10.0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '10.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: byebug
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '8.0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '8.0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: minitest
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '5.0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '5.0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: simplecov
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 0.11.0
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 0.11.0
|
97
|
+
description: 'Ruby interface to Twitter Korean Text written in Scala(https://github.com/twitter/twitter-korean-text) '
|
98
|
+
email:
|
99
|
+
- keepcosmos@gmail.com
|
100
|
+
executables: []
|
101
|
+
extensions: []
|
102
|
+
extra_rdoc_files: []
|
103
|
+
files:
|
104
|
+
- ".byebug_history"
|
105
|
+
- ".gitignore"
|
106
|
+
- ".travis.yml"
|
107
|
+
- Gemfile
|
108
|
+
- README.md
|
109
|
+
- Rakefile
|
110
|
+
- bin/console
|
111
|
+
- bin/setup
|
112
|
+
- lib/twitter-korean-text-ruby.rb
|
113
|
+
- lib/twitter_korean/jars/korean-text-4.4.jar
|
114
|
+
- lib/twitter_korean/jars/scala-library-2.11.7.jar
|
115
|
+
- lib/twitter_korean/jars/twitter-text-1.13.3.jar
|
116
|
+
- lib/twitter_korean/jvm_bridge.rb
|
117
|
+
- lib/twitter_korean/korean_token.rb
|
118
|
+
- lib/twitter_korean/processor.rb
|
119
|
+
- lib/twitter_korean/version.rb
|
120
|
+
- twitter-korean-text-ruby.gemspec
|
121
|
+
homepage: https://github.com/keepcosmos/twitter-korean-text-ruby
|
122
|
+
licenses:
|
123
|
+
- Apache License 2.0
|
124
|
+
metadata: {}
|
125
|
+
post_install_message:
|
126
|
+
rdoc_options: []
|
127
|
+
require_paths:
|
128
|
+
- lib
|
129
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
130
|
+
requirements:
|
131
|
+
- - ">="
|
132
|
+
- !ruby/object:Gem::Version
|
133
|
+
version: '0'
|
134
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - ">="
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
139
|
+
requirements: []
|
140
|
+
rubyforge_project:
|
141
|
+
rubygems_version: 2.5.1
|
142
|
+
signing_key:
|
143
|
+
specification_version: 4
|
144
|
+
summary: Ruby interface to Twitter Korean Text(written in Scala)
|
145
|
+
test_files: []
|
146
|
+
has_rdoc:
|