twitter-korean-text-ruby 0.9.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 66a39cfe7f6759aa6bc9412a49e069817201d08b
4
+ data.tar.gz: d2d0ffd199077dd5f7d0463764e2d1b2c6bb1f27
5
+ SHA512:
6
+ metadata.gz: 5522edf916fb34042b99333b141146d450634e7d3503e0716204e4dcf2a74a2361fd3ad77dce961142b077f998e6dff627a0fb0603ad7e446b7c0fce1c64ac8e
7
+ data.tar.gz: d812c9d805a0957631a7845cde29ec9bbead25a610ef42d19834cd518e9b12648a210d5a071f837fd5669b93dcbb2aff284b200148dd1b425f04514397df8946
@@ -0,0 +1,12 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ /.byebug_history
11
+ coverage
12
+ /*.gem
@@ -0,0 +1,11 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.3.0
4
+ - 2.2.0
5
+ - 2.1.0
6
+ - 2.0.0
7
+
8
+ before_install:
9
+ - gem install bundler -v 1.11.2
10
+
11
+ script: rake test
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in twitter-korean-text-ruby.gemspec
4
+ gemspec
@@ -0,0 +1,68 @@
1
+ ## twitter-korean-text-ruby
2
+ [![Build Status](https://travis-ci.org/keepcosmos/twitter-korean-text-ruby.svg?branch=master)](https://travis-ci.org/keepcosmos/twitter-korean-text-ruby)
3
+ [![Code Climate](https://codeclimate.com/repos/56d562f8e4ecf4707f00309b/badges/7673319c6a92ab7ace9f/gpa.svg)](https://codeclimate.com/repos/56d562f8e4ecf4707f00309b/feed)
4
+
5
+ Ruby interface to [twitter-korean-text](https://github.com/twitter/twitter-korean-text) by Twitter
6
+
7
+ 트위터에서 제공하는 한글 형태소 분석기인 [twitter-korean-text](https://github.com/twitter/twitter-korean-text)(Scala)를 Ruby에서 사용가능하도록 Wrapping 하였습니다.
8
+
9
+ ### install
10
+ ```{ruby}
11
+ $ gem install twitter-korean-text-ruby
12
+ ```
13
+ Gemfile을 사용할 경우
14
+ ```{ruby}
15
+ # Gemfile
16
+ gem 'twitter-korean-text-ruby'
17
+ ```
18
+
19
+ ### Useage
20
+ ```ruby
21
+ require 'twitter-korean-text-ruby'
22
+
23
+ processor = TwitterKorean::Processor.new
24
+ # OR with JVM arguments
25
+ processor = TwitterKorean::Processor.new('-Xms126M', '-Xms512M', ...)
26
+
27
+ # Normalize
28
+ processor.normalize("형태소 분석을 합니닼ㅋㅋㅋㅋㅋㅋ")
29
+ # => "형태소 분석을 합니다ㅋㅋㅋㅋㅋㅋ"
30
+
31
+ # Tokenize
32
+ tokens = proccessor.tokenize("한국어를 처리하는 예시입니다 ㅋㅋ")
33
+ puts tokens
34
+ # => ["한국어", "를", " ", "처리", "하는", " ", "예시", "입니", "다", " ", "ㅋㅋ"]
35
+
36
+ # metadata of token, 토큰에 대한 정보
37
+ metadata = tokens.first.metadata
38
+ matadata #=> "noun, 0, 3"
39
+ metadata.pos #=> :noun
40
+ metadata.offset #=> 0
41
+ metadata.length #=> 3
42
+
43
+ # Stemming
44
+ tokens = proccessor.stem("한국어를 처리하는 예시입니다 ㅋㅋ")
45
+ puts tokens
46
+ # => ["한국어", "를", " ", "처리", "하다", " ", "예시", "이다", " ", "ㅋㅋ"]
47
+
48
+ # extract phrases
49
+ tokens = proccessor.stem("한국어를 처리하는 예시입니다 ㅋㅋ")
50
+ puts tokens
51
+ # => ["한국어", "처리", "처리하는 예시", "예시"]
52
+
53
+ ```
54
+
55
+ ### Test
56
+ ```{ruby}
57
+ rake test
58
+ ```
59
+
60
+ ### Issue
61
+ JAVA Path를 찾지 못했을 경우,
62
+ ```{bash}
63
+ export JAVA_HOME=$(java_home_path)
64
+
65
+ ### Contribute
66
+ 이 프로젝트는 [twitter-korean-text](https://github.com/twitter/twitter-korean-text) 프로젝트의 Scala 코드를 Ruby로 Wrapping하는 프로젝트입니다.
67
+ 관련된 범주에 대한 Issue와 Pull Request(테스트 코드가 포함된)는 언제나 환영입니다.
68
+ ```
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/testtask"
3
+
4
+ Rake::TestTask.new(:test) do |t|
5
+ t.libs << "test"
6
+ t.libs << "lib"
7
+ t.test_files = FileList['test/**/*_test.rb']
8
+ end
9
+
10
+ task :default => :test
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "twitter/korean/text/ruby"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,7 @@
1
+ require 'twitter_korean/version'
2
+
3
+ module TwitterKorean
4
+ autoload :JvmBridge, 'twitter_korean/jvm_bridge'
5
+ autoload :Processor, 'twitter_korean/processor'
6
+ autoload :KoreanToken, 'twitter_korean/korean_token'
7
+ end
@@ -0,0 +1,15 @@
1
+ require 'rjb'
2
+
3
+ module TwitterKorean
4
+ class JvmBridge
5
+ CLASS_PATH = Dir.glob(File.dirname(__FILE__) + '/jars/*.jar').join(File::PATH_SEPARATOR).freeze
6
+
7
+ def initialize(jvmargs = [])
8
+ Rjb.load CLASS_PATH, jvmargs
9
+ end
10
+
11
+ def scala_twitter_korean_processor
12
+ Rjb.import('com.twitter.penguin.korean.TwitterKoreanProcessor')
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,53 @@
1
+ module TwitterKorean
2
+ class KoreanToken < String
3
+ attr_accessor :metadata
4
+
5
+ class << self
6
+ # by form like `한국어(Noun: 0, 3)`
7
+ def build_by_formed_str(str)
8
+ token = new(str[0...str.rindex('(')])
9
+
10
+ infos = str[(str.rindex('(') + 1)...str.rindex(')')]
11
+ attrs = {
12
+ pos: infos.match(/\w+/).to_s,
13
+ offset: infos.match(/\d+/).to_s.to_i,
14
+ length: infos.match(/\d+$/).to_s.to_i
15
+ }
16
+
17
+ token.metadata = TwitterKorean::KoreanToken::Metadata.new(attrs)
18
+ token
19
+ end
20
+ end
21
+
22
+ class Metadata
23
+ attr_accessor :pos, :offset, :length, :unkown
24
+
25
+ def initialize(attrs = {})
26
+ attrs.each { |k, v| send("#{k}=", v) if respond_to?("#{k}=") }
27
+ symbolize_pos!
28
+ end
29
+
30
+ def inspect
31
+ "#{pos}, #{offset}, #{length}"
32
+ end
33
+
34
+ def ==(other)
35
+ [:pos, :offset, :length, :unkown].inject(true) { |res, attr| res && (send(attr) == other.send(attr)) }
36
+ end
37
+
38
+ private
39
+
40
+ def symbolize_pos!
41
+ return if pos.nil?
42
+ @pos = to_underscore(pos).to_sym
43
+ end
44
+
45
+ def to_underscore(txt)
46
+ txt.gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
47
+ gsub(/([a-z\d])([A-Z])/,'\1_\2').
48
+ tr("-", "_").
49
+ downcase
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,55 @@
1
+ require 'rjb'
2
+
3
+ module TwitterKorean
4
+ # Ruby interface to Scala TwitterKoreanProcessor
5
+ class Processor
6
+ attr_reader :jvm_processor, :java_convertor
7
+
8
+ def initialize(*jvmargs)
9
+ bridge = TwitterKorean::JvmBridge.new(jvmargs)
10
+ @jvm_processor = bridge.scala_twitter_korean_processor
11
+ end
12
+
13
+ def normalize(text)
14
+ return unless text
15
+ jvm_processor.normalize(text).toString
16
+ end
17
+
18
+ def tokenize(text)
19
+ return unless text
20
+ converto_to_korean_tokens do
21
+ jvm_processor.tokenize(text)
22
+ end
23
+ end
24
+
25
+ def stem(text)
26
+ return unless text
27
+ converto_to_korean_tokens do
28
+ jvm_processor.stem(jvm_processor.tokenize(text))
29
+ end
30
+ end
31
+
32
+ def extract_phrases(text, options = {})
33
+ return unless text
34
+ filter_spam = options[:filter_spam] || false
35
+ including_hashtags = options[:including_hashtags] || true
36
+ converto_to_korean_tokens do
37
+ jvm_processor.extractPhrases(jvm_processor.tokenize(text), filter_spam, including_hashtags)
38
+ end
39
+ end
40
+
41
+ private
42
+
43
+ def converto_to_korean_tokens &block
44
+ scala_list = block.call.toString
45
+ token_strs = scala_list_to_array(scala_list)
46
+ token_strs.map do |formed_token_str|
47
+ TwitterKorean::KoreanToken.build_by_formed_str(formed_token_str.first)
48
+ end
49
+ end
50
+
51
+ def scala_list_to_array(result)
52
+ result.scan(/(?<=List\(|\,\s)(.*?\([a-zA-Z]+\:\s[0-9]+,\s[0-9]\))/).to_a
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,3 @@
1
+ module TwitterKorean
2
+ VERSION = '0.9.1'
3
+ end
@@ -0,0 +1,26 @@
1
+ lib = File.expand_path('../lib', __FILE__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+ require 'twitter_korean/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "twitter-korean-text-ruby"
7
+ spec.version = TwitterKorean::VERSION
8
+ spec.authors = ["Jaehyun Shin"]
9
+ spec.email = ["keepcosmos@gmail.com"]
10
+
11
+ spec.summary = "Ruby interface to Twitter Korean Text(written in Scala)"
12
+ spec.description = "Ruby interface to Twitter Korean Text written in Scala(https://github.com/twitter/twitter-korean-text) "
13
+ spec.homepage = 'https://github.com/keepcosmos/twitter-korean-text-ruby'
14
+ spec.license = "Apache License 2.0"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
17
+ spec.require_paths = ["lib"]
18
+
19
+ spec.add_dependency "rjb", "~> 1.5"
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.11"
22
+ spec.add_development_dependency "rake", "~> 10.0"
23
+ spec.add_development_dependency "byebug", "~> 8.0"
24
+ spec.add_development_dependency "minitest", "~> 5.0"
25
+ spec.add_development_dependency "simplecov", "~> 0.11.0"
26
+ end
metadata ADDED
@@ -0,0 +1,146 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: twitter-korean-text-ruby
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.9.1
5
+ platform: ruby
6
+ authors:
7
+ - Jaehyun Shin
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-03-01 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rjb
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.5'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.5'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.11'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.11'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '10.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '10.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: byebug
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '8.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '8.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: minitest
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '5.0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '5.0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: simplecov
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 0.11.0
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: 0.11.0
97
+ description: 'Ruby interface to Twitter Korean Text written in Scala(https://github.com/twitter/twitter-korean-text) '
98
+ email:
99
+ - keepcosmos@gmail.com
100
+ executables: []
101
+ extensions: []
102
+ extra_rdoc_files: []
103
+ files:
104
+ - ".byebug_history"
105
+ - ".gitignore"
106
+ - ".travis.yml"
107
+ - Gemfile
108
+ - README.md
109
+ - Rakefile
110
+ - bin/console
111
+ - bin/setup
112
+ - lib/twitter-korean-text-ruby.rb
113
+ - lib/twitter_korean/jars/korean-text-4.4.jar
114
+ - lib/twitter_korean/jars/scala-library-2.11.7.jar
115
+ - lib/twitter_korean/jars/twitter-text-1.13.3.jar
116
+ - lib/twitter_korean/jvm_bridge.rb
117
+ - lib/twitter_korean/korean_token.rb
118
+ - lib/twitter_korean/processor.rb
119
+ - lib/twitter_korean/version.rb
120
+ - twitter-korean-text-ruby.gemspec
121
+ homepage: https://github.com/keepcosmos/twitter-korean-text-ruby
122
+ licenses:
123
+ - Apache License 2.0
124
+ metadata: {}
125
+ post_install_message:
126
+ rdoc_options: []
127
+ require_paths:
128
+ - lib
129
+ required_ruby_version: !ruby/object:Gem::Requirement
130
+ requirements:
131
+ - - ">="
132
+ - !ruby/object:Gem::Version
133
+ version: '0'
134
+ required_rubygems_version: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
139
+ requirements: []
140
+ rubyforge_project:
141
+ rubygems_version: 2.5.1
142
+ signing_key:
143
+ specification_version: 4
144
+ summary: Ruby interface to Twitter Korean Text(written in Scala)
145
+ test_files: []
146
+ has_rdoc: