twkorean 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d5b0224b2477336b22d92f4ad59a711b72cb3ed5
4
- data.tar.gz: 96995f69e42be4eb7a21cdbcf63a8e3d5a6bd10c
3
+ metadata.gz: 2f7c6b1a5621ac47e13abb6a548c88887a2f6882
4
+ data.tar.gz: 8d70e9c33a48b8dea26a5467f3a7faba82e7c066
5
5
  SHA512:
6
- metadata.gz: 314d087f4e1d8eca9d14742ae601a844375526849eba96c563b266381370c6082a98939cbb27fe857e8275922560eaf3c24337a37714020d70bdee1bd62f84aa
7
- data.tar.gz: 2e77c37daac91d552e40198221b94f26fd84d123737b61eb67367b454f14690a600051d480167b81424e76b9eb483dd8a03c6f4bc9b9f0f42adde4c6711355e4
6
+ metadata.gz: cd109eafdc8208b0c96c83b619c7a1a125305a1f00a3e0104b05731371917b6c1e7b173436e57b04938364eea82d02035e8f0fb964e6579b8e5a01dc7ce855fb
7
+ data.tar.gz: b51ba84b44cbd8546a959a6dc7395f64bf6f1af6fd7c1d6110cea39d9d8faf807a575cd37410a42cc66c720f37362fc7ecb0ada4a1459a9d61362c6ffbd01a3c
data/.rspec ADDED
@@ -0,0 +1,4 @@
1
+ --color
2
+ --require spec_helper
3
+ --require twkorean
4
+ --require pry
data/Gemfile CHANGED
@@ -2,4 +2,3 @@ source 'https://rubygems.org'
2
2
 
3
3
  # Specify your gem's dependencies in twkorean.gemspec
4
4
  gemspec
5
- gem 'rjb'
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  ## Compatibility
4
4
 
5
- Currently wraps [twitter-korean-text 4.4](https://github.com/twitter/twitter-korean-text/tree/korean-text-4.4) / 현재 이 프로젝트는 [twitter-korean-text 4.4](https://github.com/twitter/twitter-korean-text/tree/korean-text-4.4)을 사용중입니다.
5
+ Currently wraps [open-korean-text 2.0.5](https://github.com/open-korean-text/open-korean-text/releases/tag/open-korean-text-2.0.5) / 현재 이 프로젝트는 [open-korean-text 2.0.5](https://github.com/open-korean-text/open-korean-text/releases/tag/open-korean-text-2.0.5)을 사용중입니다.
6
6
 
7
7
  ## Installation
8
8
 
@@ -10,6 +10,10 @@ Add this line to your application's Gemfile:
10
10
 
11
11
  gem 'twkorean'
12
12
 
13
+ If you are using Java 7
14
+
15
+ gem 'twkorean', "~> 0.0.5"
16
+
13
17
  And then execute:
14
18
 
15
19
  $ bundle
@@ -20,12 +24,13 @@ Or install it yourself as:
20
24
 
21
25
  ## Required
22
26
 
27
+ Twkorean supports java8+
28
+
23
29
  $ export JAVA_HOME={Your Path}
24
- $ gem install 'rjb'
25
30
 
26
31
  ## Test
27
32
 
28
- $ ruby -v test/twkorean.rb
33
+ $ rake or rspec
29
34
 
30
35
  ## Usage
31
36
 
@@ -38,28 +43,28 @@ Or install it yourself as:
38
43
  it "Tokenize" do
39
44
  p "#Tokenize"
40
45
  p twkorean.tokens_to_string_list(tokens)
41
- # ["한국어", "를", "처리", "하는", "예시", "입니", "", "ㅋㅋ", "#한국어"]
46
+ # ["한국어", "를", "처리", "하는", "예시", "입니다", "ㅋㅋㅋ", "#한국어"]
42
47
  p twkorean.tokens_to_token_list(tokens)
43
- # ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하는(Verb: 7, 2)", "예시(Noun: 10, 2)", "입니(Adjective: 12, 2)", "다(Eomi: 14, 1)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
44
- end
45
-
46
- it "Stemming" do
47
- p "#Stemming"
48
- stem = twkorean.stem(tokens)
49
- p twkorean.tokens_to_string_list(stem)
50
- # ["한국어", "를", "처리", "하다", "예시", "이다", "ㅋㅋ", "#한국어"]
51
- p twkorean.tokens_to_token_list(stem)
52
- # ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하다(Verb: 7, 2)", "예시(Noun: 10, 2)", "이다(Adjective: 12, 3)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
48
+ # [
49
+ # ["한국어(Noun: 0, 3)", "한국어", "Noun", nil, "0", "3"],
50
+ # ["를(Josa: 3, 1)", "를", "Josa", nil, "3", "1"],
51
+ # ["처리(Noun: 5, 2)", "처리", "Noun", nil, "5", "2"],
52
+ # ["하는(Verb(하다): 7, 2)", "하는", "Verb", "(하다)", "7", "2"],
53
+ # ["예시(Noun: 10, 2)", "예시", "Noun", nil, "10", "2"],
54
+ # ["입니다(Adjective(이다): 12, 3)", "입니다", "Adjective", "(이다)", "12", "3"],
55
+ # ["ㅋㅋㅋ(KoreanParticle: 15, 3)", "ㅋㅋㅋ", "KoreanParticle", nil, "15", "3"],
56
+ # ["#한국어(Hashtag: 19, 4)", "#한국어", "Hashtag", nil, "19", "4"]
57
+ # ]
53
58
  end
54
59
 
55
60
  it "Phrase extraction" do
56
61
  p "Phrase extraction"
57
62
  p twkorean.extract_phrases(tokens)
58
- # ["한국어(Noun: 0, 3)", "처리(Noun: 5, 2)", "처리하는 예시(Noun: 5, 7)", "예시(Noun: 10, 2)", "#한국어(Hashtag: 18, 4)"]
63
+ # ["한국어(Noun: 0, 3)", "처리(Noun: 5, 2)", "처리하는 예시(Noun: 5, 7)", "예시(Noun: 10, 2)", "#한국어(Hashtag: 19, 4)"]
59
64
  end
60
65
  end
61
66
 
62
- end## Contributing
67
+ ## Contributing
63
68
 
64
69
  1. Fork it ( https://github.com/[my-github-username]/twkorean/fork )
65
70
  2. Create your feature branch (`git checkout -b my-new-feature`)
data/Rakefile CHANGED
@@ -1,2 +1,7 @@
1
1
  require "bundler/gem_tasks"
2
+ require 'rspec/core/rake_task'
2
3
 
4
+ RSpec::Core::RakeTask.new('spec')
5
+
6
+ # If you want to make this the default task
7
+ task :default => :spec
@@ -1,46 +1,48 @@
1
1
  module Twkorean
2
2
  class TwitterKoreanText
3
3
 
4
- attr_accessor :korean_processor
5
-
6
4
  def initialize(normalization = true, stemming = true)
7
5
  jars = Dir.glob(File.dirname(__FILE__)+"/../jars/*.jar").join(File::PATH_SEPARATOR)
8
6
  Rjb::load(jars, ['-Xmx512M'])
9
- self.korean_processor = Rjb::import('com.twitter.penguin.korean.TwitterKoreanProcessorJava')
10
7
  end
11
8
 
12
9
  def normalize(text)
13
- self.korean_processor.normalize(text).toString
10
+ korean_processor.normalize(text).toString
14
11
  end
15
12
 
16
13
  def tokenize(text)
17
- tokens = self.korean_processor.tokenize(text)
14
+ tokens = korean_processor.tokenize(text)
18
15
  tokens
19
16
  end
20
17
 
21
18
  def tokens_to_string_list(tokens)
22
- tokens = self.korean_processor.tokensToJavaStringList(tokens)
19
+ tokens = korean_processor.tokensToJavaStringList(tokens)
23
20
  tokens.toArray.map{|x| x.toString}
24
21
  end
25
22
 
26
23
  def tokens_to_token_list(tokens)
27
- tokens = self.korean_processor.tokensToJavaKoreanTokenList(tokens)
28
- tokens.toArray.map{|x| self.parser(x.toString)}
24
+ tokens = korean_processor.tokensToJavaKoreanTokenList(tokens)
25
+ tokens.toArray.map{|x| parser(x.toString)}
29
26
  end
30
27
 
31
28
  def stem(tokens)
32
- stemmed = self.korean_processor.stem(tokens)
33
- stemmed
29
+ # Deprecated method
30
+ # For legacy Code, Version less 0.0.6
31
+ tokens_to_token_list(tokens)
34
32
  end
35
33
 
36
34
  def extract_phrases(tokens)
37
- phrases = self.korean_processor.extractPhrases(tokens, true, true)
35
+ phrases = korean_processor.extractPhrases(tokens, true, true)
38
36
  phrases.toArray.map{|x| x.toString}
39
37
  end
40
38
 
39
+ private
41
40
  def parser(text)
42
- text.match(/(.*)\(([a-zA-Z]*): ([0-9]+), ([0-9]+)\)/).to_a
41
+ text.match(/(.*)\(([a-zA-Z]*)(\(.*\))?: ([0-9]+), ([0-9]+)\)/).to_a
43
42
  end
44
43
 
44
+ def korean_processor
45
+ @korean_processor ||= Rjb::import('org.openkoreantext.processor.OpenKoreanTextProcessorJava')
46
+ end
45
47
  end
46
48
  end
@@ -1,8 +1,8 @@
1
1
  # @name twkorean-ruby
2
2
  # @author JunSangPil
3
- # @version 0.0.5
3
+ # @version 0.0.6
4
4
  # @url https://github.com/jun85664396/twkorean-ruby
5
5
  # @license Apache License 2.0
6
6
  module Twkorean
7
- VERSION = "0.0.5"
7
+ VERSION = "0.0.6"
8
8
  end
@@ -0,0 +1,102 @@
1
+ # This file was generated by the `rspec --init` command. Conventionally, all
2
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
3
+ # The generated `.rspec` file contains `--require spec_helper` which will cause
4
+ # this file to always be loaded, without a need to explicitly require it in any
5
+ # files.
6
+ #
7
+ # Given that it is always loaded, you are encouraged to keep this file as
8
+ # light-weight as possible. Requiring heavyweight dependencies from this file
9
+ # will add to the boot time of your test suite on EVERY test run, even for an
10
+ # individual file that may not need all of that loaded. Instead, consider making
11
+ # a separate helper file that requires the additional dependencies and performs
12
+ # the additional setup, and require it from the spec files that actually need
13
+ # it.
14
+ #
15
+ require 'twkorean'
16
+
17
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
18
+ RSpec.configure do |config|
19
+ # rspec-expectations config goes here. You can use an alternate
20
+ # assertion/expectation library such as wrong or the stdlib/minitest
21
+ # assertions if you prefer.
22
+ config.expect_with :rspec do |expectations|
23
+ # This option will default to `true` in RSpec 4. It makes the `description`
24
+ # and `failure_message` of custom matchers include text for helper methods
25
+ # defined using `chain`, e.g.:
26
+ # be_bigger_than(2).and_smaller_than(4).description
27
+ # # => "be bigger than 2 and smaller than 4"
28
+ # ...rather than:
29
+ # # => "be bigger than 2"
30
+ expectations.include_chain_clauses_in_custom_matcher_descriptions = true
31
+ end
32
+
33
+ # rspec-mocks config goes here. You can use an alternate test double
34
+ # library (such as bogus or mocha) by changing the `mock_with` option here.
35
+ config.mock_with :rspec do |mocks|
36
+ # Prevents you from mocking or stubbing a method that does not exist on
37
+ # a real object. This is generally recommended, and will default to
38
+ # `true` in RSpec 4.
39
+ mocks.verify_partial_doubles = true
40
+ end
41
+
42
+ # This option will default to `:apply_to_host_groups` in RSpec 4 (and will
43
+ # have no way to turn it off -- the option exists only for backwards
44
+ # compatibility in RSpec 3). It causes shared context metadata to be
45
+ # inherited by the metadata hash of host groups and examples, rather than
46
+ # triggering implicit auto-inclusion in groups with matching metadata.
47
+ config.shared_context_metadata_behavior = :apply_to_host_groups
48
+
49
+ # The settings below are suggested to provide a good initial experience
50
+ # with RSpec, but feel free to customize to your heart's content.
51
+ =begin
52
+ # This allows you to limit a spec run to individual examples or groups
53
+ # you care about by tagging them with `:focus` metadata. When nothing
54
+ # is tagged with `:focus`, all examples get run. RSpec also provides
55
+ # aliases for `it`, `describe`, and `context` that include `:focus`
56
+ # metadata: `fit`, `fdescribe` and `fcontext`, respectively.
57
+ config.filter_run_when_matching :focus
58
+
59
+ # Allows RSpec to persist some state between runs in order to support
60
+ # the `--only-failures` and `--next-failure` CLI options. We recommend
61
+ # you configure your source control system to ignore this file.
62
+ config.example_status_persistence_file_path = "spec/examples.txt"
63
+
64
+ # Limits the available syntax to the non-monkey patched syntax that is
65
+ # recommended. For more details, see:
66
+ # - http://rspec.info/blog/2012/06/rspecs-new-expectation-syntax/
67
+ # - http://www.teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/
68
+ # - http://rspec.info/blog/2014/05/notable-changes-in-rspec-3/#zero-monkey-patching-mode
69
+ config.disable_monkey_patching!
70
+
71
+ # This setting enables warnings. It's recommended, but in some cases may
72
+ # be too noisy due to issues in dependencies.
73
+ config.warnings = true
74
+
75
+ # Many RSpec users commonly either run the entire suite or an individual
76
+ # file, and it's useful to allow more verbose output when running an
77
+ # individual spec file.
78
+ if config.files_to_run.one?
79
+ # Use the documentation formatter for detailed output,
80
+ # unless a formatter has already been configured
81
+ # (e.g. via a command-line flag).
82
+ config.default_formatter = "doc"
83
+ end
84
+
85
+ # Print the 10 slowest examples and example groups at the
86
+ # end of the spec run, to help surface which specs are running
87
+ # particularly slow.
88
+ config.profile_examples = 10
89
+
90
+ # Run specs in random order to surface order dependencies. If you find an
91
+ # order dependency and want to debug it, you can fix the order by providing
92
+ # the seed, which is printed after each run.
93
+ # --seed 1234
94
+ config.order = :random
95
+
96
+ # Seed global randomization in this process using the `--seed` CLI option.
97
+ # Setting this allows you to use `--seed` to deterministically reproduce
98
+ # test failures related to randomization by passing the same `--seed` value
99
+ # as the one that triggered the failure.
100
+ Kernel.srand config.seed
101
+ =end
102
+ end
@@ -0,0 +1,40 @@
1
+ RSpec.describe Twkorean do
2
+ let(:twkorean) { Twkorean::TwitterKoreanText.new }
3
+ let(:text) { twkorean.normalize("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ #한국어") }
4
+ let(:tokens) { twkorean.tokenize(text) }
5
+
6
+ context "Tokenize" do
7
+ it "should split a sentence to list of strings " do
8
+ expect(twkorean.tokens_to_string_list(tokens)).to eq(
9
+ ["한국어", "를", "처리", "하는", "예시", "입니다", "ㅋㅋㅋ", "#한국어"]
10
+ )
11
+ end
12
+
13
+ it "should split a sentence to list of tokens" do
14
+ expect(twkorean.tokens_to_token_list(tokens)).to eq([
15
+ ["한국어(Noun: 0, 3)", "한국어", "Noun", nil, "0", "3"],
16
+ ["를(Josa: 3, 1)", "를", "Josa", nil, "3", "1"],
17
+ ["처리(Noun: 5, 2)", "처리", "Noun", nil, "5", "2"],
18
+ ["하는(Verb(하다): 7, 2)", "하는", "Verb", "(하다)", "7", "2"],
19
+ ["예시(Noun: 10, 2)", "예시", "Noun", nil, "10", "2"],
20
+ ["입니다(Adjective(이다): 12, 3)", "입니다", "Adjective", "(이다)", "12", "3"],
21
+ ["ㅋㅋㅋ(KoreanParticle: 15, 3)", "ㅋㅋㅋ", "KoreanParticle", nil, "15", "3"],
22
+ ["#한국어(Hashtag: 19, 4)", "#한국어", "Hashtag", nil, "19", "4"]
23
+ ])
24
+ end
25
+ end
26
+
27
+ context "Phrase extraction" do
28
+ it "should extract phrases from a sentence" do
29
+ expect(twkorean.extract_phrases(tokens)).to eq(
30
+ [
31
+ "한국어(Noun: 0, 3)",
32
+ "처리(Noun: 5, 2)",
33
+ "처리하는 예시(Noun: 5, 7)",
34
+ "예시(Noun: 10, 2)",
35
+ "#한국어(Hashtag: 19, 4)"
36
+ ]
37
+ )
38
+ end
39
+ end
40
+ end
@@ -18,8 +18,9 @@ Gem::Specification.new do |spec|
18
18
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
19
  spec.require_paths = ["lib"]
20
20
 
21
+ spec.add_dependency "rjb"
21
22
  spec.add_development_dependency "bundler", "~> 1.6"
22
23
  spec.add_development_dependency "rake"
23
- spec.add_development_dependency "rjb"
24
- spec.add_development_dependency 'minitest'
24
+ spec.add_development_dependency "rspec"
25
+ spec.add_development_dependency "pry"
25
26
  end
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twkorean
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - JunSangPil
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-07-08 00:00:00.000000000 Z
11
+ date: 2017-07-19 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rjb
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
13
27
  - !ruby/object:Gem::Dependency
14
28
  name: bundler
15
29
  requirement: !ruby/object:Gem::Requirement
@@ -39,7 +53,7 @@ dependencies:
39
53
  - !ruby/object:Gem::Version
40
54
  version: '0'
41
55
  - !ruby/object:Gem::Dependency
42
- name: rjb
56
+ name: rspec
43
57
  requirement: !ruby/object:Gem::Requirement
44
58
  requirements:
45
59
  - - ">="
@@ -53,7 +67,7 @@ dependencies:
53
67
  - !ruby/object:Gem::Version
54
68
  version: '0'
55
69
  - !ruby/object:Gem::Dependency
56
- name: minitest
70
+ name: pry
57
71
  requirement: !ruby/object:Gem::Requirement
58
72
  requirements:
59
73
  - - ">="
@@ -74,18 +88,19 @@ extensions: []
74
88
  extra_rdoc_files: []
75
89
  files:
76
90
  - ".gitignore"
91
+ - ".rspec"
77
92
  - Gemfile
78
93
  - LICENSE.txt
79
94
  - README.md
80
95
  - Rakefile
81
- - lib/jars/korean-text-4.4.jar
82
- - lib/jars/scala-library-2.11.6.jar
83
- - lib/jars/twitter-text-1.13.3.jar
96
+ - lib/jars/open-korean-text-2.0.6-SNAPSHOT.jar
97
+ - lib/jars/scala-library-2.12.0.jar
98
+ - lib/jars/twitter-text-1.14.3.jar
84
99
  - lib/twkorean.rb
85
100
  - lib/twkorean/twitter_korean_text.rb
86
101
  - lib/twkorean/version.rb
87
- - test/test_helper.rb
88
- - test/twkorean.rb
102
+ - spec/spec_helper.rb
103
+ - spec/twkorean_spec.rb
89
104
  - twkorean.gemspec
90
105
  homepage: https://github.com/jun85664396/twkorean-ruby
91
106
  licenses:
@@ -112,5 +127,5 @@ signing_key:
112
127
  specification_version: 4
113
128
  summary: Ruby interface to twitter-korean-text
114
129
  test_files:
115
- - test/test_helper.rb
116
- - test/twkorean.rb
130
+ - spec/spec_helper.rb
131
+ - spec/twkorean_spec.rb
Binary file
@@ -1,6 +0,0 @@
1
- # @name twkorean-ruby
2
- # @author JunSangPil
3
- # @version 0.0.4
4
- # @url https://github.com/jun85664396/twkorean-ruby
5
- # @license Apache License 2.0
6
- require 'minitest/autorun'
@@ -1,38 +0,0 @@
1
- # @name twkorean-ruby
2
- # @author JunSangPil
3
- # @version 0.0.4
4
- # @url https://github.com/jun85664396/twkorean-ruby
5
- # @license Apache License 2.0
6
- require_relative 'test_helper'
7
- require 'twkorean'
8
-
9
- describe "Twkorean" do
10
- text = "한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ #한국어"
11
- twkorean = Twkorean::TwitterKoreanText.new
12
- text = twkorean.normalize(text)
13
- tokens = twkorean.tokenize(text)
14
-
15
- it "Tokenize" do
16
- p "#Tokenize"
17
- p twkorean.tokens_to_string_list(tokens)
18
- # ["한국어", "를", "처리", "하는", "예시", "입니", "다", "ㅋㅋ", "#한국어"]
19
- p twkorean.tokens_to_token_list(tokens)
20
- # ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하는(Verb: 7, 2)", "예시(Noun: 10, 2)", "입니(Adjective: 12, 2)", "다(Eomi: 14, 1)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
21
- end
22
-
23
- it "Stemming" do
24
- p "#Stemming"
25
- stem = twkorean.stem(tokens)
26
- p twkorean.tokens_to_string_list(stem)
27
- # ["한국어", "를", "처리", "하다", "예시", "이다", "ㅋㅋ", "#한국어"]
28
- p twkorean.tokens_to_token_list(stem)
29
- # ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하다(Verb: 7, 2)", "예시(Noun: 10, 2)", "이다(Adjective: 12, 3)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
30
- end
31
-
32
- it "Phrase extraction" do
33
- p "Phrase extraction"
34
- p twkorean.extract_phrases(tokens)
35
- # ["한국어(Noun: 0, 3)", "처리(Noun: 5, 2)", "처리하는 예시(Noun: 5, 7)", "예시(Noun: 10, 2)", "#한국어(Hashtag: 18, 4)"]
36
- end
37
-
38
- end