twkorean 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d5b0224b2477336b22d92f4ad59a711b72cb3ed5
4
- data.tar.gz: 96995f69e42be4eb7a21cdbcf63a8e3d5a6bd10c
3
+ metadata.gz: 2f7c6b1a5621ac47e13abb6a548c88887a2f6882
4
+ data.tar.gz: 8d70e9c33a48b8dea26a5467f3a7faba82e7c066
5
5
  SHA512:
6
- metadata.gz: 314d087f4e1d8eca9d14742ae601a844375526849eba96c563b266381370c6082a98939cbb27fe857e8275922560eaf3c24337a37714020d70bdee1bd62f84aa
7
- data.tar.gz: 2e77c37daac91d552e40198221b94f26fd84d123737b61eb67367b454f14690a600051d480167b81424e76b9eb483dd8a03c6f4bc9b9f0f42adde4c6711355e4
6
+ metadata.gz: cd109eafdc8208b0c96c83b619c7a1a125305a1f00a3e0104b05731371917b6c1e7b173436e57b04938364eea82d02035e8f0fb964e6579b8e5a01dc7ce855fb
7
+ data.tar.gz: b51ba84b44cbd8546a959a6dc7395f64bf6f1af6fd7c1d6110cea39d9d8faf807a575cd37410a42cc66c720f37362fc7ecb0ada4a1459a9d61362c6ffbd01a3c
data/.rspec ADDED
@@ -0,0 +1,4 @@
1
+ --color
2
+ --require spec_helper
3
+ --require twkorean
4
+ --require pry
data/Gemfile CHANGED
@@ -2,4 +2,3 @@ source 'https://rubygems.org'
2
2
 
3
3
  # Specify your gem's dependencies in twkorean.gemspec
4
4
  gemspec
5
- gem 'rjb'
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  ## Compatibility
4
4
 
5
- Currently wraps [twitter-korean-text 4.4](https://github.com/twitter/twitter-korean-text/tree/korean-text-4.4) / 현재 이 프로젝트는 [twitter-korean-text 4.4](https://github.com/twitter/twitter-korean-text/tree/korean-text-4.4)을 사용중입니다.
5
+ Currently wraps [open-korean-text 2.0.5](https://github.com/open-korean-text/open-korean-text/releases/tag/open-korean-text-2.0.5) / 현재 이 프로젝트는 [open-korean-text 2.0.5](https://github.com/open-korean-text/open-korean-text/releases/tag/open-korean-text-2.0.5)을 사용중입니다.
6
6
 
7
7
  ## Installation
8
8
 
@@ -10,6 +10,10 @@ Add this line to your application's Gemfile:
10
10
 
11
11
  gem 'twkorean'
12
12
 
13
+ If you are using Java 7
14
+
15
+ gem 'twkorean', "~> 0.0.5"
16
+
13
17
  And then execute:
14
18
 
15
19
  $ bundle
@@ -20,12 +24,13 @@ Or install it yourself as:
20
24
 
21
25
  ## Required
22
26
 
27
+ Twkorean supports java8+
28
+
23
29
  $ export JAVA_HOME={Your Path}
24
- $ gem install 'rjb'
25
30
 
26
31
  ## Test
27
32
 
28
- $ ruby -v test/twkorean.rb
33
+ $ rake or rspec
29
34
 
30
35
  ## Usage
31
36
 
@@ -38,28 +43,28 @@ Or install it yourself as:
38
43
  it "Tokenize" do
39
44
  p "#Tokenize"
40
45
  p twkorean.tokens_to_string_list(tokens)
41
- # ["한국어", "를", "처리", "하는", "예시", "입니", "", "ㅋㅋ", "#한국어"]
46
+ # ["한국어", "를", "처리", "하는", "예시", "입니다", "ㅋㅋㅋ", "#한국어"]
42
47
  p twkorean.tokens_to_token_list(tokens)
43
- # ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하는(Verb: 7, 2)", "예시(Noun: 10, 2)", "입니(Adjective: 12, 2)", "다(Eomi: 14, 1)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
44
- end
45
-
46
- it "Stemming" do
47
- p "#Stemming"
48
- stem = twkorean.stem(tokens)
49
- p twkorean.tokens_to_string_list(stem)
50
- # ["한국어", "를", "처리", "하다", "예시", "이다", "ㅋㅋ", "#한국어"]
51
- p twkorean.tokens_to_token_list(stem)
52
- # ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하다(Verb: 7, 2)", "예시(Noun: 10, 2)", "이다(Adjective: 12, 3)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
48
+ # [
49
+ # ["한국어(Noun: 0, 3)", "한국어", "Noun", nil, "0", "3"],
50
+ # ["를(Josa: 3, 1)", "를", "Josa", nil, "3", "1"],
51
+ # ["처리(Noun: 5, 2)", "처리", "Noun", nil, "5", "2"],
52
+ # ["하는(Verb(하다): 7, 2)", "하는", "Verb", "(하다)", "7", "2"],
53
+ # ["예시(Noun: 10, 2)", "예시", "Noun", nil, "10", "2"],
54
+ # ["입니다(Adjective(이다): 12, 3)", "입니다", "Adjective", "(이다)", "12", "3"],
55
+ # ["ㅋㅋㅋ(KoreanParticle: 15, 3)", "ㅋㅋㅋ", "KoreanParticle", nil, "15", "3"],
56
+ # ["#한국어(Hashtag: 19, 4)", "#한국어", "Hashtag", nil, "19", "4"]
57
+ # ]
53
58
  end
54
59
 
55
60
  it "Phrase extraction" do
56
61
  p "Phrase extraction"
57
62
  p twkorean.extract_phrases(tokens)
58
- # ["한국어(Noun: 0, 3)", "처리(Noun: 5, 2)", "처리하는 예시(Noun: 5, 7)", "예시(Noun: 10, 2)", "#한국어(Hashtag: 18, 4)"]
63
+ # ["한국어(Noun: 0, 3)", "처리(Noun: 5, 2)", "처리하는 예시(Noun: 5, 7)", "예시(Noun: 10, 2)", "#한국어(Hashtag: 19, 4)"]
59
64
  end
60
65
  end
61
66
 
62
- end## Contributing
67
+ ## Contributing
63
68
 
64
69
  1. Fork it ( https://github.com/[my-github-username]/twkorean/fork )
65
70
  2. Create your feature branch (`git checkout -b my-new-feature`)
data/Rakefile CHANGED
@@ -1,2 +1,7 @@
1
1
  require "bundler/gem_tasks"
2
+ require 'rspec/core/rake_task'
2
3
 
4
+ RSpec::Core::RakeTask.new('spec')
5
+
6
+ # If you want to make this the default task
7
+ task :default => :spec
@@ -1,46 +1,48 @@
1
1
  module Twkorean
2
2
  class TwitterKoreanText
3
3
 
4
- attr_accessor :korean_processor
5
-
6
4
  def initialize(normalization = true, stemming = true)
7
5
  jars = Dir.glob(File.dirname(__FILE__)+"/../jars/*.jar").join(File::PATH_SEPARATOR)
8
6
  Rjb::load(jars, ['-Xmx512M'])
9
- self.korean_processor = Rjb::import('com.twitter.penguin.korean.TwitterKoreanProcessorJava')
10
7
  end
11
8
 
12
9
  def normalize(text)
13
- self.korean_processor.normalize(text).toString
10
+ korean_processor.normalize(text).toString
14
11
  end
15
12
 
16
13
  def tokenize(text)
17
- tokens = self.korean_processor.tokenize(text)
14
+ tokens = korean_processor.tokenize(text)
18
15
  tokens
19
16
  end
20
17
 
21
18
  def tokens_to_string_list(tokens)
22
- tokens = self.korean_processor.tokensToJavaStringList(tokens)
19
+ tokens = korean_processor.tokensToJavaStringList(tokens)
23
20
  tokens.toArray.map{|x| x.toString}
24
21
  end
25
22
 
26
23
  def tokens_to_token_list(tokens)
27
- tokens = self.korean_processor.tokensToJavaKoreanTokenList(tokens)
28
- tokens.toArray.map{|x| self.parser(x.toString)}
24
+ tokens = korean_processor.tokensToJavaKoreanTokenList(tokens)
25
+ tokens.toArray.map{|x| parser(x.toString)}
29
26
  end
30
27
 
31
28
  def stem(tokens)
32
- stemmed = self.korean_processor.stem(tokens)
33
- stemmed
29
+ # Deprecated method
30
+ # For legacy Code, Version less 0.0.6
31
+ tokens_to_token_list(tokens)
34
32
  end
35
33
 
36
34
  def extract_phrases(tokens)
37
- phrases = self.korean_processor.extractPhrases(tokens, true, true)
35
+ phrases = korean_processor.extractPhrases(tokens, true, true)
38
36
  phrases.toArray.map{|x| x.toString}
39
37
  end
40
38
 
39
+ private
41
40
  def parser(text)
42
- text.match(/(.*)\(([a-zA-Z]*): ([0-9]+), ([0-9]+)\)/).to_a
41
+ text.match(/(.*)\(([a-zA-Z]*)(\(.*\))?: ([0-9]+), ([0-9]+)\)/).to_a
43
42
  end
44
43
 
44
+ def korean_processor
45
+ @korean_processor ||= Rjb::import('org.openkoreantext.processor.OpenKoreanTextProcessorJava')
46
+ end
45
47
  end
46
48
  end
@@ -1,8 +1,8 @@
1
1
  # @name twkorean-ruby
2
2
  # @author JunSangPil
3
- # @version 0.0.5
3
+ # @version 0.0.6
4
4
  # @url https://github.com/jun85664396/twkorean-ruby
5
5
  # @license Apache License 2.0
6
6
  module Twkorean
7
- VERSION = "0.0.5"
7
+ VERSION = "0.0.6"
8
8
  end
@@ -0,0 +1,102 @@
1
+ # This file was generated by the `rspec --init` command. Conventionally, all
2
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
3
+ # The generated `.rspec` file contains `--require spec_helper` which will cause
4
+ # this file to always be loaded, without a need to explicitly require it in any
5
+ # files.
6
+ #
7
+ # Given that it is always loaded, you are encouraged to keep this file as
8
+ # light-weight as possible. Requiring heavyweight dependencies from this file
9
+ # will add to the boot time of your test suite on EVERY test run, even for an
10
+ # individual file that may not need all of that loaded. Instead, consider making
11
+ # a separate helper file that requires the additional dependencies and performs
12
+ # the additional setup, and require it from the spec files that actually need
13
+ # it.
14
+ #
15
+ require 'twkorean'
16
+
17
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
18
+ RSpec.configure do |config|
19
+ # rspec-expectations config goes here. You can use an alternate
20
+ # assertion/expectation library such as wrong or the stdlib/minitest
21
+ # assertions if you prefer.
22
+ config.expect_with :rspec do |expectations|
23
+ # This option will default to `true` in RSpec 4. It makes the `description`
24
+ # and `failure_message` of custom matchers include text for helper methods
25
+ # defined using `chain`, e.g.:
26
+ # be_bigger_than(2).and_smaller_than(4).description
27
+ # # => "be bigger than 2 and smaller than 4"
28
+ # ...rather than:
29
+ # # => "be bigger than 2"
30
+ expectations.include_chain_clauses_in_custom_matcher_descriptions = true
31
+ end
32
+
33
+ # rspec-mocks config goes here. You can use an alternate test double
34
+ # library (such as bogus or mocha) by changing the `mock_with` option here.
35
+ config.mock_with :rspec do |mocks|
36
+ # Prevents you from mocking or stubbing a method that does not exist on
37
+ # a real object. This is generally recommended, and will default to
38
+ # `true` in RSpec 4.
39
+ mocks.verify_partial_doubles = true
40
+ end
41
+
42
+ # This option will default to `:apply_to_host_groups` in RSpec 4 (and will
43
+ # have no way to turn it off -- the option exists only for backwards
44
+ # compatibility in RSpec 3). It causes shared context metadata to be
45
+ # inherited by the metadata hash of host groups and examples, rather than
46
+ # triggering implicit auto-inclusion in groups with matching metadata.
47
+ config.shared_context_metadata_behavior = :apply_to_host_groups
48
+
49
+ # The settings below are suggested to provide a good initial experience
50
+ # with RSpec, but feel free to customize to your heart's content.
51
+ =begin
52
+ # This allows you to limit a spec run to individual examples or groups
53
+ # you care about by tagging them with `:focus` metadata. When nothing
54
+ # is tagged with `:focus`, all examples get run. RSpec also provides
55
+ # aliases for `it`, `describe`, and `context` that include `:focus`
56
+ # metadata: `fit`, `fdescribe` and `fcontext`, respectively.
57
+ config.filter_run_when_matching :focus
58
+
59
+ # Allows RSpec to persist some state between runs in order to support
60
+ # the `--only-failures` and `--next-failure` CLI options. We recommend
61
+ # you configure your source control system to ignore this file.
62
+ config.example_status_persistence_file_path = "spec/examples.txt"
63
+
64
+ # Limits the available syntax to the non-monkey patched syntax that is
65
+ # recommended. For more details, see:
66
+ # - http://rspec.info/blog/2012/06/rspecs-new-expectation-syntax/
67
+ # - http://www.teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/
68
+ # - http://rspec.info/blog/2014/05/notable-changes-in-rspec-3/#zero-monkey-patching-mode
69
+ config.disable_monkey_patching!
70
+
71
+ # This setting enables warnings. It's recommended, but in some cases may
72
+ # be too noisy due to issues in dependencies.
73
+ config.warnings = true
74
+
75
+ # Many RSpec users commonly either run the entire suite or an individual
76
+ # file, and it's useful to allow more verbose output when running an
77
+ # individual spec file.
78
+ if config.files_to_run.one?
79
+ # Use the documentation formatter for detailed output,
80
+ # unless a formatter has already been configured
81
+ # (e.g. via a command-line flag).
82
+ config.default_formatter = "doc"
83
+ end
84
+
85
+ # Print the 10 slowest examples and example groups at the
86
+ # end of the spec run, to help surface which specs are running
87
+ # particularly slow.
88
+ config.profile_examples = 10
89
+
90
+ # Run specs in random order to surface order dependencies. If you find an
91
+ # order dependency and want to debug it, you can fix the order by providing
92
+ # the seed, which is printed after each run.
93
+ # --seed 1234
94
+ config.order = :random
95
+
96
+ # Seed global randomization in this process using the `--seed` CLI option.
97
+ # Setting this allows you to use `--seed` to deterministically reproduce
98
+ # test failures related to randomization by passing the same `--seed` value
99
+ # as the one that triggered the failure.
100
+ Kernel.srand config.seed
101
+ =end
102
+ end
@@ -0,0 +1,40 @@
1
+ RSpec.describe Twkorean do
2
+ let(:twkorean) { Twkorean::TwitterKoreanText.new }
3
+ let(:text) { twkorean.normalize("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ #한국어") }
4
+ let(:tokens) { twkorean.tokenize(text) }
5
+
6
+ context "Tokenize" do
7
+ it "should split a sentence to list of strings " do
8
+ expect(twkorean.tokens_to_string_list(tokens)).to eq(
9
+ ["한국어", "를", "처리", "하는", "예시", "입니다", "ㅋㅋㅋ", "#한국어"]
10
+ )
11
+ end
12
+
13
+ it "should split a sentence to list of tokens" do
14
+ expect(twkorean.tokens_to_token_list(tokens)).to eq([
15
+ ["한국어(Noun: 0, 3)", "한국어", "Noun", nil, "0", "3"],
16
+ ["를(Josa: 3, 1)", "를", "Josa", nil, "3", "1"],
17
+ ["처리(Noun: 5, 2)", "처리", "Noun", nil, "5", "2"],
18
+ ["하는(Verb(하다): 7, 2)", "하는", "Verb", "(하다)", "7", "2"],
19
+ ["예시(Noun: 10, 2)", "예시", "Noun", nil, "10", "2"],
20
+ ["입니다(Adjective(이다): 12, 3)", "입니다", "Adjective", "(이다)", "12", "3"],
21
+ ["ㅋㅋㅋ(KoreanParticle: 15, 3)", "ㅋㅋㅋ", "KoreanParticle", nil, "15", "3"],
22
+ ["#한국어(Hashtag: 19, 4)", "#한국어", "Hashtag", nil, "19", "4"]
23
+ ])
24
+ end
25
+ end
26
+
27
+ context "Phrase extraction" do
28
+ it "should extract phrases from a sentence" do
29
+ expect(twkorean.extract_phrases(tokens)).to eq(
30
+ [
31
+ "한국어(Noun: 0, 3)",
32
+ "처리(Noun: 5, 2)",
33
+ "처리하는 예시(Noun: 5, 7)",
34
+ "예시(Noun: 10, 2)",
35
+ "#한국어(Hashtag: 19, 4)"
36
+ ]
37
+ )
38
+ end
39
+ end
40
+ end
@@ -18,8 +18,9 @@ Gem::Specification.new do |spec|
18
18
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
19
  spec.require_paths = ["lib"]
20
20
 
21
+ spec.add_dependency "rjb"
21
22
  spec.add_development_dependency "bundler", "~> 1.6"
22
23
  spec.add_development_dependency "rake"
23
- spec.add_development_dependency "rjb"
24
- spec.add_development_dependency 'minitest'
24
+ spec.add_development_dependency "rspec"
25
+ spec.add_development_dependency "pry"
25
26
  end
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twkorean
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - JunSangPil
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-07-08 00:00:00.000000000 Z
11
+ date: 2017-07-19 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rjb
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
13
27
  - !ruby/object:Gem::Dependency
14
28
  name: bundler
15
29
  requirement: !ruby/object:Gem::Requirement
@@ -39,7 +53,7 @@ dependencies:
39
53
  - !ruby/object:Gem::Version
40
54
  version: '0'
41
55
  - !ruby/object:Gem::Dependency
42
- name: rjb
56
+ name: rspec
43
57
  requirement: !ruby/object:Gem::Requirement
44
58
  requirements:
45
59
  - - ">="
@@ -53,7 +67,7 @@ dependencies:
53
67
  - !ruby/object:Gem::Version
54
68
  version: '0'
55
69
  - !ruby/object:Gem::Dependency
56
- name: minitest
70
+ name: pry
57
71
  requirement: !ruby/object:Gem::Requirement
58
72
  requirements:
59
73
  - - ">="
@@ -74,18 +88,19 @@ extensions: []
74
88
  extra_rdoc_files: []
75
89
  files:
76
90
  - ".gitignore"
91
+ - ".rspec"
77
92
  - Gemfile
78
93
  - LICENSE.txt
79
94
  - README.md
80
95
  - Rakefile
81
- - lib/jars/korean-text-4.4.jar
82
- - lib/jars/scala-library-2.11.6.jar
83
- - lib/jars/twitter-text-1.13.3.jar
96
+ - lib/jars/open-korean-text-2.0.6-SNAPSHOT.jar
97
+ - lib/jars/scala-library-2.12.0.jar
98
+ - lib/jars/twitter-text-1.14.3.jar
84
99
  - lib/twkorean.rb
85
100
  - lib/twkorean/twitter_korean_text.rb
86
101
  - lib/twkorean/version.rb
87
- - test/test_helper.rb
88
- - test/twkorean.rb
102
+ - spec/spec_helper.rb
103
+ - spec/twkorean_spec.rb
89
104
  - twkorean.gemspec
90
105
  homepage: https://github.com/jun85664396/twkorean-ruby
91
106
  licenses:
@@ -112,5 +127,5 @@ signing_key:
112
127
  specification_version: 4
113
128
  summary: Ruby interface to twitter-korean-text
114
129
  test_files:
115
- - test/test_helper.rb
116
- - test/twkorean.rb
130
+ - spec/spec_helper.rb
131
+ - spec/twkorean_spec.rb
Binary file
@@ -1,6 +0,0 @@
1
- # @name twkorean-ruby
2
- # @author JunSangPil
3
- # @version 0.0.4
4
- # @url https://github.com/jun85664396/twkorean-ruby
5
- # @license Apache License 2.0
6
- require 'minitest/autorun'
@@ -1,38 +0,0 @@
1
- # @name twkorean-ruby
2
- # @author JunSangPil
3
- # @version 0.0.4
4
- # @url https://github.com/jun85664396/twkorean-ruby
5
- # @license Apache License 2.0
6
- require_relative 'test_helper'
7
- require 'twkorean'
8
-
9
- describe "Twkorean" do
10
- text = "한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ #한국어"
11
- twkorean = Twkorean::TwitterKoreanText.new
12
- text = twkorean.normalize(text)
13
- tokens = twkorean.tokenize(text)
14
-
15
- it "Tokenize" do
16
- p "#Tokenize"
17
- p twkorean.tokens_to_string_list(tokens)
18
- # ["한국어", "를", "처리", "하는", "예시", "입니", "다", "ㅋㅋ", "#한국어"]
19
- p twkorean.tokens_to_token_list(tokens)
20
- # ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하는(Verb: 7, 2)", "예시(Noun: 10, 2)", "입니(Adjective: 12, 2)", "다(Eomi: 14, 1)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
21
- end
22
-
23
- it "Stemming" do
24
- p "#Stemming"
25
- stem = twkorean.stem(tokens)
26
- p twkorean.tokens_to_string_list(stem)
27
- # ["한국어", "를", "처리", "하다", "예시", "이다", "ㅋㅋ", "#한국어"]
28
- p twkorean.tokens_to_token_list(stem)
29
- # ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하다(Verb: 7, 2)", "예시(Noun: 10, 2)", "이다(Adjective: 12, 3)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
30
- end
31
-
32
- it "Phrase extraction" do
33
- p "Phrase extraction"
34
- p twkorean.extract_phrases(tokens)
35
- # ["한국어(Noun: 0, 3)", "처리(Noun: 5, 2)", "처리하는 예시(Noun: 5, 7)", "예시(Noun: 10, 2)", "#한국어(Hashtag: 18, 4)"]
36
- end
37
-
38
- end