RubyGems - twkorean - Versions diffs - 0.0.5 → 0.0.6 - Mend

twkorean 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

checksums.yaml +4 -4
data/.rspec +4 -0
data/Gemfile +0 -1
data/README.md +21 -16
data/Rakefile +5 -0
data/lib/jars/open-korean-text-2.0.6-SNAPSHOT.jar +0 -0
data/lib/jars/scala-library-2.12.0.jar +0 -0
data/lib/jars/twitter-text-1.14.3.jar +0 -0
data/lib/twkorean/twitter_korean_text.rb +14 -12
data/lib/twkorean/version.rb +2 -2
data/spec/spec_helper.rb +102 -0
data/spec/twkorean_spec.rb +40 -0
data/twkorean.gemspec +3 -2
metadata +26 -11
data/lib/jars/korean-text-4.4.jar +0 -0
data/lib/jars/scala-library-2.11.6.jar +0 -0
data/lib/jars/twitter-text-1.13.3.jar +0 -0
data/test/test_helper.rb +0 -6
data/test/twkorean.rb +0 -38

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: d5b0224b2477336b22d92f4ad59a711b72cb3ed5
-  data.tar.gz: 96995f69e42be4eb7a21cdbcf63a8e3d5a6bd10c
+  metadata.gz: 2f7c6b1a5621ac47e13abb6a548c88887a2f6882
+  data.tar.gz: 8d70e9c33a48b8dea26a5467f3a7faba82e7c066
 SHA512:
-  metadata.gz: 314d087f4e1d8eca9d14742ae601a844375526849eba96c563b266381370c6082a98939cbb27fe857e8275922560eaf3c24337a37714020d70bdee1bd62f84aa
-  data.tar.gz: 2e77c37daac91d552e40198221b94f26fd84d123737b61eb67367b454f14690a600051d480167b81424e76b9eb483dd8a03c6f4bc9b9f0f42adde4c6711355e4
+  metadata.gz: cd109eafdc8208b0c96c83b619c7a1a125305a1f00a3e0104b05731371917b6c1e7b173436e57b04938364eea82d02035e8f0fb964e6579b8e5a01dc7ce855fb
+  data.tar.gz: b51ba84b44cbd8546a959a6dc7395f64bf6f1af6fd7c1d6110cea39d9d8faf807a575cd37410a42cc66c720f37362fc7ecb0ada4a1459a9d61362c6ffbd01a3c

data/.rspec ADDED

@@ -0,0 +1,4 @@
+--color
+--require spec_helper
+--require twkorean
+--require pry

data/Gemfile CHANGED

@@ -2,4 +2,3 @@ source 'https://rubygems.org'
 # Specify your gem's dependencies in twkorean.gemspec
 gemspec
-gem 'rjb'

data/README.md CHANGED

@@ -2,7 +2,7 @@
 ## Compatibility
-Currently wraps [twitter-korean-text 4.4](https://github.com/twitter/twitter-korean-text/tree/korean-text-4.4) / 현재 이 프로젝트는 [twitter-korean-text 4.4](https://github.com/twitter/twitter-korean-text/tree/korean-text-4.4)을 사용중입니다.
+Currently wraps [open-korean-text 2.0.5](https://github.com/open-korean-text/open-korean-text/releases/tag/open-korean-text-2.0.5) / 현재 이 프로젝트는 [open-korean-text 2.0.5](https://github.com/open-korean-text/open-korean-text/releases/tag/open-korean-text-2.0.5)을 사용중입니다.
 ## Installation
@@ -10,6 +10,10 @@ Add this line to your application's Gemfile:
     gem 'twkorean'
+If you are using Java 7
+    gem 'twkorean', "~> 0.0.5"
 And then execute:
     $ bundle
@@ -20,12 +24,13 @@ Or install it yourself as:
 ## Required
+Twkorean supports java8+
     $ export JAVA_HOME={Your Path}
-    $ gem install 'rjb'
 ## Test
-    $ ruby -v test/twkorean.rb
+    $ rake or rspec
 ## Usage
@@ -38,28 +43,28 @@ Or install it yourself as:
       it "Tokenize" do
         p "#Tokenize"
         p twkorean.tokens_to_string_list(tokens)
-        # ["한국어", "를", "처리", "하는", "예시", "입니", "다", "ㅋㅋ", "#한국어"]
+        # ["한국어", "를", "처리", "하는", "예시", "입니다", "ㅋㅋㅋ", "#한국어"]
         p twkorean.tokens_to_token_list(tokens)
-        # ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하는(Verb: 7, 2)", "예시(Noun: 10, 2)", "입니(Adjective: 12, 2)", "다(Eomi: 14, 1)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
-      end
-      it "Stemming" do
-        p "#Stemming"
-        stem = twkorean.stem(tokens)
-        p twkorean.tokens_to_string_list(stem)
-        # ["한국어", "를", "처리", "하다", "예시", "이다", "ㅋㅋ", "#한국어"]
-        p twkorean.tokens_to_token_list(stem)
-        # ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하다(Verb: 7, 2)", "예시(Noun: 10, 2)", "이다(Adjective: 12, 3)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
+        # [
+        #    ["한국어(Noun: 0, 3)", "한국어", "Noun", nil, "0", "3"],
+        #    ["를(Josa: 3, 1)", "를", "Josa", nil, "3", "1"],
+        #    ["처리(Noun: 5, 2)", "처리", "Noun", nil, "5", "2"],
+        #    ["하는(Verb(하다): 7, 2)", "하는", "Verb", "(하다)", "7", "2"],
+        #    ["예시(Noun: 10, 2)", "예시", "Noun", nil, "10", "2"],
+        #    ["입니다(Adjective(이다): 12, 3)", "입니다", "Adjective", "(이다)", "12", "3"],
+        #    ["ㅋㅋㅋ(KoreanParticle: 15, 3)", "ㅋㅋㅋ", "KoreanParticle", nil, "15", "3"],
+        #    ["#한국어(Hashtag: 19, 4)", "#한국어", "Hashtag", nil, "19", "4"]
+        # ]
       end
       it "Phrase extraction" do
         p "Phrase extraction"
         p twkorean.extract_phrases(tokens)
-        # ["한국어(Noun: 0, 3)", "처리(Noun: 5, 2)", "처리하는 예시(Noun: 5, 7)", "예시(Noun: 10, 2)", "#한국어(Hashtag: 18, 4)"]
+        # ["한국어(Noun: 0, 3)", "처리(Noun: 5, 2)", "처리하는 예시(Noun: 5, 7)", "예시(Noun: 10, 2)", "#한국어(Hashtag: 19, 4)"]
       end
     end
-end## Contributing
+## Contributing
 1. Fork it ( https://github.com/[my-github-username]/twkorean/fork )
 2. Create your feature branch (`git checkout -b my-new-feature`)

data/Rakefile CHANGED

@@ -1,2 +1,7 @@
 require "bundler/gem_tasks"
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new('spec')
+# If you want to make this the default task
+task :default => :spec

data/lib/jars/open-korean-text-2.0.6-SNAPSHOT.jar ADDED

Binary file

data/lib/jars/scala-library-2.12.0.jar ADDED

Binary file

data/lib/jars/twitter-text-1.14.3.jar ADDED

Binary file

data/lib/twkorean/twitter_korean_text.rb CHANGED

@@ -1,46 +1,48 @@
 module Twkorean
   class TwitterKoreanText
-    attr_accessor :korean_processor
     def initialize(normalization = true, stemming = true)
       jars = Dir.glob(File.dirname(__FILE__)+"/../jars/*.jar").join(File::PATH_SEPARATOR)
       Rjb::load(jars, ['-Xmx512M'])
-      self.korean_processor = Rjb::import('com.twitter.penguin.korean.TwitterKoreanProcessorJava')
     end
     def normalize(text)
-      self.korean_processor.normalize(text).toString
+      korean_processor.normalize(text).toString
     end
     def tokenize(text)
-      tokens = self.korean_processor.tokenize(text)
+      tokens = korean_processor.tokenize(text)
       tokens
     end
     def tokens_to_string_list(tokens)
-      tokens = self.korean_processor.tokensToJavaStringList(tokens)
+      tokens = korean_processor.tokensToJavaStringList(tokens)
       tokens.toArray.map{|x| x.toString}
     end
     def tokens_to_token_list(tokens)
-      tokens = self.korean_processor.tokensToJavaKoreanTokenList(tokens)
-      tokens.toArray.map{|x| self.parser(x.toString)}
+      tokens = korean_processor.tokensToJavaKoreanTokenList(tokens)
+      tokens.toArray.map{|x| parser(x.toString)}
     end
     def stem(tokens)
-      stemmed = self.korean_processor.stem(tokens)
-      stemmed
+      # Deprecated method
+      # For legacy Code, Version less 0.0.6
+      tokens_to_token_list(tokens)
     end
     def extract_phrases(tokens)
-      phrases = self.korean_processor.extractPhrases(tokens, true, true)
+      phrases = korean_processor.extractPhrases(tokens, true, true)
       phrases.toArray.map{|x| x.toString}
     end
+    private
     def parser(text)
-      text.match(/(.*)\(([a-zA-Z]*): ([0-9]+), ([0-9]+)\)/).to_a
+      text.match(/(.*)\(([a-zA-Z]*)(\(.*\))?: ([0-9]+), ([0-9]+)\)/).to_a
     end
+    def korean_processor
+      @korean_processor ||= Rjb::import('org.openkoreantext.processor.OpenKoreanTextProcessorJava')
+    end
   end
 end

data/lib/twkorean/version.rb CHANGED

@@ -1,8 +1,8 @@
 # @name                twkorean-ruby
 # @author              JunSangPil
-# @version             0.0.5
+# @version             0.0.6
 # @url                 https://github.com/jun85664396/twkorean-ruby
 # @license             Apache License 2.0
 module Twkorean
-  VERSION = "0.0.5"
+  VERSION = "0.0.6"
 end

data/spec/spec_helper.rb ADDED

@@ -0,0 +1,102 @@
+# This file was generated by the `rspec --init` command. Conventionally, all
+# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
+# The generated `.rspec` file contains `--require spec_helper` which will cause
+# this file to always be loaded, without a need to explicitly require it in any
+# files.
+#
+# Given that it is always loaded, you are encouraged to keep this file as
+# light-weight as possible. Requiring heavyweight dependencies from this file
+# will add to the boot time of your test suite on EVERY test run, even for an
+# individual file that may not need all of that loaded. Instead, consider making
+# a separate helper file that requires the additional dependencies and performs
+# the additional setup, and require it from the spec files that actually need
+# it.
+#
+require 'twkorean'
+# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
+RSpec.configure do |config|
+  # rspec-expectations config goes here. You can use an alternate
+  # assertion/expectation library such as wrong or the stdlib/minitest
+  # assertions if you prefer.
+  config.expect_with :rspec do |expectations|
+    # This option will default to `true` in RSpec 4. It makes the `description`
+    # and `failure_message` of custom matchers include text for helper methods
+    # defined using `chain`, e.g.:
+    #     be_bigger_than(2).and_smaller_than(4).description
+    #     # => "be bigger than 2 and smaller than 4"
+    # ...rather than:
+    #     # => "be bigger than 2"
+    expectations.include_chain_clauses_in_custom_matcher_descriptions = true
+  end
+  # rspec-mocks config goes here. You can use an alternate test double
+  # library (such as bogus or mocha) by changing the `mock_with` option here.
+  config.mock_with :rspec do |mocks|
+    # Prevents you from mocking or stubbing a method that does not exist on
+    # a real object. This is generally recommended, and will default to
+    # `true` in RSpec 4.
+    mocks.verify_partial_doubles = true
+  end
+  # This option will default to `:apply_to_host_groups` in RSpec 4 (and will
+  # have no way to turn it off -- the option exists only for backwards
+  # compatibility in RSpec 3). It causes shared context metadata to be
+  # inherited by the metadata hash of host groups and examples, rather than
+  # triggering implicit auto-inclusion in groups with matching metadata.
+  config.shared_context_metadata_behavior = :apply_to_host_groups
+# The settings below are suggested to provide a good initial experience
+# with RSpec, but feel free to customize to your heart's content.
+=begin
+  # This allows you to limit a spec run to individual examples or groups
+  # you care about by tagging them with `:focus` metadata. When nothing
+  # is tagged with `:focus`, all examples get run. RSpec also provides
+  # aliases for `it`, `describe`, and `context` that include `:focus`
+  # metadata: `fit`, `fdescribe` and `fcontext`, respectively.
+  config.filter_run_when_matching :focus
+  # Allows RSpec to persist some state between runs in order to support
+  # the `--only-failures` and `--next-failure` CLI options. We recommend
+  # you configure your source control system to ignore this file.
+  config.example_status_persistence_file_path = "spec/examples.txt"
+  # Limits the available syntax to the non-monkey patched syntax that is
+  # recommended. For more details, see:
+  #   - http://rspec.info/blog/2012/06/rspecs-new-expectation-syntax/
+  #   - http://www.teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/
+  #   - http://rspec.info/blog/2014/05/notable-changes-in-rspec-3/#zero-monkey-patching-mode
+  config.disable_monkey_patching!
+  # This setting enables warnings. It's recommended, but in some cases may
+  # be too noisy due to issues in dependencies.
+  config.warnings = true
+  # Many RSpec users commonly either run the entire suite or an individual
+  # file, and it's useful to allow more verbose output when running an
+  # individual spec file.
+  if config.files_to_run.one?
+    # Use the documentation formatter for detailed output,
+    # unless a formatter has already been configured
+    # (e.g. via a command-line flag).
+    config.default_formatter = "doc"
+  end
+  # Print the 10 slowest examples and example groups at the
+  # end of the spec run, to help surface which specs are running
+  # particularly slow.
+  config.profile_examples = 10
+  # Run specs in random order to surface order dependencies. If you find an
+  # order dependency and want to debug it, you can fix the order by providing
+  # the seed, which is printed after each run.
+  #     --seed 1234
+  config.order = :random
+  # Seed global randomization in this process using the `--seed` CLI option.
+  # Setting this allows you to use `--seed` to deterministically reproduce
+  # test failures related to randomization by passing the same `--seed` value
+  # as the one that triggered the failure.
+  Kernel.srand config.seed
+=end
+end

data/spec/twkorean_spec.rb ADDED

@@ -0,0 +1,40 @@
+RSpec.describe Twkorean do
+  let(:twkorean) { Twkorean::TwitterKoreanText.new }
+  let(:text) { twkorean.normalize("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ #한국어") }
+  let(:tokens) { twkorean.tokenize(text) }
+  context "Tokenize" do
+    it "should split a sentence to list of strings " do
+      expect(twkorean.tokens_to_string_list(tokens)).to eq(
+        ["한국어", "를", "처리", "하는", "예시", "입니다", "ㅋㅋㅋ", "#한국어"]
+      )
+    end
+    it "should split a sentence to list of tokens" do
+      expect(twkorean.tokens_to_token_list(tokens)).to eq([
+        ["한국어(Noun: 0, 3)", "한국어", "Noun", nil, "0", "3"],
+        ["를(Josa: 3, 1)", "를", "Josa", nil, "3", "1"],
+        ["처리(Noun: 5, 2)", "처리", "Noun", nil, "5", "2"],
+        ["하는(Verb(하다): 7, 2)", "하는", "Verb", "(하다)", "7", "2"],
+        ["예시(Noun: 10, 2)", "예시", "Noun", nil, "10", "2"],
+        ["입니다(Adjective(이다): 12, 3)", "입니다", "Adjective", "(이다)", "12", "3"],
+        ["ㅋㅋㅋ(KoreanParticle: 15, 3)", "ㅋㅋㅋ", "KoreanParticle", nil, "15", "3"],
+        ["#한국어(Hashtag: 19, 4)", "#한국어", "Hashtag", nil, "19", "4"]
+      ])
+    end
+  end
+  context "Phrase extraction" do
+    it "should extract phrases from a sentence" do
+      expect(twkorean.extract_phrases(tokens)).to eq(
+        [
+          "한국어(Noun: 0, 3)",
+          "처리(Noun: 5, 2)",
+          "처리하는 예시(Noun: 5, 7)",
+          "예시(Noun: 10, 2)",
+          "#한국어(Hashtag: 19, 4)"
+        ]
+      )
+    end
+  end
+end

data/twkorean.gemspec CHANGED

@@ -18,8 +18,9 @@ Gem::Specification.new do |spec|
   spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
   spec.require_paths = ["lib"]
+  spec.add_dependency "rjb"
   spec.add_development_dependency "bundler", "~> 1.6"
   spec.add_development_dependency "rake"
-  spec.add_development_dependency "rjb"
-  spec.add_development_dependency 'minitest'
+  spec.add_development_dependency "rspec"
+  spec.add_development_dependency "pry"
 end

metadata CHANGED

@@ -1,15 +1,29 @@
 --- !ruby/object:Gem::Specification
 name: twkorean
 version: !ruby/object:Gem::Version
-  version: 0.0.5
+  version: 0.0.6
 platform: ruby
 authors:
 - JunSangPil
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2017-07-08 00:00:00.000000000 Z
+date: 2017-07-19 00:00:00.000000000 Z
 dependencies:
+- !ruby/object:Gem::Dependency
+  name: rjb
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: bundler
   requirement: !ruby/object:Gem::Requirement
@@ -39,7 +53,7 @@ dependencies:
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
-  name: rjb
+  name: rspec
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - ">="
@@ -53,7 +67,7 @@ dependencies:
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
-  name: minitest
+  name: pry
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - ">="
@@ -74,18 +88,19 @@ extensions: []
 extra_rdoc_files: []
 files:
 - ".gitignore"
+- ".rspec"
 - Gemfile
 - LICENSE.txt
 - README.md
 - Rakefile
-- lib/jars/korean-text-4.4.jar
-- lib/jars/scala-library-2.11.6.jar
-- lib/jars/twitter-text-1.13.3.jar
+- lib/jars/open-korean-text-2.0.6-SNAPSHOT.jar
+- lib/jars/scala-library-2.12.0.jar
+- lib/jars/twitter-text-1.14.3.jar
 - lib/twkorean.rb
 - lib/twkorean/twitter_korean_text.rb
 - lib/twkorean/version.rb
-- test/test_helper.rb
-- test/twkorean.rb
+- spec/spec_helper.rb
+- spec/twkorean_spec.rb
 - twkorean.gemspec
 homepage: https://github.com/jun85664396/twkorean-ruby
 licenses:
@@ -112,5 +127,5 @@ signing_key:
 specification_version: 4
 summary: Ruby interface to twitter-korean-text
 test_files:
-- test/test_helper.rb
-- test/twkorean.rb
+- spec/spec_helper.rb
+- spec/twkorean_spec.rb

data/lib/jars/korean-text-4.4.jar DELETED

Binary file

data/lib/jars/scala-library-2.11.6.jar DELETED

Binary file

data/lib/jars/twitter-text-1.13.3.jar DELETED

Binary file

data/test/test_helper.rb DELETED

@@ -1,6 +0,0 @@
-# @name                twkorean-ruby
-# @author              JunSangPil
-# @version             0.0.4
-# @url                 https://github.com/jun85664396/twkorean-ruby
-# @license             Apache License 2.0
-require 'minitest/autorun'

data/test/twkorean.rb DELETED

@@ -1,38 +0,0 @@
-# @name                twkorean-ruby
-# @author              JunSangPil
-# @version             0.0.4
-# @url                 https://github.com/jun85664396/twkorean-ruby
-# @license             Apache License 2.0
-require_relative 'test_helper'
-require 'twkorean'
-describe "Twkorean" do
-  text = "한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ #한국어"
-  twkorean = Twkorean::TwitterKoreanText.new
-  text = twkorean.normalize(text)
-  tokens = twkorean.tokenize(text)
-  it "Tokenize" do
-    p "#Tokenize"
-    p twkorean.tokens_to_string_list(tokens)
-    # ["한국어", "를", "처리", "하는", "예시", "입니", "다", "ㅋㅋ", "#한국어"]
-    p twkorean.tokens_to_token_list(tokens)
-    # ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하는(Verb: 7, 2)", "예시(Noun: 10, 2)", "입니(Adjective: 12, 2)", "다(Eomi: 14, 1)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
-  end
-  it "Stemming" do
-    p "#Stemming"
-    stem = twkorean.stem(tokens)
-    p twkorean.tokens_to_string_list(stem)
-    # ["한국어", "를", "처리", "하다", "예시", "이다", "ㅋㅋ", "#한국어"]
-    p twkorean.tokens_to_token_list(stem)
-    # ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하다(Verb: 7, 2)", "예시(Noun: 10, 2)", "이다(Adjective: 12, 3)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
-  end
-  it "Phrase extraction" do
-    p "Phrase extraction"
-    p twkorean.extract_phrases(tokens)
-    # ["한국어(Noun: 0, 3)", "처리(Noun: 5, 2)", "처리하는 예시(Noun: 5, 7)", "예시(Noun: 10, 2)", "#한국어(Hashtag: 18, 4)"]
-  end
-end