twkorean 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rspec +4 -0
- data/Gemfile +0 -1
- data/README.md +21 -16
- data/Rakefile +5 -0
- data/lib/jars/open-korean-text-2.0.6-SNAPSHOT.jar +0 -0
- data/lib/jars/scala-library-2.12.0.jar +0 -0
- data/lib/jars/twitter-text-1.14.3.jar +0 -0
- data/lib/twkorean/twitter_korean_text.rb +14 -12
- data/lib/twkorean/version.rb +2 -2
- data/spec/spec_helper.rb +102 -0
- data/spec/twkorean_spec.rb +40 -0
- data/twkorean.gemspec +3 -2
- metadata +26 -11
- data/lib/jars/korean-text-4.4.jar +0 -0
- data/lib/jars/scala-library-2.11.6.jar +0 -0
- data/lib/jars/twitter-text-1.13.3.jar +0 -0
- data/test/test_helper.rb +0 -6
- data/test/twkorean.rb +0 -38
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2f7c6b1a5621ac47e13abb6a548c88887a2f6882
|
4
|
+
data.tar.gz: 8d70e9c33a48b8dea26a5467f3a7faba82e7c066
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cd109eafdc8208b0c96c83b619c7a1a125305a1f00a3e0104b05731371917b6c1e7b173436e57b04938364eea82d02035e8f0fb964e6579b8e5a01dc7ce855fb
|
7
|
+
data.tar.gz: b51ba84b44cbd8546a959a6dc7395f64bf6f1af6fd7c1d6110cea39d9d8faf807a575cd37410a42cc66c720f37362fc7ecb0ada4a1459a9d61362c6ffbd01a3c
|
data/.rspec
ADDED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
## Compatibility
|
4
4
|
|
5
|
-
Currently wraps [
|
5
|
+
Currently wraps [open-korean-text 2.0.5](https://github.com/open-korean-text/open-korean-text/releases/tag/open-korean-text-2.0.5) / 현재 이 프로젝트는 [open-korean-text 2.0.5](https://github.com/open-korean-text/open-korean-text/releases/tag/open-korean-text-2.0.5)을 사용중입니다.
|
6
6
|
|
7
7
|
## Installation
|
8
8
|
|
@@ -10,6 +10,10 @@ Add this line to your application's Gemfile:
|
|
10
10
|
|
11
11
|
gem 'twkorean'
|
12
12
|
|
13
|
+
If you are using Java 7
|
14
|
+
|
15
|
+
gem 'twkorean', "~> 0.0.5"
|
16
|
+
|
13
17
|
And then execute:
|
14
18
|
|
15
19
|
$ bundle
|
@@ -20,12 +24,13 @@ Or install it yourself as:
|
|
20
24
|
|
21
25
|
## Required
|
22
26
|
|
27
|
+
Twkorean supports java8+
|
28
|
+
|
23
29
|
$ export JAVA_HOME={Your Path}
|
24
|
-
$ gem install 'rjb'
|
25
30
|
|
26
31
|
## Test
|
27
32
|
|
28
|
-
$
|
33
|
+
$ rake or rspec
|
29
34
|
|
30
35
|
## Usage
|
31
36
|
|
@@ -38,28 +43,28 @@ Or install it yourself as:
|
|
38
43
|
it "Tokenize" do
|
39
44
|
p "#Tokenize"
|
40
45
|
p twkorean.tokens_to_string_list(tokens)
|
41
|
-
# ["한국어", "를", "처리", "하는", "예시", "
|
46
|
+
# ["한국어", "를", "처리", "하는", "예시", "입니다", "ㅋㅋㅋ", "#한국어"]
|
42
47
|
p twkorean.tokens_to_token_list(tokens)
|
43
|
-
# [
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
#
|
51
|
-
|
52
|
-
#
|
48
|
+
# [
|
49
|
+
# ["한국어(Noun: 0, 3)", "한국어", "Noun", nil, "0", "3"],
|
50
|
+
# ["를(Josa: 3, 1)", "를", "Josa", nil, "3", "1"],
|
51
|
+
# ["처리(Noun: 5, 2)", "처리", "Noun", nil, "5", "2"],
|
52
|
+
# ["하는(Verb(하다): 7, 2)", "하는", "Verb", "(하다)", "7", "2"],
|
53
|
+
# ["예시(Noun: 10, 2)", "예시", "Noun", nil, "10", "2"],
|
54
|
+
# ["입니다(Adjective(이다): 12, 3)", "입니다", "Adjective", "(이다)", "12", "3"],
|
55
|
+
# ["ㅋㅋㅋ(KoreanParticle: 15, 3)", "ㅋㅋㅋ", "KoreanParticle", nil, "15", "3"],
|
56
|
+
# ["#한국어(Hashtag: 19, 4)", "#한국어", "Hashtag", nil, "19", "4"]
|
57
|
+
# ]
|
53
58
|
end
|
54
59
|
|
55
60
|
it "Phrase extraction" do
|
56
61
|
p "Phrase extraction"
|
57
62
|
p twkorean.extract_phrases(tokens)
|
58
|
-
# ["한국어(Noun: 0, 3)", "처리(Noun: 5, 2)", "처리하는 예시(Noun: 5, 7)", "예시(Noun: 10, 2)", "#한국어(Hashtag:
|
63
|
+
# ["한국어(Noun: 0, 3)", "처리(Noun: 5, 2)", "처리하는 예시(Noun: 5, 7)", "예시(Noun: 10, 2)", "#한국어(Hashtag: 19, 4)"]
|
59
64
|
end
|
60
65
|
end
|
61
66
|
|
62
|
-
|
67
|
+
## Contributing
|
63
68
|
|
64
69
|
1. Fork it ( https://github.com/[my-github-username]/twkorean/fork )
|
65
70
|
2. Create your feature branch (`git checkout -b my-new-feature`)
|
data/Rakefile
CHANGED
Binary file
|
Binary file
|
Binary file
|
@@ -1,46 +1,48 @@
|
|
1
1
|
module Twkorean
|
2
2
|
class TwitterKoreanText
|
3
3
|
|
4
|
-
attr_accessor :korean_processor
|
5
|
-
|
6
4
|
def initialize(normalization = true, stemming = true)
|
7
5
|
jars = Dir.glob(File.dirname(__FILE__)+"/../jars/*.jar").join(File::PATH_SEPARATOR)
|
8
6
|
Rjb::load(jars, ['-Xmx512M'])
|
9
|
-
self.korean_processor = Rjb::import('com.twitter.penguin.korean.TwitterKoreanProcessorJava')
|
10
7
|
end
|
11
8
|
|
12
9
|
def normalize(text)
|
13
|
-
|
10
|
+
korean_processor.normalize(text).toString
|
14
11
|
end
|
15
12
|
|
16
13
|
def tokenize(text)
|
17
|
-
tokens =
|
14
|
+
tokens = korean_processor.tokenize(text)
|
18
15
|
tokens
|
19
16
|
end
|
20
17
|
|
21
18
|
def tokens_to_string_list(tokens)
|
22
|
-
tokens =
|
19
|
+
tokens = korean_processor.tokensToJavaStringList(tokens)
|
23
20
|
tokens.toArray.map{|x| x.toString}
|
24
21
|
end
|
25
22
|
|
26
23
|
def tokens_to_token_list(tokens)
|
27
|
-
tokens =
|
28
|
-
tokens.toArray.map{|x|
|
24
|
+
tokens = korean_processor.tokensToJavaKoreanTokenList(tokens)
|
25
|
+
tokens.toArray.map{|x| parser(x.toString)}
|
29
26
|
end
|
30
27
|
|
31
28
|
def stem(tokens)
|
32
|
-
|
33
|
-
|
29
|
+
# Deprecated method
|
30
|
+
# For legacy Code, Version less 0.0.6
|
31
|
+
tokens_to_token_list(tokens)
|
34
32
|
end
|
35
33
|
|
36
34
|
def extract_phrases(tokens)
|
37
|
-
phrases =
|
35
|
+
phrases = korean_processor.extractPhrases(tokens, true, true)
|
38
36
|
phrases.toArray.map{|x| x.toString}
|
39
37
|
end
|
40
38
|
|
39
|
+
private
|
41
40
|
def parser(text)
|
42
|
-
text.match(/(.*)\(([a-zA-Z]*)
|
41
|
+
text.match(/(.*)\(([a-zA-Z]*)(\(.*\))?: ([0-9]+), ([0-9]+)\)/).to_a
|
43
42
|
end
|
44
43
|
|
44
|
+
def korean_processor
|
45
|
+
@korean_processor ||= Rjb::import('org.openkoreantext.processor.OpenKoreanTextProcessorJava')
|
46
|
+
end
|
45
47
|
end
|
46
48
|
end
|
data/lib/twkorean/version.rb
CHANGED
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,102 @@
|
|
1
|
+
# This file was generated by the `rspec --init` command. Conventionally, all
|
2
|
+
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
3
|
+
# The generated `.rspec` file contains `--require spec_helper` which will cause
|
4
|
+
# this file to always be loaded, without a need to explicitly require it in any
|
5
|
+
# files.
|
6
|
+
#
|
7
|
+
# Given that it is always loaded, you are encouraged to keep this file as
|
8
|
+
# light-weight as possible. Requiring heavyweight dependencies from this file
|
9
|
+
# will add to the boot time of your test suite on EVERY test run, even for an
|
10
|
+
# individual file that may not need all of that loaded. Instead, consider making
|
11
|
+
# a separate helper file that requires the additional dependencies and performs
|
12
|
+
# the additional setup, and require it from the spec files that actually need
|
13
|
+
# it.
|
14
|
+
#
|
15
|
+
require 'twkorean'
|
16
|
+
|
17
|
+
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
18
|
+
RSpec.configure do |config|
|
19
|
+
# rspec-expectations config goes here. You can use an alternate
|
20
|
+
# assertion/expectation library such as wrong or the stdlib/minitest
|
21
|
+
# assertions if you prefer.
|
22
|
+
config.expect_with :rspec do |expectations|
|
23
|
+
# This option will default to `true` in RSpec 4. It makes the `description`
|
24
|
+
# and `failure_message` of custom matchers include text for helper methods
|
25
|
+
# defined using `chain`, e.g.:
|
26
|
+
# be_bigger_than(2).and_smaller_than(4).description
|
27
|
+
# # => "be bigger than 2 and smaller than 4"
|
28
|
+
# ...rather than:
|
29
|
+
# # => "be bigger than 2"
|
30
|
+
expectations.include_chain_clauses_in_custom_matcher_descriptions = true
|
31
|
+
end
|
32
|
+
|
33
|
+
# rspec-mocks config goes here. You can use an alternate test double
|
34
|
+
# library (such as bogus or mocha) by changing the `mock_with` option here.
|
35
|
+
config.mock_with :rspec do |mocks|
|
36
|
+
# Prevents you from mocking or stubbing a method that does not exist on
|
37
|
+
# a real object. This is generally recommended, and will default to
|
38
|
+
# `true` in RSpec 4.
|
39
|
+
mocks.verify_partial_doubles = true
|
40
|
+
end
|
41
|
+
|
42
|
+
# This option will default to `:apply_to_host_groups` in RSpec 4 (and will
|
43
|
+
# have no way to turn it off -- the option exists only for backwards
|
44
|
+
# compatibility in RSpec 3). It causes shared context metadata to be
|
45
|
+
# inherited by the metadata hash of host groups and examples, rather than
|
46
|
+
# triggering implicit auto-inclusion in groups with matching metadata.
|
47
|
+
config.shared_context_metadata_behavior = :apply_to_host_groups
|
48
|
+
|
49
|
+
# The settings below are suggested to provide a good initial experience
|
50
|
+
# with RSpec, but feel free to customize to your heart's content.
|
51
|
+
=begin
|
52
|
+
# This allows you to limit a spec run to individual examples or groups
|
53
|
+
# you care about by tagging them with `:focus` metadata. When nothing
|
54
|
+
# is tagged with `:focus`, all examples get run. RSpec also provides
|
55
|
+
# aliases for `it`, `describe`, and `context` that include `:focus`
|
56
|
+
# metadata: `fit`, `fdescribe` and `fcontext`, respectively.
|
57
|
+
config.filter_run_when_matching :focus
|
58
|
+
|
59
|
+
# Allows RSpec to persist some state between runs in order to support
|
60
|
+
# the `--only-failures` and `--next-failure` CLI options. We recommend
|
61
|
+
# you configure your source control system to ignore this file.
|
62
|
+
config.example_status_persistence_file_path = "spec/examples.txt"
|
63
|
+
|
64
|
+
# Limits the available syntax to the non-monkey patched syntax that is
|
65
|
+
# recommended. For more details, see:
|
66
|
+
# - http://rspec.info/blog/2012/06/rspecs-new-expectation-syntax/
|
67
|
+
# - http://www.teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/
|
68
|
+
# - http://rspec.info/blog/2014/05/notable-changes-in-rspec-3/#zero-monkey-patching-mode
|
69
|
+
config.disable_monkey_patching!
|
70
|
+
|
71
|
+
# This setting enables warnings. It's recommended, but in some cases may
|
72
|
+
# be too noisy due to issues in dependencies.
|
73
|
+
config.warnings = true
|
74
|
+
|
75
|
+
# Many RSpec users commonly either run the entire suite or an individual
|
76
|
+
# file, and it's useful to allow more verbose output when running an
|
77
|
+
# individual spec file.
|
78
|
+
if config.files_to_run.one?
|
79
|
+
# Use the documentation formatter for detailed output,
|
80
|
+
# unless a formatter has already been configured
|
81
|
+
# (e.g. via a command-line flag).
|
82
|
+
config.default_formatter = "doc"
|
83
|
+
end
|
84
|
+
|
85
|
+
# Print the 10 slowest examples and example groups at the
|
86
|
+
# end of the spec run, to help surface which specs are running
|
87
|
+
# particularly slow.
|
88
|
+
config.profile_examples = 10
|
89
|
+
|
90
|
+
# Run specs in random order to surface order dependencies. If you find an
|
91
|
+
# order dependency and want to debug it, you can fix the order by providing
|
92
|
+
# the seed, which is printed after each run.
|
93
|
+
# --seed 1234
|
94
|
+
config.order = :random
|
95
|
+
|
96
|
+
# Seed global randomization in this process using the `--seed` CLI option.
|
97
|
+
# Setting this allows you to use `--seed` to deterministically reproduce
|
98
|
+
# test failures related to randomization by passing the same `--seed` value
|
99
|
+
# as the one that triggered the failure.
|
100
|
+
Kernel.srand config.seed
|
101
|
+
=end
|
102
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
RSpec.describe Twkorean do
|
2
|
+
let(:twkorean) { Twkorean::TwitterKoreanText.new }
|
3
|
+
let(:text) { twkorean.normalize("한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ #한국어") }
|
4
|
+
let(:tokens) { twkorean.tokenize(text) }
|
5
|
+
|
6
|
+
context "Tokenize" do
|
7
|
+
it "should split a sentence to list of strings " do
|
8
|
+
expect(twkorean.tokens_to_string_list(tokens)).to eq(
|
9
|
+
["한국어", "를", "처리", "하는", "예시", "입니다", "ㅋㅋㅋ", "#한국어"]
|
10
|
+
)
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should split a sentence to list of tokens" do
|
14
|
+
expect(twkorean.tokens_to_token_list(tokens)).to eq([
|
15
|
+
["한국어(Noun: 0, 3)", "한국어", "Noun", nil, "0", "3"],
|
16
|
+
["를(Josa: 3, 1)", "를", "Josa", nil, "3", "1"],
|
17
|
+
["처리(Noun: 5, 2)", "처리", "Noun", nil, "5", "2"],
|
18
|
+
["하는(Verb(하다): 7, 2)", "하는", "Verb", "(하다)", "7", "2"],
|
19
|
+
["예시(Noun: 10, 2)", "예시", "Noun", nil, "10", "2"],
|
20
|
+
["입니다(Adjective(이다): 12, 3)", "입니다", "Adjective", "(이다)", "12", "3"],
|
21
|
+
["ㅋㅋㅋ(KoreanParticle: 15, 3)", "ㅋㅋㅋ", "KoreanParticle", nil, "15", "3"],
|
22
|
+
["#한국어(Hashtag: 19, 4)", "#한국어", "Hashtag", nil, "19", "4"]
|
23
|
+
])
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
context "Phrase extraction" do
|
28
|
+
it "should extract phrases from a sentence" do
|
29
|
+
expect(twkorean.extract_phrases(tokens)).to eq(
|
30
|
+
[
|
31
|
+
"한국어(Noun: 0, 3)",
|
32
|
+
"처리(Noun: 5, 2)",
|
33
|
+
"처리하는 예시(Noun: 5, 7)",
|
34
|
+
"예시(Noun: 10, 2)",
|
35
|
+
"#한국어(Hashtag: 19, 4)"
|
36
|
+
]
|
37
|
+
)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
data/twkorean.gemspec
CHANGED
@@ -18,8 +18,9 @@ Gem::Specification.new do |spec|
|
|
18
18
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
19
|
spec.require_paths = ["lib"]
|
20
20
|
|
21
|
+
spec.add_dependency "rjb"
|
21
22
|
spec.add_development_dependency "bundler", "~> 1.6"
|
22
23
|
spec.add_development_dependency "rake"
|
23
|
-
spec.add_development_dependency "
|
24
|
-
spec.add_development_dependency
|
24
|
+
spec.add_development_dependency "rspec"
|
25
|
+
spec.add_development_dependency "pry"
|
25
26
|
end
|
metadata
CHANGED
@@ -1,15 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twkorean
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- JunSangPil
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-07-
|
11
|
+
date: 2017-07-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rjb
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
13
27
|
- !ruby/object:Gem::Dependency
|
14
28
|
name: bundler
|
15
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -39,7 +53,7 @@ dependencies:
|
|
39
53
|
- !ruby/object:Gem::Version
|
40
54
|
version: '0'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
56
|
+
name: rspec
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
44
58
|
requirements:
|
45
59
|
- - ">="
|
@@ -53,7 +67,7 @@ dependencies:
|
|
53
67
|
- !ruby/object:Gem::Version
|
54
68
|
version: '0'
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
70
|
+
name: pry
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
58
72
|
requirements:
|
59
73
|
- - ">="
|
@@ -74,18 +88,19 @@ extensions: []
|
|
74
88
|
extra_rdoc_files: []
|
75
89
|
files:
|
76
90
|
- ".gitignore"
|
91
|
+
- ".rspec"
|
77
92
|
- Gemfile
|
78
93
|
- LICENSE.txt
|
79
94
|
- README.md
|
80
95
|
- Rakefile
|
81
|
-
- lib/jars/korean-text-
|
82
|
-
- lib/jars/scala-library-2.
|
83
|
-
- lib/jars/twitter-text-1.
|
96
|
+
- lib/jars/open-korean-text-2.0.6-SNAPSHOT.jar
|
97
|
+
- lib/jars/scala-library-2.12.0.jar
|
98
|
+
- lib/jars/twitter-text-1.14.3.jar
|
84
99
|
- lib/twkorean.rb
|
85
100
|
- lib/twkorean/twitter_korean_text.rb
|
86
101
|
- lib/twkorean/version.rb
|
87
|
-
-
|
88
|
-
-
|
102
|
+
- spec/spec_helper.rb
|
103
|
+
- spec/twkorean_spec.rb
|
89
104
|
- twkorean.gemspec
|
90
105
|
homepage: https://github.com/jun85664396/twkorean-ruby
|
91
106
|
licenses:
|
@@ -112,5 +127,5 @@ signing_key:
|
|
112
127
|
specification_version: 4
|
113
128
|
summary: Ruby interface to twitter-korean-text
|
114
129
|
test_files:
|
115
|
-
-
|
116
|
-
-
|
130
|
+
- spec/spec_helper.rb
|
131
|
+
- spec/twkorean_spec.rb
|
Binary file
|
Binary file
|
Binary file
|
data/test/test_helper.rb
DELETED
data/test/twkorean.rb
DELETED
@@ -1,38 +0,0 @@
|
|
1
|
-
# @name twkorean-ruby
|
2
|
-
# @author JunSangPil
|
3
|
-
# @version 0.0.4
|
4
|
-
# @url https://github.com/jun85664396/twkorean-ruby
|
5
|
-
# @license Apache License 2.0
|
6
|
-
require_relative 'test_helper'
|
7
|
-
require 'twkorean'
|
8
|
-
|
9
|
-
describe "Twkorean" do
|
10
|
-
text = "한국어를 처리하는 예시입니닼ㅋㅋㅋㅋㅋ #한국어"
|
11
|
-
twkorean = Twkorean::TwitterKoreanText.new
|
12
|
-
text = twkorean.normalize(text)
|
13
|
-
tokens = twkorean.tokenize(text)
|
14
|
-
|
15
|
-
it "Tokenize" do
|
16
|
-
p "#Tokenize"
|
17
|
-
p twkorean.tokens_to_string_list(tokens)
|
18
|
-
# ["한국어", "를", "처리", "하는", "예시", "입니", "다", "ㅋㅋ", "#한국어"]
|
19
|
-
p twkorean.tokens_to_token_list(tokens)
|
20
|
-
# ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하는(Verb: 7, 2)", "예시(Noun: 10, 2)", "입니(Adjective: 12, 2)", "다(Eomi: 14, 1)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
|
21
|
-
end
|
22
|
-
|
23
|
-
it "Stemming" do
|
24
|
-
p "#Stemming"
|
25
|
-
stem = twkorean.stem(tokens)
|
26
|
-
p twkorean.tokens_to_string_list(stem)
|
27
|
-
# ["한국어", "를", "처리", "하다", "예시", "이다", "ㅋㅋ", "#한국어"]
|
28
|
-
p twkorean.tokens_to_token_list(stem)
|
29
|
-
# ["한국어(Noun: 0, 3)", "를(Josa: 3, 1)", "처리(Noun: 5, 2)", "하다(Verb: 7, 2)", "예시(Noun: 10, 2)", "이다(Adjective: 12, 3)", "ㅋㅋ(KoreanParticle: 15, 2)", "#한국어(Hashtag: 18, 4)"]
|
30
|
-
end
|
31
|
-
|
32
|
-
it "Phrase extraction" do
|
33
|
-
p "Phrase extraction"
|
34
|
-
p twkorean.extract_phrases(tokens)
|
35
|
-
# ["한국어(Noun: 0, 3)", "처리(Noun: 5, 2)", "처리하는 예시(Noun: 5, 7)", "예시(Noun: 10, 2)", "#한국어(Hashtag: 18, 4)"]
|
36
|
-
end
|
37
|
-
|
38
|
-
end
|