suika 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +16 -0
- data/.rspec +3 -0
- data/.rubocop.yml +118 -0
- data/.travis.yml +6 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +9 -0
- data/LICENSE.txt +27 -0
- data/NOTICE.txt +79 -0
- data/README.md +77 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/dict/ipadic.gz +0 -0
- data/lib/suika.rb +6 -0
- data/lib/suika/char_def.rb +201 -0
- data/lib/suika/lattice.rb +29 -0
- data/lib/suika/tagger.rb +120 -0
- data/lib/suika/version.rb +7 -0
- data/suika.gemspec +32 -0
- metadata +80 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 41135bca7252d6e52d8c13c52ca08446ebe79ffe388f585a1739976e5b26ad4f
|
4
|
+
data.tar.gz: 22e021383f5169c264392cf4e1c4cd185319f810d86c2c8e85000b80ac493ccd
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 534e3c4c67612fcf0c052057dbb85b649cba037517c9c500c2a3e20e8f0cae97e734b71fcce7ff47ba1a55d4b99fd5fcdab0be26f14addddc3728c784d1e8966
|
7
|
+
data.tar.gz: 8f929702297098e8c3439170323205e9c58f6f5f9735192604bd08566f4f20c8ae1d603185126636097846df45e45c6e7141380fea3e2b449bea3e6c73bd983d
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.rubocop.yml
ADDED
@@ -0,0 +1,118 @@
|
|
1
|
+
require:
|
2
|
+
- rubocop-performance
|
3
|
+
- rubocop-rspec
|
4
|
+
|
5
|
+
AllCops:
|
6
|
+
TargetRubyVersion: 2.4
|
7
|
+
DisplayCopNames: true
|
8
|
+
DisplayStyleGuide: true
|
9
|
+
Exclude:
|
10
|
+
- 'bin/*'
|
11
|
+
- 'suika.gemspec'
|
12
|
+
- 'Rakefile'
|
13
|
+
- 'Gemfile'
|
14
|
+
|
15
|
+
Layout/EmptyLineAfterGuardClause:
|
16
|
+
Enabled: false
|
17
|
+
|
18
|
+
Layout/EmptyLinesAroundAttributeAccessor:
|
19
|
+
Enabled: true
|
20
|
+
|
21
|
+
Layout/LineLength:
|
22
|
+
Max: 145
|
23
|
+
IgnoredPatterns: ['(\A|\s)#']
|
24
|
+
|
25
|
+
Layout/SpaceAroundMethodCallOperator:
|
26
|
+
Enabled: true
|
27
|
+
|
28
|
+
Lint/DeprecatedOpenSSLConstant:
|
29
|
+
Enabled: true
|
30
|
+
|
31
|
+
Lint/MixedRegexpCaptureTypes:
|
32
|
+
Enabled: true
|
33
|
+
|
34
|
+
Lint/RaiseException:
|
35
|
+
Enabled: true
|
36
|
+
|
37
|
+
Lint/StructNewOverride:
|
38
|
+
Enabled: true
|
39
|
+
|
40
|
+
Metrics/ModuleLength:
|
41
|
+
Max: 200
|
42
|
+
|
43
|
+
Metrics/ClassLength:
|
44
|
+
Max: 200
|
45
|
+
|
46
|
+
Metrics/MethodLength:
|
47
|
+
Max: 50
|
48
|
+
|
49
|
+
Metrics/AbcSize:
|
50
|
+
Max: 60
|
51
|
+
|
52
|
+
Metrics/CyclomaticComplexity:
|
53
|
+
Max: 16
|
54
|
+
|
55
|
+
Metrics/PerceivedComplexity:
|
56
|
+
Max: 16
|
57
|
+
|
58
|
+
Metrics/BlockLength:
|
59
|
+
Max: 40
|
60
|
+
Exclude:
|
61
|
+
- 'spec/**/*'
|
62
|
+
|
63
|
+
Metrics/ParameterLists:
|
64
|
+
Max: 12
|
65
|
+
|
66
|
+
Naming/MethodParameterName:
|
67
|
+
Enabled: false
|
68
|
+
|
69
|
+
Naming/ConstantName:
|
70
|
+
Enabled: false
|
71
|
+
|
72
|
+
Security/MarshalLoad:
|
73
|
+
Enabled: false
|
74
|
+
|
75
|
+
Style/AsciiComments:
|
76
|
+
Enabled: false
|
77
|
+
|
78
|
+
Style/Documentation:
|
79
|
+
Enabled: false
|
80
|
+
|
81
|
+
Style/ExponentialNotation:
|
82
|
+
Enabled: true
|
83
|
+
|
84
|
+
Style/HashEachMethods:
|
85
|
+
Enabled: true
|
86
|
+
|
87
|
+
Style/HashTransformKeys:
|
88
|
+
Enabled: true
|
89
|
+
|
90
|
+
Style/HashTransformValues:
|
91
|
+
Enabled: true
|
92
|
+
|
93
|
+
Style/RedundantRegexpCharacterClass:
|
94
|
+
Enabled: true
|
95
|
+
|
96
|
+
Style/RedundantRegexpEscape:
|
97
|
+
Enabled: true
|
98
|
+
|
99
|
+
Style/SlicingWithRange:
|
100
|
+
Enabled: true
|
101
|
+
|
102
|
+
Style/FormatStringToken:
|
103
|
+
Enabled: false
|
104
|
+
|
105
|
+
Style/NumericLiterals:
|
106
|
+
Enabled: false
|
107
|
+
|
108
|
+
RSpec/MultipleExpectations:
|
109
|
+
Enabled: false
|
110
|
+
|
111
|
+
RSpec/ExampleLength:
|
112
|
+
Max: 40
|
113
|
+
|
114
|
+
RSpec/InstanceVariable:
|
115
|
+
Enabled: false
|
116
|
+
|
117
|
+
RSpec/LeakyConstantDeclaration:
|
118
|
+
Enabled: false
|
data/.travis.yml
ADDED
data/CODE_OF_CONDUCT.md
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
# Contributor Covenant Code of Conduct
|
2
|
+
|
3
|
+
## Our Pledge
|
4
|
+
|
5
|
+
In the interest of fostering an open and welcoming environment, we as
|
6
|
+
contributors and maintainers pledge to making participation in our project and
|
7
|
+
our community a harassment-free experience for everyone, regardless of age, body
|
8
|
+
size, disability, ethnicity, gender identity and expression, level of experience,
|
9
|
+
nationality, personal appearance, race, religion, or sexual identity and
|
10
|
+
orientation.
|
11
|
+
|
12
|
+
## Our Standards
|
13
|
+
|
14
|
+
Examples of behavior that contributes to creating a positive environment
|
15
|
+
include:
|
16
|
+
|
17
|
+
* Using welcoming and inclusive language
|
18
|
+
* Being respectful of differing viewpoints and experiences
|
19
|
+
* Gracefully accepting constructive criticism
|
20
|
+
* Focusing on what is best for the community
|
21
|
+
* Showing empathy towards other community members
|
22
|
+
|
23
|
+
Examples of unacceptable behavior by participants include:
|
24
|
+
|
25
|
+
* The use of sexualized language or imagery and unwelcome sexual attention or
|
26
|
+
advances
|
27
|
+
* Trolling, insulting/derogatory comments, and personal or political attacks
|
28
|
+
* Public or private harassment
|
29
|
+
* Publishing others' private information, such as a physical or electronic
|
30
|
+
address, without explicit permission
|
31
|
+
* Other conduct which could reasonably be considered inappropriate in a
|
32
|
+
professional setting
|
33
|
+
|
34
|
+
## Our Responsibilities
|
35
|
+
|
36
|
+
Project maintainers are responsible for clarifying the standards of acceptable
|
37
|
+
behavior and are expected to take appropriate and fair corrective action in
|
38
|
+
response to any instances of unacceptable behavior.
|
39
|
+
|
40
|
+
Project maintainers have the right and responsibility to remove, edit, or
|
41
|
+
reject comments, commits, code, wiki edits, issues, and other contributions
|
42
|
+
that are not aligned to this Code of Conduct, or to ban temporarily or
|
43
|
+
permanently any contributor for other behaviors that they deem inappropriate,
|
44
|
+
threatening, offensive, or harmful.
|
45
|
+
|
46
|
+
## Scope
|
47
|
+
|
48
|
+
This Code of Conduct applies both within project spaces and in public spaces
|
49
|
+
when an individual is representing the project or its community. Examples of
|
50
|
+
representing a project or community include using an official project e-mail
|
51
|
+
address, posting via an official social media account, or acting as an appointed
|
52
|
+
representative at an online or offline event. Representation of a project may be
|
53
|
+
further defined and clarified by project maintainers.
|
54
|
+
|
55
|
+
## Enforcement
|
56
|
+
|
57
|
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
58
|
+
reported by contacting the project team at yoshoku@outlook.com. All
|
59
|
+
complaints will be reviewed and investigated and will result in a response that
|
60
|
+
is deemed necessary and appropriate to the circumstances. The project team is
|
61
|
+
obligated to maintain confidentiality with regard to the reporter of an incident.
|
62
|
+
Further details of specific enforcement policies may be posted separately.
|
63
|
+
|
64
|
+
Project maintainers who do not follow or enforce the Code of Conduct in good
|
65
|
+
faith may face temporary or permanent repercussions as determined by other
|
66
|
+
members of the project's leadership.
|
67
|
+
|
68
|
+
## Attribution
|
69
|
+
|
70
|
+
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
|
71
|
+
available at [https://contributor-covenant.org/version/1/4][version]
|
72
|
+
|
73
|
+
[homepage]: https://contributor-covenant.org
|
74
|
+
[version]: https://contributor-covenant.org/version/1/4/
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
Copyright (c) 2020 Atsushi Tatsuma
|
2
|
+
All rights reserved.
|
3
|
+
|
4
|
+
Redistribution and use in source and binary forms, with or without
|
5
|
+
modification, are permitted provided that the following conditions are met:
|
6
|
+
|
7
|
+
* Redistributions of source code must retain the above copyright notice, this
|
8
|
+
list of conditions and the following disclaimer.
|
9
|
+
|
10
|
+
* Redistributions in binary form must reproduce the above copyright notice,
|
11
|
+
this list of conditions and the following disclaimer in the documentation
|
12
|
+
and/or other materials provided with the distribution.
|
13
|
+
|
14
|
+
* Neither the name of the copyright holder nor the names of its
|
15
|
+
contributors may be used to endorse or promote products derived from
|
16
|
+
this software without specific prior written permission.
|
17
|
+
|
18
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
19
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
20
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
21
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
22
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
23
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
24
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
25
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
26
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
27
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
data/NOTICE.txt
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
Suika includes source code and binary data which are generated from IPAdic.
|
2
|
+
The license of IPAdic is found in the following file.
|
3
|
+
|
4
|
+
mecab-ipadic-2.7.0-20070801
|
5
|
+
https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM
|
6
|
+
|
7
|
+
---
|
8
|
+
IPAdic is licensed as follows:
|
9
|
+
|
10
|
+
Copyright 2000, 2001, 2002, 2003 Nara Institute of Science
|
11
|
+
and Technology. All Rights Reserved.
|
12
|
+
|
13
|
+
Use, reproduction, and distribution of this software is permitted.
|
14
|
+
Any copy of this software, whether in its original form or modified,
|
15
|
+
must include both the above copyright notice and the following
|
16
|
+
paragraphs.
|
17
|
+
|
18
|
+
Nara Institute of Science and Technology (NAIST),
|
19
|
+
the copyright holders, disclaims all warranties with regard to this
|
20
|
+
software, including all implied warranties of merchantability and
|
21
|
+
fitness, in no event shall NAIST be liable for
|
22
|
+
any special, indirect or consequential damages or any damages
|
23
|
+
whatsoever resulting from loss of use, data or profits, whether in an
|
24
|
+
action of contract, negligence or other tortuous action, arising out
|
25
|
+
of or in connection with the use or performance of this software.
|
26
|
+
|
27
|
+
A large portion of the dictionary entries
|
28
|
+
originate from ICOT Free Software. The following conditions for ICOT
|
29
|
+
Free Software applies to the current dictionary as well.
|
30
|
+
|
31
|
+
Each User may also freely distribute the Program, whether in its
|
32
|
+
original form or modified, to any third party or parties, PROVIDED
|
33
|
+
that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear
|
34
|
+
on, or be attached to, the Program, which is distributed substantially
|
35
|
+
in the same form as set out herein and that such intended
|
36
|
+
distribution, if actually made, will neither violate or otherwise
|
37
|
+
contravene any of the laws and regulations of the countries having
|
38
|
+
jurisdiction over the User or the intended distribution itself.
|
39
|
+
|
40
|
+
NO WARRANTY
|
41
|
+
|
42
|
+
The program was produced on an experimental basis in the course of the
|
43
|
+
research and development conducted during the project and is provided
|
44
|
+
to users as so produced on an experimental basis. Accordingly, the
|
45
|
+
program is provided without any warranty whatsoever, whether express,
|
46
|
+
implied, statutory or otherwise. The term "warranty" used herein
|
47
|
+
includes, but is not limited to, any warranty of the quality,
|
48
|
+
performance, merchantability and fitness for a particular purpose of
|
49
|
+
the program and the nonexistence of any infringement or violation of
|
50
|
+
any right of any third party.
|
51
|
+
|
52
|
+
Each user of the program will agree and understand, and be deemed to
|
53
|
+
have agreed and understood, that there is no warranty whatsoever for
|
54
|
+
the program and, accordingly, the entire risk arising from or
|
55
|
+
otherwise connected with the program is assumed by the user.
|
56
|
+
|
57
|
+
Therefore, neither ICOT, the copyright holder, or any other
|
58
|
+
organization that participated in or was otherwise related to the
|
59
|
+
development of the program and their respective officials, directors,
|
60
|
+
officers and other employees shall be held liable for any and all
|
61
|
+
damages, including, without limitation, general, special, incidental
|
62
|
+
and consequential damages, arising out of or otherwise in connection
|
63
|
+
with the use or inability to use the program or any product, material
|
64
|
+
or result produced or otherwise obtained by using the program,
|
65
|
+
regardless of whether they have been advised of, or otherwise had
|
66
|
+
knowledge of, the possibility of such damages at any time during the
|
67
|
+
project or thereafter. Each user will be deemed to have agreed to the
|
68
|
+
foregoing by his or her commencement of use of the program. The term
|
69
|
+
"use" as used herein includes, but is not limited to, the use,
|
70
|
+
modification, copying and distribution of the program and the
|
71
|
+
production of secondary products from the program.
|
72
|
+
|
73
|
+
In the case where the program, whether in its original form or
|
74
|
+
modified, was distributed or delivered to or received by a user from
|
75
|
+
any person, organization or entity other than ICOT, unless it makes or
|
76
|
+
grants independently of ICOT any specific warranty to the user in
|
77
|
+
writing, such person, organization or entity, will also be exempted
|
78
|
+
from and not be held liable to the user for any such damages as noted
|
79
|
+
above as far as the program is concerned.
|
data/README.md
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
# Suika
|
2
|
+
|
3
|
+
[![Gem Version](https://badge.fury.io/rb/suika.svg)](https://badge.fury.io/rb/suika)
|
4
|
+
[![BSD 3-Clause License](https://img.shields.io/badge/License-BSD%203--Clause-orange.svg)](https://github.com/yoshoku/suika/blob/master/LICENSE.txt)
|
5
|
+
[![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://rubydoc.info/gems/suika)
|
6
|
+
|
7
|
+
Suika 🍉 is a Japanese morphological analyzer written in pure Ruby.
|
8
|
+
|
9
|
+
## Installation
|
10
|
+
|
11
|
+
Add this line to your application's Gemfile:
|
12
|
+
|
13
|
+
```ruby
|
14
|
+
gem 'suika'
|
15
|
+
```
|
16
|
+
|
17
|
+
And then execute:
|
18
|
+
|
19
|
+
$ bundle install
|
20
|
+
|
21
|
+
Or install it yourself as:
|
22
|
+
|
23
|
+
$ gem install suika
|
24
|
+
|
25
|
+
## Usage
|
26
|
+
|
27
|
+
```ruby
|
28
|
+
require 'suika'
|
29
|
+
|
30
|
+
tagger = Suika::Tagger.new
|
31
|
+
tagger.parse('すもももももももものうち').each { |token| puts token }
|
32
|
+
|
33
|
+
# すもも 名詞, 一般, *, *, *, *, すもも, スモモ, スモモ
|
34
|
+
# も 助詞, 係助詞, *, *, *, *, も, モ, モ
|
35
|
+
# もも 名詞, 一般, *, *, *, *, もも, モモ, モモ
|
36
|
+
# も 助詞, 係助詞, *, *, *, *, も, モ, モ
|
37
|
+
# もも 名詞, 一般, *, *, *, *, もも, モモ, モモ
|
38
|
+
# の 助詞, 連体化, *, *, *, *, の, ノ, ノ
|
39
|
+
# うち 名詞, 非自立, 副詞可能, *, *, *, うち, ウチ, ウチ
|
40
|
+
```
|
41
|
+
|
42
|
+
Since the Tagger class loads the binary dictionary at initialization, it is recommended to reuse the instance.
|
43
|
+
|
44
|
+
```ruby
|
45
|
+
tagger = Suika::Tagger.new
|
46
|
+
|
47
|
+
sentences.each do |sentence|
|
48
|
+
result = tagger.parse(sentence)
|
49
|
+
|
50
|
+
# ...
|
51
|
+
end
|
52
|
+
```
|
53
|
+
|
54
|
+
## Contributing
|
55
|
+
|
56
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/yoshoku/suika.
|
57
|
+
This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/yoshoku/suika/blob/master/CODE_OF_CONDUCT.md).
|
58
|
+
|
59
|
+
## License
|
60
|
+
|
61
|
+
The gem is available as open source under the terms of the [BSD-3-Clause License](https://opensource.org/licenses/BSD-3-Clause).
|
62
|
+
In addition, the gem includes binary data generated from mecab-ipadic.
|
63
|
+
The details of the license can be found in [LICENSE.txt](https://github.com/yoshoku/suika/blob/master/LICENSE.txt)
|
64
|
+
and [NOTICE.txt](https://github.com/yoshoku/suika/blob/master/NOTICE.txt).
|
65
|
+
|
66
|
+
## Respect
|
67
|
+
|
68
|
+
- [Taku Kudo](https://github.com/taku910) is the author of [MeCab](https://taku910.github.io/mecab/) that is the most famous morphological analyzer in Japan.
|
69
|
+
MeCab is one of the great software in natural language processing.
|
70
|
+
Suika is created with reference to [the book on morphological analysis](https://www.kindaikagaku.co.jp/information/kd0577.htm) written by Dr. Kudo.
|
71
|
+
- [Tomoko Uchida](https://github.com/mocobeta) is the author of [Janome](https://github.com/mocobeta/janome) that is a Japanese morphological analysis engine written in pure Python.
|
72
|
+
Suika is heavily influenced by Janome's idea to include the built-in dictionary and language model.
|
73
|
+
Janome, a morphological analyzer written in scripting language, gives me the courage to develop Suika.
|
74
|
+
|
75
|
+
## Code of Conduct
|
76
|
+
|
77
|
+
Everyone interacting in the Suika project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/yoshoku/suika/blob/master/CODE_OF_CONDUCT.md).
|
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "suika"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start(__FILE__)
|
data/bin/setup
ADDED
data/dict/ipadic.gz
ADDED
Binary file
|
data/lib/suika.rb
ADDED
@@ -0,0 +1,201 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Suika
|
4
|
+
# @!visibility private
|
5
|
+
class CharDef
|
6
|
+
# @!visibility private
|
7
|
+
def self.char_type(ch)
|
8
|
+
code = ch.unpack1('U*')
|
9
|
+
CHAR_TYPES.find do |ctype|
|
10
|
+
Object.const_get("CharDef::#{ctype}").any? { |r| r.include?(code) }
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
# @!visibility private
|
15
|
+
def self.char_category(ch)
|
16
|
+
CHAR_CATEGORY[char_type(ch)]
|
17
|
+
end
|
18
|
+
|
19
|
+
CHAR_CATEGORY = {
|
20
|
+
'DEFAULT' => {
|
21
|
+
invoke: 0, group: 1, length: 0
|
22
|
+
},
|
23
|
+
'SPACE' => {
|
24
|
+
invoke: 0, group: 1, length: 0
|
25
|
+
},
|
26
|
+
'KANJI' => {
|
27
|
+
invoke: 0, group: 0, length: 2
|
28
|
+
},
|
29
|
+
'SYMBOL' => {
|
30
|
+
invoke: 1, group: 1, length: 0
|
31
|
+
},
|
32
|
+
'NUMERIC' => {
|
33
|
+
invoke: 1, group: 1, length: 0
|
34
|
+
},
|
35
|
+
'ALPHA' => {
|
36
|
+
invoke: 1, group: 1, length: 0
|
37
|
+
},
|
38
|
+
'HIRAGANA' => {
|
39
|
+
invoke: 0, group: 1, length: 2
|
40
|
+
},
|
41
|
+
'KATAKANA' => {
|
42
|
+
invoke: 1, group: 1, length: 2
|
43
|
+
},
|
44
|
+
'KANJINUMERIC' => {
|
45
|
+
invoke: 1, group: 1, length: 0
|
46
|
+
},
|
47
|
+
'GREEK' => {
|
48
|
+
invoke: 1, group: 1, length: 0
|
49
|
+
},
|
50
|
+
'CYRILLIC' => {
|
51
|
+
invoke: 1, group: 1, length: 0
|
52
|
+
}
|
53
|
+
}.freeze
|
54
|
+
|
55
|
+
CHAR_TYPES = %w[
|
56
|
+
SPACE
|
57
|
+
NUMERIC
|
58
|
+
SYMBOL
|
59
|
+
ALPHA
|
60
|
+
CYRILLIC
|
61
|
+
GREEK
|
62
|
+
HIRAGANA
|
63
|
+
KATAKANA
|
64
|
+
KANJI
|
65
|
+
KANJINUMERIC
|
66
|
+
].freeze
|
67
|
+
|
68
|
+
SPACE = [
|
69
|
+
0x0020..0x0020,
|
70
|
+
0x00D0..0x00D0,
|
71
|
+
0x0009..0x0009,
|
72
|
+
0x000B..0x000B,
|
73
|
+
0x000A..0x000A
|
74
|
+
].freeze
|
75
|
+
|
76
|
+
NUMERIC = [
|
77
|
+
0x0030..0x0039, # ASCII
|
78
|
+
0xFF10..0xFF19, # ZENKAKU
|
79
|
+
# OTHER SYMBOLS
|
80
|
+
0x2070..0x209F, # Superscripts and Subscripts
|
81
|
+
0x2150..0x218F # Number forms
|
82
|
+
].freeze
|
83
|
+
|
84
|
+
SYMBOL = [
|
85
|
+
# ASCII
|
86
|
+
0x0021..0x002F,
|
87
|
+
0x003A..0x0040,
|
88
|
+
0x005B..0x0060,
|
89
|
+
0x007B..0x007E,
|
90
|
+
# Latin 1
|
91
|
+
0x00A1..0x00BF,
|
92
|
+
# ZENKAKU
|
93
|
+
0xFF01..0xFF0F,
|
94
|
+
0xFF1A..0xFF1F,
|
95
|
+
0xFF3B..0xFF40,
|
96
|
+
0xFF5B..0xFF65,
|
97
|
+
0xFFE0..0xFFEF, # HalfWidth and Full width Form
|
98
|
+
# OTHER SYMBOLS
|
99
|
+
0x2000..0x206F, # General Punctuation
|
100
|
+
0x20A0..0x20CF, # Currency Symbols
|
101
|
+
0x20D0..0x20FF, # Combining Diaritical Marks for Symbols
|
102
|
+
0x2100..0x214F, # Letterlike Symbols
|
103
|
+
0x2100..0x214B, # Letterlike Symbols
|
104
|
+
0x2190..0x21FF, # Arrow
|
105
|
+
0x2200..0x22FF, # Mathematical Operators
|
106
|
+
0x2300..0x23FF, # Miscellaneuos Technical
|
107
|
+
0x2460..0x24FF, # Enclosed NUMERICs
|
108
|
+
0x2501..0x257F, # Box Drawing
|
109
|
+
0x2580..0x259F, # Block Elements
|
110
|
+
0x25A0..0x25FF, # Geometric Shapes
|
111
|
+
0x2600..0x26FE, # Miscellaneous Symbols
|
112
|
+
0x2700..0x27BF, # Dingbats
|
113
|
+
0x27F0..0x27FF, # Supplemental Arrows A
|
114
|
+
0x27C0..0x27EF, # Miscellaneous Mathematical Symbols-A
|
115
|
+
0x2800..0x28FF, # Braille Patterns
|
116
|
+
0x2900..0x297F, # Supplemental Arrows B
|
117
|
+
0x2B00..0x2BFF, # Miscellaneous Symbols and Arrows
|
118
|
+
0x2A00..0x2AFF, # Supplemental Mathematical Operators
|
119
|
+
0x3300..0x33FF,
|
120
|
+
0x3200..0x32FE, # ENclosed CJK Letters and Months
|
121
|
+
0x3000..0x303F, # CJK Symbol and Punctuation
|
122
|
+
0xFE30..0xFE4F, # CJK Compatibility Forms
|
123
|
+
0xFE50..0xFE6B, # Small Form Variants
|
124
|
+
# 0x3007 SYMBOL KANJINUMERIC
|
125
|
+
0x3007..0x3007
|
126
|
+
].freeze
|
127
|
+
|
128
|
+
ALPHA = [
|
129
|
+
# ASCII
|
130
|
+
0x0041..0x005A,
|
131
|
+
0x0061..0x007A,
|
132
|
+
# Latin
|
133
|
+
0x00C0..0x00FF, # Latin 1
|
134
|
+
0x0100..0x017F, # Latin Extended A
|
135
|
+
0x0180..0x0236, # Latin Extended B
|
136
|
+
0x1E00..0x1EF9, # Latin Extended Additional
|
137
|
+
0xFF21..0xFF3A, # ZENKAKU
|
138
|
+
0xFF41..0xFF5A # ZENKAKU
|
139
|
+
].freeze
|
140
|
+
|
141
|
+
# CYRILLIC
|
142
|
+
CYRILLIC = [
|
143
|
+
0x0400..0x04F9,
|
144
|
+
0x0500..0x050F # Cyrillic supplementary
|
145
|
+
].freeze
|
146
|
+
|
147
|
+
# GREEK
|
148
|
+
GREEK = [0x0374..0x03FB].freeze # Greek and Coptic
|
149
|
+
|
150
|
+
# HIRAGANA
|
151
|
+
HIRAGANA = [0x3041..0x309F].freeze
|
152
|
+
|
153
|
+
# KATAKANA
|
154
|
+
KATAKANA = [
|
155
|
+
0x30A1..0x30FF,
|
156
|
+
0x31F0..0x31FF, # Small KU .. Small RO
|
157
|
+
0x30FC..0x30FC,
|
158
|
+
# Half KATAKANA
|
159
|
+
0xFF66..0xFF9D,
|
160
|
+
0xFF9E..0xFF9F
|
161
|
+
].freeze
|
162
|
+
|
163
|
+
# KANJI
|
164
|
+
KANJI = [
|
165
|
+
0x2E80..0x2EF3, # CJK Raidcals Supplement
|
166
|
+
0x2F00..0x2FD5,
|
167
|
+
0x3005..0x3005,
|
168
|
+
0x3007..0x3007,
|
169
|
+
0x3400..0x4DB5, # CJK Unified Ideographs Extention
|
170
|
+
0x4E00..0x9FA5,
|
171
|
+
0xF900..0xFA2D,
|
172
|
+
0xFA30..0xFA6A
|
173
|
+
].freeze
|
174
|
+
|
175
|
+
# rubocop:disable Style/AsciiComments
|
176
|
+
# KANJI-NUMERIC (一 二 三 四 五 六 七 八 九 十 百 千 万 億 兆)
|
177
|
+
# 0x4E00 KANJINUMERIC KANJI
|
178
|
+
KANJINUMERIC = [
|
179
|
+
0x4E00..0x4E00,
|
180
|
+
0x4E8C..0x4E8C,
|
181
|
+
0x4E09..0x4E09,
|
182
|
+
0x56DB..0x56DB,
|
183
|
+
0x4E94..0x4E94,
|
184
|
+
0x516D..0x516D,
|
185
|
+
0x4E03..0x4E03,
|
186
|
+
0x516B..0x516B,
|
187
|
+
0x4E5D..0x4E5D,
|
188
|
+
0x5341..0x5341,
|
189
|
+
0x767E..0x767E,
|
190
|
+
0x5343..0x5343,
|
191
|
+
0x4E07..0x4E07,
|
192
|
+
0x5104..0x5104,
|
193
|
+
0x5146..0x5146
|
194
|
+
].freeze
|
195
|
+
# rubocop:enable Style/AsciiComments
|
196
|
+
|
197
|
+
private_constant :CHAR_CATEGORY, :CHAR_TYPES
|
198
|
+
|
199
|
+
private_constant :ALPHA, :CYRILLIC, :GREEK, :HIRAGANA, :KANJI, :KANJINUMERIC, :KATAKANA, :NUMERIC, :SPACE, :SYMBOL
|
200
|
+
end
|
201
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Suika
|
4
|
+
# @!visibility private
|
5
|
+
class Lattice
|
6
|
+
# @!visibility private
|
7
|
+
Node = Struct.new(:surface, :min_cost, :min_prev, :left_id, :right_id, :cost, :attrs, keyword_init: true)
|
8
|
+
|
9
|
+
attr_reader :begin_nodes, :end_nodes, :length
|
10
|
+
|
11
|
+
# @!visibility private
|
12
|
+
def initialize(length)
|
13
|
+
@length = length
|
14
|
+
@begin_nodes = Array.new(length + 1) { [] }
|
15
|
+
@end_nodes = Array.new(length + 1) { [] }
|
16
|
+
bos = Node.new(surface: 'BOS', left_id: 0, right_id: 0, cost: 0, attrs: [])
|
17
|
+
@end_nodes[0].append(bos)
|
18
|
+
eos = Node.new(surface: 'EOS', left_id: 0, right_id: 0, cost: 0, attrs: [])
|
19
|
+
@begin_nodes[length].append(eos)
|
20
|
+
end
|
21
|
+
|
22
|
+
# @!visibility private
|
23
|
+
def insert(begin_id, end_id, surface, left_id, right_id, cost, attrs)
|
24
|
+
node = Node.new(surface: surface, left_id: left_id, right_id: right_id, cost: cost, attrs: attrs)
|
25
|
+
@begin_nodes[begin_id].append(node)
|
26
|
+
@end_nodes[end_id].append(node)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
data/lib/suika/tagger.rb
ADDED
@@ -0,0 +1,120 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rambling-trie'
|
4
|
+
require 'zlib'
|
5
|
+
|
6
|
+
module Suika
|
7
|
+
# Tagger is a class that tokenizes Japanese text.
|
8
|
+
#
|
9
|
+
# @example
|
10
|
+
# require 'suika'
|
11
|
+
#
|
12
|
+
# tagger = Suika::Tagger.new
|
13
|
+
# tagger.parse('すもももももももものうち').each { |token| puts token }
|
14
|
+
#
|
15
|
+
# # すもも 名詞, 一般, *, *, *, *, すもも, スモモ, スモモ
|
16
|
+
# # も 助詞, 係助詞, *, *, *, *, も, モ, モ
|
17
|
+
# # もも 名詞, 一般, *, *, *, *, もも, モモ, モモ
|
18
|
+
# # も 助詞, 係助詞, *, *, *, *, も, モ, モ
|
19
|
+
# # もも 名詞, 一般, *, *, *, *, もも, モモ, モモ
|
20
|
+
# # の 助詞, 連体化, *, *, *, *, の, ノ, ノ
|
21
|
+
# # うち 名詞, 非自立, 副詞可能, *, *, *, うち, ウチ, ウチ
|
22
|
+
#
|
23
|
+
class Tagger
|
24
|
+
# Create a new tagger by loading the built-in binary dictionary.
|
25
|
+
def initialize
|
26
|
+
ipadic = Marshal.load(Zlib::GzipReader.open(__dir__ + '/../../dict/ipadic.gz', &:read))
|
27
|
+
@trie = ipadic[:trie]
|
28
|
+
@dictionary = ipadic[:dictionary]
|
29
|
+
@unknown_dictionary = ipadic[:unknown_dictionary]
|
30
|
+
@cost_mat = ipadic[:cost_matrix]
|
31
|
+
end
|
32
|
+
|
33
|
+
# Parse the given sentence.
|
34
|
+
# @param sentence [String] Japanese text to be parsed.
|
35
|
+
# @return [Array<String>]
|
36
|
+
def parse(sentence)
|
37
|
+
lattice = Lattice.new(sentence.length)
|
38
|
+
start = 0
|
39
|
+
terminal = sentence.length
|
40
|
+
|
41
|
+
while start < terminal
|
42
|
+
word = sentence[start]
|
43
|
+
pos = start
|
44
|
+
is_unknown = true
|
45
|
+
while @trie.match?(word) && pos < terminal
|
46
|
+
if @dictionary.key?(word)
|
47
|
+
@dictionary[word].each do |el|
|
48
|
+
lattice.insert(start, start + word.length,
|
49
|
+
word, el[0].to_i, el[1].to_i, el[2].to_i,
|
50
|
+
el[3..-1])
|
51
|
+
end
|
52
|
+
is_unknown = false
|
53
|
+
end
|
54
|
+
pos += 1
|
55
|
+
word = sentence[start..pos]
|
56
|
+
end
|
57
|
+
|
58
|
+
unless is_unknown
|
59
|
+
start += 1
|
60
|
+
next
|
61
|
+
end
|
62
|
+
|
63
|
+
word = sentence[start]
|
64
|
+
char_type = CharDef.char_type(sentence[start])
|
65
|
+
char_cate = CharDef.char_category(sentence[start])
|
66
|
+
if char_cate[:group] == 1
|
67
|
+
unk_terminal = char_cate[:length].zero? ? terminal : start + char_cate[:length]
|
68
|
+
pos = start + 1
|
69
|
+
while pos < unk_terminal && char_type == CharDef.char_type(text[t])
|
70
|
+
word << text[t]
|
71
|
+
pos += 1
|
72
|
+
end
|
73
|
+
end
|
74
|
+
@unknown_dictionary[char_type].each do |el|
|
75
|
+
lattice.insert(start, start + word.length,
|
76
|
+
word, el[0].to_i, el[1].to_i, el[2].to_i,
|
77
|
+
el[3..-1])
|
78
|
+
end
|
79
|
+
start += 1
|
80
|
+
end
|
81
|
+
|
82
|
+
viterbi(lattice)
|
83
|
+
end
|
84
|
+
|
85
|
+
private
|
86
|
+
|
87
|
+
INT_MAX = 2**(([42].pack('i').size * 16) - 2) - 1
|
88
|
+
|
89
|
+
private_constant :INT_MAX
|
90
|
+
|
91
|
+
def viterbi(lattice)
|
92
|
+
bos = lattice.end_nodes[0].first
|
93
|
+
bos.min_cost = 0
|
94
|
+
bos.min_prev = nil
|
95
|
+
|
96
|
+
(lattice.length + 1).times do |n|
|
97
|
+
lattice.begin_nodes[n].each do |rnode|
|
98
|
+
rnode.min_cost = INT_MAX
|
99
|
+
rnode.min_prev = nil
|
100
|
+
lattice.end_nodes[n].each do |lnode|
|
101
|
+
cost = lnode.min_cost + @cost_mat[lnode.right_id][rnode.left_id] + rnode.cost
|
102
|
+
if cost < rnode.min_cost
|
103
|
+
rnode.min_cost = cost
|
104
|
+
rnode.min_prev = lnode
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
eos = lattice.begin_nodes[-1].first
|
111
|
+
prev_node = eos.min_prev
|
112
|
+
res = []
|
113
|
+
until prev_node.nil?
|
114
|
+
res.append("#{prev_node.surface}\t#{prev_node.attrs.join(', ')}") if prev_node.surface != 'BOS' && prev_node.surface != 'EOS'
|
115
|
+
prev_node = prev_node.min_prev
|
116
|
+
end
|
117
|
+
res.reverse
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
data/suika.gemspec
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'lib/suika/version'
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = 'suika'
|
7
|
+
spec.version = Suika::VERSION
|
8
|
+
spec.authors = ['yoshoku']
|
9
|
+
spec.email = ['yoshoku@outlook.com']
|
10
|
+
|
11
|
+
spec.summary = 'Suika is a Japanese morphological analyzer written in pure Ruby.'
|
12
|
+
spec.description = 'Suika is a Japanese morphological analyzer written in pure Ruby.'
|
13
|
+
spec.homepage = 'https://github.com/yoshoku/suika'
|
14
|
+
spec.license = 'BSD-3-Clause'
|
15
|
+
spec.required_ruby_version = Gem::Requirement.new('>= 2.3.0')
|
16
|
+
|
17
|
+
spec.metadata['homepage_uri'] = spec.homepage
|
18
|
+
spec.metadata['source_code_uri'] = spec.homepage
|
19
|
+
spec.metadata['changelog_uri'] = 'https://github.com/yoshoku/magro/blob/master/CHANGELOG.md'
|
20
|
+
spec.metadata['documentation_uri'] = 'https://rubydoc.info/gems/suika'
|
21
|
+
|
22
|
+
# Specify which files should be added to the gem when it is released.
|
23
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
24
|
+
spec.files = Dir.chdir(File.expand_path(__dir__)) do
|
25
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
26
|
+
end
|
27
|
+
spec.bindir = 'exe'
|
28
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
29
|
+
spec.require_paths = ['lib']
|
30
|
+
|
31
|
+
spec.add_runtime_dependency 'rambling-trie', '~> 2.1'
|
32
|
+
end
|
metadata
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: suika
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- yoshoku
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2020-07-04 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rambling-trie
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '2.1'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '2.1'
|
27
|
+
description: Suika is a Japanese morphological analyzer written in pure Ruby.
|
28
|
+
email:
|
29
|
+
- yoshoku@outlook.com
|
30
|
+
executables: []
|
31
|
+
extensions: []
|
32
|
+
extra_rdoc_files: []
|
33
|
+
files:
|
34
|
+
- ".gitignore"
|
35
|
+
- ".rspec"
|
36
|
+
- ".rubocop.yml"
|
37
|
+
- ".travis.yml"
|
38
|
+
- CODE_OF_CONDUCT.md
|
39
|
+
- Gemfile
|
40
|
+
- LICENSE.txt
|
41
|
+
- NOTICE.txt
|
42
|
+
- README.md
|
43
|
+
- Rakefile
|
44
|
+
- bin/console
|
45
|
+
- bin/setup
|
46
|
+
- dict/ipadic.gz
|
47
|
+
- lib/suika.rb
|
48
|
+
- lib/suika/char_def.rb
|
49
|
+
- lib/suika/lattice.rb
|
50
|
+
- lib/suika/tagger.rb
|
51
|
+
- lib/suika/version.rb
|
52
|
+
- suika.gemspec
|
53
|
+
homepage: https://github.com/yoshoku/suika
|
54
|
+
licenses:
|
55
|
+
- BSD-3-Clause
|
56
|
+
metadata:
|
57
|
+
homepage_uri: https://github.com/yoshoku/suika
|
58
|
+
source_code_uri: https://github.com/yoshoku/suika
|
59
|
+
changelog_uri: https://github.com/yoshoku/magro/blob/master/CHANGELOG.md
|
60
|
+
documentation_uri: https://rubydoc.info/gems/suika
|
61
|
+
post_install_message:
|
62
|
+
rdoc_options: []
|
63
|
+
require_paths:
|
64
|
+
- lib
|
65
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- - ">="
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 2.3.0
|
70
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
71
|
+
requirements:
|
72
|
+
- - ">="
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: '0'
|
75
|
+
requirements: []
|
76
|
+
rubygems_version: 3.1.2
|
77
|
+
signing_key:
|
78
|
+
specification_version: 4
|
79
|
+
summary: Suika is a Japanese morphological analyzer written in pure Ruby.
|
80
|
+
test_files: []
|