suika 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 41135bca7252d6e52d8c13c52ca08446ebe79ffe388f585a1739976e5b26ad4f
4
+ data.tar.gz: 22e021383f5169c264392cf4e1c4cd185319f810d86c2c8e85000b80ac493ccd
5
+ SHA512:
6
+ metadata.gz: 534e3c4c67612fcf0c052057dbb85b649cba037517c9c500c2a3e20e8f0cae97e734b71fcce7ff47ba1a55d4b99fd5fcdab0be26f14addddc3728c784d1e8966
7
+ data.tar.gz: 8f929702297098e8c3439170323205e9c58f6f5f9735192604bd08566f4f20c8ae1d603185126636097846df45e45c6e7141380fea3e2b449bea3e6c73bd983d
@@ -0,0 +1,16 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+
10
+ # rspec failure tracking
11
+ .rspec_status
12
+
13
+ *.swp
14
+ tags
15
+ .DS_Store
16
+ .ruby-version
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
@@ -0,0 +1,118 @@
1
+ require:
2
+ - rubocop-performance
3
+ - rubocop-rspec
4
+
5
+ AllCops:
6
+ TargetRubyVersion: 2.4
7
+ DisplayCopNames: true
8
+ DisplayStyleGuide: true
9
+ Exclude:
10
+ - 'bin/*'
11
+ - 'suika.gemspec'
12
+ - 'Rakefile'
13
+ - 'Gemfile'
14
+
15
+ Layout/EmptyLineAfterGuardClause:
16
+ Enabled: false
17
+
18
+ Layout/EmptyLinesAroundAttributeAccessor:
19
+ Enabled: true
20
+
21
+ Layout/LineLength:
22
+ Max: 145
23
+ IgnoredPatterns: ['(\A|\s)#']
24
+
25
+ Layout/SpaceAroundMethodCallOperator:
26
+ Enabled: true
27
+
28
+ Lint/DeprecatedOpenSSLConstant:
29
+ Enabled: true
30
+
31
+ Lint/MixedRegexpCaptureTypes:
32
+ Enabled: true
33
+
34
+ Lint/RaiseException:
35
+ Enabled: true
36
+
37
+ Lint/StructNewOverride:
38
+ Enabled: true
39
+
40
+ Metrics/ModuleLength:
41
+ Max: 200
42
+
43
+ Metrics/ClassLength:
44
+ Max: 200
45
+
46
+ Metrics/MethodLength:
47
+ Max: 50
48
+
49
+ Metrics/AbcSize:
50
+ Max: 60
51
+
52
+ Metrics/CyclomaticComplexity:
53
+ Max: 16
54
+
55
+ Metrics/PerceivedComplexity:
56
+ Max: 16
57
+
58
+ Metrics/BlockLength:
59
+ Max: 40
60
+ Exclude:
61
+ - 'spec/**/*'
62
+
63
+ Metrics/ParameterLists:
64
+ Max: 12
65
+
66
+ Naming/MethodParameterName:
67
+ Enabled: false
68
+
69
+ Naming/ConstantName:
70
+ Enabled: false
71
+
72
+ Security/MarshalLoad:
73
+ Enabled: false
74
+
75
+ Style/AsciiComments:
76
+ Enabled: false
77
+
78
+ Style/Documentation:
79
+ Enabled: false
80
+
81
+ Style/ExponentialNotation:
82
+ Enabled: true
83
+
84
+ Style/HashEachMethods:
85
+ Enabled: true
86
+
87
+ Style/HashTransformKeys:
88
+ Enabled: true
89
+
90
+ Style/HashTransformValues:
91
+ Enabled: true
92
+
93
+ Style/RedundantRegexpCharacterClass:
94
+ Enabled: true
95
+
96
+ Style/RedundantRegexpEscape:
97
+ Enabled: true
98
+
99
+ Style/SlicingWithRange:
100
+ Enabled: true
101
+
102
+ Style/FormatStringToken:
103
+ Enabled: false
104
+
105
+ Style/NumericLiterals:
106
+ Enabled: false
107
+
108
+ RSpec/MultipleExpectations:
109
+ Enabled: false
110
+
111
+ RSpec/ExampleLength:
112
+ Max: 40
113
+
114
+ RSpec/InstanceVariable:
115
+ Enabled: false
116
+
117
+ RSpec/LeakyConstantDeclaration:
118
+ Enabled: false
@@ -0,0 +1,6 @@
1
+ ---
2
+ language: ruby
3
+ cache: bundler
4
+ rvm:
5
+ - 2.7.0
6
+ before_install: gem install bundler -v 2.1.2
@@ -0,0 +1,74 @@
1
+ # Contributor Covenant Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ In the interest of fostering an open and welcoming environment, we as
6
+ contributors and maintainers pledge to making participation in our project and
7
+ our community a harassment-free experience for everyone, regardless of age, body
8
+ size, disability, ethnicity, gender identity and expression, level of experience,
9
+ nationality, personal appearance, race, religion, or sexual identity and
10
+ orientation.
11
+
12
+ ## Our Standards
13
+
14
+ Examples of behavior that contributes to creating a positive environment
15
+ include:
16
+
17
+ * Using welcoming and inclusive language
18
+ * Being respectful of differing viewpoints and experiences
19
+ * Gracefully accepting constructive criticism
20
+ * Focusing on what is best for the community
21
+ * Showing empathy towards other community members
22
+
23
+ Examples of unacceptable behavior by participants include:
24
+
25
+ * The use of sexualized language or imagery and unwelcome sexual attention or
26
+ advances
27
+ * Trolling, insulting/derogatory comments, and personal or political attacks
28
+ * Public or private harassment
29
+ * Publishing others' private information, such as a physical or electronic
30
+ address, without explicit permission
31
+ * Other conduct which could reasonably be considered inappropriate in a
32
+ professional setting
33
+
34
+ ## Our Responsibilities
35
+
36
+ Project maintainers are responsible for clarifying the standards of acceptable
37
+ behavior and are expected to take appropriate and fair corrective action in
38
+ response to any instances of unacceptable behavior.
39
+
40
+ Project maintainers have the right and responsibility to remove, edit, or
41
+ reject comments, commits, code, wiki edits, issues, and other contributions
42
+ that are not aligned to this Code of Conduct, or to ban temporarily or
43
+ permanently any contributor for other behaviors that they deem inappropriate,
44
+ threatening, offensive, or harmful.
45
+
46
+ ## Scope
47
+
48
+ This Code of Conduct applies both within project spaces and in public spaces
49
+ when an individual is representing the project or its community. Examples of
50
+ representing a project or community include using an official project e-mail
51
+ address, posting via an official social media account, or acting as an appointed
52
+ representative at an online or offline event. Representation of a project may be
53
+ further defined and clarified by project maintainers.
54
+
55
+ ## Enforcement
56
+
57
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
58
+ reported by contacting the project team at yoshoku@outlook.com. All
59
+ complaints will be reviewed and investigated and will result in a response that
60
+ is deemed necessary and appropriate to the circumstances. The project team is
61
+ obligated to maintain confidentiality with regard to the reporter of an incident.
62
+ Further details of specific enforcement policies may be posted separately.
63
+
64
+ Project maintainers who do not follow or enforce the Code of Conduct in good
65
+ faith may face temporary or permanent repercussions as determined by other
66
+ members of the project's leadership.
67
+
68
+ ## Attribution
69
+
70
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71
+ available at [https://contributor-covenant.org/version/1/4][version]
72
+
73
+ [homepage]: https://contributor-covenant.org
74
+ [version]: https://contributor-covenant.org/version/1/4/
data/Gemfile ADDED
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ source 'https://rubygems.org'
4
+
5
+ # Specify your gem's dependencies in suika.gemspec
6
+ gemspec
7
+
8
+ gem 'rake', '~> 12.0'
9
+ gem 'rspec', '~> 3.0'
@@ -0,0 +1,27 @@
1
+ Copyright (c) 2020 Atsushi Tatsuma
2
+ All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions are met:
6
+
7
+ * Redistributions of source code must retain the above copyright notice, this
8
+ list of conditions and the following disclaimer.
9
+
10
+ * Redistributions in binary form must reproduce the above copyright notice,
11
+ this list of conditions and the following disclaimer in the documentation
12
+ and/or other materials provided with the distribution.
13
+
14
+ * Neither the name of the copyright holder nor the names of its
15
+ contributors may be used to endorse or promote products derived from
16
+ this software without specific prior written permission.
17
+
18
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,79 @@
1
+ Suika includes source code and binary data which are generated from IPAdic.
2
+ The license of IPAdic is found in the following file.
3
+
4
+ mecab-ipadic-2.7.0-20070801
5
+ https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM
6
+
7
+ ---
8
+ IPAdic is licensed as follows:
9
+
10
+ Copyright 2000, 2001, 2002, 2003 Nara Institute of Science
11
+ and Technology. All Rights Reserved.
12
+
13
+ Use, reproduction, and distribution of this software is permitted.
14
+ Any copy of this software, whether in its original form or modified,
15
+ must include both the above copyright notice and the following
16
+ paragraphs.
17
+
18
+ Nara Institute of Science and Technology (NAIST),
19
+ the copyright holders, disclaims all warranties with regard to this
20
+ software, including all implied warranties of merchantability and
21
+ fitness, in no event shall NAIST be liable for
22
+ any special, indirect or consequential damages or any damages
23
+ whatsoever resulting from loss of use, data or profits, whether in an
24
+ action of contract, negligence or other tortuous action, arising out
25
+ of or in connection with the use or performance of this software.
26
+
27
+ A large portion of the dictionary entries
28
+ originate from ICOT Free Software. The following conditions for ICOT
29
+ Free Software applies to the current dictionary as well.
30
+
31
+ Each User may also freely distribute the Program, whether in its
32
+ original form or modified, to any third party or parties, PROVIDED
33
+ that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear
34
+ on, or be attached to, the Program, which is distributed substantially
35
+ in the same form as set out herein and that such intended
36
+ distribution, if actually made, will neither violate or otherwise
37
+ contravene any of the laws and regulations of the countries having
38
+ jurisdiction over the User or the intended distribution itself.
39
+
40
+ NO WARRANTY
41
+
42
+ The program was produced on an experimental basis in the course of the
43
+ research and development conducted during the project and is provided
44
+ to users as so produced on an experimental basis. Accordingly, the
45
+ program is provided without any warranty whatsoever, whether express,
46
+ implied, statutory or otherwise. The term "warranty" used herein
47
+ includes, but is not limited to, any warranty of the quality,
48
+ performance, merchantability and fitness for a particular purpose of
49
+ the program and the nonexistence of any infringement or violation of
50
+ any right of any third party.
51
+
52
+ Each user of the program will agree and understand, and be deemed to
53
+ have agreed and understood, that there is no warranty whatsoever for
54
+ the program and, accordingly, the entire risk arising from or
55
+ otherwise connected with the program is assumed by the user.
56
+
57
+ Therefore, neither ICOT, the copyright holder, or any other
58
+ organization that participated in or was otherwise related to the
59
+ development of the program and their respective officials, directors,
60
+ officers and other employees shall be held liable for any and all
61
+ damages, including, without limitation, general, special, incidental
62
+ and consequential damages, arising out of or otherwise in connection
63
+ with the use or inability to use the program or any product, material
64
+ or result produced or otherwise obtained by using the program,
65
+ regardless of whether they have been advised of, or otherwise had
66
+ knowledge of, the possibility of such damages at any time during the
67
+ project or thereafter. Each user will be deemed to have agreed to the
68
+ foregoing by his or her commencement of use of the program. The term
69
+ "use" as used herein includes, but is not limited to, the use,
70
+ modification, copying and distribution of the program and the
71
+ production of secondary products from the program.
72
+
73
+ In the case where the program, whether in its original form or
74
+ modified, was distributed or delivered to or received by a user from
75
+ any person, organization or entity other than ICOT, unless it makes or
76
+ grants independently of ICOT any specific warranty to the user in
77
+ writing, such person, organization or entity, will also be exempted
78
+ from and not be held liable to the user for any such damages as noted
79
+ above as far as the program is concerned.
@@ -0,0 +1,77 @@
1
+ # Suika
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/suika.svg)](https://badge.fury.io/rb/suika)
4
+ [![BSD 3-Clause License](https://img.shields.io/badge/License-BSD%203--Clause-orange.svg)](https://github.com/yoshoku/suika/blob/master/LICENSE.txt)
5
+ [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://rubydoc.info/gems/suika)
6
+
7
+ Suika 🍉 is a Japanese morphological analyzer written in pure Ruby.
8
+
9
+ ## Installation
10
+
11
+ Add this line to your application's Gemfile:
12
+
13
+ ```ruby
14
+ gem 'suika'
15
+ ```
16
+
17
+ And then execute:
18
+
19
+ $ bundle install
20
+
21
+ Or install it yourself as:
22
+
23
+ $ gem install suika
24
+
25
+ ## Usage
26
+
27
+ ```ruby
28
+ require 'suika'
29
+
30
+ tagger = Suika::Tagger.new
31
+ tagger.parse('すもももももももものうち').each { |token| puts token }
32
+
33
+ # すもも 名詞, 一般, *, *, *, *, すもも, スモモ, スモモ
34
+ # も 助詞, 係助詞, *, *, *, *, も, モ, モ
35
+ # もも 名詞, 一般, *, *, *, *, もも, モモ, モモ
36
+ # も 助詞, 係助詞, *, *, *, *, も, モ, モ
37
+ # もも 名詞, 一般, *, *, *, *, もも, モモ, モモ
38
+ # の 助詞, 連体化, *, *, *, *, の, ノ, ノ
39
+ # うち 名詞, 非自立, 副詞可能, *, *, *, うち, ウチ, ウチ
40
+ ```
41
+
42
+ Since the Tagger class loads the binary dictionary at initialization, it is recommended to reuse the instance.
43
+
44
+ ```ruby
45
+ tagger = Suika::Tagger.new
46
+
47
+ sentences.each do |sentence|
48
+ result = tagger.parse(sentence)
49
+
50
+ # ...
51
+ end
52
+ ```
53
+
54
+ ## Contributing
55
+
56
+ Bug reports and pull requests are welcome on GitHub at https://github.com/yoshoku/suika.
57
+ This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/yoshoku/suika/blob/master/CODE_OF_CONDUCT.md).
58
+
59
+ ## License
60
+
61
+ The gem is available as open source under the terms of the [BSD-3-Clause License](https://opensource.org/licenses/BSD-3-Clause).
62
+ In addition, the gem includes binary data generated from mecab-ipadic.
63
+ The details of the license can be found in [LICENSE.txt](https://github.com/yoshoku/suika/blob/master/LICENSE.txt)
64
+ and [NOTICE.txt](https://github.com/yoshoku/suika/blob/master/NOTICE.txt).
65
+
66
+ ## Respect
67
+
68
+ - [Taku Kudo](https://github.com/taku910) is the author of [MeCab](https://taku910.github.io/mecab/) that is the most famous morphological analyzer in Japan.
69
+ MeCab is one of the great software in natural language processing.
70
+ Suika is created with reference to [the book on morphological analysis](https://www.kindaikagaku.co.jp/information/kd0577.htm) written by Dr. Kudo.
71
+ - [Tomoko Uchida](https://github.com/mocobeta) is the author of [Janome](https://github.com/mocobeta/janome) that is a Japanese morphological analysis engine written in pure Python.
72
+ Suika is heavily influenced by Janome's idea to include the built-in dictionary and language model.
73
+ Janome, a morphological analyzer written in scripting language, gives me the courage to develop Suika.
74
+
75
+ ## Code of Conduct
76
+
77
+ Everyone interacting in the Suika project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/yoshoku/suika/blob/master/CODE_OF_CONDUCT.md).
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "suika"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
Binary file
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'suika/version'
4
+ require 'suika/char_def'
5
+ require 'suika/lattice'
6
+ require 'suika/tagger'
@@ -0,0 +1,201 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Suika
4
+ # @!visibility private
5
+ class CharDef
6
+ # @!visibility private
7
+ def self.char_type(ch)
8
+ code = ch.unpack1('U*')
9
+ CHAR_TYPES.find do |ctype|
10
+ Object.const_get("CharDef::#{ctype}").any? { |r| r.include?(code) }
11
+ end
12
+ end
13
+
14
+ # @!visibility private
15
+ def self.char_category(ch)
16
+ CHAR_CATEGORY[char_type(ch)]
17
+ end
18
+
19
+ CHAR_CATEGORY = {
20
+ 'DEFAULT' => {
21
+ invoke: 0, group: 1, length: 0
22
+ },
23
+ 'SPACE' => {
24
+ invoke: 0, group: 1, length: 0
25
+ },
26
+ 'KANJI' => {
27
+ invoke: 0, group: 0, length: 2
28
+ },
29
+ 'SYMBOL' => {
30
+ invoke: 1, group: 1, length: 0
31
+ },
32
+ 'NUMERIC' => {
33
+ invoke: 1, group: 1, length: 0
34
+ },
35
+ 'ALPHA' => {
36
+ invoke: 1, group: 1, length: 0
37
+ },
38
+ 'HIRAGANA' => {
39
+ invoke: 0, group: 1, length: 2
40
+ },
41
+ 'KATAKANA' => {
42
+ invoke: 1, group: 1, length: 2
43
+ },
44
+ 'KANJINUMERIC' => {
45
+ invoke: 1, group: 1, length: 0
46
+ },
47
+ 'GREEK' => {
48
+ invoke: 1, group: 1, length: 0
49
+ },
50
+ 'CYRILLIC' => {
51
+ invoke: 1, group: 1, length: 0
52
+ }
53
+ }.freeze
54
+
55
+ CHAR_TYPES = %w[
56
+ SPACE
57
+ NUMERIC
58
+ SYMBOL
59
+ ALPHA
60
+ CYRILLIC
61
+ GREEK
62
+ HIRAGANA
63
+ KATAKANA
64
+ KANJI
65
+ KANJINUMERIC
66
+ ].freeze
67
+
68
+ SPACE = [
69
+ 0x0020..0x0020,
70
+ 0x00D0..0x00D0,
71
+ 0x0009..0x0009,
72
+ 0x000B..0x000B,
73
+ 0x000A..0x000A
74
+ ].freeze
75
+
76
+ NUMERIC = [
77
+ 0x0030..0x0039, # ASCII
78
+ 0xFF10..0xFF19, # ZENKAKU
79
+ # OTHER SYMBOLS
80
+ 0x2070..0x209F, # Superscripts and Subscripts
81
+ 0x2150..0x218F # Number forms
82
+ ].freeze
83
+
84
+ SYMBOL = [
85
+ # ASCII
86
+ 0x0021..0x002F,
87
+ 0x003A..0x0040,
88
+ 0x005B..0x0060,
89
+ 0x007B..0x007E,
90
+ # Latin 1
91
+ 0x00A1..0x00BF,
92
+ # ZENKAKU
93
+ 0xFF01..0xFF0F,
94
+ 0xFF1A..0xFF1F,
95
+ 0xFF3B..0xFF40,
96
+ 0xFF5B..0xFF65,
97
+ 0xFFE0..0xFFEF, # HalfWidth and Full width Form
98
+ # OTHER SYMBOLS
99
+ 0x2000..0x206F, # General Punctuation
100
+ 0x20A0..0x20CF, # Currency Symbols
101
+ 0x20D0..0x20FF, # Combining Diaritical Marks for Symbols
102
+ 0x2100..0x214F, # Letterlike Symbols
103
+ 0x2100..0x214B, # Letterlike Symbols
104
+ 0x2190..0x21FF, # Arrow
105
+ 0x2200..0x22FF, # Mathematical Operators
106
+ 0x2300..0x23FF, # Miscellaneuos Technical
107
+ 0x2460..0x24FF, # Enclosed NUMERICs
108
+ 0x2501..0x257F, # Box Drawing
109
+ 0x2580..0x259F, # Block Elements
110
+ 0x25A0..0x25FF, # Geometric Shapes
111
+ 0x2600..0x26FE, # Miscellaneous Symbols
112
+ 0x2700..0x27BF, # Dingbats
113
+ 0x27F0..0x27FF, # Supplemental Arrows A
114
+ 0x27C0..0x27EF, # Miscellaneous Mathematical Symbols-A
115
+ 0x2800..0x28FF, # Braille Patterns
116
+ 0x2900..0x297F, # Supplemental Arrows B
117
+ 0x2B00..0x2BFF, # Miscellaneous Symbols and Arrows
118
+ 0x2A00..0x2AFF, # Supplemental Mathematical Operators
119
+ 0x3300..0x33FF,
120
+ 0x3200..0x32FE, # ENclosed CJK Letters and Months
121
+ 0x3000..0x303F, # CJK Symbol and Punctuation
122
+ 0xFE30..0xFE4F, # CJK Compatibility Forms
123
+ 0xFE50..0xFE6B, # Small Form Variants
124
+ # 0x3007 SYMBOL KANJINUMERIC
125
+ 0x3007..0x3007
126
+ ].freeze
127
+
128
+ ALPHA = [
129
+ # ASCII
130
+ 0x0041..0x005A,
131
+ 0x0061..0x007A,
132
+ # Latin
133
+ 0x00C0..0x00FF, # Latin 1
134
+ 0x0100..0x017F, # Latin Extended A
135
+ 0x0180..0x0236, # Latin Extended B
136
+ 0x1E00..0x1EF9, # Latin Extended Additional
137
+ 0xFF21..0xFF3A, # ZENKAKU
138
+ 0xFF41..0xFF5A # ZENKAKU
139
+ ].freeze
140
+
141
+ # CYRILLIC
142
+ CYRILLIC = [
143
+ 0x0400..0x04F9,
144
+ 0x0500..0x050F # Cyrillic supplementary
145
+ ].freeze
146
+
147
+ # GREEK
148
+ GREEK = [0x0374..0x03FB].freeze # Greek and Coptic
149
+
150
+ # HIRAGANA
151
+ HIRAGANA = [0x3041..0x309F].freeze
152
+
153
+ # KATAKANA
154
+ KATAKANA = [
155
+ 0x30A1..0x30FF,
156
+ 0x31F0..0x31FF, # Small KU .. Small RO
157
+ 0x30FC..0x30FC,
158
+ # Half KATAKANA
159
+ 0xFF66..0xFF9D,
160
+ 0xFF9E..0xFF9F
161
+ ].freeze
162
+
163
+ # KANJI
164
+ KANJI = [
165
+ 0x2E80..0x2EF3, # CJK Raidcals Supplement
166
+ 0x2F00..0x2FD5,
167
+ 0x3005..0x3005,
168
+ 0x3007..0x3007,
169
+ 0x3400..0x4DB5, # CJK Unified Ideographs Extention
170
+ 0x4E00..0x9FA5,
171
+ 0xF900..0xFA2D,
172
+ 0xFA30..0xFA6A
173
+ ].freeze
174
+
175
+ # rubocop:disable Style/AsciiComments
176
+ # KANJI-NUMERIC (一 二 三 四 五 六 七 八 九 十 百 千 万 億 兆)
177
+ # 0x4E00 KANJINUMERIC KANJI
178
+ KANJINUMERIC = [
179
+ 0x4E00..0x4E00,
180
+ 0x4E8C..0x4E8C,
181
+ 0x4E09..0x4E09,
182
+ 0x56DB..0x56DB,
183
+ 0x4E94..0x4E94,
184
+ 0x516D..0x516D,
185
+ 0x4E03..0x4E03,
186
+ 0x516B..0x516B,
187
+ 0x4E5D..0x4E5D,
188
+ 0x5341..0x5341,
189
+ 0x767E..0x767E,
190
+ 0x5343..0x5343,
191
+ 0x4E07..0x4E07,
192
+ 0x5104..0x5104,
193
+ 0x5146..0x5146
194
+ ].freeze
195
+ # rubocop:enable Style/AsciiComments
196
+
197
+ private_constant :CHAR_CATEGORY, :CHAR_TYPES
198
+
199
+ private_constant :ALPHA, :CYRILLIC, :GREEK, :HIRAGANA, :KANJI, :KANJINUMERIC, :KATAKANA, :NUMERIC, :SPACE, :SYMBOL
200
+ end
201
+ end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Suika
4
+ # @!visibility private
5
+ class Lattice
6
+ # @!visibility private
7
+ Node = Struct.new(:surface, :min_cost, :min_prev, :left_id, :right_id, :cost, :attrs, keyword_init: true)
8
+
9
+ attr_reader :begin_nodes, :end_nodes, :length
10
+
11
+ # @!visibility private
12
+ def initialize(length)
13
+ @length = length
14
+ @begin_nodes = Array.new(length + 1) { [] }
15
+ @end_nodes = Array.new(length + 1) { [] }
16
+ bos = Node.new(surface: 'BOS', left_id: 0, right_id: 0, cost: 0, attrs: [])
17
+ @end_nodes[0].append(bos)
18
+ eos = Node.new(surface: 'EOS', left_id: 0, right_id: 0, cost: 0, attrs: [])
19
+ @begin_nodes[length].append(eos)
20
+ end
21
+
22
+ # @!visibility private
23
+ def insert(begin_id, end_id, surface, left_id, right_id, cost, attrs)
24
+ node = Node.new(surface: surface, left_id: left_id, right_id: right_id, cost: cost, attrs: attrs)
25
+ @begin_nodes[begin_id].append(node)
26
+ @end_nodes[end_id].append(node)
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,120 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rambling-trie'
4
+ require 'zlib'
5
+
6
+ module Suika
7
+ # Tagger is a class that tokenizes Japanese text.
8
+ #
9
+ # @example
10
+ # require 'suika'
11
+ #
12
+ # tagger = Suika::Tagger.new
13
+ # tagger.parse('すもももももももものうち').each { |token| puts token }
14
+ #
15
+ # # すもも 名詞, 一般, *, *, *, *, すもも, スモモ, スモモ
16
+ # # も 助詞, 係助詞, *, *, *, *, も, モ, モ
17
+ # # もも 名詞, 一般, *, *, *, *, もも, モモ, モモ
18
+ # # も 助詞, 係助詞, *, *, *, *, も, モ, モ
19
+ # # もも 名詞, 一般, *, *, *, *, もも, モモ, モモ
20
+ # # の 助詞, 連体化, *, *, *, *, の, ノ, ノ
21
+ # # うち 名詞, 非自立, 副詞可能, *, *, *, うち, ウチ, ウチ
22
+ #
23
+ class Tagger
24
+ # Create a new tagger by loading the built-in binary dictionary.
25
+ def initialize
26
+ ipadic = Marshal.load(Zlib::GzipReader.open(__dir__ + '/../../dict/ipadic.gz', &:read))
27
+ @trie = ipadic[:trie]
28
+ @dictionary = ipadic[:dictionary]
29
+ @unknown_dictionary = ipadic[:unknown_dictionary]
30
+ @cost_mat = ipadic[:cost_matrix]
31
+ end
32
+
33
+ # Parse the given sentence.
34
+ # @param sentence [String] Japanese text to be parsed.
35
+ # @return [Array<String>]
36
+ def parse(sentence)
37
+ lattice = Lattice.new(sentence.length)
38
+ start = 0
39
+ terminal = sentence.length
40
+
41
+ while start < terminal
42
+ word = sentence[start]
43
+ pos = start
44
+ is_unknown = true
45
+ while @trie.match?(word) && pos < terminal
46
+ if @dictionary.key?(word)
47
+ @dictionary[word].each do |el|
48
+ lattice.insert(start, start + word.length,
49
+ word, el[0].to_i, el[1].to_i, el[2].to_i,
50
+ el[3..-1])
51
+ end
52
+ is_unknown = false
53
+ end
54
+ pos += 1
55
+ word = sentence[start..pos]
56
+ end
57
+
58
+ unless is_unknown
59
+ start += 1
60
+ next
61
+ end
62
+
63
+ word = sentence[start]
64
+ char_type = CharDef.char_type(sentence[start])
65
+ char_cate = CharDef.char_category(sentence[start])
66
+ if char_cate[:group] == 1
67
+ unk_terminal = char_cate[:length].zero? ? terminal : start + char_cate[:length]
68
+ pos = start + 1
69
+ while pos < unk_terminal && char_type == CharDef.char_type(text[t])
70
+ word << text[t]
71
+ pos += 1
72
+ end
73
+ end
74
+ @unknown_dictionary[char_type].each do |el|
75
+ lattice.insert(start, start + word.length,
76
+ word, el[0].to_i, el[1].to_i, el[2].to_i,
77
+ el[3..-1])
78
+ end
79
+ start += 1
80
+ end
81
+
82
+ viterbi(lattice)
83
+ end
84
+
85
+ private
86
+
87
+ INT_MAX = 2**(([42].pack('i').size * 16) - 2) - 1
88
+
89
+ private_constant :INT_MAX
90
+
91
+ def viterbi(lattice)
92
+ bos = lattice.end_nodes[0].first
93
+ bos.min_cost = 0
94
+ bos.min_prev = nil
95
+
96
+ (lattice.length + 1).times do |n|
97
+ lattice.begin_nodes[n].each do |rnode|
98
+ rnode.min_cost = INT_MAX
99
+ rnode.min_prev = nil
100
+ lattice.end_nodes[n].each do |lnode|
101
+ cost = lnode.min_cost + @cost_mat[lnode.right_id][rnode.left_id] + rnode.cost
102
+ if cost < rnode.min_cost
103
+ rnode.min_cost = cost
104
+ rnode.min_prev = lnode
105
+ end
106
+ end
107
+ end
108
+ end
109
+
110
+ eos = lattice.begin_nodes[-1].first
111
+ prev_node = eos.min_prev
112
+ res = []
113
+ until prev_node.nil?
114
+ res.append("#{prev_node.surface}\t#{prev_node.attrs.join(', ')}") if prev_node.surface != 'BOS' && prev_node.surface != 'EOS'
115
+ prev_node = prev_node.min_prev
116
+ end
117
+ res.reverse
118
+ end
119
+ end
120
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Suika is a Japanese morphological analyzer written in pure Ruby.
4
+ module Suika
5
+ # The version of Suika you are using.
6
+ VERSION = '0.1.0'
7
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'lib/suika/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'suika'
7
+ spec.version = Suika::VERSION
8
+ spec.authors = ['yoshoku']
9
+ spec.email = ['yoshoku@outlook.com']
10
+
11
+ spec.summary = 'Suika is a Japanese morphological analyzer written in pure Ruby.'
12
+ spec.description = 'Suika is a Japanese morphological analyzer written in pure Ruby.'
13
+ spec.homepage = 'https://github.com/yoshoku/suika'
14
+ spec.license = 'BSD-3-Clause'
15
+ spec.required_ruby_version = Gem::Requirement.new('>= 2.3.0')
16
+
17
+ spec.metadata['homepage_uri'] = spec.homepage
18
+ spec.metadata['source_code_uri'] = spec.homepage
19
+ spec.metadata['changelog_uri'] = 'https://github.com/yoshoku/magro/blob/master/CHANGELOG.md'
20
+ spec.metadata['documentation_uri'] = 'https://rubydoc.info/gems/suika'
21
+
22
+ # Specify which files should be added to the gem when it is released.
23
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
24
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
25
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
26
+ end
27
+ spec.bindir = 'exe'
28
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
29
+ spec.require_paths = ['lib']
30
+
31
+ spec.add_runtime_dependency 'rambling-trie', '~> 2.1'
32
+ end
metadata ADDED
@@ -0,0 +1,80 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: suika
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - yoshoku
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2020-07-04 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rambling-trie
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.1'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.1'
27
+ description: Suika is a Japanese morphological analyzer written in pure Ruby.
28
+ email:
29
+ - yoshoku@outlook.com
30
+ executables: []
31
+ extensions: []
32
+ extra_rdoc_files: []
33
+ files:
34
+ - ".gitignore"
35
+ - ".rspec"
36
+ - ".rubocop.yml"
37
+ - ".travis.yml"
38
+ - CODE_OF_CONDUCT.md
39
+ - Gemfile
40
+ - LICENSE.txt
41
+ - NOTICE.txt
42
+ - README.md
43
+ - Rakefile
44
+ - bin/console
45
+ - bin/setup
46
+ - dict/ipadic.gz
47
+ - lib/suika.rb
48
+ - lib/suika/char_def.rb
49
+ - lib/suika/lattice.rb
50
+ - lib/suika/tagger.rb
51
+ - lib/suika/version.rb
52
+ - suika.gemspec
53
+ homepage: https://github.com/yoshoku/suika
54
+ licenses:
55
+ - BSD-3-Clause
56
+ metadata:
57
+ homepage_uri: https://github.com/yoshoku/suika
58
+ source_code_uri: https://github.com/yoshoku/suika
59
+ changelog_uri: https://github.com/yoshoku/magro/blob/master/CHANGELOG.md
60
+ documentation_uri: https://rubydoc.info/gems/suika
61
+ post_install_message:
62
+ rdoc_options: []
63
+ require_paths:
64
+ - lib
65
+ required_ruby_version: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: 2.3.0
70
+ required_rubygems_version: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: '0'
75
+ requirements: []
76
+ rubygems_version: 3.1.2
77
+ signing_key:
78
+ specification_version: 4
79
+ summary: Suika is a Japanese morphological analyzer written in pure Ruby.
80
+ test_files: []