suika 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 41135bca7252d6e52d8c13c52ca08446ebe79ffe388f585a1739976e5b26ad4f
4
+ data.tar.gz: 22e021383f5169c264392cf4e1c4cd185319f810d86c2c8e85000b80ac493ccd
5
+ SHA512:
6
+ metadata.gz: 534e3c4c67612fcf0c052057dbb85b649cba037517c9c500c2a3e20e8f0cae97e734b71fcce7ff47ba1a55d4b99fd5fcdab0be26f14addddc3728c784d1e8966
7
+ data.tar.gz: 8f929702297098e8c3439170323205e9c58f6f5f9735192604bd08566f4f20c8ae1d603185126636097846df45e45c6e7141380fea3e2b449bea3e6c73bd983d
@@ -0,0 +1,16 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+
10
+ # rspec failure tracking
11
+ .rspec_status
12
+
13
+ *.swp
14
+ tags
15
+ .DS_Store
16
+ .ruby-version
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
@@ -0,0 +1,118 @@
1
+ require:
2
+ - rubocop-performance
3
+ - rubocop-rspec
4
+
5
+ AllCops:
6
+ TargetRubyVersion: 2.4
7
+ DisplayCopNames: true
8
+ DisplayStyleGuide: true
9
+ Exclude:
10
+ - 'bin/*'
11
+ - 'suika.gemspec'
12
+ - 'Rakefile'
13
+ - 'Gemfile'
14
+
15
+ Layout/EmptyLineAfterGuardClause:
16
+ Enabled: false
17
+
18
+ Layout/EmptyLinesAroundAttributeAccessor:
19
+ Enabled: true
20
+
21
+ Layout/LineLength:
22
+ Max: 145
23
+ IgnoredPatterns: ['(\A|\s)#']
24
+
25
+ Layout/SpaceAroundMethodCallOperator:
26
+ Enabled: true
27
+
28
+ Lint/DeprecatedOpenSSLConstant:
29
+ Enabled: true
30
+
31
+ Lint/MixedRegexpCaptureTypes:
32
+ Enabled: true
33
+
34
+ Lint/RaiseException:
35
+ Enabled: true
36
+
37
+ Lint/StructNewOverride:
38
+ Enabled: true
39
+
40
+ Metrics/ModuleLength:
41
+ Max: 200
42
+
43
+ Metrics/ClassLength:
44
+ Max: 200
45
+
46
+ Metrics/MethodLength:
47
+ Max: 50
48
+
49
+ Metrics/AbcSize:
50
+ Max: 60
51
+
52
+ Metrics/CyclomaticComplexity:
53
+ Max: 16
54
+
55
+ Metrics/PerceivedComplexity:
56
+ Max: 16
57
+
58
+ Metrics/BlockLength:
59
+ Max: 40
60
+ Exclude:
61
+ - 'spec/**/*'
62
+
63
+ Metrics/ParameterLists:
64
+ Max: 12
65
+
66
+ Naming/MethodParameterName:
67
+ Enabled: false
68
+
69
+ Naming/ConstantName:
70
+ Enabled: false
71
+
72
+ Security/MarshalLoad:
73
+ Enabled: false
74
+
75
+ Style/AsciiComments:
76
+ Enabled: false
77
+
78
+ Style/Documentation:
79
+ Enabled: false
80
+
81
+ Style/ExponentialNotation:
82
+ Enabled: true
83
+
84
+ Style/HashEachMethods:
85
+ Enabled: true
86
+
87
+ Style/HashTransformKeys:
88
+ Enabled: true
89
+
90
+ Style/HashTransformValues:
91
+ Enabled: true
92
+
93
+ Style/RedundantRegexpCharacterClass:
94
+ Enabled: true
95
+
96
+ Style/RedundantRegexpEscape:
97
+ Enabled: true
98
+
99
+ Style/SlicingWithRange:
100
+ Enabled: true
101
+
102
+ Style/FormatStringToken:
103
+ Enabled: false
104
+
105
+ Style/NumericLiterals:
106
+ Enabled: false
107
+
108
+ RSpec/MultipleExpectations:
109
+ Enabled: false
110
+
111
+ RSpec/ExampleLength:
112
+ Max: 40
113
+
114
+ RSpec/InstanceVariable:
115
+ Enabled: false
116
+
117
+ RSpec/LeakyConstantDeclaration:
118
+ Enabled: false
@@ -0,0 +1,6 @@
1
+ ---
2
+ language: ruby
3
+ cache: bundler
4
+ rvm:
5
+ - 2.7.0
6
+ before_install: gem install bundler -v 2.1.2
@@ -0,0 +1,74 @@
1
+ # Contributor Covenant Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ In the interest of fostering an open and welcoming environment, we as
6
+ contributors and maintainers pledge to making participation in our project and
7
+ our community a harassment-free experience for everyone, regardless of age, body
8
+ size, disability, ethnicity, gender identity and expression, level of experience,
9
+ nationality, personal appearance, race, religion, or sexual identity and
10
+ orientation.
11
+
12
+ ## Our Standards
13
+
14
+ Examples of behavior that contributes to creating a positive environment
15
+ include:
16
+
17
+ * Using welcoming and inclusive language
18
+ * Being respectful of differing viewpoints and experiences
19
+ * Gracefully accepting constructive criticism
20
+ * Focusing on what is best for the community
21
+ * Showing empathy towards other community members
22
+
23
+ Examples of unacceptable behavior by participants include:
24
+
25
+ * The use of sexualized language or imagery and unwelcome sexual attention or
26
+ advances
27
+ * Trolling, insulting/derogatory comments, and personal or political attacks
28
+ * Public or private harassment
29
+ * Publishing others' private information, such as a physical or electronic
30
+ address, without explicit permission
31
+ * Other conduct which could reasonably be considered inappropriate in a
32
+ professional setting
33
+
34
+ ## Our Responsibilities
35
+
36
+ Project maintainers are responsible for clarifying the standards of acceptable
37
+ behavior and are expected to take appropriate and fair corrective action in
38
+ response to any instances of unacceptable behavior.
39
+
40
+ Project maintainers have the right and responsibility to remove, edit, or
41
+ reject comments, commits, code, wiki edits, issues, and other contributions
42
+ that are not aligned to this Code of Conduct, or to ban temporarily or
43
+ permanently any contributor for other behaviors that they deem inappropriate,
44
+ threatening, offensive, or harmful.
45
+
46
+ ## Scope
47
+
48
+ This Code of Conduct applies both within project spaces and in public spaces
49
+ when an individual is representing the project or its community. Examples of
50
+ representing a project or community include using an official project e-mail
51
+ address, posting via an official social media account, or acting as an appointed
52
+ representative at an online or offline event. Representation of a project may be
53
+ further defined and clarified by project maintainers.
54
+
55
+ ## Enforcement
56
+
57
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
58
+ reported by contacting the project team at yoshoku@outlook.com. All
59
+ complaints will be reviewed and investigated and will result in a response that
60
+ is deemed necessary and appropriate to the circumstances. The project team is
61
+ obligated to maintain confidentiality with regard to the reporter of an incident.
62
+ Further details of specific enforcement policies may be posted separately.
63
+
64
+ Project maintainers who do not follow or enforce the Code of Conduct in good
65
+ faith may face temporary or permanent repercussions as determined by other
66
+ members of the project's leadership.
67
+
68
+ ## Attribution
69
+
70
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71
+ available at [https://contributor-covenant.org/version/1/4][version]
72
+
73
+ [homepage]: https://contributor-covenant.org
74
+ [version]: https://contributor-covenant.org/version/1/4/
data/Gemfile ADDED
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ source 'https://rubygems.org'
4
+
5
+ # Specify your gem's dependencies in suika.gemspec
6
+ gemspec
7
+
8
+ gem 'rake', '~> 12.0'
9
+ gem 'rspec', '~> 3.0'
@@ -0,0 +1,27 @@
1
+ Copyright (c) 2020 Atsushi Tatsuma
2
+ All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions are met:
6
+
7
+ * Redistributions of source code must retain the above copyright notice, this
8
+ list of conditions and the following disclaimer.
9
+
10
+ * Redistributions in binary form must reproduce the above copyright notice,
11
+ this list of conditions and the following disclaimer in the documentation
12
+ and/or other materials provided with the distribution.
13
+
14
+ * Neither the name of the copyright holder nor the names of its
15
+ contributors may be used to endorse or promote products derived from
16
+ this software without specific prior written permission.
17
+
18
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,79 @@
1
+ Suika includes source code and binary data which are generated from IPAdic.
2
+ The license of IPAdic is found in the following file.
3
+
4
+ mecab-ipadic-2.7.0-20070801
5
+ https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM
6
+
7
+ ---
8
+ IPAdic is licensed as follows:
9
+
10
+ Copyright 2000, 2001, 2002, 2003 Nara Institute of Science
11
+ and Technology. All Rights Reserved.
12
+
13
+ Use, reproduction, and distribution of this software is permitted.
14
+ Any copy of this software, whether in its original form or modified,
15
+ must include both the above copyright notice and the following
16
+ paragraphs.
17
+
18
+ Nara Institute of Science and Technology (NAIST),
19
+ the copyright holders, disclaims all warranties with regard to this
20
+ software, including all implied warranties of merchantability and
21
+ fitness, in no event shall NAIST be liable for
22
+ any special, indirect or consequential damages or any damages
23
+ whatsoever resulting from loss of use, data or profits, whether in an
24
+ action of contract, negligence or other tortuous action, arising out
25
+ of or in connection with the use or performance of this software.
26
+
27
+ A large portion of the dictionary entries
28
+ originate from ICOT Free Software. The following conditions for ICOT
29
+ Free Software applies to the current dictionary as well.
30
+
31
+ Each User may also freely distribute the Program, whether in its
32
+ original form or modified, to any third party or parties, PROVIDED
33
+ that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear
34
+ on, or be attached to, the Program, which is distributed substantially
35
+ in the same form as set out herein and that such intended
36
+ distribution, if actually made, will neither violate or otherwise
37
+ contravene any of the laws and regulations of the countries having
38
+ jurisdiction over the User or the intended distribution itself.
39
+
40
+ NO WARRANTY
41
+
42
+ The program was produced on an experimental basis in the course of the
43
+ research and development conducted during the project and is provided
44
+ to users as so produced on an experimental basis. Accordingly, the
45
+ program is provided without any warranty whatsoever, whether express,
46
+ implied, statutory or otherwise. The term "warranty" used herein
47
+ includes, but is not limited to, any warranty of the quality,
48
+ performance, merchantability and fitness for a particular purpose of
49
+ the program and the nonexistence of any infringement or violation of
50
+ any right of any third party.
51
+
52
+ Each user of the program will agree and understand, and be deemed to
53
+ have agreed and understood, that there is no warranty whatsoever for
54
+ the program and, accordingly, the entire risk arising from or
55
+ otherwise connected with the program is assumed by the user.
56
+
57
+ Therefore, neither ICOT, the copyright holder, or any other
58
+ organization that participated in or was otherwise related to the
59
+ development of the program and their respective officials, directors,
60
+ officers and other employees shall be held liable for any and all
61
+ damages, including, without limitation, general, special, incidental
62
+ and consequential damages, arising out of or otherwise in connection
63
+ with the use or inability to use the program or any product, material
64
+ or result produced or otherwise obtained by using the program,
65
+ regardless of whether they have been advised of, or otherwise had
66
+ knowledge of, the possibility of such damages at any time during the
67
+ project or thereafter. Each user will be deemed to have agreed to the
68
+ foregoing by his or her commencement of use of the program. The term
69
+ "use" as used herein includes, but is not limited to, the use,
70
+ modification, copying and distribution of the program and the
71
+ production of secondary products from the program.
72
+
73
+ In the case where the program, whether in its original form or
74
+ modified, was distributed or delivered to or received by a user from
75
+ any person, organization or entity other than ICOT, unless it makes or
76
+ grants independently of ICOT any specific warranty to the user in
77
+ writing, such person, organization or entity, will also be exempted
78
+ from and not be held liable to the user for any such damages as noted
79
+ above as far as the program is concerned.
@@ -0,0 +1,77 @@
1
+ # Suika
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/suika.svg)](https://badge.fury.io/rb/suika)
4
+ [![BSD 3-Clause License](https://img.shields.io/badge/License-BSD%203--Clause-orange.svg)](https://github.com/yoshoku/suika/blob/master/LICENSE.txt)
5
+ [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://rubydoc.info/gems/suika)
6
+
7
+ Suika 🍉 is a Japanese morphological analyzer written in pure Ruby.
8
+
9
+ ## Installation
10
+
11
+ Add this line to your application's Gemfile:
12
+
13
+ ```ruby
14
+ gem 'suika'
15
+ ```
16
+
17
+ And then execute:
18
+
19
+ $ bundle install
20
+
21
+ Or install it yourself as:
22
+
23
+ $ gem install suika
24
+
25
+ ## Usage
26
+
27
+ ```ruby
28
+ require 'suika'
29
+
30
+ tagger = Suika::Tagger.new
31
+ tagger.parse('すもももももももものうち').each { |token| puts token }
32
+
33
+ # すもも 名詞, 一般, *, *, *, *, すもも, スモモ, スモモ
34
+ # も 助詞, 係助詞, *, *, *, *, も, モ, モ
35
+ # もも 名詞, 一般, *, *, *, *, もも, モモ, モモ
36
+ # も 助詞, 係助詞, *, *, *, *, も, モ, モ
37
+ # もも 名詞, 一般, *, *, *, *, もも, モモ, モモ
38
+ # の 助詞, 連体化, *, *, *, *, の, ノ, ノ
39
+ # うち 名詞, 非自立, 副詞可能, *, *, *, うち, ウチ, ウチ
40
+ ```
41
+
42
+ Since the Tagger class loads the binary dictionary at initialization, it is recommended to reuse the instance.
43
+
44
+ ```ruby
45
+ tagger = Suika::Tagger.new
46
+
47
+ sentences.each do |sentence|
48
+ result = tagger.parse(sentence)
49
+
50
+ # ...
51
+ end
52
+ ```
53
+
54
+ ## Contributing
55
+
56
+ Bug reports and pull requests are welcome on GitHub at https://github.com/yoshoku/suika.
57
+ This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/yoshoku/suika/blob/master/CODE_OF_CONDUCT.md).
58
+
59
+ ## License
60
+
61
+ The gem is available as open source under the terms of the [BSD-3-Clause License](https://opensource.org/licenses/BSD-3-Clause).
62
+ In addition, the gem includes binary data generated from mecab-ipadic.
63
+ The details of the license can be found in [LICENSE.txt](https://github.com/yoshoku/suika/blob/master/LICENSE.txt)
64
+ and [NOTICE.txt](https://github.com/yoshoku/suika/blob/master/NOTICE.txt).
65
+
66
+ ## Respect
67
+
68
+ - [Taku Kudo](https://github.com/taku910) is the author of [MeCab](https://taku910.github.io/mecab/) that is the most famous morphological analyzer in Japan.
69
+ MeCab is one of the great software in natural language processing.
70
+ Suika is created with reference to [the book on morphological analysis](https://www.kindaikagaku.co.jp/information/kd0577.htm) written by Dr. Kudo.
71
+ - [Tomoko Uchida](https://github.com/mocobeta) is the author of [Janome](https://github.com/mocobeta/janome) that is a Japanese morphological analysis engine written in pure Python.
72
+ Suika is heavily influenced by Janome's idea to include the built-in dictionary and language model.
73
+ Janome, a morphological analyzer written in scripting language, gives me the courage to develop Suika.
74
+
75
+ ## Code of Conduct
76
+
77
+ Everyone interacting in the Suika project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/yoshoku/suika/blob/master/CODE_OF_CONDUCT.md).
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "suika"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
Binary file
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'suika/version'
4
+ require 'suika/char_def'
5
+ require 'suika/lattice'
6
+ require 'suika/tagger'
@@ -0,0 +1,201 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Suika
4
+ # @!visibility private
5
+ class CharDef
6
+ # @!visibility private
7
+ def self.char_type(ch)
8
+ code = ch.unpack1('U*')
9
+ CHAR_TYPES.find do |ctype|
10
+ Object.const_get("CharDef::#{ctype}").any? { |r| r.include?(code) }
11
+ end
12
+ end
13
+
14
+ # @!visibility private
15
+ def self.char_category(ch)
16
+ CHAR_CATEGORY[char_type(ch)]
17
+ end
18
+
19
+ CHAR_CATEGORY = {
20
+ 'DEFAULT' => {
21
+ invoke: 0, group: 1, length: 0
22
+ },
23
+ 'SPACE' => {
24
+ invoke: 0, group: 1, length: 0
25
+ },
26
+ 'KANJI' => {
27
+ invoke: 0, group: 0, length: 2
28
+ },
29
+ 'SYMBOL' => {
30
+ invoke: 1, group: 1, length: 0
31
+ },
32
+ 'NUMERIC' => {
33
+ invoke: 1, group: 1, length: 0
34
+ },
35
+ 'ALPHA' => {
36
+ invoke: 1, group: 1, length: 0
37
+ },
38
+ 'HIRAGANA' => {
39
+ invoke: 0, group: 1, length: 2
40
+ },
41
+ 'KATAKANA' => {
42
+ invoke: 1, group: 1, length: 2
43
+ },
44
+ 'KANJINUMERIC' => {
45
+ invoke: 1, group: 1, length: 0
46
+ },
47
+ 'GREEK' => {
48
+ invoke: 1, group: 1, length: 0
49
+ },
50
+ 'CYRILLIC' => {
51
+ invoke: 1, group: 1, length: 0
52
+ }
53
+ }.freeze
54
+
55
+ CHAR_TYPES = %w[
56
+ SPACE
57
+ NUMERIC
58
+ SYMBOL
59
+ ALPHA
60
+ CYRILLIC
61
+ GREEK
62
+ HIRAGANA
63
+ KATAKANA
64
+ KANJI
65
+ KANJINUMERIC
66
+ ].freeze
67
+
68
+ SPACE = [
69
+ 0x0020..0x0020,
70
+ 0x00D0..0x00D0,
71
+ 0x0009..0x0009,
72
+ 0x000B..0x000B,
73
+ 0x000A..0x000A
74
+ ].freeze
75
+
76
+ NUMERIC = [
77
+ 0x0030..0x0039, # ASCII
78
+ 0xFF10..0xFF19, # ZENKAKU
79
+ # OTHER SYMBOLS
80
+ 0x2070..0x209F, # Superscripts and Subscripts
81
+ 0x2150..0x218F # Number forms
82
+ ].freeze
83
+
84
+ SYMBOL = [
85
+ # ASCII
86
+ 0x0021..0x002F,
87
+ 0x003A..0x0040,
88
+ 0x005B..0x0060,
89
+ 0x007B..0x007E,
90
+ # Latin 1
91
+ 0x00A1..0x00BF,
92
+ # ZENKAKU
93
+ 0xFF01..0xFF0F,
94
+ 0xFF1A..0xFF1F,
95
+ 0xFF3B..0xFF40,
96
+ 0xFF5B..0xFF65,
97
+ 0xFFE0..0xFFEF, # HalfWidth and Full width Form
98
+ # OTHER SYMBOLS
99
+ 0x2000..0x206F, # General Punctuation
100
+ 0x20A0..0x20CF, # Currency Symbols
101
+ 0x20D0..0x20FF, # Combining Diaritical Marks for Symbols
102
+ 0x2100..0x214F, # Letterlike Symbols
103
+ 0x2100..0x214B, # Letterlike Symbols
104
+ 0x2190..0x21FF, # Arrow
105
+ 0x2200..0x22FF, # Mathematical Operators
106
+ 0x2300..0x23FF, # Miscellaneuos Technical
107
+ 0x2460..0x24FF, # Enclosed NUMERICs
108
+ 0x2501..0x257F, # Box Drawing
109
+ 0x2580..0x259F, # Block Elements
110
+ 0x25A0..0x25FF, # Geometric Shapes
111
+ 0x2600..0x26FE, # Miscellaneous Symbols
112
+ 0x2700..0x27BF, # Dingbats
113
+ 0x27F0..0x27FF, # Supplemental Arrows A
114
+ 0x27C0..0x27EF, # Miscellaneous Mathematical Symbols-A
115
+ 0x2800..0x28FF, # Braille Patterns
116
+ 0x2900..0x297F, # Supplemental Arrows B
117
+ 0x2B00..0x2BFF, # Miscellaneous Symbols and Arrows
118
+ 0x2A00..0x2AFF, # Supplemental Mathematical Operators
119
+ 0x3300..0x33FF,
120
+ 0x3200..0x32FE, # ENclosed CJK Letters and Months
121
+ 0x3000..0x303F, # CJK Symbol and Punctuation
122
+ 0xFE30..0xFE4F, # CJK Compatibility Forms
123
+ 0xFE50..0xFE6B, # Small Form Variants
124
+ # 0x3007 SYMBOL KANJINUMERIC
125
+ 0x3007..0x3007
126
+ ].freeze
127
+
128
+ ALPHA = [
129
+ # ASCII
130
+ 0x0041..0x005A,
131
+ 0x0061..0x007A,
132
+ # Latin
133
+ 0x00C0..0x00FF, # Latin 1
134
+ 0x0100..0x017F, # Latin Extended A
135
+ 0x0180..0x0236, # Latin Extended B
136
+ 0x1E00..0x1EF9, # Latin Extended Additional
137
+ 0xFF21..0xFF3A, # ZENKAKU
138
+ 0xFF41..0xFF5A # ZENKAKU
139
+ ].freeze
140
+
141
+ # CYRILLIC
142
+ CYRILLIC = [
143
+ 0x0400..0x04F9,
144
+ 0x0500..0x050F # Cyrillic supplementary
145
+ ].freeze
146
+
147
+ # GREEK
148
+ GREEK = [0x0374..0x03FB].freeze # Greek and Coptic
149
+
150
+ # HIRAGANA
151
+ HIRAGANA = [0x3041..0x309F].freeze
152
+
153
+ # KATAKANA
154
+ KATAKANA = [
155
+ 0x30A1..0x30FF,
156
+ 0x31F0..0x31FF, # Small KU .. Small RO
157
+ 0x30FC..0x30FC,
158
+ # Half KATAKANA
159
+ 0xFF66..0xFF9D,
160
+ 0xFF9E..0xFF9F
161
+ ].freeze
162
+
163
+ # KANJI
164
+ KANJI = [
165
+ 0x2E80..0x2EF3, # CJK Raidcals Supplement
166
+ 0x2F00..0x2FD5,
167
+ 0x3005..0x3005,
168
+ 0x3007..0x3007,
169
+ 0x3400..0x4DB5, # CJK Unified Ideographs Extention
170
+ 0x4E00..0x9FA5,
171
+ 0xF900..0xFA2D,
172
+ 0xFA30..0xFA6A
173
+ ].freeze
174
+
175
+ # rubocop:disable Style/AsciiComments
176
+ # KANJI-NUMERIC (一 二 三 四 五 六 七 八 九 十 百 千 万 億 兆)
177
+ # 0x4E00 KANJINUMERIC KANJI
178
+ KANJINUMERIC = [
179
+ 0x4E00..0x4E00,
180
+ 0x4E8C..0x4E8C,
181
+ 0x4E09..0x4E09,
182
+ 0x56DB..0x56DB,
183
+ 0x4E94..0x4E94,
184
+ 0x516D..0x516D,
185
+ 0x4E03..0x4E03,
186
+ 0x516B..0x516B,
187
+ 0x4E5D..0x4E5D,
188
+ 0x5341..0x5341,
189
+ 0x767E..0x767E,
190
+ 0x5343..0x5343,
191
+ 0x4E07..0x4E07,
192
+ 0x5104..0x5104,
193
+ 0x5146..0x5146
194
+ ].freeze
195
+ # rubocop:enable Style/AsciiComments
196
+
197
+ private_constant :CHAR_CATEGORY, :CHAR_TYPES
198
+
199
+ private_constant :ALPHA, :CYRILLIC, :GREEK, :HIRAGANA, :KANJI, :KANJINUMERIC, :KATAKANA, :NUMERIC, :SPACE, :SYMBOL
200
+ end
201
+ end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Suika
4
+ # @!visibility private
5
+ class Lattice
6
+ # @!visibility private
7
+ Node = Struct.new(:surface, :min_cost, :min_prev, :left_id, :right_id, :cost, :attrs, keyword_init: true)
8
+
9
+ attr_reader :begin_nodes, :end_nodes, :length
10
+
11
+ # @!visibility private
12
+ def initialize(length)
13
+ @length = length
14
+ @begin_nodes = Array.new(length + 1) { [] }
15
+ @end_nodes = Array.new(length + 1) { [] }
16
+ bos = Node.new(surface: 'BOS', left_id: 0, right_id: 0, cost: 0, attrs: [])
17
+ @end_nodes[0].append(bos)
18
+ eos = Node.new(surface: 'EOS', left_id: 0, right_id: 0, cost: 0, attrs: [])
19
+ @begin_nodes[length].append(eos)
20
+ end
21
+
22
+ # @!visibility private
23
+ def insert(begin_id, end_id, surface, left_id, right_id, cost, attrs)
24
+ node = Node.new(surface: surface, left_id: left_id, right_id: right_id, cost: cost, attrs: attrs)
25
+ @begin_nodes[begin_id].append(node)
26
+ @end_nodes[end_id].append(node)
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,120 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rambling-trie'
4
+ require 'zlib'
5
+
6
+ module Suika
7
+ # Tagger is a class that tokenizes Japanese text.
8
+ #
9
+ # @example
10
+ # require 'suika'
11
+ #
12
+ # tagger = Suika::Tagger.new
13
+ # tagger.parse('すもももももももものうち').each { |token| puts token }
14
+ #
15
+ # # すもも 名詞, 一般, *, *, *, *, すもも, スモモ, スモモ
16
+ # # も 助詞, 係助詞, *, *, *, *, も, モ, モ
17
+ # # もも 名詞, 一般, *, *, *, *, もも, モモ, モモ
18
+ # # も 助詞, 係助詞, *, *, *, *, も, モ, モ
19
+ # # もも 名詞, 一般, *, *, *, *, もも, モモ, モモ
20
+ # # の 助詞, 連体化, *, *, *, *, の, ノ, ノ
21
+ # # うち 名詞, 非自立, 副詞可能, *, *, *, うち, ウチ, ウチ
22
+ #
23
+ class Tagger
24
+ # Create a new tagger by loading the built-in binary dictionary.
25
+ def initialize
26
+ ipadic = Marshal.load(Zlib::GzipReader.open(__dir__ + '/../../dict/ipadic.gz', &:read))
27
+ @trie = ipadic[:trie]
28
+ @dictionary = ipadic[:dictionary]
29
+ @unknown_dictionary = ipadic[:unknown_dictionary]
30
+ @cost_mat = ipadic[:cost_matrix]
31
+ end
32
+
33
+ # Parse the given sentence.
34
+ # @param sentence [String] Japanese text to be parsed.
35
+ # @return [Array<String>]
36
+ def parse(sentence)
37
+ lattice = Lattice.new(sentence.length)
38
+ start = 0
39
+ terminal = sentence.length
40
+
41
+ while start < terminal
42
+ word = sentence[start]
43
+ pos = start
44
+ is_unknown = true
45
+ while @trie.match?(word) && pos < terminal
46
+ if @dictionary.key?(word)
47
+ @dictionary[word].each do |el|
48
+ lattice.insert(start, start + word.length,
49
+ word, el[0].to_i, el[1].to_i, el[2].to_i,
50
+ el[3..-1])
51
+ end
52
+ is_unknown = false
53
+ end
54
+ pos += 1
55
+ word = sentence[start..pos]
56
+ end
57
+
58
+ unless is_unknown
59
+ start += 1
60
+ next
61
+ end
62
+
63
+ word = sentence[start]
64
+ char_type = CharDef.char_type(sentence[start])
65
+ char_cate = CharDef.char_category(sentence[start])
66
+ if char_cate[:group] == 1
67
+ unk_terminal = char_cate[:length].zero? ? terminal : start + char_cate[:length]
68
+ pos = start + 1
69
+ while pos < unk_terminal && char_type == CharDef.char_type(text[t])
70
+ word << text[t]
71
+ pos += 1
72
+ end
73
+ end
74
+ @unknown_dictionary[char_type].each do |el|
75
+ lattice.insert(start, start + word.length,
76
+ word, el[0].to_i, el[1].to_i, el[2].to_i,
77
+ el[3..-1])
78
+ end
79
+ start += 1
80
+ end
81
+
82
+ viterbi(lattice)
83
+ end
84
+
85
+ private
86
+
87
+ INT_MAX = 2**(([42].pack('i').size * 16) - 2) - 1
88
+
89
+ private_constant :INT_MAX
90
+
91
+ def viterbi(lattice)
92
+ bos = lattice.end_nodes[0].first
93
+ bos.min_cost = 0
94
+ bos.min_prev = nil
95
+
96
+ (lattice.length + 1).times do |n|
97
+ lattice.begin_nodes[n].each do |rnode|
98
+ rnode.min_cost = INT_MAX
99
+ rnode.min_prev = nil
100
+ lattice.end_nodes[n].each do |lnode|
101
+ cost = lnode.min_cost + @cost_mat[lnode.right_id][rnode.left_id] + rnode.cost
102
+ if cost < rnode.min_cost
103
+ rnode.min_cost = cost
104
+ rnode.min_prev = lnode
105
+ end
106
+ end
107
+ end
108
+ end
109
+
110
+ eos = lattice.begin_nodes[-1].first
111
+ prev_node = eos.min_prev
112
+ res = []
113
+ until prev_node.nil?
114
+ res.append("#{prev_node.surface}\t#{prev_node.attrs.join(', ')}") if prev_node.surface != 'BOS' && prev_node.surface != 'EOS'
115
+ prev_node = prev_node.min_prev
116
+ end
117
+ res.reverse
118
+ end
119
+ end
120
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Suika is a Japanese morphological analyzer written in pure Ruby.
4
+ module Suika
5
+ # The version of Suika you are using.
6
+ VERSION = '0.1.0'
7
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'lib/suika/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'suika'
7
+ spec.version = Suika::VERSION
8
+ spec.authors = ['yoshoku']
9
+ spec.email = ['yoshoku@outlook.com']
10
+
11
+ spec.summary = 'Suika is a Japanese morphological analyzer written in pure Ruby.'
12
+ spec.description = 'Suika is a Japanese morphological analyzer written in pure Ruby.'
13
+ spec.homepage = 'https://github.com/yoshoku/suika'
14
+ spec.license = 'BSD-3-Clause'
15
+ spec.required_ruby_version = Gem::Requirement.new('>= 2.3.0')
16
+
17
+ spec.metadata['homepage_uri'] = spec.homepage
18
+ spec.metadata['source_code_uri'] = spec.homepage
19
+ spec.metadata['changelog_uri'] = 'https://github.com/yoshoku/magro/blob/master/CHANGELOG.md'
20
+ spec.metadata['documentation_uri'] = 'https://rubydoc.info/gems/suika'
21
+
22
+ # Specify which files should be added to the gem when it is released.
23
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
24
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
25
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
26
+ end
27
+ spec.bindir = 'exe'
28
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
29
+ spec.require_paths = ['lib']
30
+
31
+ spec.add_runtime_dependency 'rambling-trie', '~> 2.1'
32
+ end
metadata ADDED
@@ -0,0 +1,80 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: suika
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - yoshoku
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2020-07-04 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rambling-trie
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.1'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.1'
27
+ description: Suika is a Japanese morphological analyzer written in pure Ruby.
28
+ email:
29
+ - yoshoku@outlook.com
30
+ executables: []
31
+ extensions: []
32
+ extra_rdoc_files: []
33
+ files:
34
+ - ".gitignore"
35
+ - ".rspec"
36
+ - ".rubocop.yml"
37
+ - ".travis.yml"
38
+ - CODE_OF_CONDUCT.md
39
+ - Gemfile
40
+ - LICENSE.txt
41
+ - NOTICE.txt
42
+ - README.md
43
+ - Rakefile
44
+ - bin/console
45
+ - bin/setup
46
+ - dict/ipadic.gz
47
+ - lib/suika.rb
48
+ - lib/suika/char_def.rb
49
+ - lib/suika/lattice.rb
50
+ - lib/suika/tagger.rb
51
+ - lib/suika/version.rb
52
+ - suika.gemspec
53
+ homepage: https://github.com/yoshoku/suika
54
+ licenses:
55
+ - BSD-3-Clause
56
+ metadata:
57
+ homepage_uri: https://github.com/yoshoku/suika
58
+ source_code_uri: https://github.com/yoshoku/suika
59
+ changelog_uri: https://github.com/yoshoku/magro/blob/master/CHANGELOG.md
60
+ documentation_uri: https://rubydoc.info/gems/suika
61
+ post_install_message:
62
+ rdoc_options: []
63
+ require_paths:
64
+ - lib
65
+ required_ruby_version: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: 2.3.0
70
+ required_rubygems_version: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: '0'
75
+ requirements: []
76
+ rubygems_version: 3.1.2
77
+ signing_key:
78
+ specification_version: 4
79
+ summary: Suika is a Japanese morphological analyzer written in pure Ruby.
80
+ test_files: []