zabon 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.bundler-audit.yml +6 -0
- data/.rubocop.yml +41 -0
- data/.ruby-gemset +1 -0
- data/.ruby-version +1 -0
- data/AGENTS.md +51 -0
- data/CHANGELOG.md +34 -0
- data/Gemfile +22 -0
- data/LICENSE.txt +25 -0
- data/README.md +178 -0
- data/Rakefile +22 -0
- data/bin/console +11 -0
- data/bin/setup +8 -0
- data/lib/zabon/analyzer.rb +71 -0
- data/lib/zabon/configuration.rb +9 -0
- data/lib/zabon/constants.rb +39 -0
- data/lib/zabon/helper.rb +39 -0
- data/lib/zabon/railtie.rb +12 -0
- data/lib/zabon/segment.rb +13 -0
- data/lib/zabon/version.rb +5 -0
- data/lib/zabon.rb +29 -0
- data/zabon.gemspec +29 -0
- metadata +90 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: d0f3961a7b9e53f882a480036cfb49a80eb7340f62d4a9c8d1b1e3e3bad0c82c
|
|
4
|
+
data.tar.gz: 779a1fac27148c57b58f79c03920d920b44c07751f9e64c527735b2748fa5bde
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: ec94a2d9f1fee896f2fb3c5deb27913e33971063c7397224c9a5a6d4394ce8a0d6a8d43bc96d1d954969d5d0f5105d3385eabc86b7f6e20e23a1ba33c0e26d8c
|
|
7
|
+
data.tar.gz: 40f2a3fc5c52b24b7346e312c26092435e87fd8f3405ce88440c540742e18270c5c75865e92bcbe339865492369dbf18001053f7cea2fe95642ff0b056cfec30
|
data/.bundler-audit.yml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
ignore:
|
|
2
|
+
# nokogiri GHSA-wx95-c6cv-8532: does not check return value from xmlC14NExecute.
|
|
3
|
+
# Patched in nokogiri >= 1.19.1, which requires Ruby >= 3.2.
|
|
4
|
+
# zabon supports Ruby 3.1+ and does not use nokogiri's C14N functionality directly
|
|
5
|
+
# (nokogiri is a transitive dependency via actionview). Revisit when Ruby 3.1 is EOL.
|
|
6
|
+
- GHSA-wx95-c6cv-8532
|
data/.rubocop.yml
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
require:
|
|
2
|
+
- rubocop-minitest
|
|
3
|
+
- rubocop-performance
|
|
4
|
+
- rubocop-rake
|
|
5
|
+
|
|
6
|
+
AllCops:
|
|
7
|
+
TargetRubyVersion: 3.1
|
|
8
|
+
|
|
9
|
+
Layout/LineLength:
|
|
10
|
+
Max: 180
|
|
11
|
+
Exclude:
|
|
12
|
+
- test/zabon_test.rb
|
|
13
|
+
|
|
14
|
+
Style/StringLiterals:
|
|
15
|
+
Enabled: true
|
|
16
|
+
EnforcedStyle: double_quotes
|
|
17
|
+
|
|
18
|
+
Style/StringLiteralsInInterpolation:
|
|
19
|
+
Enabled: true
|
|
20
|
+
EnforcedStyle: double_quotes
|
|
21
|
+
|
|
22
|
+
Style/RedundantRegexpEscape:
|
|
23
|
+
Enabled: false
|
|
24
|
+
|
|
25
|
+
Style/Documentation:
|
|
26
|
+
Enabled: false
|
|
27
|
+
|
|
28
|
+
Metrics/AbcSize:
|
|
29
|
+
Enabled: false
|
|
30
|
+
|
|
31
|
+
Metrics/CyclomaticComplexity:
|
|
32
|
+
Enabled: false
|
|
33
|
+
|
|
34
|
+
Metrics/MethodLength:
|
|
35
|
+
Enabled: false
|
|
36
|
+
|
|
37
|
+
Metrics/BlockLength:
|
|
38
|
+
Enabled: false
|
|
39
|
+
|
|
40
|
+
Metrics/PerceivedComplexity:
|
|
41
|
+
Enabled: false
|
data/.ruby-gemset
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
zabon
|
data/.ruby-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.3
|
data/AGENTS.md
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# Agent Guidelines
|
|
2
|
+
|
|
3
|
+
## Repository overview
|
|
4
|
+
|
|
5
|
+
zabon is a Ruby gem for Japanese text segmentation (Kinsoku Shori / 禁則処理).
|
|
6
|
+
It ports the mikan.js algorithm to Ruby and integrates with Rails via `ActionView::Helper`.
|
|
7
|
+
|
|
8
|
+
## Running checks
|
|
9
|
+
|
|
10
|
+
```bash
|
|
11
|
+
bundle exec rake # full suite: bundler-audit, rubocop, tests
|
|
12
|
+
bundle exec rake test # tests only
|
|
13
|
+
bundle exec rubocop # linter only
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
All three must be green before committing.
|
|
17
|
+
|
|
18
|
+
## Code conventions
|
|
19
|
+
|
|
20
|
+
- Ruby 3.1+ required; use Ruby 3 idioms: endless methods, `Hash#except`, `Struct keyword_init:`.
|
|
21
|
+
- `frozen_string_literal: true` on every file.
|
|
22
|
+
- Double-quoted strings (`EnforcedStyle: double_quotes` in `.rubocop.yml`).
|
|
23
|
+
- Predicate methods return a plain boolean — use `Regexp#match?`, never `match`.
|
|
24
|
+
|
|
25
|
+
## constants.rb encoding
|
|
26
|
+
|
|
27
|
+
`lib/zabon/constants.rb` stores some voiced hiragana particles in NFD form (e.g. `で` = `\u3066\u3099`).
|
|
28
|
+
This is **load-bearing**: those particles intentionally do not match NFC strings.
|
|
29
|
+
Never normalise this file to NFC. When editing it, use byte-level operations and verify
|
|
30
|
+
`test_sentence6` and `test_sentence8` still pass.
|
|
31
|
+
|
|
32
|
+
## Commits
|
|
33
|
+
|
|
34
|
+
Follow conventional commits (`fix:`, `feat:`, `ref:`, `test:`, `chore:`, `ci:`, `license:`, `meta:`).
|
|
35
|
+
One logical change per commit. Include `Co-Authored-By:` when AI-generated.
|
|
36
|
+
|
|
37
|
+
## CI matrix
|
|
38
|
+
|
|
39
|
+
Ruby 3.1, 3.2, 3.3, 3.4 on ubuntu-latest. Always verify fixes on **all** matrix versions locally
|
|
40
|
+
before pushing — incompatibilities between Ruby versions are not always obvious.
|
|
41
|
+
|
|
42
|
+
## Tests
|
|
43
|
+
|
|
44
|
+
Tests in `test/zabon_test.rb` must satisfy `expected.join == source` — the segmentation must be
|
|
45
|
+
lossless. When adding new test sentences, run `Zabon.split(source)` first to get the real output,
|
|
46
|
+
then copy it into `expected`; do not guess segment boundaries.
|
|
47
|
+
|
|
48
|
+
## Attribution
|
|
49
|
+
|
|
50
|
+
zabon is a port of [mikan.js](https://github.com/trkbt10/mikan.js) by trkbt10 (MIT).
|
|
51
|
+
The upstream copyright notice is preserved in `LICENSE.txt` — do not remove it.
|
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## 0.2.0 (2026-03-26)
|
|
4
|
+
|
|
5
|
+
### Breaking changes
|
|
6
|
+
|
|
7
|
+
- Ruby >= 3.1 is now required (was >= 2.5)
|
|
8
|
+
|
|
9
|
+
### Features
|
|
10
|
+
|
|
11
|
+
- Add `ので` as a standalone JOSHI particle, correcting an accidental concatenation in the particle list
|
|
12
|
+
- Add optional Sentry context (`Sentry.set_context`) before segmentation — zero hard dependency, guarded by `defined?(Sentry)`
|
|
13
|
+
- Add `reset_config!` to reset configuration to defaults
|
|
14
|
+
- Add `examples/` — a minimal single-file Rails app demonstrating `zabon_translate` with locale switching
|
|
15
|
+
|
|
16
|
+
### Fixes
|
|
17
|
+
|
|
18
|
+
- `Segment#hiragana?` was returning `MatchData` instead of a boolean
|
|
19
|
+
- `Analyzer#segments` had implicit operator precedence on a compound condition; added explicit parentheses
|
|
20
|
+
- `Helper#zabon_translate` was detecting missing translations via a brittle string sniff (`include?("translation_missing")`); replaced with `I18n.exists?`
|
|
21
|
+
- `Helper#zabon_translate` called `strip_tags` via a module method that shadowed `ActionView::Helpers::SanitizeHelper#strip_tags` in the ancestor chain; inlined the `ActionView::Base.full_sanitizer` call directly
|
|
22
|
+
- `require "uri"` added before `require "action_view"` to fix a `NameError` on Ruby 3.1 where `URI` is not pre-loaded
|
|
23
|
+
- `Zabon::Helper` now works correctly when included in a plain controller or any non-`ActionView::Base` context
|
|
24
|
+
|
|
25
|
+
### Changes
|
|
26
|
+
|
|
27
|
+
- Ruby 3 idioms adopted throughout: endless methods (`Segment`), `Hash#except` (`Helper`), `Struct keyword_init:` (`Configuration`)
|
|
28
|
+
- CI matrix updated to Ruby 3.1, 3.2, 3.3, 3.4
|
|
29
|
+
- Upstream copyright notice for mikan.js (trkbt10) added to `LICENSE.txt`
|
|
30
|
+
- `examples/` excluded from the gem package
|
|
31
|
+
|
|
32
|
+
## 0.1.0
|
|
33
|
+
|
|
34
|
+
Initial release.
|
data/Gemfile
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
source "https://rubygems.org"
|
|
4
|
+
|
|
5
|
+
gemspec
|
|
6
|
+
|
|
7
|
+
group :code_quality do
|
|
8
|
+
gem "bundler-audit"
|
|
9
|
+
gem "rubocop"
|
|
10
|
+
gem "rubocop-minitest"
|
|
11
|
+
gem "rubocop-performance"
|
|
12
|
+
gem "rubocop-rake"
|
|
13
|
+
gem "simplecov"
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
group :development do
|
|
17
|
+
gem "rake"
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
group :test do
|
|
21
|
+
gem "minitest"
|
|
22
|
+
end
|
data/LICENSE.txt
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2022 Joesi
|
|
4
|
+
|
|
5
|
+
Portions of this software are derived from mikan.js
|
|
6
|
+
(https://github.com/trkbt10/mikan.js), Copyright (c) trkbt10,
|
|
7
|
+
used under the MIT License.
|
|
8
|
+
|
|
9
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
10
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
11
|
+
in the Software without restriction, including without limitation the rights
|
|
12
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
13
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
14
|
+
furnished to do so, subject to the following conditions:
|
|
15
|
+
|
|
16
|
+
The above copyright notice and this permission notice shall be included in
|
|
17
|
+
all copies or substantial portions of the Software.
|
|
18
|
+
|
|
19
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
20
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
21
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
22
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
23
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
24
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
25
|
+
THE SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
# zabon.ruby 🍊
|
|
2
|
+
|
|
3
|
+
A Ruby gem / Rails helper for dealing with Japanese line-breaking logic. It is basically a port of [mikan.js](https://github.com/trkbt10/mikan.js), which implements a regular expression based algorithm to segment text into semantic chunks. No machine learning needed 🤖☺️. In addition the resulting text segments can be wrapped in a configurable HTML tag. All praise 👏👏👏 for the algorithm goes to [trkbt10](https://github.com/trkbt10).
|
|
4
|
+
|
|
5
|
+
## Usage
|
|
6
|
+
``` ruby
|
|
7
|
+
# split this sentence
|
|
8
|
+
Zabon.split('この文を分割する')
|
|
9
|
+
=> ["この", "文を", "分割する"]
|
|
10
|
+
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Configuration
|
|
14
|
+
|
|
15
|
+
Configuration is used for tag that the results can be wrapped in. It's making heavy use of Rails tag helpers.
|
|
16
|
+
E.g. put this in an initializer in your Rails app.
|
|
17
|
+
|
|
18
|
+
``` ruby
|
|
19
|
+
Zabon.configure do |config|
|
|
20
|
+
config.tag = :div # default: :span
|
|
21
|
+
config.tag_options = { class: 'zabon_trara', style: 'font_size: 5em' } # default: { class: 'zabon', style: 'display: inline-block' }
|
|
22
|
+
config.strip_tags = false # default true
|
|
23
|
+
end
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
### Rails
|
|
27
|
+
|
|
28
|
+
The gem ships a Railtie that automatically includes `Zabon::Helper` into `ActionView::Base`, so `zabon_translate` is available in all views without any further setup.
|
|
29
|
+
|
|
30
|
+
Call `zabon_translate` directly in views for strings that need segmentation:
|
|
31
|
+
|
|
32
|
+
```erb
|
|
33
|
+
<%= zabon_translate("page.title") %>
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
To replace the standard `t()` globally, add the following to an initializer. Note that this affects **all** ActionView translation calls — prefer explicit `zabon_translate` for finer control.
|
|
37
|
+
|
|
38
|
+
```ruby
|
|
39
|
+
# config/initializers/zabon.rb
|
|
40
|
+
module ActionView
|
|
41
|
+
module Helpers
|
|
42
|
+
module TranslationHelper
|
|
43
|
+
alias_method :translate_without_zabon, :translate
|
|
44
|
+
|
|
45
|
+
def translate(key, **options)
|
|
46
|
+
zabon_translate(key, orig_translate: :translate_without_zabon, **options)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
alias t translate
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Japanese grammar 🇯🇵
|
|
56
|
+
|
|
57
|
+
Just enough Japanese to understand the algorithm :)
|
|
58
|
+
|
|
59
|
+
### Writing system ✍️
|
|
60
|
+
|
|
61
|
+
The Japanese writing system uses for different components:
|
|
62
|
+
|
|
63
|
+
* [Hiragana (ひらがな)](https://en.wikipedia.org/wiki/Hiragana), a syllabary alphabet used for Japanese words not covered by kanji and mostly for grammatical inflections
|
|
64
|
+
* [Katakana (カタカナ)](https://en.wikipedia.org/wiki/Katakana), a syllabary alphabet used for transcription of foreign-language words into Japanese; for emphasis; [onomatopoeia](https://en.wikipedia.org/wiki/Onomatopoeia); for scientific terms and often Japanese companies.
|
|
65
|
+
* [Kanji (漢字)](https://en.wikipedia.org/wiki/Kanji), a set of Chinese characters directly incorporated into the written Japanese language with often Japanese pronunciation, which can be
|
|
66
|
+
* [Romaji](https://en.wikipedia.org/wiki/Romanization_of_Japanese), use of Latin script in Japanese language
|
|
67
|
+
|
|
68
|
+
### Particles
|
|
69
|
+
|
|
70
|
+
[Joshi (助詞)](https://en.wikipedia.org/wiki/Japanese_particles), Japanese particles written in Hiragana, are suffixes or short words that follow a modified noun, verb, adjective, or sentence. Their grammatical range can indicate various meanings and functions:
|
|
71
|
+
|
|
72
|
+
* case markers
|
|
73
|
+
* parallel markers
|
|
74
|
+
* sentence ending particles
|
|
75
|
+
* interjectory particles
|
|
76
|
+
* adverbial particles
|
|
77
|
+
* binding particles
|
|
78
|
+
* conjunctive particles
|
|
79
|
+
* phrasal particles
|
|
80
|
+
|
|
81
|
+
### Line breaking
|
|
82
|
+
|
|
83
|
+
Certain characters in Japanese should not come at the end of a line, certain characters should not come at the start of a line, and some characters should never be split up across two lines. These rules are called [Kinsoku Shori 禁則処理](https://en.wikipedia.org/wiki/Line_breaking_rules_in_East_Asian_languages#Line_breaking_rules_in_Japanese_text_(Kinsoku_Shori)):
|
|
84
|
+
|
|
85
|
+
simplified:
|
|
86
|
+
|
|
87
|
+
| Class | Can't begin a line | Can't finish a line |
|
|
88
|
+
|-------|--------------------|---------------------|
|
|
89
|
+
| small _kana_ | ぁぃぅぇぉっ... | |
|
|
90
|
+
| parentheses | )〉》】... | (〈《【... |
|
|
91
|
+
| quotations | 」』”... | 「『“... |
|
|
92
|
+
| punctuation | 、。・!?... | |
|
|
93
|
+
|
|
94
|
+
### Text segmentation
|
|
95
|
+
|
|
96
|
+
Written Japanese uses no spaces and little punctuation to delimit words. Readers instead depend on grammatical cues (e.g. Japanese, particles and verb endings), the relative frequency of character combinations, and semantic context, in order to determine what words have been written. This is a non trivial problem which is often solved by applying machine learning algorithms. Without a careful approach, breaks can occur randomly and usually in the middle of a word. This is an issue with typography on the web and results in a degradation of readability.
|
|
97
|
+
|
|
98
|
+
### Zabon ???
|
|
99
|
+
|
|
100
|
+
I made a couple of assumptions when choosing the name:
|
|
101
|
+
1. 🍊 The original algorithm name **Mikan** might be transscription of 蜜柑, a Japanese citrus fruit (Mandarin, Satsuma)
|
|
102
|
+
2. There already is a gem called [mikan](https://rubygems.org/gems/mikan), didn't want to go for **mikan_ruby** or similar b/c of autoloading
|
|
103
|
+
3. 🍇 My guess is the original author chose this name, b/c he was searching for something simpler then Google's **Budou** (葡萄)
|
|
104
|
+
4. 🔪 Both fruits have in common, that they can be easily split apart in segments
|
|
105
|
+
5. So I was searching for another fruit that can be easily split apart, what can be split better apart than a Pomelo (文旦, ぶんたん) - **Zabon** (derived from Portoguese: zamboa)
|
|
106
|
+
|
|
107
|
+
Who knows if that's how it was 🤷🏻♂️😂.
|
|
108
|
+
|
|
109
|
+
## The Algorithm
|
|
110
|
+
|
|
111
|
+
This algorithm does NOT find the most minimal segmentation of unbreakable text segments and probably will have problems if a text is solely written in one alphabet. It also does not support Furigana (yet). It does basic text segmentation and stitches the segments back together in segments which can be made unbreakable. The unbreakability we achieve by wrapping them in a <span> tag with certain CSS rules.
|
|
112
|
+
|
|
113
|
+
### Splitting
|
|
114
|
+
|
|
115
|
+
1. Split text across different alphabets used: split text into parts that are written in Kanjis, Hiragana, Katakana, Latin (incl. double width characters). The assumption here is that parts written in the same script should belong together.
|
|
116
|
+
|
|
117
|
+
2. Then split up each element further by splitting up particles are sequences that might be used as particles. The original author of the algorithm has identified the following list (でなければ, について, かしら, くらい, けれど, なのか, ばかり, ながら, ことよ, こそ, こと, さえ, しか, した, たり, だけ, だに, だの, つつ, ても, てよ, でも, とも, から, など, なりので, のに, ほど, まで, もの, やら, より, って, で, と, な, に, ね, の, も, は, ば, へ, や, わ, を, か, が, さ, し, ぞ, て). To me that looks about right, but maybe there are missing some.
|
|
118
|
+
|
|
119
|
+
3. Split along further by splitting up brackets and quotations: ([,〈,《,「,『,「,【,〔,〚,〖,〘,❮,❬,❪,❨,(,<,{,❲,❰,{,❴,] + the matching end brackets and quotations.
|
|
120
|
+
|
|
121
|
+
### Stitching
|
|
122
|
+
|
|
123
|
+
1. Now we have a list of minimal segments and try to stitch them back together in a result set, so that they will fulfil Japanese line breaking rules. We are gonna look at tuples from left to right, looking at the current segment and the previous segment.
|
|
124
|
+
|
|
125
|
+
2. If the current segment is a beginning bracket or quotation; we look at the next segment, we have a definitiv start of an unbreakable segment.
|
|
126
|
+
|
|
127
|
+
3. If the current segment is an ending bracket or quotation; we append to the last entry of the result set and don't look back anymore; we've reached the end of a segment and start a new one with the next iteration.
|
|
128
|
+
|
|
129
|
+
4. If the previous segment is a beginning bracket; we stitch it together with the current segment to become a new segment. In the next iteration we don’t need to look at the previous segment anymore and continue.
|
|
130
|
+
|
|
131
|
+
5. If he current segment is a particle or a punctuation mark and we are not looking back (see step 7.); we append the current segment to the last entry of the result set.
|
|
132
|
+
|
|
133
|
+
6. If he current segment is a particle or a punctuation mark or if the previous segment is not a bracket, quotation or punctuation mark or a conjunctive particle (と, の,に) and the current segment is in Hiragana; we append to the last entry of the result set.
|
|
134
|
+
|
|
135
|
+
7. If no condition from stiching steps 1-2 are matching we can safely add the current segment to the result set.
|
|
136
|
+
|
|
137
|
+
## Other solutions
|
|
138
|
+
### [Google Budou](https://github.com/google/budou)
|
|
139
|
+
|
|
140
|
+
Budou is a python library, which uses word segmenters to analyze input sentences. It can concatenate proper into meaningful chunks utilizing part-of-speech tagging and other syntactic information. Processed chunks are wrapped in a SPAN tag. Depending on the text segmentation algorithm used, it also has support for Chinese & Korean. Since this library is written in Python, it cannot be used simply used in Ruby, PHP, or Node.js.
|
|
141
|
+
|
|
142
|
+
#### Text segmenter backends
|
|
143
|
+
You can choose different segmenter backends depending on the needs of your environment. Currently, the segmenters below are supported.
|
|
144
|
+
|
|
145
|
+
* [Google Cloud Natural Language API](https://cloud.google.com/natural-language/): external API calls, can be costly
|
|
146
|
+
* [MeCab](https://taku910.github.io/mecab/): Japanese POS tagger & morphological analyzer with lots of language bindings, e.g. also used in Google Japanese Input and Japanese Input on Mac OS X
|
|
147
|
+
* [TinySegmenter](http://chasen.org/~taku/software/TinySegmenter/): extremely compact word separation algorithm in Javascript which produces MeCab compatible word separation without depending on external APIs, no dictionaires, classifies input
|
|
148
|
+
|
|
149
|
+
[TinySegmenter](http://chasen.org/~taku/software/TinySegmenter/) is an extremely compact word separation algorithm in Javascript which produces MeCab compatible word separation without depending on external APIs. It classifies the input by using entities like characters, N-Grams, Hiragana, Katakana (Japanese phonetic lettering system / syllabaries) and their combinations as features to determine whether a character is preceded by a word boundary. A [Naive Bayes]((https://towardsdatascience.com/naive-bayes-explained-9d2b96f4a9c0) model was trained using the [RWCP corpus](http://research.nii.ac.jp/src/en/list.html) and to make that model even more compact Boosting was used for [L1 norm regularization](https://blog.mlreview.com/l1-norm-regularization-and-sparsity-explained-for-dummies-5b0e4be3938a). Basically it compresess the model and get rid off redundant features as much as possible.
|
|
150
|
+
|
|
151
|
+
### CSS `line-break: strict`
|
|
152
|
+
|
|
153
|
+
Worth knowing: CSS has had native support for some Kinsoku Shori rules for a while now.
|
|
154
|
+
|
|
155
|
+
```css
|
|
156
|
+
p {
|
|
157
|
+
line-break: strict;
|
|
158
|
+
overflow-wrap: break-word;
|
|
159
|
+
}
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
`line-break: strict` applies character-level Unicode line-breaking rules, which covers small kana, prolonged sound marks, and common punctuation cases. Browser implementations are not perfectly consistent and the spec intentionally leaves the precise rule set up to the user agent, so you may see subtle differences across browsers.
|
|
163
|
+
|
|
164
|
+
zabon takes a fundamentally different approach. Instead of telling the browser where not to break, it wraps each segment in a `display: inline-block` element that the browser cannot split internally. This gives you precise semantic grouping that works the same way in every browser, and also opens up per-segment styling like hover effects, search highlighting, or animations. The trade-off is server-side processing and extra markup.
|
|
165
|
+
|
|
166
|
+
If basic punctuation and small-kana rules are all you need, `line-break: strict` might be enough and has zero runtime cost. If you need guaranteed atomic grouping or per-segment control, zabon is the better fit.
|
|
167
|
+
|
|
168
|
+
## Resources
|
|
169
|
+
|
|
170
|
+
* [Regular Expressions for Japanese characters](https://gist.github.com/terrancesnyder/1345094)
|
|
171
|
+
* [Word breaking in Japanese is Hard](https://docs.microsoft.com/en-us/archive/blogs/jonasbar/word-breaking-japanese-is-hard)
|
|
172
|
+
* [mikan.sharp](https://github.com/YoungjaeKim/mikan.sharp)
|
|
173
|
+
* [mikan.php](https://github.com/sters/mikan.php)
|
|
174
|
+
* [Kinsoku - Japanese line breaking rules for LaTeX](https://github.com/jamesohortle/kinsoku)
|
|
175
|
+
* [Kuromoji - Japanese morphological analyzer written in Java](https://www.atilika.org/)
|
|
176
|
+
* [WrapText CJK - line breaking rules in Lua](https://github.com/subsoap/wraptext)
|
|
177
|
+
* [TinySegmenter - Ruby Port](https://github.com/6/tiny_segmenter)
|
|
178
|
+
* [How to Pomelo](https://github.com/dingsdax/zabon/wiki/How-to-Pomelo!)
|
data/Rakefile
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "bundler/gem_tasks"
|
|
4
|
+
require "bundler/audit/task"
|
|
5
|
+
require "rake/testtask"
|
|
6
|
+
require "rubocop/rake_task"
|
|
7
|
+
|
|
8
|
+
Bundler::Audit::Task.new
|
|
9
|
+
RuboCop::RakeTask.new
|
|
10
|
+
|
|
11
|
+
Rake::TestTask.new(:test) do |t|
|
|
12
|
+
t.libs << "test"
|
|
13
|
+
t.libs << "lib"
|
|
14
|
+
t.warning = false
|
|
15
|
+
t.verbose = true
|
|
16
|
+
t.test_files = FileList["test/**/*_test.rb"]
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
desc "Run code quality checks"
|
|
20
|
+
task code_quality: %i[bundle:audit rubocop]
|
|
21
|
+
|
|
22
|
+
task default: %i[code_quality test]
|
data/bin/console
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
require "bundler/setup"
|
|
5
|
+
require "zabon"
|
|
6
|
+
|
|
7
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
|
8
|
+
# with your gem easier. You can also use a different console, if you like.
|
|
9
|
+
|
|
10
|
+
require "irb"
|
|
11
|
+
IRB.start(__FILE__)
|
data/bin/setup
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Zabon
|
|
4
|
+
class Analyzer
|
|
5
|
+
class << self
|
|
6
|
+
def split(text)
|
|
7
|
+
text.split(KEYWORDS)
|
|
8
|
+
.flat_map { |segment| segment.split(JOSHI) }
|
|
9
|
+
.flat_map { |segment| segment.split(BRACKETS_BEGIN) }
|
|
10
|
+
.flat_map { |segment| segment.split(BRACKETS_END) }
|
|
11
|
+
.flatten.reject(&:empty?)
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def segments(text)
|
|
15
|
+
result = [""] # we do this, so we can += to append to the last result item, without checking for nil
|
|
16
|
+
previous_segment = nil
|
|
17
|
+
|
|
18
|
+
split(text).each do |segment|
|
|
19
|
+
current_segment = Segment.new(segment)
|
|
20
|
+
|
|
21
|
+
# if the current segment is a beginning bracket => we look further
|
|
22
|
+
if current_segment.bracket_begin?
|
|
23
|
+
previous_segment = current_segment
|
|
24
|
+
next
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# if the current segment is an ending bracket =>
|
|
28
|
+
# we append to the last entry of the result set and don't look back anymore,
|
|
29
|
+
# we've reached the end of a segment and start a new one with the next iteration
|
|
30
|
+
if current_segment.bracket_end?
|
|
31
|
+
result[-1] += current_segment
|
|
32
|
+
previous_segment = nil
|
|
33
|
+
next
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# if the previous segment is a beginning bracket =>
|
|
37
|
+
# we stitch together previous segment & current segment to become a new segment
|
|
38
|
+
# we don't look back anymore
|
|
39
|
+
if previous_segment&.bracket_begin?
|
|
40
|
+
current_segment = Segment.new(previous_segment + current_segment)
|
|
41
|
+
previous_segment = nil
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# if we are not at the start, the current segment is a particle or a period and
|
|
45
|
+
# we are not looking back, we append to the last entry of the result set
|
|
46
|
+
if result.size > 1 && current_segment.joshi_or_period? && previous_segment.nil?
|
|
47
|
+
result[-1] += current_segment
|
|
48
|
+
previous_segment = current_segment
|
|
49
|
+
next
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# if we are not at the start, the current segment is a particle or a period or
|
|
53
|
+
# the previous segment is not a bracket or period or a conjunctive particle and the current segment is hiragana
|
|
54
|
+
# we append to the last entry of the result set
|
|
55
|
+
if (result.size > 2 && current_segment.joshi_or_period?) || (previous_segment&.keyword? && current_segment.hiragana? && !/^[とのに]$/.match?(previous_segment))
|
|
56
|
+
result[-1] += current_segment
|
|
57
|
+
# if the current segment is not a particle, we are no looking back anymore, we start a new segment
|
|
58
|
+
previous_segment = current_segment.joshi? ? current_segment : nil
|
|
59
|
+
next
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# no stitching left, append the current segment to the result set
|
|
63
|
+
result << current_segment
|
|
64
|
+
previous_segment = current_segment
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
result.reject(&:empty?) # we clear out any possible blank strings in the result set
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Zabon
|
|
4
|
+
Configuration = Struct.new(:tag, :tag_options, :strip_tags, keyword_init: true) do
|
|
5
|
+
def initialize(tag: :span, tag_options: { class: "zabon", style: "display: inline-block" }, strip_tags: true)
|
|
6
|
+
super
|
|
7
|
+
end
|
|
8
|
+
end
|
|
9
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Zabon
|
|
4
|
+
# Joshi (助詞), Japanese particles written in Hiragana, are suffixes or short words that follow a modified noun, verb, adjective, or sentence.
|
|
5
|
+
# Some particals can appear in two types. They give pretty reliable cues depending on the following character, whether a line break is allowed or not.
|
|
6
|
+
JOSHI = /
|
|
7
|
+
(でなければ|について|かしら|くらい|けれど|なのか|ばかり|ながら|ことよ|こそ|こと|さえ|しか|した|たり|だけ|だに|だの|つつ|ても|てよ|でも|
|
|
8
|
+
とも|から|など|なりので|ので|のに|ほど|まで|もの|やら|より|って|で|と|な|に|ね|の|も|は|ば|へ|や|わ|を|か|が|さ|し|ぞ|て)
|
|
9
|
+
/x
|
|
10
|
+
|
|
11
|
+
# A simple way to find word segementations in Japanese is
|
|
12
|
+
# to tokenise by grouping characters continuously by script (Hiragana, Katakana, Kanji, Romaji)
|
|
13
|
+
#
|
|
14
|
+
# The following regular expression matches in this order:
|
|
15
|
+
# * non breaking space
|
|
16
|
+
# * domains
|
|
17
|
+
# * any Japanese Kanji or Chinese character
|
|
18
|
+
# * Hirgana (+ chisai kana)
|
|
19
|
+
# * Katakana (+ chisai kana)
|
|
20
|
+
# * Latin
|
|
21
|
+
# * Latin (double width)
|
|
22
|
+
KEYWORDS = /
|
|
23
|
+
(\ |
|
|
24
|
+
[a-zA-Z0-9]+\.[a-z]{2,}|
|
|
25
|
+
[一-龠々〆ヵヶゝ]+|
|
|
26
|
+
[ぁ-んゝ]+|
|
|
27
|
+
[ァ-ヴー]+|
|
|
28
|
+
[a-zA-Z0-9]+|
|
|
29
|
+
[a-zA-Z0-9]+)
|
|
30
|
+
/x
|
|
31
|
+
|
|
32
|
+
# Brackets & Quotations
|
|
33
|
+
BRACKETS_BEGIN = /([〈《「『「((\[【〔〚〖〘❮❬❪❨(<{❲❰{❴])/
|
|
34
|
+
BRACKETS_END = /([〉》」』」))\]】〕〗〙〛}>\)❩❫❭❯❱❳❵}])/
|
|
35
|
+
|
|
36
|
+
PERIODS = /([\.\,。、!\!?\?]+)$/
|
|
37
|
+
|
|
38
|
+
HIRAGANA = /[ぁ-んゝ]+/
|
|
39
|
+
end
|
data/lib/zabon/helper.rb
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "uri"
|
|
4
|
+
require "action_view"
|
|
5
|
+
|
|
6
|
+
module Zabon
|
|
7
|
+
module Helper
|
|
8
|
+
include ActionView::Helpers::TagHelper
|
|
9
|
+
|
|
10
|
+
# can be used as a replacement for ActionView::Helpers::TranslationHelper.translate
|
|
11
|
+
# will use original translate method if locale is not :ja or translation is missing
|
|
12
|
+
# if not will split translation into semantic chunks wrap them into a configurable HTML tag
|
|
13
|
+
# and join again
|
|
14
|
+
def zabon_translate(key, **options)
|
|
15
|
+
orig_translate = options[:orig_translate] || :translate
|
|
16
|
+
translate_options = options.except(:orig_translate)
|
|
17
|
+
|
|
18
|
+
locale = (options[:locale] || I18n.locale || :en).to_sym
|
|
19
|
+
|
|
20
|
+
return public_send(orig_translate, key, **translate_options) if locale != :ja # if locale is not Japanese we use original method
|
|
21
|
+
|
|
22
|
+
return key.map { |k| zabon_translate(k, **options) } if key.is_a?(Array)
|
|
23
|
+
|
|
24
|
+
return public_send(orig_translate, key, **translate_options) unless I18n.exists?(key, locale: :ja)
|
|
25
|
+
|
|
26
|
+
orig_translation = public_send(orig_translate, key, **translate_options)
|
|
27
|
+
|
|
28
|
+
orig_translation = ActionView::Base.full_sanitizer.sanitize(orig_translation, tags: []) if Zabon.config.strip_tags
|
|
29
|
+
|
|
30
|
+
Sentry.set_context("zabon", { key: key, locale: locale }) if defined?(Sentry)
|
|
31
|
+
|
|
32
|
+
translation = Zabon.split(orig_translation).map do |segment|
|
|
33
|
+
content_tag(Zabon.config.tag, segment, Zabon.config.tag_options)
|
|
34
|
+
end.join.html_safe
|
|
35
|
+
|
|
36
|
+
block_given? ? yield(translation, key) : translation
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Zabon
|
|
4
|
+
class Segment < String
|
|
5
|
+
def hiragana? = @hiragana ||= HIRAGANA.match?(self)
|
|
6
|
+
def keyword? = @keyword ||= KEYWORDS.match?(self)
|
|
7
|
+
def bracket_begin? = @bracket_begin ||= BRACKETS_BEGIN.match?(self)
|
|
8
|
+
def bracket_end? = @bracket_end ||= BRACKETS_END.match?(self)
|
|
9
|
+
def joshi? = @joshi ||= JOSHI.match?(self)
|
|
10
|
+
def period? = @period ||= PERIODS.match?(self)
|
|
11
|
+
def joshi_or_period? = @joshi_or_period ||= joshi? || period?
|
|
12
|
+
end
|
|
13
|
+
end
|
data/lib/zabon.rb
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "zabon/analyzer"
|
|
4
|
+
require "zabon/configuration"
|
|
5
|
+
require "zabon/constants"
|
|
6
|
+
require "zabon/helper"
|
|
7
|
+
require "zabon/segment"
|
|
8
|
+
require "zabon/version"
|
|
9
|
+
require "zabon/railtie" if defined?(Rails::Railtie)
|
|
10
|
+
|
|
11
|
+
module Zabon
|
|
12
|
+
class << self
|
|
13
|
+
def split(text)
|
|
14
|
+
Analyzer.segments(text)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def config
|
|
18
|
+
@config ||= Configuration.new
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def configure
|
|
22
|
+
yield config
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def reset_config!
|
|
26
|
+
@config = Configuration.new
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
data/zabon.gemspec
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "lib/zabon/version"
|
|
4
|
+
|
|
5
|
+
Gem::Specification.new do |spec|
|
|
6
|
+
spec.name = "zabon"
|
|
7
|
+
spec.version = Zabon::VERSION
|
|
8
|
+
spec.authors = ["Johannes D."]
|
|
9
|
+
spec.email = ["dingsdax@fastmail.fm"]
|
|
10
|
+
|
|
11
|
+
spec.summary = "Japanese line breaking algorithm: Ruby port of mikan.js"
|
|
12
|
+
spec.description = "Splits up a (Japanese) string into semantic segment; wrap result in a HTML tag"
|
|
13
|
+
spec.license = "MIT"
|
|
14
|
+
spec.required_ruby_version = ">= 3.1"
|
|
15
|
+
|
|
16
|
+
spec.metadata["source_code_uri"] = "https://github.com/dingsdax/zabon.git"
|
|
17
|
+
|
|
18
|
+
# Specify which files should be added to the gem when it is released.
|
|
19
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
|
20
|
+
spec.files = Dir.chdir(File.expand_path(__dir__)) do
|
|
21
|
+
`git ls-files -z`.split("\x0").reject do |f|
|
|
22
|
+
(f == __FILE__) || f.match(%r{\A(?:(?:examples|test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
spec.require_paths = ["lib"]
|
|
26
|
+
|
|
27
|
+
spec.add_dependency "actionview"
|
|
28
|
+
spec.add_dependency "railties"
|
|
29
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: zabon
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.2.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Johannes D.
|
|
8
|
+
bindir: bin
|
|
9
|
+
cert_chain: []
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
|
+
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: actionview
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - ">="
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: '0'
|
|
19
|
+
type: :runtime
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - ">="
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: '0'
|
|
26
|
+
- !ruby/object:Gem::Dependency
|
|
27
|
+
name: railties
|
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
|
29
|
+
requirements:
|
|
30
|
+
- - ">="
|
|
31
|
+
- !ruby/object:Gem::Version
|
|
32
|
+
version: '0'
|
|
33
|
+
type: :runtime
|
|
34
|
+
prerelease: false
|
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
36
|
+
requirements:
|
|
37
|
+
- - ">="
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
version: '0'
|
|
40
|
+
description: Splits up a (Japanese) string into semantic segment; wrap result in a
|
|
41
|
+
HTML tag
|
|
42
|
+
email:
|
|
43
|
+
- dingsdax@fastmail.fm
|
|
44
|
+
executables: []
|
|
45
|
+
extensions: []
|
|
46
|
+
extra_rdoc_files: []
|
|
47
|
+
files:
|
|
48
|
+
- ".bundler-audit.yml"
|
|
49
|
+
- ".rubocop.yml"
|
|
50
|
+
- ".ruby-gemset"
|
|
51
|
+
- ".ruby-version"
|
|
52
|
+
- AGENTS.md
|
|
53
|
+
- CHANGELOG.md
|
|
54
|
+
- Gemfile
|
|
55
|
+
- LICENSE.txt
|
|
56
|
+
- README.md
|
|
57
|
+
- Rakefile
|
|
58
|
+
- bin/console
|
|
59
|
+
- bin/setup
|
|
60
|
+
- lib/zabon.rb
|
|
61
|
+
- lib/zabon/analyzer.rb
|
|
62
|
+
- lib/zabon/configuration.rb
|
|
63
|
+
- lib/zabon/constants.rb
|
|
64
|
+
- lib/zabon/helper.rb
|
|
65
|
+
- lib/zabon/railtie.rb
|
|
66
|
+
- lib/zabon/segment.rb
|
|
67
|
+
- lib/zabon/version.rb
|
|
68
|
+
- zabon.gemspec
|
|
69
|
+
licenses:
|
|
70
|
+
- MIT
|
|
71
|
+
metadata:
|
|
72
|
+
source_code_uri: https://github.com/dingsdax/zabon.git
|
|
73
|
+
rdoc_options: []
|
|
74
|
+
require_paths:
|
|
75
|
+
- lib
|
|
76
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
77
|
+
requirements:
|
|
78
|
+
- - ">="
|
|
79
|
+
- !ruby/object:Gem::Version
|
|
80
|
+
version: '3.1'
|
|
81
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
82
|
+
requirements:
|
|
83
|
+
- - ">="
|
|
84
|
+
- !ruby/object:Gem::Version
|
|
85
|
+
version: '0'
|
|
86
|
+
requirements: []
|
|
87
|
+
rubygems_version: 3.7.1
|
|
88
|
+
specification_version: 4
|
|
89
|
+
summary: 'Japanese line breaking algorithm: Ruby port of mikan.js'
|
|
90
|
+
test_files: []
|