zabon 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: d0f3961a7b9e53f882a480036cfb49a80eb7340f62d4a9c8d1b1e3e3bad0c82c
4
+ data.tar.gz: 779a1fac27148c57b58f79c03920d920b44c07751f9e64c527735b2748fa5bde
5
+ SHA512:
6
+ metadata.gz: ec94a2d9f1fee896f2fb3c5deb27913e33971063c7397224c9a5a6d4394ce8a0d6a8d43bc96d1d954969d5d0f5105d3385eabc86b7f6e20e23a1ba33c0e26d8c
7
+ data.tar.gz: 40f2a3fc5c52b24b7346e312c26092435e87fd8f3405ce88440c540742e18270c5c75865e92bcbe339865492369dbf18001053f7cea2fe95642ff0b056cfec30
@@ -0,0 +1,6 @@
1
+ ignore:
2
+ # nokogiri GHSA-wx95-c6cv-8532: does not check return value from xmlC14NExecute.
3
+ # Patched in nokogiri >= 1.19.1, which requires Ruby >= 3.2.
4
+ # zabon supports Ruby 3.1+ and does not use nokogiri's C14N functionality directly
5
+ # (nokogiri is a transitive dependency via actionview). Revisit when Ruby 3.1 is EOL.
6
+ - GHSA-wx95-c6cv-8532
data/.rubocop.yml ADDED
@@ -0,0 +1,41 @@
1
+ require:
2
+ - rubocop-minitest
3
+ - rubocop-performance
4
+ - rubocop-rake
5
+
6
+ AllCops:
7
+ TargetRubyVersion: 3.1
8
+
9
+ Layout/LineLength:
10
+ Max: 180
11
+ Exclude:
12
+ - test/zabon_test.rb
13
+
14
+ Style/StringLiterals:
15
+ Enabled: true
16
+ EnforcedStyle: double_quotes
17
+
18
+ Style/StringLiteralsInInterpolation:
19
+ Enabled: true
20
+ EnforcedStyle: double_quotes
21
+
22
+ Style/RedundantRegexpEscape:
23
+ Enabled: false
24
+
25
+ Style/Documentation:
26
+ Enabled: false
27
+
28
+ Metrics/AbcSize:
29
+ Enabled: false
30
+
31
+ Metrics/CyclomaticComplexity:
32
+ Enabled: false
33
+
34
+ Metrics/MethodLength:
35
+ Enabled: false
36
+
37
+ Metrics/BlockLength:
38
+ Enabled: false
39
+
40
+ Metrics/PerceivedComplexity:
41
+ Enabled: false
data/.ruby-gemset ADDED
@@ -0,0 +1 @@
1
+ zabon
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ 3.3
data/AGENTS.md ADDED
@@ -0,0 +1,51 @@
1
+ # Agent Guidelines
2
+
3
+ ## Repository overview
4
+
5
+ zabon is a Ruby gem for Japanese text segmentation (Kinsoku Shori / 禁則処理).
6
+ It ports the mikan.js algorithm to Ruby and integrates with Rails via `ActionView::Helper`.
7
+
8
+ ## Running checks
9
+
10
+ ```bash
11
+ bundle exec rake # full suite: bundler-audit, rubocop, tests
12
+ bundle exec rake test # tests only
13
+ bundle exec rubocop # linter only
14
+ ```
15
+
16
+ All three must be green before committing.
17
+
18
+ ## Code conventions
19
+
20
+ - Ruby 3.1+ required; use Ruby 3 idioms: endless methods, `Hash#except`, `Struct keyword_init:`.
21
+ - `frozen_string_literal: true` on every file.
22
+ - Double-quoted strings (`EnforcedStyle: double_quotes` in `.rubocop.yml`).
23
+ - Predicate methods return a plain boolean — use `Regexp#match?`, never `match`.
24
+
25
+ ## constants.rb encoding
26
+
27
+ `lib/zabon/constants.rb` stores some voiced hiragana particles in NFD form (e.g. `で` = `\u3066\u3099`).
28
+ This is **load-bearing**: those particles intentionally do not match NFC strings.
29
+ Never normalise this file to NFC. When editing it, use byte-level operations and verify
30
+ `test_sentence6` and `test_sentence8` still pass.
31
+
32
+ ## Commits
33
+
34
+ Follow conventional commits (`fix:`, `feat:`, `ref:`, `test:`, `chore:`, `ci:`, `license:`, `meta:`).
35
+ One logical change per commit. Include `Co-Authored-By:` when AI-generated.
36
+
37
+ ## CI matrix
38
+
39
+ Ruby 3.1, 3.2, 3.3, 3.4 on ubuntu-latest. Always verify fixes on **all** matrix versions locally
40
+ before pushing — incompatibilities between Ruby versions are not always obvious.
41
+
42
+ ## Tests
43
+
44
+ Tests in `test/zabon_test.rb` must satisfy `expected.join == source` — the segmentation must be
45
+ lossless. When adding new test sentences, run `Zabon.split(source)` first to get the real output,
46
+ then copy it into `expected`; do not guess segment boundaries.
47
+
48
+ ## Attribution
49
+
50
+ zabon is a port of [mikan.js](https://github.com/trkbt10/mikan.js) by trkbt10 (MIT).
51
+ The upstream copyright notice is preserved in `LICENSE.txt` — do not remove it.
data/CHANGELOG.md ADDED
@@ -0,0 +1,34 @@
1
+ # Changelog
2
+
3
+ ## 0.2.0 (2026-03-26)
4
+
5
+ ### Breaking changes
6
+
7
+ - Ruby >= 3.1 is now required (was >= 2.5)
8
+
9
+ ### Features
10
+
11
+ - Add `ので` as a standalone JOSHI particle, correcting an accidental concatenation in the particle list
12
+ - Add optional Sentry context (`Sentry.set_context`) before segmentation — zero hard dependency, guarded by `defined?(Sentry)`
13
+ - Add `reset_config!` to reset configuration to defaults
14
+ - Add `examples/` — a minimal single-file Rails app demonstrating `zabon_translate` with locale switching
15
+
16
+ ### Fixes
17
+
18
+ - `Segment#hiragana?` was returning `MatchData` instead of a boolean
19
+ - `Analyzer#segments` had implicit operator precedence on a compound condition; added explicit parentheses
20
+ - `Helper#zabon_translate` was detecting missing translations via a brittle string sniff (`include?("translation_missing")`); replaced with `I18n.exists?`
21
+ - `Helper#zabon_translate` called `strip_tags` via a module method that shadowed `ActionView::Helpers::SanitizeHelper#strip_tags` in the ancestor chain; inlined the `ActionView::Base.full_sanitizer` call directly
22
+ - `require "uri"` added before `require "action_view"` to fix a `NameError` on Ruby 3.1 where `URI` is not pre-loaded
23
+ - `Zabon::Helper` now works correctly when included in a plain controller or any non-`ActionView::Base` context
24
+
25
+ ### Changes
26
+
27
+ - Ruby 3 idioms adopted throughout: endless methods (`Segment`), `Hash#except` (`Helper`), `Struct keyword_init:` (`Configuration`)
28
+ - CI matrix updated to Ruby 3.1, 3.2, 3.3, 3.4
29
+ - Upstream copyright notice for mikan.js (trkbt10) added to `LICENSE.txt`
30
+ - `examples/` excluded from the gem package
31
+
32
+ ## 0.1.0
33
+
34
+ Initial release.
data/Gemfile ADDED
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ gemspec
6
+
7
+ group :code_quality do
8
+ gem "bundler-audit"
9
+ gem "rubocop"
10
+ gem "rubocop-minitest"
11
+ gem "rubocop-performance"
12
+ gem "rubocop-rake"
13
+ gem "simplecov"
14
+ end
15
+
16
+ group :development do
17
+ gem "rake"
18
+ end
19
+
20
+ group :test do
21
+ gem "minitest"
22
+ end
data/LICENSE.txt ADDED
@@ -0,0 +1,25 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2022 Joesi
4
+
5
+ Portions of this software are derived from mikan.js
6
+ (https://github.com/trkbt10/mikan.js), Copyright (c) trkbt10,
7
+ used under the MIT License.
8
+
9
+ Permission is hereby granted, free of charge, to any person obtaining a copy
10
+ of this software and associated documentation files (the "Software"), to deal
11
+ in the Software without restriction, including without limitation the rights
12
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
+ copies of the Software, and to permit persons to whom the Software is
14
+ furnished to do so, subject to the following conditions:
15
+
16
+ The above copyright notice and this permission notice shall be included in
17
+ all copies or substantial portions of the Software.
18
+
19
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,178 @@
1
+ # zabon.ruby 🍊
2
+
3
+ A Ruby gem / Rails helper for dealing with Japanese line-breaking logic. It is basically a port of [mikan.js](https://github.com/trkbt10/mikan.js), which implements a regular expression based algorithm to segment text into semantic chunks. No machine learning needed 🤖☺️. In addition the resulting text segments can be wrapped in a configurable HTML tag. All praise 👏👏👏 for the algorithm goes to [trkbt10](https://github.com/trkbt10).
4
+
5
+ ## Usage
6
+ ``` ruby
7
+ # split this sentence
8
+ Zabon.split('この文を分割する')
9
+ => ["この", "文を", "分割する"]
10
+
11
+ ```
12
+
13
+ ## Configuration
14
+
15
+ Configuration is used for tag that the results can be wrapped in. It's making heavy use of Rails tag helpers.
16
+ E.g. put this in an initializer in your Rails app.
17
+
18
+ ``` ruby
19
+ Zabon.configure do |config|
20
+ config.tag = :div # default: :span
21
+ config.tag_options = { class: 'zabon_trara', style: 'font_size: 5em' } # default: { class: 'zabon', style: 'display: inline-block' }
22
+ config.strip_tags = false # default true
23
+ end
24
+ ```
25
+
26
+ ### Rails
27
+
28
+ The gem ships a Railtie that automatically includes `Zabon::Helper` into `ActionView::Base`, so `zabon_translate` is available in all views without any further setup.
29
+
30
+ Call `zabon_translate` directly in views for strings that need segmentation:
31
+
32
+ ```erb
33
+ <%= zabon_translate("page.title") %>
34
+ ```
35
+
36
+ To replace the standard `t()` globally, add the following to an initializer. Note that this affects **all** ActionView translation calls — prefer explicit `zabon_translate` for finer control.
37
+
38
+ ```ruby
39
+ # config/initializers/zabon.rb
40
+ module ActionView
41
+ module Helpers
42
+ module TranslationHelper
43
+ alias_method :translate_without_zabon, :translate
44
+
45
+ def translate(key, **options)
46
+ zabon_translate(key, orig_translate: :translate_without_zabon, **options)
47
+ end
48
+
49
+ alias t translate
50
+ end
51
+ end
52
+ end
53
+ ```
54
+
55
+ ## Japanese grammar 🇯🇵
56
+
57
+ Just enough Japanese to understand the algorithm :)
58
+
59
+ ### Writing system ✍️
60
+
61
+ The Japanese writing system uses for different components:
62
+
63
+ * [Hiragana (ひらがな)](https://en.wikipedia.org/wiki/Hiragana), a syllabary alphabet used for Japanese words not covered by kanji and mostly for grammatical inflections
64
+ * [Katakana (カタカナ)](https://en.wikipedia.org/wiki/Katakana), a syllabary alphabet used for transcription of foreign-language words into Japanese; for emphasis; [onomatopoeia](https://en.wikipedia.org/wiki/Onomatopoeia); for scientific terms and often Japanese companies.
65
+ * [Kanji (漢字)](https://en.wikipedia.org/wiki/Kanji), a set of Chinese characters directly incorporated into the written Japanese language with often Japanese pronunciation, which can be
66
+ * [Romaji](https://en.wikipedia.org/wiki/Romanization_of_Japanese), use of Latin script in Japanese language
67
+
68
+ ### Particles
69
+
70
+ [Joshi (助詞)](https://en.wikipedia.org/wiki/Japanese_particles), Japanese particles written in Hiragana, are suffixes or short words that follow a modified noun, verb, adjective, or sentence. Their grammatical range can indicate various meanings and functions:
71
+
72
+ * case markers
73
+ * parallel markers
74
+ * sentence ending particles
75
+ * interjectory particles
76
+ * adverbial particles
77
+ * binding particles
78
+ * conjunctive particles
79
+ * phrasal particles
80
+
81
+ ### Line breaking
82
+
83
+ Certain characters in Japanese should not come at the end of a line, certain characters should not come at the start of a line, and some characters should never be split up across two lines. These rules are called [Kinsoku Shori 禁則処理](https://en.wikipedia.org/wiki/Line_breaking_rules_in_East_Asian_languages#Line_breaking_rules_in_Japanese_text_(Kinsoku_Shori)):
84
+
85
+ simplified:
86
+
87
+ | Class | Can't begin a line | Can't finish a line |
88
+ |-------|--------------------|---------------------|
89
+ | small _kana_ | ぁぃぅぇぉっ... | |
90
+ | parentheses | )〉》】... | (〈《【... |
91
+ | quotations | 」』”... | 「『“... |
92
+ | punctuation | 、。・!?... | |
93
+
94
+ ### Text segmentation
95
+
96
+ Written Japanese uses no spaces and little punctuation to delimit words. Readers instead depend on grammatical cues (e.g. Japanese, particles and verb endings), the relative frequency of character combinations, and semantic context, in order to determine what words have been written. This is a non trivial problem which is often solved by applying machine learning algorithms. Without a careful approach, breaks can occur randomly and usually in the middle of a word. This is an issue with typography on the web and results in a degradation of readability.
97
+
98
+ ### Zabon ???
99
+
100
+ I made a couple of assumptions when choosing the name:
101
+ 1. 🍊 The original algorithm name **Mikan** might be transscription of 蜜柑, a Japanese citrus fruit (Mandarin, Satsuma)
102
+ 2. There already is a gem called [mikan](https://rubygems.org/gems/mikan), didn't want to go for **mikan_ruby** or similar b/c of autoloading
103
+ 3. 🍇 My guess is the original author chose this name, b/c he was searching for something simpler then Google's **Budou** (葡萄)
104
+ 4. 🔪 Both fruits have in common, that they can be easily split apart in segments
105
+ 5. So I was searching for another fruit that can be easily split apart, what can be split better apart than a Pomelo (文旦, ぶんたん) - **Zabon** (derived from Portoguese: zamboa)
106
+
107
+ Who knows if that's how it was 🤷🏻‍♂️😂.
108
+
109
+ ## The Algorithm
110
+
111
+ This algorithm does NOT find the most minimal segmentation of unbreakable text segments and probably will have problems if a text is solely written in one alphabet. It also does not support Furigana (yet). It does basic text segmentation and stitches the segments back together in segments which can be made unbreakable. The unbreakability we achieve by wrapping them in a <span> tag with certain CSS rules.
112
+
113
+ ### Splitting
114
+
115
+ 1. Split text across different alphabets used: split text into parts that are written in Kanjis, Hiragana, Katakana, Latin (incl. double width characters). The assumption here is that parts written in the same script should belong together.
116
+
117
+ 2. Then split up each element further by splitting up particles are sequences that might be used as particles. The original author of the algorithm has identified the following list (でなければ, について, かしら, くらい, けれど, なのか, ばかり, ながら, ことよ, こそ, こと, さえ, しか, した, たり, だけ, だに, だの, つつ, ても, てよ, でも, とも, から, など, なりので, のに, ほど, まで, もの, やら, より, って, で, と, な, に, ね, の, も, は, ば, へ, や, わ, を, か, が, さ, し, ぞ, て). To me that looks about right, but maybe there are missing some.
118
+
119
+ 3. Split along further by splitting up brackets and quotations: ([,〈,《,「,『,「,【,〔,〚,〖,〘,❮,❬,❪,❨,(,<,{,❲,❰,{,❴,] + the matching end brackets and quotations.
120
+
121
+ ### Stitching
122
+
123
+ 1. Now we have a list of minimal segments and try to stitch them back together in a result set, so that they will fulfil Japanese line breaking rules. We are gonna look at tuples from left to right, looking at the current segment and the previous segment.
124
+
125
+ 2. If the current segment is a beginning bracket or quotation; we look at the next segment, we have a definitiv start of an unbreakable segment.
126
+
127
+ 3. If the current segment is an ending bracket or quotation; we append to the last entry of the result set and don't look back anymore; we've reached the end of a segment and start a new one with the next iteration.
128
+
129
+ 4. If the previous segment is a beginning bracket; we stitch it together with the current segment to become a new segment. In the next iteration we don’t need to look at the previous segment anymore and continue.
130
+
131
+ 5. If he current segment is a particle or a punctuation mark and we are not looking back (see step 7.); we append the current segment to the last entry of the result set.
132
+
133
+ 6. If he current segment is a particle or a punctuation mark or if the previous segment is not a bracket, quotation or punctuation mark or a conjunctive particle (と, の,に) and the current segment is in Hiragana; we append to the last entry of the result set.
134
+
135
+ 7. If no condition from stiching steps 1-2 are matching we can safely add the current segment to the result set.
136
+
137
+ ## Other solutions
138
+ ### [Google Budou](https://github.com/google/budou)
139
+
140
+ Budou is a python library, which uses word segmenters to analyze input sentences. It can concatenate proper into meaningful chunks utilizing part-of-speech tagging and other syntactic information. Processed chunks are wrapped in a SPAN tag. Depending on the text segmentation algorithm used, it also has support for Chinese & Korean. Since this library is written in Python, it cannot be used simply used in Ruby, PHP, or Node.js.
141
+
142
+ #### Text segmenter backends
143
+ You can choose different segmenter backends depending on the needs of your environment. Currently, the segmenters below are supported.
144
+
145
+ * [Google Cloud Natural Language API](https://cloud.google.com/natural-language/): external API calls, can be costly
146
+ * [MeCab](https://taku910.github.io/mecab/): Japanese POS tagger & morphological analyzer with lots of language bindings, e.g. also used in Google Japanese Input and Japanese Input on Mac OS X
147
+ * [TinySegmenter](http://chasen.org/~taku/software/TinySegmenter/): extremely compact word separation algorithm in Javascript which produces MeCab compatible word separation without depending on external APIs, no dictionaires, classifies input
148
+
149
+ [TinySegmenter](http://chasen.org/~taku/software/TinySegmenter/) is an extremely compact word separation algorithm in Javascript which produces MeCab compatible word separation without depending on external APIs. It classifies the input by using entities like characters, N-Grams, Hiragana, Katakana (Japanese phonetic lettering system / syllabaries) and their combinations as features to determine whether a character is preceded by a word boundary. A [Naive Bayes]((https://towardsdatascience.com/naive-bayes-explained-9d2b96f4a9c0) model was trained using the [RWCP corpus](http://research.nii.ac.jp/src/en/list.html) and to make that model even more compact Boosting was used for [L1 norm regularization](https://blog.mlreview.com/l1-norm-regularization-and-sparsity-explained-for-dummies-5b0e4be3938a). Basically it compresess the model and get rid off redundant features as much as possible.
150
+
151
+ ### CSS `line-break: strict`
152
+
153
+ Worth knowing: CSS has had native support for some Kinsoku Shori rules for a while now.
154
+
155
+ ```css
156
+ p {
157
+ line-break: strict;
158
+ overflow-wrap: break-word;
159
+ }
160
+ ```
161
+
162
+ `line-break: strict` applies character-level Unicode line-breaking rules, which covers small kana, prolonged sound marks, and common punctuation cases. Browser implementations are not perfectly consistent and the spec intentionally leaves the precise rule set up to the user agent, so you may see subtle differences across browsers.
163
+
164
+ zabon takes a fundamentally different approach. Instead of telling the browser where not to break, it wraps each segment in a `display: inline-block` element that the browser cannot split internally. This gives you precise semantic grouping that works the same way in every browser, and also opens up per-segment styling like hover effects, search highlighting, or animations. The trade-off is server-side processing and extra markup.
165
+
166
+ If basic punctuation and small-kana rules are all you need, `line-break: strict` might be enough and has zero runtime cost. If you need guaranteed atomic grouping or per-segment control, zabon is the better fit.
167
+
168
+ ## Resources
169
+
170
+ * [Regular Expressions for Japanese characters](https://gist.github.com/terrancesnyder/1345094)
171
+ * [Word breaking in Japanese is Hard](https://docs.microsoft.com/en-us/archive/blogs/jonasbar/word-breaking-japanese-is-hard)
172
+ * [mikan.sharp](https://github.com/YoungjaeKim/mikan.sharp)
173
+ * [mikan.php](https://github.com/sters/mikan.php)
174
+ * [Kinsoku - Japanese line breaking rules for LaTeX](https://github.com/jamesohortle/kinsoku)
175
+ * [Kuromoji - Japanese morphological analyzer written in Java](https://www.atilika.org/)
176
+ * [WrapText CJK - line breaking rules in Lua](https://github.com/subsoap/wraptext)
177
+ * [TinySegmenter - Ruby Port](https://github.com/6/tiny_segmenter)
178
+ * [How to Pomelo](https://github.com/dingsdax/zabon/wiki/How-to-Pomelo!)
data/Rakefile ADDED
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "bundler/audit/task"
5
+ require "rake/testtask"
6
+ require "rubocop/rake_task"
7
+
8
+ Bundler::Audit::Task.new
9
+ RuboCop::RakeTask.new
10
+
11
+ Rake::TestTask.new(:test) do |t|
12
+ t.libs << "test"
13
+ t.libs << "lib"
14
+ t.warning = false
15
+ t.verbose = true
16
+ t.test_files = FileList["test/**/*_test.rb"]
17
+ end
18
+
19
+ desc "Run code quality checks"
20
+ task code_quality: %i[bundle:audit rubocop]
21
+
22
+ task default: %i[code_quality test]
data/bin/console ADDED
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require "bundler/setup"
5
+ require "zabon"
6
+
7
+ # You can add fixtures and/or initialization code here to make experimenting
8
+ # with your gem easier. You can also use a different console, if you like.
9
+
10
+ require "irb"
11
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,71 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Zabon
4
+ class Analyzer
5
+ class << self
6
+ def split(text)
7
+ text.split(KEYWORDS)
8
+ .flat_map { |segment| segment.split(JOSHI) }
9
+ .flat_map { |segment| segment.split(BRACKETS_BEGIN) }
10
+ .flat_map { |segment| segment.split(BRACKETS_END) }
11
+ .flatten.reject(&:empty?)
12
+ end
13
+
14
+ def segments(text)
15
+ result = [""] # we do this, so we can += to append to the last result item, without checking for nil
16
+ previous_segment = nil
17
+
18
+ split(text).each do |segment|
19
+ current_segment = Segment.new(segment)
20
+
21
+ # if the current segment is a beginning bracket => we look further
22
+ if current_segment.bracket_begin?
23
+ previous_segment = current_segment
24
+ next
25
+ end
26
+
27
+ # if the current segment is an ending bracket =>
28
+ # we append to the last entry of the result set and don't look back anymore,
29
+ # we've reached the end of a segment and start a new one with the next iteration
30
+ if current_segment.bracket_end?
31
+ result[-1] += current_segment
32
+ previous_segment = nil
33
+ next
34
+ end
35
+
36
+ # if the previous segment is a beginning bracket =>
37
+ # we stitch together previous segment & current segment to become a new segment
38
+ # we don't look back anymore
39
+ if previous_segment&.bracket_begin?
40
+ current_segment = Segment.new(previous_segment + current_segment)
41
+ previous_segment = nil
42
+ end
43
+
44
+ # if we are not at the start, the current segment is a particle or a period and
45
+ # we are not looking back, we append to the last entry of the result set
46
+ if result.size > 1 && current_segment.joshi_or_period? && previous_segment.nil?
47
+ result[-1] += current_segment
48
+ previous_segment = current_segment
49
+ next
50
+ end
51
+
52
+ # if we are not at the start, the current segment is a particle or a period or
53
+ # the previous segment is not a bracket or period or a conjunctive particle and the current segment is hiragana
54
+ # we append to the last entry of the result set
55
+ if (result.size > 2 && current_segment.joshi_or_period?) || (previous_segment&.keyword? && current_segment.hiragana? && !/^[とのに]$/.match?(previous_segment))
56
+ result[-1] += current_segment
57
+ # if the current segment is not a particle, we are no looking back anymore, we start a new segment
58
+ previous_segment = current_segment.joshi? ? current_segment : nil
59
+ next
60
+ end
61
+
62
+ # no stitching left, append the current segment to the result set
63
+ result << current_segment
64
+ previous_segment = current_segment
65
+ end
66
+
67
+ result.reject(&:empty?) # we clear out any possible blank strings in the result set
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Zabon
4
+ Configuration = Struct.new(:tag, :tag_options, :strip_tags, keyword_init: true) do
5
+ def initialize(tag: :span, tag_options: { class: "zabon", style: "display: inline-block" }, strip_tags: true)
6
+ super
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Zabon
4
+ # Joshi (助詞), Japanese particles written in Hiragana, are suffixes or short words that follow a modified noun, verb, adjective, or sentence.
5
+ # Some particals can appear in two types. They give pretty reliable cues depending on the following character, whether a line break is allowed or not.
6
+ JOSHI = /
7
+ (でなければ|について|かしら|くらい|けれど|なのか|ばかり|ながら|ことよ|こそ|こと|さえ|しか|した|たり|だけ|だに|だの|つつ|ても|てよ|でも|
8
+ とも|から|など|なりので|ので|のに|ほど|まで|もの|やら|より|って|で|と|な|に|ね|の|も|は|ば|へ|や|わ|を|か|が|さ|し|ぞ|て)
9
+ /x
10
+
11
+ # A simple way to find word segementations in Japanese is
12
+ # to tokenise by grouping characters continuously by script (Hiragana, Katakana, Kanji, Romaji)
13
+ #
14
+ # The following regular expression matches in this order:
15
+ # * non breaking space
16
+ # * domains
17
+ # * any Japanese Kanji or Chinese character
18
+ # * Hirgana (+ chisai kana)
19
+ # * Katakana (+ chisai kana)
20
+ # * Latin
21
+ # * Latin (double width)
22
+ KEYWORDS = /
23
+ (\&nbsp;|
24
+ [a-zA-Z0-9]+\.[a-z]{2,}|
25
+ [一-龠々〆ヵヶゝ]+|
26
+ [ぁ-んゝ]+|
27
+ [ァ-ヴー]+|
28
+ [a-zA-Z0-9]+|
29
+ [a-zA-Z0-9]+)
30
+ /x
31
+
32
+ # Brackets & Quotations
33
+ BRACKETS_BEGIN = /([〈《「『「((\[【〔〚〖〘❮❬❪❨(<{❲❰{❴])/
34
+ BRACKETS_END = /([〉》」』」))\]】〕〗〙〛}>\)❩❫❭❯❱❳❵}])/
35
+
36
+ PERIODS = /([\.\,。、!\!?\?]+)$/
37
+
38
+ HIRAGANA = /[ぁ-んゝ]+/
39
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "uri"
4
+ require "action_view"
5
+
6
+ module Zabon
7
+ module Helper
8
+ include ActionView::Helpers::TagHelper
9
+
10
+ # can be used as a replacement for ActionView::Helpers::TranslationHelper.translate
11
+ # will use original translate method if locale is not :ja or translation is missing
12
+ # if not will split translation into semantic chunks wrap them into a configurable HTML tag
13
+ # and join again
14
+ def zabon_translate(key, **options)
15
+ orig_translate = options[:orig_translate] || :translate
16
+ translate_options = options.except(:orig_translate)
17
+
18
+ locale = (options[:locale] || I18n.locale || :en).to_sym
19
+
20
+ return public_send(orig_translate, key, **translate_options) if locale != :ja # if locale is not Japanese we use original method
21
+
22
+ return key.map { |k| zabon_translate(k, **options) } if key.is_a?(Array)
23
+
24
+ return public_send(orig_translate, key, **translate_options) unless I18n.exists?(key, locale: :ja)
25
+
26
+ orig_translation = public_send(orig_translate, key, **translate_options)
27
+
28
+ orig_translation = ActionView::Base.full_sanitizer.sanitize(orig_translation, tags: []) if Zabon.config.strip_tags
29
+
30
+ Sentry.set_context("zabon", { key: key, locale: locale }) if defined?(Sentry)
31
+
32
+ translation = Zabon.split(orig_translation).map do |segment|
33
+ content_tag(Zabon.config.tag, segment, Zabon.config.tag_options)
34
+ end.join.html_safe
35
+
36
+ block_given? ? yield(translation, key) : translation
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "rails/railtie"
4
+ require "action_view"
5
+
6
+ module Zabon
7
+ class Railtie < Rails::Railtie
8
+ initializer "zabon.helper" do
9
+ ActionView::Base.include Zabon::Helper
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Zabon
4
+ class Segment < String
5
+ def hiragana? = @hiragana ||= HIRAGANA.match?(self)
6
+ def keyword? = @keyword ||= KEYWORDS.match?(self)
7
+ def bracket_begin? = @bracket_begin ||= BRACKETS_BEGIN.match?(self)
8
+ def bracket_end? = @bracket_end ||= BRACKETS_END.match?(self)
9
+ def joshi? = @joshi ||= JOSHI.match?(self)
10
+ def period? = @period ||= PERIODS.match?(self)
11
+ def joshi_or_period? = @joshi_or_period ||= joshi? || period?
12
+ end
13
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Zabon
4
+ VERSION = "0.2.0"
5
+ end
data/lib/zabon.rb ADDED
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "zabon/analyzer"
4
+ require "zabon/configuration"
5
+ require "zabon/constants"
6
+ require "zabon/helper"
7
+ require "zabon/segment"
8
+ require "zabon/version"
9
+ require "zabon/railtie" if defined?(Rails::Railtie)
10
+
11
+ module Zabon
12
+ class << self
13
+ def split(text)
14
+ Analyzer.segments(text)
15
+ end
16
+
17
+ def config
18
+ @config ||= Configuration.new
19
+ end
20
+
21
+ def configure
22
+ yield config
23
+ end
24
+
25
+ def reset_config!
26
+ @config = Configuration.new
27
+ end
28
+ end
29
+ end
data/zabon.gemspec ADDED
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/zabon/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "zabon"
7
+ spec.version = Zabon::VERSION
8
+ spec.authors = ["Johannes D."]
9
+ spec.email = ["dingsdax@fastmail.fm"]
10
+
11
+ spec.summary = "Japanese line breaking algorithm: Ruby port of mikan.js"
12
+ spec.description = "Splits up a (Japanese) string into semantic segment; wrap result in a HTML tag"
13
+ spec.license = "MIT"
14
+ spec.required_ruby_version = ">= 3.1"
15
+
16
+ spec.metadata["source_code_uri"] = "https://github.com/dingsdax/zabon.git"
17
+
18
+ # Specify which files should be added to the gem when it is released.
19
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
20
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
21
+ `git ls-files -z`.split("\x0").reject do |f|
22
+ (f == __FILE__) || f.match(%r{\A(?:(?:examples|test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
23
+ end
24
+ end
25
+ spec.require_paths = ["lib"]
26
+
27
+ spec.add_dependency "actionview"
28
+ spec.add_dependency "railties"
29
+ end
metadata ADDED
@@ -0,0 +1,90 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: zabon
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
+ platform: ruby
6
+ authors:
7
+ - Johannes D.
8
+ bindir: bin
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: actionview
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ">="
17
+ - !ruby/object:Gem::Version
18
+ version: '0'
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - ">="
24
+ - !ruby/object:Gem::Version
25
+ version: '0'
26
+ - !ruby/object:Gem::Dependency
27
+ name: railties
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ version: '0'
40
+ description: Splits up a (Japanese) string into semantic segment; wrap result in a
41
+ HTML tag
42
+ email:
43
+ - dingsdax@fastmail.fm
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - ".bundler-audit.yml"
49
+ - ".rubocop.yml"
50
+ - ".ruby-gemset"
51
+ - ".ruby-version"
52
+ - AGENTS.md
53
+ - CHANGELOG.md
54
+ - Gemfile
55
+ - LICENSE.txt
56
+ - README.md
57
+ - Rakefile
58
+ - bin/console
59
+ - bin/setup
60
+ - lib/zabon.rb
61
+ - lib/zabon/analyzer.rb
62
+ - lib/zabon/configuration.rb
63
+ - lib/zabon/constants.rb
64
+ - lib/zabon/helper.rb
65
+ - lib/zabon/railtie.rb
66
+ - lib/zabon/segment.rb
67
+ - lib/zabon/version.rb
68
+ - zabon.gemspec
69
+ licenses:
70
+ - MIT
71
+ metadata:
72
+ source_code_uri: https://github.com/dingsdax/zabon.git
73
+ rdoc_options: []
74
+ require_paths:
75
+ - lib
76
+ required_ruby_version: !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - ">="
79
+ - !ruby/object:Gem::Version
80
+ version: '3.1'
81
+ required_rubygems_version: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ requirements: []
87
+ rubygems_version: 3.7.1
88
+ specification_version: 4
89
+ summary: 'Japanese line breaking algorithm: Ruby port of mikan.js'
90
+ test_files: []