neologdish-normalizer 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.ruby-version +1 -1
- data/README.md +3 -5
- data/Rakefile +5 -0
- data/lib/neologdish/normalizer/version.rb +1 -1
- data/lib/neologdish/normalizer.rb +6 -3
- data/renovate.json +6 -0
- data/sig/generated/neologdish/normalizer.rbs +5 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2aa7f75228455a5870d56e6f8b74dd74dcb9f8c0619bb86eda89e02518478afd
|
4
|
+
data.tar.gz: b92a3042b6f7c3ad7b2653bb7f04643e2769733104ee78e1c689170776729b53
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4479aa95d7ba3d2aca68feecf5a4674dc5af9f71c795ac748bcb6e38558b7d9e73c4d068b071441c020b96a4765b58224ad0328fe4856aafcf934bf6eb4d6e99
|
7
|
+
data.tar.gz: fb794e40955c4460bf592fac2f9baf415bb9c486ff07a288fc9b3a083bb8256a98d949291b497fd36b2e846ea083dee3925afd8001ae6a7ce90e4536badf6836
|
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
3.3
|
1
|
+
3.3.5
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Neologdish::Normalizer for Ruby
|
1
|
+
# Neologdish::Normalizer for Ruby [](https://github.com/moznion/neologdish-normalizer-ruby/actions/workflows/check.yml) [](https://badge.fury.io/rb/neologdish-normalizer)
|
2
2
|
|
3
3
|
A Japanese text normalization library for Ruby follows the conventions of [neologd/mecab-ipadic-neologd](https://github.com/neologd/mecab-ipadic-neologd), with some performance optimizations, without external dependencies. It is designed to preprocess Japanese text before applying NLP techniques.
|
4
4
|
|
@@ -27,18 +27,16 @@ The benchmark script is here: [./scripts/benchmark.rb](./scripts/benchmark.rb)
|
|
27
27
|
|
28
28
|
## Installation
|
29
29
|
|
30
|
-
TODO: Replace `UPDATE_WITH_YOUR_GEM_NAME_IMMEDIATELY_AFTER_RELEASE_TO_RUBYGEMS_ORG` with your gem name right after releasing it to RubyGems.org. Please do not do it earlier due to security reasons. Alternatively, replace this section with instructions to install your gem from git if you don't plan to release to RubyGems.org.
|
31
|
-
|
32
30
|
Install the gem and add to the application's Gemfile by executing:
|
33
31
|
|
34
32
|
```bash
|
35
|
-
bundle add
|
33
|
+
bundle add 'neologdish-normalizer'
|
36
34
|
```
|
37
35
|
|
38
36
|
If bundler is not being used to manage dependencies, install the gem by executing:
|
39
37
|
|
40
38
|
```bash
|
41
|
-
gem install
|
39
|
+
gem install 'neologdish-normalizer'
|
42
40
|
```
|
43
41
|
|
44
42
|
## Development
|
data/Rakefile
CHANGED
@@ -69,15 +69,18 @@ module Neologdish
|
|
69
69
|
# Normalize the given text.
|
70
70
|
#
|
71
71
|
# @rbs str: String
|
72
|
+
# @rbs override_conversion_map: Hash[String, String]
|
72
73
|
# @rbs return: String
|
73
|
-
def normalize(str)
|
74
|
+
def normalize(str, override_conversion_map = {})
|
75
|
+
conversion_map = CONVERSION_MAP.merge(override_conversion_map)
|
76
|
+
|
74
77
|
squeezee = ''
|
75
78
|
prev_latin = false
|
76
79
|
whitespace_encountered = false
|
77
80
|
encountered_half_width_kana = nil
|
78
81
|
normalized = str.chars.map do |c|
|
79
82
|
prefix = ''
|
80
|
-
c =
|
83
|
+
c = conversion_map[c] || c
|
81
84
|
|
82
85
|
# normalize the Half-width kana to full-width
|
83
86
|
if encountered_half_width_kana
|
@@ -112,7 +115,7 @@ module Neologdish
|
|
112
115
|
c = ''
|
113
116
|
else
|
114
117
|
prefix = ' ' if is_latin && whitespace_encountered
|
115
|
-
whitespace_encountered
|
118
|
+
whitespace_encountered &&= c == '' # take care for consecutive spaces on the right side
|
116
119
|
end
|
117
120
|
prev_latin = is_latin
|
118
121
|
|
data/renovate.json
ADDED
@@ -1,6 +1,7 @@
|
|
1
1
|
# Generated from lib/neologdish/normalizer.rb with RBS::Inline
|
2
2
|
|
3
3
|
module Neologdish
|
4
|
+
# A Japanese text normalizer module according to the neologd convention.
|
4
5
|
module Normalizer
|
5
6
|
CONVERSION_MAP: Hash[String, String]
|
6
7
|
|
@@ -12,8 +13,11 @@ module Neologdish
|
|
12
13
|
|
13
14
|
HANDAKUON_KANA_MAP: Hash[String, String]
|
14
15
|
|
16
|
+
# Normalize the given text.
|
17
|
+
#
|
15
18
|
# @rbs str: String
|
19
|
+
# @rbs override_conversion_map: Hash[String, String]
|
16
20
|
# @rbs return: String
|
17
|
-
def normalize: (String str) -> String
|
21
|
+
def normalize: (String str, ?Hash[String, String] override_conversion_map) -> String
|
18
22
|
end
|
19
23
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: neologdish-normalizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- moznion
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-10-
|
11
|
+
date: 2024-10-29 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: A Japanese text normalization library follows the conventions of neologd
|
14
14
|
with some performance optimizations. It is designed to preprocess Japanese text
|
@@ -26,6 +26,7 @@ files:
|
|
26
26
|
- Rakefile
|
27
27
|
- lib/neologdish/normalizer.rb
|
28
28
|
- lib/neologdish/normalizer/version.rb
|
29
|
+
- renovate.json
|
29
30
|
- sig/generated/neologdish/normalizer.rbs
|
30
31
|
- sig/generated/neologdish/normalizer/version.rbs
|
31
32
|
homepage: https://github.com/moznion/neologdish-normalizer
|
@@ -50,7 +51,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
50
51
|
- !ruby/object:Gem::Version
|
51
52
|
version: '0'
|
52
53
|
requirements: []
|
53
|
-
rubygems_version: 3.5.
|
54
|
+
rubygems_version: 3.5.16
|
54
55
|
signing_key:
|
55
56
|
specification_version: 4
|
56
57
|
summary: A Japanese text normalization library follows the conventions of neologd
|