script_detector_2 0.1.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +4 -0
- data/.rubocop_todo.yml +17 -2
- data/.ruby-version +1 -0
- data/CHANGELOG.md +21 -0
- data/Gemfile.lock +12 -12
- data/README.md +11 -1
- data/lib/script_detector_2/patterns.gen.rb +4 -4
- data/lib/script_detector_2/version.rb +1 -1
- data/lib/script_detector_2.rb +82 -14
- data/script_detector_2.gemspec +1 -0
- data/tasks/gen_src.rake +1 -1
- data/tasks/unihan.rb +12 -5
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 37ab1716845b98ca15a072e67cd900f5854dadd3c3641667fb5887c1626c4a85
|
4
|
+
data.tar.gz: 5142f341e40601f3d1fff211ff1fa566047f6353d0d3f6bc5062653827e759db
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 35b1771a4d8898d6ca02f38b4a016adc927600e7456b0a41ae515f88e85c846d8a41951ce78fa12c7a2851fc596a13ab48a3537dd97ff4579e8cd45d19b50ccf
|
7
|
+
data.tar.gz: ea6da32d1c4b9a10a4c04208fda4bee7bc6bad7d8ffa635f163c608e256160b4115fe3c1a8f4e7c9ec8b65f435ddca1fcb7af9f857a82042edd11457685d7424
|
data/.rubocop.yml
CHANGED
data/.rubocop_todo.yml
CHANGED
@@ -1,12 +1,27 @@
|
|
1
1
|
# This configuration was generated by
|
2
2
|
# `rubocop --auto-gen-config`
|
3
|
-
# on 2021-
|
3
|
+
# on 2021-11-24 02:35:34 UTC using RuboCop version 1.23.0.
|
4
4
|
# The point is for the user to remove these configuration records
|
5
5
|
# one by one as the offenses are removed from the code base.
|
6
6
|
# Note that changes in the inspected code, or installation of new
|
7
7
|
# versions of RuboCop, may require this file to be generated again.
|
8
8
|
|
9
|
-
# Offense count:
|
9
|
+
# Offense count: 2
|
10
|
+
# Configuration parameters: IgnoredMethods, CountRepeatedAttributes.
|
11
|
+
Metrics/AbcSize:
|
12
|
+
Max: 20
|
13
|
+
|
14
|
+
# Offense count: 2
|
15
|
+
# Configuration parameters: IgnoredMethods.
|
16
|
+
Metrics/CyclomaticComplexity:
|
17
|
+
Max: 13
|
18
|
+
|
19
|
+
# Offense count: 2
|
10
20
|
# Configuration parameters: CountComments, CountAsOne, ExcludedMethods, IgnoredMethods.
|
11
21
|
Metrics/MethodLength:
|
22
|
+
Max: 15
|
23
|
+
|
24
|
+
# Offense count: 1
|
25
|
+
# Configuration parameters: IgnoredMethods.
|
26
|
+
Metrics/PerceivedComplexity:
|
12
27
|
Max: 14
|
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
2.7.4
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,26 @@
|
|
1
1
|
## [Unreleased]
|
2
2
|
|
3
|
+
## [0.4.0] - 2021-11-24
|
4
|
+
|
5
|
+
- Add `identify_scripts` method
|
6
|
+
- Improve accuracy of `identify_script` method
|
7
|
+
|
8
|
+
## [0.3.0] - 2021-10-13
|
9
|
+
|
10
|
+
- Add `kana?` and `hangul?` methods
|
11
|
+
- Improve accuracy of `identify_script` method
|
12
|
+
- `chinese?` method now returns actual boolean instead of merely something
|
13
|
+
truthy
|
14
|
+
|
15
|
+
## [0.2.0] - 2021-08-23
|
16
|
+
|
17
|
+
- Slight optimization of script-matching regexps
|
18
|
+
- Script-matching regexps now match against entire string
|
19
|
+
|
20
|
+
## [0.1.1] - 2021-08-21
|
21
|
+
|
22
|
+
- Improve identification of ambiguous Chinese
|
23
|
+
|
3
24
|
## [0.1.0] - 2021-08-21
|
4
25
|
|
5
26
|
- Initial release
|
data/Gemfile.lock
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
script_detector_2 (0.
|
4
|
+
script_detector_2 (0.4.0)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: https://rubygems.org/
|
8
8
|
specs:
|
9
9
|
ast (2.4.2)
|
10
10
|
backport (1.2.0)
|
11
|
-
benchmark (0.
|
11
|
+
benchmark (0.2.0)
|
12
12
|
byebug (11.1.3)
|
13
13
|
diff-lcs (1.4.4)
|
14
14
|
e2mmap (0.1.0)
|
@@ -18,32 +18,32 @@ GEM
|
|
18
18
|
kramdown-parser-gfm (1.1.0)
|
19
19
|
kramdown (~> 2.0)
|
20
20
|
minitest (5.14.4)
|
21
|
-
nokogiri (1.12.
|
21
|
+
nokogiri (1.12.5-x86_64-darwin)
|
22
22
|
racc (~> 1.4)
|
23
|
-
parallel (1.
|
23
|
+
parallel (1.21.0)
|
24
24
|
parser (3.0.2.0)
|
25
25
|
ast (~> 2.4.1)
|
26
|
-
racc (1.
|
26
|
+
racc (1.6.0)
|
27
27
|
rainbow (3.0.0)
|
28
28
|
rake (13.0.6)
|
29
29
|
regexp_parser (2.1.1)
|
30
|
-
reverse_markdown (2.
|
30
|
+
reverse_markdown (2.1.1)
|
31
31
|
nokogiri
|
32
32
|
rexml (3.2.5)
|
33
|
-
rubocop (1.
|
33
|
+
rubocop (1.23.0)
|
34
34
|
parallel (~> 1.10)
|
35
35
|
parser (>= 3.0.0.0)
|
36
36
|
rainbow (>= 2.2.2, < 4.0)
|
37
37
|
regexp_parser (>= 1.8, < 3.0)
|
38
38
|
rexml
|
39
|
-
rubocop-ast (>= 1.
|
39
|
+
rubocop-ast (>= 1.12.0, < 2.0)
|
40
40
|
ruby-progressbar (~> 1.7)
|
41
41
|
unicode-display_width (>= 1.4.0, < 3.0)
|
42
|
-
rubocop-ast (1.
|
42
|
+
rubocop-ast (1.13.0)
|
43
43
|
parser (>= 3.0.1.1)
|
44
44
|
ruby-progressbar (1.11.0)
|
45
45
|
rubyzip (2.3.2)
|
46
|
-
solargraph (0.
|
46
|
+
solargraph (0.44.2)
|
47
47
|
backport (~> 1.2)
|
48
48
|
benchmark
|
49
49
|
bundler (>= 1.17.2)
|
@@ -60,7 +60,7 @@ GEM
|
|
60
60
|
yard (~> 0.9, >= 0.9.24)
|
61
61
|
thor (1.1.0)
|
62
62
|
tilt (2.0.10)
|
63
|
-
unicode-display_width (2.
|
63
|
+
unicode-display_width (2.1.0)
|
64
64
|
yard (0.9.26)
|
65
65
|
|
66
66
|
PLATFORMS
|
@@ -76,4 +76,4 @@ DEPENDENCIES
|
|
76
76
|
solargraph
|
77
77
|
|
78
78
|
BUNDLED WITH
|
79
|
-
2.2.
|
79
|
+
2.2.32
|
data/README.md
CHANGED
@@ -12,7 +12,7 @@ Unlike the original script_detector, this gem:
|
|
12
12
|
- Uses the
|
13
13
|
[kUnihanCore2020](https://www.unicode.org/reports/tr38/#kUnihanCore2020)
|
14
14
|
property of the Unicode Unihan database to determine which characters belong
|
15
|
-
to which script (Unicode
|
15
|
+
to which script (Unicode 14)
|
16
16
|
([details](http://www.unicode.org/L2/L2019/19388-unihan-core-2020.pdf))
|
17
17
|
- Uses [ISO 15924 script names](https://en.wikipedia.org/wiki/ISO_15924) in
|
18
18
|
symbol form as return values (instead of English strings)
|
@@ -42,6 +42,7 @@ The main detection methods are:
|
|
42
42
|
- `ScriptDetector2.simplified_chinese?`
|
43
43
|
- `ScriptDetector2.traditional_chinese?`
|
44
44
|
- `ScriptDetector2.identify_script`
|
45
|
+
- `ScriptDetector2.identify_scripts`
|
45
46
|
|
46
47
|
Regexp patterns are used to identify the script to which Han characters belong.
|
47
48
|
These can be used directly as well:
|
@@ -55,6 +56,15 @@ These can be used directly as well:
|
|
55
56
|
- `ScriptDetector2::KOREAN_PATTERN`: matches all Han characters in the
|
56
57
|
kUnihanCore2020 set marked as ROK (K) or DPRK (P)
|
57
58
|
|
59
|
+
Each of the above patterns matches an entire string containing only Han
|
60
|
+
characters of the indicated script, i.e.
|
61
|
+
|
62
|
+
```ruby
|
63
|
+
ScriptDetector2::JAPANESE_PATTERN.match?('日本語') # => true
|
64
|
+
ScriptDetector2::JAPANESE_PATTERN.match?('你好') # => false
|
65
|
+
ScriptDetector2::JAPANESE_PATTERN.match?('Hello 日本語') # => false
|
66
|
+
```
|
67
|
+
|
58
68
|
To recreate the script_detector gem's extension of the String class, use the
|
59
69
|
supplied refinement like so:
|
60
70
|
|