script_detector_2 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 76df1da987c76f07c1116e360f2686bcb177d5be42d5faf8e8898561d9975f7a
4
- data.tar.gz: 81e79124e3d9e1e9283891c12970a84ab09dd5b2f6aaa4c3e626e07f852d2696
3
+ metadata.gz: 9709f6c83f82a3bf7073bb8b9cfc95a1edea6885e0f596ef6e93f01a62c26dbb
4
+ data.tar.gz: 3ed583c8487617e9687b3c776116f0cc22140993ef9debf4f8542057e9f9f232
5
5
  SHA512:
6
- metadata.gz: 34d104036ce6fd3aa8140cfb659d745e1d786996b7139d489cd352c1a4cb835991a4e45a8634c28d6ba9cdc65fe39d3c3a3da3bddd19dd3ae799f741059eb484
7
- data.tar.gz: d7ddc49d2a0a835e1549632f7b0f7a07b6983b345798e491ed819f93a30c72ef6b90a0b1063105def6ee22f8defc3d92ebd7d19511d5b0cbc1ba29d55fc2fa29
6
+ metadata.gz: 5d17c19eee4868b540af844e08c95133415de8ccf0293dd208c11a29c14315b716c1563c45fd22081fbbf604147c837fa14bc98772a4103b2df080e4d09f89fa
7
+ data.tar.gz: 4a7894f23f49494f9debcafc967217ccb00549234014166f9fe4f1fe3d13b12b15fbe63eac426b164dfa763acbc9b4e15dfa8cd616de84720f1b2ea3249abfde
data/.rubocop.yml CHANGED
@@ -1,5 +1,9 @@
1
1
  inherit_from: .rubocop_todo.yml
2
2
 
3
+ inherit_mode:
4
+ merge:
5
+ - Exclude
6
+
3
7
  AllCops:
4
8
  TargetRubyVersion: 2.5
5
9
  SuggestExtensions: false
data/.rubocop_todo.yml CHANGED
@@ -1,6 +1,6 @@
1
1
  # This configuration was generated by
2
2
  # `rubocop --auto-gen-config`
3
- # on 2021-08-22 13:16:05 UTC using RuboCop version 1.19.1.
3
+ # on 2021-10-11 07:56:31 UTC using RuboCop version 1.21.0.
4
4
  # The point is for the user to remove these configuration records
5
5
  # one by one as the offenses are removed from the code base.
6
6
  # Note that changes in the inspected code, or installation of new
@@ -9,9 +9,9 @@
9
9
  # Offense count: 1
10
10
  # Configuration parameters: IgnoredMethods.
11
11
  Metrics/CyclomaticComplexity:
12
- Max: 8
12
+ Max: 10
13
13
 
14
- # Offense count: 1
14
+ # Offense count: 2
15
15
  # Configuration parameters: CountComments, CountAsOne, ExcludedMethods, IgnoredMethods.
16
16
  Metrics/MethodLength:
17
17
  Max: 14
@@ -19,4 +19,4 @@ Metrics/MethodLength:
19
19
  # Offense count: 1
20
20
  # Configuration parameters: IgnoredMethods.
21
21
  Metrics/PerceivedComplexity:
22
- Max: 10
22
+ Max: 11
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ 2.7.4
data/CHANGELOG.md CHANGED
@@ -1,5 +1,12 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [0.3.0] - 2021-10-13
4
+
5
+ - Add `kana?` and `hangul?` methods
6
+ - Improve accuracy of `identify_script` method
7
+ - `chinese?` method now returns actual boolean instead of merely something
8
+ truthy
9
+
3
10
  ## [0.2.0] - 2021-08-23
4
11
 
5
12
  - Slight optimization of script-matching regexps
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- script_detector_2 (0.2.0)
4
+ script_detector_2 (0.3.0)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -18,9 +18,9 @@ GEM
18
18
  kramdown-parser-gfm (1.1.0)
19
19
  kramdown (~> 2.0)
20
20
  minitest (5.14.4)
21
- nokogiri (1.12.3-x86_64-darwin)
21
+ nokogiri (1.12.5-x86_64-darwin)
22
22
  racc (~> 1.4)
23
- parallel (1.20.1)
23
+ parallel (1.21.0)
24
24
  parser (3.0.2.0)
25
25
  ast (~> 2.4.1)
26
26
  racc (1.5.2)
@@ -30,20 +30,20 @@ GEM
30
30
  reverse_markdown (2.0.0)
31
31
  nokogiri
32
32
  rexml (3.2.5)
33
- rubocop (1.19.1)
33
+ rubocop (1.22.1)
34
34
  parallel (~> 1.10)
35
35
  parser (>= 3.0.0.0)
36
36
  rainbow (>= 2.2.2, < 4.0)
37
37
  regexp_parser (>= 1.8, < 3.0)
38
38
  rexml
39
- rubocop-ast (>= 1.9.1, < 2.0)
39
+ rubocop-ast (>= 1.12.0, < 2.0)
40
40
  ruby-progressbar (~> 1.7)
41
41
  unicode-display_width (>= 1.4.0, < 3.0)
42
- rubocop-ast (1.10.0)
42
+ rubocop-ast (1.12.0)
43
43
  parser (>= 3.0.1.1)
44
44
  ruby-progressbar (1.11.0)
45
45
  rubyzip (2.3.2)
46
- solargraph (0.43.0)
46
+ solargraph (0.44.0)
47
47
  backport (~> 1.2)
48
48
  benchmark
49
49
  bundler (>= 1.17.2)
@@ -60,7 +60,7 @@ GEM
60
60
  yard (~> 0.9, >= 0.9.24)
61
61
  thor (1.1.0)
62
62
  tilt (2.0.10)
63
- unicode-display_width (2.0.0)
63
+ unicode-display_width (2.1.0)
64
64
  yard (0.9.26)
65
65
 
66
66
  PLATFORMS
@@ -76,4 +76,4 @@ DEPENDENCIES
76
76
  solargraph
77
77
 
78
78
  BUNDLED WITH
79
- 2.2.26
79
+ 2.2.29
data/README.md CHANGED
@@ -12,7 +12,7 @@ Unlike the original script_detector, this gem:
12
12
  - Uses the
13
13
  [kUnihanCore2020](https://www.unicode.org/reports/tr38/#kUnihanCore2020)
14
14
  property of the Unicode Unihan database to determine which characters belong
15
- to which script (Unicode 13)
15
+ to which script (Unicode 14)
16
16
  ([details](http://www.unicode.org/L2/L2019/19388-unihan-core-2020.pdf))
17
17
  - Uses [ISO 15924 script names](https://en.wikipedia.org/wiki/ISO_15924) in
18
18
  symbol form as return values (instead of English strings)
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module ScriptDetector2
4
- VERSION = '0.2.0'
4
+ VERSION = '0.3.0'
5
5
  end
@@ -10,8 +10,8 @@ module ScriptDetector2
10
10
  # @param string [String]
11
11
  # @return [Boolean]
12
12
  def japanese?(string)
13
- return true if string =~ /[\p{Hiragana}\p{Katakana}]/
14
- return false if string =~ /\p{Hangul}/
13
+ return true if kana?(string)
14
+ return false if hangul?(string)
15
15
 
16
16
  kanji = string.scan(/\p{Han}/)
17
17
  return false unless kanji.any?
@@ -19,12 +19,18 @@ module ScriptDetector2
19
19
  kanji.all?(JAPANESE_PATTERN)
20
20
  end
21
21
 
22
+ # @param string [String]
23
+ # @return [Boolean]
24
+ def kana?(string)
25
+ /[\p{Hiragana}\p{Katakana}]/.match?(string)
26
+ end
27
+
22
28
  # @param string [String]
23
29
  # @return [Boolean]
24
30
  def chinese?(string)
25
31
  return false if string =~ /[\p{Hiragana}\p{Katakana}\p{Hangul}]/
26
32
 
27
- string =~ /\p{Han}/
33
+ /\p{Han}/.match?(string)
28
34
  end
29
35
 
30
36
  # @param string [String]
@@ -52,8 +58,8 @@ module ScriptDetector2
52
58
  # @param string [String]
53
59
  # @return [Boolean]
54
60
  def korean?(string)
55
- return true if string =~ /\p{Hangul}/
56
- return false if string =~ /[\p{Hiragana}\p{Katakana}]/
61
+ return true if hangul?(string)
62
+ return false if kana?(string)
57
63
 
58
64
  hanja = string.scan(/\p{Han}/)
59
65
  return false unless hanja.any?
@@ -61,19 +67,28 @@ module ScriptDetector2
61
67
  hanja.all?(KOREAN_PATTERN)
62
68
  end
63
69
 
70
+ # @param string [String]
71
+ # @return [Boolean]
72
+ def hangul?(string)
73
+ /\p{Hangul}/.match?(string)
74
+ end
75
+
64
76
  # @param string [String]
65
77
  # @return [Symbol]
66
78
  def identify_script(string)
67
- return :Jpan if japanese?(string)
68
- return :Kore if korean?(string)
79
+ return :Jpan if kana?(string)
80
+ return :Kore if hangul?(string)
69
81
 
70
82
  is_hant = traditional_chinese?(string)
71
83
  is_hans = simplified_chinese?(string)
72
84
  if is_hant && is_hans then :Hani
73
85
  elsif is_hans then :Hans
74
86
  elsif is_hant then :Hant
87
+ elsif japanese?(string) then :Jpan
88
+ elsif korean?(string) then :Kore
75
89
  elsif chinese?(string) then :Hani # rubocop:disable Lint/DuplicateBranch
76
- else :Zyyy
90
+ else
91
+ :Zyyy
77
92
  end
78
93
  end
79
94
  end
data/tasks/gen_src.rake CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  TMP_DIR = 'tmp'
4
4
  UNIHAN_ZIP = File.join(TMP_DIR, 'Unihan.zip')
5
- UNIHAN_URL = 'https://www.unicode.org/Public/UCD/latest/ucd/Unihan.zip'
5
+ UNIHAN_URL = 'https://www.unicode.org/Public/14.0.0/ucd/Unihan.zip'
6
6
 
7
7
  directory TMP_DIR
8
8
 
data/tasks/unihan.rb CHANGED
@@ -5,15 +5,20 @@ module Unihan
5
5
  CODEPOINT_PATTERN = /U\+(?<hex>[A-F0-9]+)/.freeze
6
6
 
7
7
  class << self
8
- # @param readings_data [Hash<Integer,Hash{String => String}>]
8
+ # @param dict_data [Hash<Integer,Hash{String => String}>]
9
9
  # @param tags [Array<String>]
10
10
  # @return [Regexp]
11
11
  def gen_unihan_core_pattern(dict_data, *tags)
12
- codepoints = dict_data.select do |_, data|
12
+ gen_pattern(codepoints_for_tags(dict_data, tags))
13
+ end
14
+
15
+ # @param dict_data [Hash<Integer,Hash{String => String}>]
16
+ # @param tags [Array<String>]
17
+ # @return [Array<Integer>]
18
+ def codepoints_for_tags(dict_data, tags)
19
+ dict_data.select do |_, data|
13
20
  tags.all? { |t| data['kUnihanCore2020']&.include?(t) }
14
21
  end.keys
15
-
16
- gen_pattern(codepoints)
17
22
  end
18
23
 
19
24
  # @param codepoints [Array<Integer>]
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: script_detector_2
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aaron Madlon-Kay
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-08-23 00:00:00.000000000 Z
11
+ date: 2021-10-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: byebug
@@ -76,6 +76,7 @@ files:
76
76
  - ".dir-locals.el"
77
77
  - ".rubocop.yml"
78
78
  - ".rubocop_todo.yml"
79
+ - ".ruby-version"
79
80
  - ".solargraph.yml"
80
81
  - CHANGELOG.md
81
82
  - Gemfile