script_detector_2 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9709f6c83f82a3bf7073bb8b9cfc95a1edea6885e0f596ef6e93f01a62c26dbb
4
- data.tar.gz: 3ed583c8487617e9687b3c776116f0cc22140993ef9debf4f8542057e9f9f232
3
+ metadata.gz: 37ab1716845b98ca15a072e67cd900f5854dadd3c3641667fb5887c1626c4a85
4
+ data.tar.gz: 5142f341e40601f3d1fff211ff1fa566047f6353d0d3f6bc5062653827e759db
5
5
  SHA512:
6
- metadata.gz: 5d17c19eee4868b540af844e08c95133415de8ccf0293dd208c11a29c14315b716c1563c45fd22081fbbf604147c837fa14bc98772a4103b2df080e4d09f89fa
7
- data.tar.gz: 4a7894f23f49494f9debcafc967217ccb00549234014166f9fe4f1fe3d13b12b15fbe63eac426b164dfa763acbc9b4e15dfa8cd616de84720f1b2ea3249abfde
6
+ metadata.gz: 35b1771a4d8898d6ca02f38b4a016adc927600e7456b0a41ae515f88e85c846d8a41951ce78fa12c7a2851fc596a13ab48a3537dd97ff4579e8cd45d19b50ccf
7
+ data.tar.gz: ea6da32d1c4b9a10a4c04208fda4bee7bc6bad7d8ffa635f163c608e256160b4115fe3c1a8f4e7c9ec8b65f435ddca1fcb7af9f857a82042edd11457685d7424
data/.rubocop_todo.yml CHANGED
@@ -1,22 +1,27 @@
1
1
  # This configuration was generated by
2
2
  # `rubocop --auto-gen-config`
3
- # on 2021-10-11 07:56:31 UTC using RuboCop version 1.21.0.
3
+ # on 2021-11-24 02:35:34 UTC using RuboCop version 1.23.0.
4
4
  # The point is for the user to remove these configuration records
5
5
  # one by one as the offenses are removed from the code base.
6
6
  # Note that changes in the inspected code, or installation of new
7
7
  # versions of RuboCop, may require this file to be generated again.
8
8
 
9
- # Offense count: 1
9
+ # Offense count: 2
10
+ # Configuration parameters: IgnoredMethods, CountRepeatedAttributes.
11
+ Metrics/AbcSize:
12
+ Max: 20
13
+
14
+ # Offense count: 2
10
15
  # Configuration parameters: IgnoredMethods.
11
16
  Metrics/CyclomaticComplexity:
12
- Max: 10
17
+ Max: 13
13
18
 
14
19
  # Offense count: 2
15
20
  # Configuration parameters: CountComments, CountAsOne, ExcludedMethods, IgnoredMethods.
16
21
  Metrics/MethodLength:
17
- Max: 14
22
+ Max: 15
18
23
 
19
24
  # Offense count: 1
20
25
  # Configuration parameters: IgnoredMethods.
21
26
  Metrics/PerceivedComplexity:
22
- Max: 11
27
+ Max: 14
data/CHANGELOG.md CHANGED
@@ -1,5 +1,10 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [0.4.0] - 2021-11-24
4
+
5
+ - Add `identify_scripts` method
6
+ - Improve accuracy of `identify_script` method
7
+
3
8
  ## [0.3.0] - 2021-10-13
4
9
 
5
10
  - Add `kana?` and `hangul?` methods
data/Gemfile.lock CHANGED
@@ -1,14 +1,14 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- script_detector_2 (0.3.0)
4
+ script_detector_2 (0.4.0)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
8
8
  specs:
9
9
  ast (2.4.2)
10
10
  backport (1.2.0)
11
- benchmark (0.1.1)
11
+ benchmark (0.2.0)
12
12
  byebug (11.1.3)
13
13
  diff-lcs (1.4.4)
14
14
  e2mmap (0.1.0)
@@ -23,14 +23,14 @@ GEM
23
23
  parallel (1.21.0)
24
24
  parser (3.0.2.0)
25
25
  ast (~> 2.4.1)
26
- racc (1.5.2)
26
+ racc (1.6.0)
27
27
  rainbow (3.0.0)
28
28
  rake (13.0.6)
29
29
  regexp_parser (2.1.1)
30
- reverse_markdown (2.0.0)
30
+ reverse_markdown (2.1.1)
31
31
  nokogiri
32
32
  rexml (3.2.5)
33
- rubocop (1.22.1)
33
+ rubocop (1.23.0)
34
34
  parallel (~> 1.10)
35
35
  parser (>= 3.0.0.0)
36
36
  rainbow (>= 2.2.2, < 4.0)
@@ -39,11 +39,11 @@ GEM
39
39
  rubocop-ast (>= 1.12.0, < 2.0)
40
40
  ruby-progressbar (~> 1.7)
41
41
  unicode-display_width (>= 1.4.0, < 3.0)
42
- rubocop-ast (1.12.0)
42
+ rubocop-ast (1.13.0)
43
43
  parser (>= 3.0.1.1)
44
44
  ruby-progressbar (1.11.0)
45
45
  rubyzip (2.3.2)
46
- solargraph (0.44.0)
46
+ solargraph (0.44.2)
47
47
  backport (~> 1.2)
48
48
  benchmark
49
49
  bundler (>= 1.17.2)
@@ -76,4 +76,4 @@ DEPENDENCIES
76
76
  solargraph
77
77
 
78
78
  BUNDLED WITH
79
- 2.2.29
79
+ 2.2.32
data/README.md CHANGED
@@ -42,6 +42,7 @@ The main detection methods are:
42
42
  - `ScriptDetector2.simplified_chinese?`
43
43
  - `ScriptDetector2.traditional_chinese?`
44
44
  - `ScriptDetector2.identify_script`
45
+ - `ScriptDetector2.identify_scripts`
45
46
 
46
47
  Regexp patterns are used to identify the script to which Han characters belong.
47
48
  These can be used directly as well:
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module ScriptDetector2
4
- VERSION = '0.3.0'
4
+ VERSION = '0.4.0'
5
5
  end
@@ -8,7 +8,7 @@ require_relative 'script_detector_2/string'
8
8
  module ScriptDetector2
9
9
  class << self
10
10
  # @param string [String]
11
- # @return [Boolean]
11
+ # @return [Boolean] true if +string+ appears to be Japanese
12
12
  def japanese?(string)
13
13
  return true if kana?(string)
14
14
  return false if hangul?(string)
@@ -20,13 +20,14 @@ module ScriptDetector2
20
20
  end
21
21
 
22
22
  # @param string [String]
23
- # @return [Boolean]
23
+ # @return [Boolean] true if +string+ contains Hiragana or Katakana
24
24
  def kana?(string)
25
25
  /[\p{Hiragana}\p{Katakana}]/.match?(string)
26
26
  end
27
27
 
28
28
  # @param string [String]
29
- # @return [Boolean]
29
+ # @return [Boolean] true if +string+ appears to be Chinese (either
30
+ # Simplified or Traditional)
30
31
  def chinese?(string)
31
32
  return false if string =~ /[\p{Hiragana}\p{Katakana}\p{Hangul}]/
32
33
 
@@ -34,7 +35,7 @@ module ScriptDetector2
34
35
  end
35
36
 
36
37
  # @param string [String]
37
- # @return [Boolean]
38
+ # @return [Boolean] true if +string+ appears to be Simplified Chinese
38
39
  def simplified_chinese?(string)
39
40
  return false if string =~ /[\p{Hiragana}\p{Katakana}\p{Hangul}]/
40
41
 
@@ -45,7 +46,7 @@ module ScriptDetector2
45
46
  end
46
47
 
47
48
  # @param string [String]
48
- # @return [Boolean]
49
+ # @return [Boolean] true if +string+ appears to be Traditional Chinese
49
50
  def traditional_chinese?(string)
50
51
  return false if string =~ /[\p{Hiragana}\p{Katakana}\p{Hangul}]/
51
52
 
@@ -56,7 +57,7 @@ module ScriptDetector2
56
57
  end
57
58
 
58
59
  # @param string [String]
59
- # @return [Boolean]
60
+ # @return [Boolean] true if +string+ appears to be Korean
60
61
  def korean?(string)
61
62
  return true if hangul?(string)
62
63
  return false if kana?(string)
@@ -68,11 +69,24 @@ module ScriptDetector2
68
69
  end
69
70
 
70
71
  # @param string [String]
71
- # @return [Boolean]
72
+ # @return [Boolean] true if +string+ contains Hangul
72
73
  def hangul?(string)
73
74
  /\p{Hangul}/.match?(string)
74
75
  end
75
76
 
77
+ # Make a best-effort attempt to guess the singular script of +string+.
78
+ # Result is a symbol representing one of the scripts defined by ISO 15924,
79
+ # namely one of:
80
+ # - Hans (Simplified Chinese)
81
+ # - Hant (Traditional Chinese)
82
+ # - Hani (Unspecified Han)
83
+ # - Jpan (Japanese: Han, Hiragana, Katakana)
84
+ # - Kore (Korean: Hangul, Han)
85
+ # - Zyyy (Undetermined)
86
+ #
87
+ # Note that this is likely to give poor results for very short strings,
88
+ # which are often inherently ambiguous.
89
+ #
76
90
  # @param string [String]
77
91
  # @return [Symbol]
78
92
  def identify_script(string)
@@ -81,15 +95,50 @@ module ScriptDetector2
81
95
 
82
96
  is_hant = traditional_chinese?(string)
83
97
  is_hans = simplified_chinese?(string)
84
- if is_hant && is_hans then :Hani
85
- elsif is_hans then :Hans
98
+ return :Hani if is_hant && is_hans
99
+
100
+ is_japanese = japanese?(string)
101
+ return :Hani if is_japanese && (is_hant || is_hans)
102
+
103
+ # At this point we have determined that the string does not contain
104
+ # Hangul; for such a string to be Korean would be unusual. Allowing Korean
105
+ # to dilute the result to Hani is going to be a loss on average, so we
106
+ # don't handle it like Japanese above.
107
+
108
+ if is_hans then :Hans
86
109
  elsif is_hant then :Hant
87
- elsif japanese?(string) then :Jpan
110
+ elsif is_japanese then :Jpan
88
111
  elsif korean?(string) then :Kore
89
- elsif chinese?(string) then :Hani # rubocop:disable Lint/DuplicateBranch
112
+ elsif chinese?(string) then :Hani
90
113
  else
91
114
  :Zyyy
92
115
  end
93
116
  end
117
+
118
+ # Identify all CJK scripts represented in +string+. Result is a list of symbols
119
+ # representing scripts defined by ISO 15924, namely one or more of:
120
+ # - Hans (Simplified Chinese)
121
+ # - Hant (Traditional Chinese)
122
+ # - Hani (Unspecified Chinese)
123
+ # - Jpan (Japanese: Han, Hiragana, Katakana)
124
+ # - Kore (Korean: Hangul, Han)
125
+ # - Zyyy (Undetermined)
126
+ #
127
+ # This method does not attempt to identify other scripts such as Latn.
128
+ #
129
+ # @param string [String]
130
+ # @return [Array<Symbol>]
131
+ def identify_scripts(string)
132
+ result = []
133
+
134
+ result << :Hans if simplified_chinese?(string)
135
+ result << :Hant if traditional_chinese?(string)
136
+ result << :Jpan if japanese?(string)
137
+ result << :Kore if korean?(string)
138
+ result << :Hani if chinese?(string) && result.empty?
139
+ result << :Zyyy if result.empty?
140
+
141
+ result
142
+ end
94
143
  end
95
144
  end
@@ -17,6 +17,7 @@ Gem::Specification.new do |spec|
17
17
 
18
18
  spec.metadata['homepage_uri'] = spec.homepage
19
19
  spec.metadata['source_code_uri'] = 'https://github.com/amake/script_detector_2.git'
20
+ spec.metadata['rubygems_mfa_required'] = 'true'
20
21
 
21
22
  # Specify which files should be added to the gem when it is released.
22
23
  # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: script_detector_2
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aaron Madlon-Kay
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-10-13 00:00:00.000000000 Z
11
+ date: 2021-11-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: byebug
@@ -100,6 +100,7 @@ licenses:
100
100
  metadata:
101
101
  homepage_uri: https://github.com/amake/script_detector_2
102
102
  source_code_uri: https://github.com/amake/script_detector_2.git
103
+ rubygems_mfa_required: 'true'
103
104
  post_install_message:
104
105
  rdoc_options: []
105
106
  require_paths: