unicode-data 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/Gemfile +5 -0
- data/Gemfile.lock +24 -0
- data/LICENSE +21 -0
- data/README.md +50 -0
- data/Rakefile +35 -0
- data/bin/console +7 -0
- data/bin/setup +6 -0
- data/ext/unicode/data/Rakefile +9 -0
- data/lib/unicode/data.rb +35 -0
- data/lib/unicode/data/generate.rb +248 -0
- data/lib/unicode/data/validate.rb +114 -0
- data/lib/unicode/data/version.rb +7 -0
- data/unicode-data.gemspec +30 -0
- metadata +114 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 84da723c97e50af6ce0b53902b81511f5333949d847d047046e070c9f640fe28
|
4
|
+
data.tar.gz: 477cb2a6ddf74b0b8ec8af07f3db680a205ef272b439da8978bc0b548fe20946
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: fb06a0321db3ffe4fbbeb459c9554bd65ef21bcf525a088778d7171234944fef797e64ee1ad9aefd6d5f3e31ca956e87c1d28ada4b730cb7f018ff72b6390f36
|
7
|
+
data.tar.gz: 5e10c86ca002f5240a3b80b026d32545c7a49b570270ae3b7875edb489907e060eedd9fdbc4ac90a5b65146db0a893e1b2534ce7d0f35781c8e23cc03e32f461
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
unicode-data (0.1.0)
|
5
|
+
rubyzip
|
6
|
+
|
7
|
+
GEM
|
8
|
+
remote: https://rubygems.org/
|
9
|
+
specs:
|
10
|
+
minitest (5.14.4)
|
11
|
+
rake (13.0.6)
|
12
|
+
rubyzip (2.3.2)
|
13
|
+
|
14
|
+
PLATFORMS
|
15
|
+
x86_64-darwin-19
|
16
|
+
|
17
|
+
DEPENDENCIES
|
18
|
+
bundler
|
19
|
+
minitest
|
20
|
+
rake
|
21
|
+
unicode-data!
|
22
|
+
|
23
|
+
BUNDLED WITH
|
24
|
+
2.2.24
|
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2021-present Kevin Newton
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
# Unicode::Data
|
2
|
+
|
3
|
+
A Ruby wrapping for the unicode character data set.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
```ruby
|
10
|
+
gem "unicode-data"
|
11
|
+
```
|
12
|
+
|
13
|
+
And then execute:
|
14
|
+
|
15
|
+
$ bundle
|
16
|
+
|
17
|
+
Or install it yourself as:
|
18
|
+
|
19
|
+
$ gem install unicode-data
|
20
|
+
|
21
|
+
## Usage
|
22
|
+
|
23
|
+
When this gem is installed, it will automatically download the unicode character data set to a temporary zip file and generate a list of properties from that zip file. You can use this information (under `lib/unicode/data/derived.txt`) to implement, for example, a regular expression engine that can respect the unicode semantics defined by the [unicode technical standard](https://unicode.org/reports/tr18/). At the moment the list of properties generated includes:
|
24
|
+
|
25
|
+
* General Categories
|
26
|
+
* Blocks
|
27
|
+
* Ages
|
28
|
+
* Scripts
|
29
|
+
* Script Extensions
|
30
|
+
* Core Properties (Math, Alphabetic, Lowercase, Case_Ignorable, etc.)
|
31
|
+
* Prop List Properties (White_Space, Bidi_Control, Terminal_Punctuation, etc.)
|
32
|
+
|
33
|
+
This lines up to almost all of the [Onigmo](https://github.com/k-takata/Onigmo/blob/master/doc/UnicodeProps.txt) unicode support (and a lot more), with the exception of:
|
34
|
+
|
35
|
+
* POSIX brackets
|
36
|
+
* Emoji
|
37
|
+
|
38
|
+
## Development
|
39
|
+
|
40
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
41
|
+
|
42
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
43
|
+
|
44
|
+
## Contributing
|
45
|
+
|
46
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/kddnewton/unicode-data.
|
47
|
+
|
48
|
+
## License
|
49
|
+
|
50
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
lib = File.expand_path("../../../lib", __dir__)
|
4
|
+
$LOAD_PATH.unshift(lib)
|
5
|
+
|
6
|
+
require "bundler/gem_tasks"
|
7
|
+
require "rake/clean"
|
8
|
+
require "rake/testtask"
|
9
|
+
require "unicode/data"
|
10
|
+
|
11
|
+
CLEAN.include(File.join(lib, "unicode/data/derived.txt"))
|
12
|
+
|
13
|
+
namespace :ext do
|
14
|
+
load "ext/unicode/data/Rakefile"
|
15
|
+
end
|
16
|
+
|
17
|
+
Rake::TestTask.new(:test) do |t|
|
18
|
+
t.libs << "test"
|
19
|
+
t.libs << "lib"
|
20
|
+
t.test_files = FileList["test/**/*_test.rb"]
|
21
|
+
end
|
22
|
+
|
23
|
+
task default: :test
|
24
|
+
|
25
|
+
namespace :"unicode:data" do
|
26
|
+
desc "Generate all of the neccesary derived files"
|
27
|
+
task :generate do
|
28
|
+
Unicode::Data.generate
|
29
|
+
end
|
30
|
+
|
31
|
+
desc "Validate all of the necessary derived files"
|
32
|
+
task :validate do
|
33
|
+
Unicode::Data.validate
|
34
|
+
end
|
35
|
+
end
|
data/bin/console
ADDED
data/bin/setup
ADDED
data/lib/unicode/data.rb
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "unicode/data/version"
|
4
|
+
|
5
|
+
module Unicode
|
6
|
+
module Data
|
7
|
+
def self.generate
|
8
|
+
require "unicode/data/generate"
|
9
|
+
Generate.call
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.validate
|
13
|
+
require "unicode/data/validate"
|
14
|
+
Validate.call
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.properties
|
18
|
+
@properties ||=
|
19
|
+
File.readlines(File.expand_path("data/derived.txt", __dir__), chomp: true).to_h do |line|
|
20
|
+
line.split(/\s+/, 2)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.property?(query, value)
|
25
|
+
properties[query].split(",").any? do |segment|
|
26
|
+
case segment
|
27
|
+
when /^(\d+)$/
|
28
|
+
$1.to_i == value.ord
|
29
|
+
when /^(\d+)..(\d+)$/
|
30
|
+
($1.to_i..$2.to_i).cover?(value.ord)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,248 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "logger"
|
4
|
+
require "open-uri"
|
5
|
+
require "zip"
|
6
|
+
|
7
|
+
module Unicode
|
8
|
+
module Data
|
9
|
+
class Generate
|
10
|
+
class PropertyValueAliases
|
11
|
+
attr_reader :aliases
|
12
|
+
|
13
|
+
def initialize(aliases)
|
14
|
+
@aliases = aliases
|
15
|
+
end
|
16
|
+
|
17
|
+
def keys
|
18
|
+
aliases.keys
|
19
|
+
end
|
20
|
+
|
21
|
+
def find(property, value)
|
22
|
+
term = value.gsub(/[- ]/, "_")
|
23
|
+
|
24
|
+
aliases[property].find do |alias_set|
|
25
|
+
alias_set.any? { |alias_value| alias_value.casecmp(term) == 0 }
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
attr_reader :zipfile, :outfile, :logger
|
31
|
+
|
32
|
+
def initialize(zipfile, outfile, logger: Logger.new(STDOUT))
|
33
|
+
@zipfile = zipfile
|
34
|
+
@outfile = outfile
|
35
|
+
@logger = logger
|
36
|
+
end
|
37
|
+
|
38
|
+
def generate
|
39
|
+
property_aliases = read_property_aliases
|
40
|
+
property_value_aliases = PropertyValueAliases.new(read_property_value_aliases)
|
41
|
+
|
42
|
+
generate_general_categories
|
43
|
+
generate_blocks(property_value_aliases)
|
44
|
+
generate_ages(property_value_aliases)
|
45
|
+
generate_scripts(property_value_aliases)
|
46
|
+
generate_script_extensions(property_value_aliases)
|
47
|
+
generate_core_properties(property_aliases, property_value_aliases)
|
48
|
+
generate_prop_list_properties(property_aliases, property_value_aliases)
|
49
|
+
end
|
50
|
+
|
51
|
+
def self.call
|
52
|
+
unicode_version = RbConfig::CONFIG["UNICODE_VERSION"]
|
53
|
+
|
54
|
+
URI.open("https://www.unicode.org/Public/#{unicode_version}/ucd/UCD.zip") do |file|
|
55
|
+
Zip::File.open_buffer(file) do |zipfile|
|
56
|
+
File.open(File.join(__dir__, "derived.txt"), "w") do |outfile|
|
57
|
+
new(zipfile, outfile).generate
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
private
|
64
|
+
|
65
|
+
def each_line(filepath)
|
66
|
+
zipfile.get_input_stream(filepath).each_line do |line|
|
67
|
+
line.tap(&:chomp!).gsub!(/\s*#.*$/, "")
|
68
|
+
yield line unless line.empty?
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def read_property_aliases
|
73
|
+
[].tap do |aliases|
|
74
|
+
each_line("PropertyAliases.txt") do |line|
|
75
|
+
aliases << line.split(/\s*;\s*/).uniq
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
def read_property_value_aliases
|
81
|
+
{}.tap do |aliases|
|
82
|
+
each_line("PropertyValueAliases.txt") do |line|
|
83
|
+
type, *values = line.split(/\s*;\s*/)
|
84
|
+
(aliases[type] ||= []) << values.uniq
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
GeneralCategory = Struct.new(:name, :abbrev, :aliased, :subsets, keyword_init: true)
|
90
|
+
|
91
|
+
# https://www.unicode.org/reports/tr44/#General_Category_Values
|
92
|
+
def generate_general_categories
|
93
|
+
properties = {}
|
94
|
+
|
95
|
+
zipfile.get_input_stream("PropertyValueAliases.txt").each_line do |line|
|
96
|
+
if line.start_with?("# General_Category") .. line.start_with?("# @missing")
|
97
|
+
match = /^gc ; (?<abbrev>[^\s]+)\s+; (?<name>[^\s]+)\s+(?:; (?<aliased>[^\s]+)\s+)?(?:\# (?<subsets>[^\s]+))?/.match(line)
|
98
|
+
next if match.nil?
|
99
|
+
|
100
|
+
properties[match[:abbrev]] =
|
101
|
+
GeneralCategory.new(
|
102
|
+
name: match[:name],
|
103
|
+
abbrev: match[:abbrev],
|
104
|
+
aliased: match[:aliased],
|
105
|
+
subsets: match[:subsets]&.split(" | ")
|
106
|
+
)
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
general_categories = read_property_codepoints("extracted/DerivedGeneralCategory.txt")
|
111
|
+
general_category_codepoints = {}
|
112
|
+
|
113
|
+
general_categories.each do |abbrev, codepoints|
|
114
|
+
general_category = properties[abbrev]
|
115
|
+
|
116
|
+
queries = [abbrev, general_category.name]
|
117
|
+
queries << general_category.aliased if general_category.aliased
|
118
|
+
queries.map! { |value| "\\p{General_Category=#{value}}" }
|
119
|
+
|
120
|
+
if general_category.subsets
|
121
|
+
codepoints =
|
122
|
+
general_category.subsets.flat_map do |subset|
|
123
|
+
general_categories[subset]
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
general_category_codepoints[abbrev] = codepoints
|
128
|
+
write_queries(queries, codepoints)
|
129
|
+
end
|
130
|
+
|
131
|
+
# https://unicode.org/reports/tr18/#General_Category_Property
|
132
|
+
# There are a couple of special categories that are defined that we will
|
133
|
+
# handle here.
|
134
|
+
write_queries(["\\p{Any}"], [0..0x10FFFF])
|
135
|
+
write_queries(["\\p{Assigned}"], (0..0x10FFFF).to_a - general_category_codepoints["Cn"].flat_map { |codepoint| [*codepoint] })
|
136
|
+
write_queries(["\\p{ASCII}"], [0..0x7F])
|
137
|
+
end
|
138
|
+
|
139
|
+
def generate_blocks(property_value_aliases)
|
140
|
+
read_property_codepoints("Blocks.txt").each do |block, codepoints|
|
141
|
+
write_queries(
|
142
|
+
property_value_aliases.find("blk", block).map { |value| "\\p{Block=#{value}}" },
|
143
|
+
codepoints
|
144
|
+
)
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
# https://www.unicode.org/reports/tr44/#Character_Age
|
149
|
+
def generate_ages(property_value_aliases)
|
150
|
+
ages = read_property_codepoints("DerivedAge.txt").to_a
|
151
|
+
ages.each_with_index do |(version, _values), index|
|
152
|
+
# When querying by age, something that was added in 1.1 will also
|
153
|
+
# match at \p{age=2.0} query, so we need to get every value from all
|
154
|
+
# of the preceeding ages as well.
|
155
|
+
write_queries(
|
156
|
+
property_value_aliases.find("age", version).map { |value| "\\p{Age=#{value}}" },
|
157
|
+
ages[0..index].flat_map(&:last)
|
158
|
+
)
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
# https://www.unicode.org/reports/tr24/
|
163
|
+
def generate_scripts(property_value_aliases)
|
164
|
+
read_property_codepoints("Scripts.txt").each do |script, codepoints|
|
165
|
+
write_queries(
|
166
|
+
property_value_aliases.find("sc", script).map { |value| "\\p{Script=#{value}}" },
|
167
|
+
codepoints
|
168
|
+
)
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
def generate_script_extensions(property_value_aliases)
|
173
|
+
script_extensions = {}
|
174
|
+
|
175
|
+
read_property_codepoints("ScriptExtensions.txt").each do |script_extension_set, codepoints|
|
176
|
+
script_extension_set.split(" ").each do |script_extension|
|
177
|
+
script_extensions[script_extension] ||= []
|
178
|
+
script_extensions[script_extension] += codepoints
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
script_extensions.each do |script_extension, codepoints|
|
183
|
+
write_queries(
|
184
|
+
property_value_aliases.find("sc", script_extension)
|
185
|
+
.map { |value| "\\p{Script_Extensions=#{value}}" },
|
186
|
+
codepoints
|
187
|
+
)
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
def generate_core_properties(property_aliases, property_value_aliases)
|
192
|
+
read_property_codepoints("DerivedCoreProperties.txt").each do |property, codepoints|
|
193
|
+
property_alias_set =
|
194
|
+
property_aliases.find { |alias_set| alias_set.include?(property) }
|
195
|
+
|
196
|
+
property_value_alias_key =
|
197
|
+
(property_alias_set & property_value_aliases.keys).first
|
198
|
+
|
199
|
+
write_queries(["\\p{#{property_value_alias_key}=True}"], codepoints)
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
def generate_prop_list_properties(property_aliases, property_value_aliases)
|
204
|
+
read_property_codepoints("PropList.txt").each do |property, codepoints|
|
205
|
+
property_alias_set =
|
206
|
+
property_aliases.find { |alias_set| alias_set.include?(property) }
|
207
|
+
|
208
|
+
property_value_alias_key =
|
209
|
+
(property_alias_set & property_value_aliases.keys).first
|
210
|
+
|
211
|
+
write_queries(["\\p{#{property_value_alias_key}=True}"], codepoints)
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
def read_property_codepoints(filepath)
|
216
|
+
{}.tap do |properties|
|
217
|
+
each_line(filepath) do |line|
|
218
|
+
codepoint, property = line.split(/\s*;\s*/)
|
219
|
+
codepoint =
|
220
|
+
if codepoint.include?("..")
|
221
|
+
left, right = codepoint.split("..").map { |value| value.to_i(16) }
|
222
|
+
left..right
|
223
|
+
else
|
224
|
+
codepoint.to_i(16)
|
225
|
+
end
|
226
|
+
|
227
|
+
(properties[property] ||= []) << codepoint
|
228
|
+
end
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
def write_queries(queries, codepoints)
|
233
|
+
serialized =
|
234
|
+
codepoints
|
235
|
+
.flat_map { |codepoint| [*codepoint] }
|
236
|
+
.sort
|
237
|
+
.chunk_while { |prev, curr| curr - prev == 1 }
|
238
|
+
.map { |chunk| chunk.length > 1 ? "#{chunk[0]}..#{chunk[-1]}" : chunk[0] }
|
239
|
+
.join(",")
|
240
|
+
|
241
|
+
queries.each do |query|
|
242
|
+
logger.info("Generating #{query}")
|
243
|
+
outfile.puts("%-80s %s" % [query, serialized])
|
244
|
+
end
|
245
|
+
end
|
246
|
+
end
|
247
|
+
end
|
248
|
+
end
|
@@ -0,0 +1,114 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "logger"
|
4
|
+
|
5
|
+
module Unicode
|
6
|
+
module Data
|
7
|
+
class Validate
|
8
|
+
module Mode
|
9
|
+
# Just test the first value in the range of characters
|
10
|
+
class First
|
11
|
+
def apply(values, &block)
|
12
|
+
block.call(values.first)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
# Test a sample of 50 random values from the range of characters
|
17
|
+
class Sample
|
18
|
+
def apply(values, &block)
|
19
|
+
values.to_a.sample(50).each(&block)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
# Test every value from the range of characters
|
24
|
+
class Full
|
25
|
+
def apply(values, &block)
|
26
|
+
values.each(&block)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
attr_reader :logger, :mode, :surrogates
|
32
|
+
|
33
|
+
def initialize(logger: Logger.new(STDOUT), mode: ENV.fetch("MODE", "first"))
|
34
|
+
@logger = logger
|
35
|
+
@mode =
|
36
|
+
case mode
|
37
|
+
when "first" then Mode::First.new
|
38
|
+
when "sample" then Mode::Sample.new
|
39
|
+
when "full" then Mode::Full.new
|
40
|
+
else
|
41
|
+
raise ArgumentError, "invalid mode: #{mode}"
|
42
|
+
end
|
43
|
+
|
44
|
+
# This is a list of all of the surrogate characters that exist so that
|
45
|
+
# we can skip them when validating since they're not valid in UTF-8.
|
46
|
+
File.foreach(File.join(__dir__, "derived.txt"), chomp: true) do |line|
|
47
|
+
property, values = line.split(" ", 2)
|
48
|
+
|
49
|
+
if property.start_with?("\\p{General_Category=Surrogate}")
|
50
|
+
@surrogates = each_value(values, Mode::Full.new).to_a
|
51
|
+
break
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def validate
|
57
|
+
File.foreach(File.join(__dir__, "derived.txt"), chomp: true) do |line|
|
58
|
+
property, values = line.split(/\s+/, 2)
|
59
|
+
|
60
|
+
# For general categories and scripts, we don't actually want the
|
61
|
+
# prefix in the property name, so here leave it out.
|
62
|
+
property.gsub!(/(General_Category|Script)=/, "")
|
63
|
+
|
64
|
+
# Ruby doesn't support Block= syntax, it expects you to instead have
|
65
|
+
# no property name and have the block name begin with In_.
|
66
|
+
property.gsub!(/Block=/, "In_")
|
67
|
+
|
68
|
+
# Ruby doesn't support boolean property querying with values, it only
|
69
|
+
# supports the plain property name.
|
70
|
+
property.gsub!(/=(Yes|Y|True|T)/, "")
|
71
|
+
|
72
|
+
pattern =
|
73
|
+
begin
|
74
|
+
/#{property}/
|
75
|
+
rescue RegexpError
|
76
|
+
# There are a fair amount of properties that we have in this gem
|
77
|
+
# that Ruby doesn't support natively. Things like aliases for the
|
78
|
+
# various blocks, script extensions, aliases for the ages, etc.
|
79
|
+
# In this case just rescue the error and move on since we can't
|
80
|
+
# validate against native.
|
81
|
+
logger.warn("Skipping #{property}")
|
82
|
+
next
|
83
|
+
end
|
84
|
+
|
85
|
+
logger.info("Validating #{property}")
|
86
|
+
|
87
|
+
each_value(values, mode) do |value|
|
88
|
+
next if surrogates.include?(value)
|
89
|
+
raise unless pattern.match?([value].pack("U"))
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
def self.call
|
95
|
+
new.validate
|
96
|
+
end
|
97
|
+
|
98
|
+
private
|
99
|
+
|
100
|
+
def each_value(values, mode, &block)
|
101
|
+
return enum_for(__method__, values, mode) unless block_given?
|
102
|
+
|
103
|
+
values.split(",").each do |value|
|
104
|
+
case value
|
105
|
+
when /^(\d+)$/
|
106
|
+
block.call($1.to_i)
|
107
|
+
when /^(\d+)..(\d+)$/
|
108
|
+
mode.apply($1.to_i..$2.to_i, &block)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
lib = File.expand_path("../lib", __FILE__)
|
4
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
|
+
require "unicode/data/version"
|
6
|
+
|
7
|
+
Gem::Specification.new do |spec|
|
8
|
+
spec.name = "unicode-data"
|
9
|
+
spec.version = Unicode::Data::VERSION
|
10
|
+
spec.authors = ["Kevin Newton"]
|
11
|
+
spec.email = ["kddnewton@gmail.com"]
|
12
|
+
|
13
|
+
spec.summary = "A Ruby port of the unicode character data"
|
14
|
+
spec.homepage = "https://github.com/kddnewton/unicode-data"
|
15
|
+
spec.license = "MIT"
|
16
|
+
|
17
|
+
spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
|
18
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
19
|
+
end
|
20
|
+
spec.bindir = "exe"
|
21
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
22
|
+
spec.require_paths = ["lib"]
|
23
|
+
spec.extensions = ["ext/unicode/data/Rakefile"]
|
24
|
+
|
25
|
+
spec.add_dependency "rubyzip"
|
26
|
+
|
27
|
+
spec.add_development_dependency "bundler"
|
28
|
+
spec.add_development_dependency "rake"
|
29
|
+
spec.add_development_dependency "minitest"
|
30
|
+
end
|
metadata
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: unicode-data
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Kevin Newton
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2021-08-13 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rubyzip
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: bundler
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: minitest
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
description:
|
70
|
+
email:
|
71
|
+
- kddnewton@gmail.com
|
72
|
+
executables: []
|
73
|
+
extensions:
|
74
|
+
- ext/unicode/data/Rakefile
|
75
|
+
extra_rdoc_files: []
|
76
|
+
files:
|
77
|
+
- ".gitignore"
|
78
|
+
- Gemfile
|
79
|
+
- Gemfile.lock
|
80
|
+
- LICENSE
|
81
|
+
- README.md
|
82
|
+
- Rakefile
|
83
|
+
- bin/console
|
84
|
+
- bin/setup
|
85
|
+
- ext/unicode/data/Rakefile
|
86
|
+
- lib/unicode/data.rb
|
87
|
+
- lib/unicode/data/generate.rb
|
88
|
+
- lib/unicode/data/validate.rb
|
89
|
+
- lib/unicode/data/version.rb
|
90
|
+
- unicode-data.gemspec
|
91
|
+
homepage: https://github.com/kddnewton/unicode-data
|
92
|
+
licenses:
|
93
|
+
- MIT
|
94
|
+
metadata: {}
|
95
|
+
post_install_message:
|
96
|
+
rdoc_options: []
|
97
|
+
require_paths:
|
98
|
+
- lib
|
99
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
105
|
+
requirements:
|
106
|
+
- - ">="
|
107
|
+
- !ruby/object:Gem::Version
|
108
|
+
version: '0'
|
109
|
+
requirements: []
|
110
|
+
rubygems_version: 3.2.3
|
111
|
+
signing_key:
|
112
|
+
specification_version: 4
|
113
|
+
summary: A Ruby port of the unicode character data
|
114
|
+
test_files: []
|