ruby_unicode_prop 1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +51 -0
- data/ChangeLog +5 -0
- data/LICENSE.txt +20 -0
- data/Makefile +23 -0
- data/README.en.rdoc +153 -0
- data/Rakefile +9 -0
- data/bin/ruby_unicode_prop +186 -0
- data/ruby_unicode_prop.gemspec +50 -0
- data/test/test_ruby_unicode_prop.rb +140 -0
- metadata +60 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 995fd4cfd7ebc81e4621b58517e94c827cb76e06b40d99189537558813c060c4
|
4
|
+
data.tar.gz: 402b13940827cd2b4538c2f8ac83bda464c728eee6f6fbd774593c802c55d857
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: bf2a92ce22fb744822bd6a687c9c5874a8ee22c7dd72dcbadfd830d133f289615b8573107139c6f8de9a659500deda0d85454578d90608d284eef65f1ae38dce
|
7
|
+
data.tar.gz: 6dafbbabc31fe0133fd08815fd71b6d086697cfbab3180b3678ed5997c834de4a531f8f083053c5da8541ec031e33f1f8d01069540ee088f43af5319f63c5090
|
data/.gitignore
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
# See https://help.github.com/articles/ignoring-files for more about ignoring files.
|
2
|
+
#
|
3
|
+
# If you find yourself ignoring temporary files generated by your text editor
|
4
|
+
# or operating system, you probably want to add a global ignore instead:
|
5
|
+
# git config --global core.excludesfile '~/.gitignore_global'
|
6
|
+
|
7
|
+
# Ignore bundler config.
|
8
|
+
/.bundle
|
9
|
+
/vendor/bundle
|
10
|
+
|
11
|
+
# Ignore all logfiles and tempfiles.
|
12
|
+
/log/*
|
13
|
+
/tmp/*
|
14
|
+
!/log/.keep
|
15
|
+
!/tmp/.keep
|
16
|
+
|
17
|
+
.rbenv-version
|
18
|
+
# unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
|
19
|
+
.rvmrc
|
20
|
+
|
21
|
+
/node_modules
|
22
|
+
/yarn-error.log
|
23
|
+
|
24
|
+
.byebug_history
|
25
|
+
|
26
|
+
*.[oa]
|
27
|
+
*.so
|
28
|
+
*~
|
29
|
+
*.nogem
|
30
|
+
*nogem.*
|
31
|
+
*.bak
|
32
|
+
*.BAK
|
33
|
+
*.backup
|
34
|
+
*.org
|
35
|
+
*.orig
|
36
|
+
*.elc
|
37
|
+
*.pyc
|
38
|
+
\#*\#
|
39
|
+
|
40
|
+
# Elastic Beanstalk Files
|
41
|
+
.elasticbeanstalk/*
|
42
|
+
!.elasticbeanstalk/*.cfg.yml
|
43
|
+
!.elasticbeanstalk/*.global.yml
|
44
|
+
|
45
|
+
# yard
|
46
|
+
*.yardoc
|
47
|
+
|
48
|
+
# Ruby Gem doc
|
49
|
+
*.gem
|
50
|
+
doc/*
|
51
|
+
|
data/ChangeLog
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2012-2018 Scott Chacon and others
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Makefile
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
ALL =
|
2
|
+
|
3
|
+
objs =
|
4
|
+
|
5
|
+
.SUFFIXES: .so .o .c .f
|
6
|
+
|
7
|
+
#.o.so:
|
8
|
+
# ${LD} ${LFLAGS} -o $@ $< ${LINK_LIB}
|
9
|
+
|
10
|
+
all: ${ALL}
|
11
|
+
|
12
|
+
|
13
|
+
.PHONY: clean test doc
|
14
|
+
clean:
|
15
|
+
$(RM) bin/*~
|
16
|
+
|
17
|
+
## You may need RUBYLIB=`pwd`/lib:$RUBYLIB
|
18
|
+
test:
|
19
|
+
rake test
|
20
|
+
|
21
|
+
doc:
|
22
|
+
yard doc
|
23
|
+
|
data/README.en.rdoc
ADDED
@@ -0,0 +1,153 @@
|
|
1
|
+
|
2
|
+
= Command to print out Unicode characters that satisfy expressions for Ruby Regexp
|
3
|
+
|
4
|
+
== Summary
|
5
|
+
|
6
|
+
This package contains a standalone Ruby (executable) source file
|
7
|
+
+bin/ruby_unicode_prop+ - a command to be used from terminals etc, and it outputs (to STDOUT)
|
8
|
+
Unicode characters and/or their hexagonal codepoints that satisfy one or more given expressions
|
9
|
+
defined in Regexp in Ruby. Specifically, they are +\p{XXX}+ -type expressions (e.g., +\p{Katakana}+)
|
10
|
+
for Unicode, as well as +[[:blank:]]+ -type expressions for POSIX representation.
|
11
|
+
|
12
|
+
Some supplementary files are found in the top and +test+ directories,
|
13
|
+
none of which is essential to run the command.
|
14
|
+
|
15
|
+
== Description
|
16
|
+
|
17
|
+
The help doc is viewable with +-h+ (or +--help+) option, which all the basics:
|
18
|
+
|
19
|
+
% /YOUR/INSTALLED/PATH/ruby_unicode_prop -h
|
20
|
+
USAGE: ruby_unicode_prop [options] Property1 [Property2, ...]
|
21
|
+
Print all the characters and/or their hex-codepoints that have
|
22
|
+
the given "Unicode property" used in Ruby Regexp like \p{Currency_Symbol}
|
23
|
+
(or POSIX expression like [[:blank:]] if -p option is given).
|
24
|
+
|
25
|
+
Options:
|
26
|
+
-c, --[no-]without-codepoint Print characters only? (Def: false)
|
27
|
+
-n, --[no-]without-char Print codepoints only? (Def: false)
|
28
|
+
-d, --delimiter=CHAR Delimeter in output.
|
29
|
+
-l, --[no-]lowercase Lower cases alphabets are used for Hex in codepoints (Def: false)
|
30
|
+
-p, --[no-]posix Use POSIX expression instead of Unicode (Def: false)
|
31
|
+
--[no-]list-property Print all the Ruby Unicode properties and exit.
|
32
|
+
|
33
|
+
Note1: Delimeter means one
|
34
|
+
(1) between multiple characters and codepoints if either of -n or -c is specified
|
35
|
+
(Default: Null for -c (characters only) and a new line for -n.
|
36
|
+
(2) between the number and character of each pair if both are specified
|
37
|
+
(Def: a whitespace), whereas the delimeter between pairs is always a newline.
|
38
|
+
To specify a newline as a delimiter, give 'NL'
|
39
|
+
Note2: Properties differ for '-p', 'ascii' in POSIX and 'ASCII' in Unicode.
|
40
|
+
|
41
|
+
The reference file (used in the +-l+ option) is dynamically retrieved
|
42
|
+
from https://github.com/k-takata/Onigmo/blob/master/doc/UnicodeProps.txt
|
43
|
+
The definition file in the Ruby source tree is at +/enc/unicode/name2ctype.h+
|
44
|
+
|
45
|
+
=== Limitations
|
46
|
+
|
47
|
+
The output of this command is generated by the Ruby it runs, and hence
|
48
|
+
is fully consistent with Regexp matching results with the same
|
49
|
+
property names in any applications when you run the same Ruby. That
|
50
|
+
also means the output can depend on the version of the Ruby you run,
|
51
|
+
because the unicode table has expanded over the years (such as emojis)
|
52
|
+
and it will keep doing so.
|
53
|
+
|
54
|
+
Currently, the searches by this command is limited up to the second
|
55
|
+
Supplementary Plane
|
56
|
+
({Supplementary Ideographic Plane}[https://en.wikipedia.org/wiki/Plane_(Unicode)#Supplementary_Ideographic_Plane]), which should be enough in practice
|
57
|
+
in most cases now in 2019 and perhaps will be so for some time.
|
58
|
+
|
59
|
+
In fact, in many practical cases, searching over
|
60
|
+
only the {Basic Multilingual Plane}[https://en.wikipedia.org/wiki/Plane_(Unicode)#Basic_Multilingual_Plane] (up to 0xFFFF) is probably
|
61
|
+
sufficient, though it seems the second Supplementary Plane does
|
62
|
+
include groups of CJK characters some of which are still in use occasionally in
|
63
|
+
modern days. The maximum codepoint to search for is defined in the constant
|
64
|
+
+MAX_UNICODE_HEX+ near the beginning of the source code. If you set
|
65
|
+
it to a lower value, that can speed up the processing considerably,
|
66
|
+
potentially noticeably.
|
67
|
+
|
68
|
+
== Examples
|
69
|
+
|
70
|
+
A typical example is as follows:
|
71
|
+
|
72
|
+
% bin/ruby_unicode_prop Greek
|
73
|
+
0370 Ͱ
|
74
|
+
0371 ͱ
|
75
|
+
0372 Ͳ
|
76
|
+
……(snipped)……
|
77
|
+
0391 Α
|
78
|
+
0392 Β
|
79
|
+
0393 Γ
|
80
|
+
0394 Δ
|
81
|
+
……(snipped)
|
82
|
+
|
83
|
+
For some, POSIX (bracket) expressions are supported:
|
84
|
+
|
85
|
+
% bin/ruby_unicode_prop -p -d '___ ' punct
|
86
|
+
0021___ !
|
87
|
+
0022___ "
|
88
|
+
0023___ #
|
89
|
+
0024___ $
|
90
|
+
0025___ %
|
91
|
+
0026___ &
|
92
|
+
0027___ '
|
93
|
+
0028___ (
|
94
|
+
0029___ )
|
95
|
+
002A___ *
|
96
|
+
002B___ +
|
97
|
+
……(snipped")
|
98
|
+
|
99
|
+
Note the corresponding property name for Unicode +\p{}+ (a backslash
|
100
|
+
followed by +p+ and curly brackets) is +Punct+ — it is capitalized,
|
101
|
+
compared with the POSIX expression name.
|
102
|
+
|
103
|
+
Or, you can specify multiple properties. The order of the argument
|
104
|
+
does not matter and the result is always in the order of the
|
105
|
+
codepoints. No duplication is produced, even if some of the
|
106
|
+
specified properties have overlapped ranges of characters. An example is,
|
107
|
+
|
108
|
+
% bin/ruby_unicode_prop -c Number Terminal_Punctuation
|
109
|
+
!,.0123456789:;?²³¹¼½¾;……(snipped)
|
110
|
+
% bin/ruby_unicode_prop -c Number Terminal_Punctuation Close_Punctuation
|
111
|
+
!),.0123456789:;?]}²³¹¼½¾;……(snipped)
|
112
|
+
|
113
|
+
== Install
|
114
|
+
|
115
|
+
This script requires {Ruby}[http://www.ruby-lang.org] Version 2.0
|
116
|
+
or above.
|
117
|
+
|
118
|
+
If you install it as the standard Ruby Gem package, the executable +bin/ruby_unicode_prop+
|
119
|
+
should be located automatically in your command-line search path.
|
120
|
+
|
121
|
+
If not, place (copy) it in any of your command-line search paths. It
|
122
|
+
is a self-contained single file and does not need any external
|
123
|
+
optional library except the standard library that come in default with Ruby 2.0.
|
124
|
+
|
125
|
+
You may need to modify the first line (Shebang line) of the script to suit your
|
126
|
+
environment (it should be unnecessary for Linux and macOS), or run it
|
127
|
+
explicitly with your Ruby command as
|
128
|
+
|
129
|
+
Prompt% /YOUR/ENV/ruby /YOUR/INSTALLED/ruby_unicode_prop
|
130
|
+
|
131
|
+
== Developer's note
|
132
|
+
|
133
|
+
The master of this README file is found in
|
134
|
+
[RubyGems/plain_text](https://rubygems.org/gems/ruby_unicode_prop)
|
135
|
+
|
136
|
+
=== Tests
|
137
|
+
|
138
|
+
Ruby codes under the directory <tt>test/</tt> are the test scripts.
|
139
|
+
You can run them from the top directory as <tt>ruby test/test_****.rb</tt>
|
140
|
+
or simply run <tt>make test</tt>.
|
141
|
+
|
142
|
+
|
143
|
+
== Known bugs and Todo items
|
144
|
+
|
145
|
+
None.
|
146
|
+
|
147
|
+
|
148
|
+
== Copyright
|
149
|
+
|
150
|
+
Author:: Masa Sakano < info a_t wisebabel dot com >
|
151
|
+
Versions:: The versions of this package follow Semantic Versioning (2.0.0) http://semver.org/
|
152
|
+
License:: MIT
|
153
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,186 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
# Script to print a list of either/both characters or hexadecimal codepoints of
|
5
|
+
# the specified Unicode properties.
|
6
|
+
#
|
7
|
+
# @author: M. Sakano (Wise Babel Ltd)
|
8
|
+
|
9
|
+
require 'optparse'
|
10
|
+
require 'net/http'
|
11
|
+
require 'uri'
|
12
|
+
|
13
|
+
URL_RUBY_UNICODE_PROPS = 'https://raw.githubusercontent.com/k-takata/Onigmo/master/doc/UnicodeProps.txt'
|
14
|
+
MAX_UNICODE_HEX = 0x2FFFF # Plane 0-2 (up to Supplementary Ideographic Plane) of Unicode
|
15
|
+
|
16
|
+
BANNER = <<"__EOF__"
|
17
|
+
USAGE: #{File.basename($0)} [options] Property1 [Property2, ...]
|
18
|
+
Print all the characters and/or their hex-codepoints that have
|
19
|
+
the given "Unicode property" used in Ruby Regexp like \\p{Currency_Symbol}
|
20
|
+
(or POSIX expression like [[:blank:]] if -p option is given).
|
21
|
+
__EOF__
|
22
|
+
|
23
|
+
# Initialising the hash for the command-line options.
|
24
|
+
OPTS = {
|
25
|
+
with_char: true,
|
26
|
+
with_codepoint: true,
|
27
|
+
delimiter: nil,
|
28
|
+
posix: false,
|
29
|
+
lowercase: false,
|
30
|
+
list_property: false,
|
31
|
+
# :chatter => 3, # Default
|
32
|
+
debug: false,
|
33
|
+
}
|
34
|
+
|
35
|
+
# Function to handle the command-line arguments.
|
36
|
+
#
|
37
|
+
# ARGV will be modified, and the constant variable OPTS is set.
|
38
|
+
#
|
39
|
+
# @return [Hash] Optional-argument hash.
|
40
|
+
#
|
41
|
+
def handle_argv
|
42
|
+
opt = OptionParser.new(BANNER)
|
43
|
+
opt.separator ""
|
44
|
+
opt.separator "Options:"
|
45
|
+
opt.on('-c', '--[no-]without-codepoint', sprintf("Print characters only? (Def: %s)", (!OPTS[:with_codepoint]).inspect), FalseClass){|v| OPTS[:with_codepoint] = v} # memo: "-c" for "Characters only"
|
46
|
+
opt.on('-n', '--[no-]without-char', sprintf("Print codepoints only? (Def: %s)", (!OPTS[:with_char]).inspect), FalseClass){|v| OPTS[:with_char] = v} # memo: "-n" for "codepoints Number only"
|
47
|
+
opt.on('-d', '--delimiter=CHAR', sprintf("Delimeter in output.", OPTS[:delimiter].inspect)) {|v| OPTS[:delimiter] = v}
|
48
|
+
opt.on('-l', '--[no-]lowercase', sprintf("Lower cases alphabets are used for Hex in codepoints (Def: %s)", OPTS[:lowercase].inspect)) {|v| OPTS[:lowercase] = v} # memo: "-l" for "Lower case"
|
49
|
+
opt.on('-p', '--[no-]posix', sprintf("Use POSIX expression instead of Unicode (Def: %s)", OPTS[:posix].inspect)) {|v| OPTS[:posix] = v} # memo: "-p" for "POSIX"
|
50
|
+
opt.on( '--[no-]list-property', 'Print all the Ruby Unicode properties and exit.') {|v| OPTS[:list_property] = v}
|
51
|
+
# opt.on( '--version', "Display the version and exits.", TrueClass) {|v| OPTS[:version] = v} # Consider opts.on_tail
|
52
|
+
# opt.on( '--[no-]debug', "Debug (Def: false)") {|v| OPTS[:debug] = v}
|
53
|
+
opt.separator ""
|
54
|
+
opt.separator "Note1: Delimeter means one"
|
55
|
+
opt.separator " (1) between multiple characters and codepoints if either of -n or -c is specified (Default: Null for -c (characters only) and a new line for -n."
|
56
|
+
opt.separator " (2) between the number and character of each pair if both are specified (Def: a whitespace), whereas the delimeter between pairs is always a newline."
|
57
|
+
opt.separator " To specify a newline as a delimiter, give 'NL'"
|
58
|
+
opt.separator "Note2: Properties differ for '-p', 'ascii' in POSIX and 'ASCII' in Unicode."
|
59
|
+
opt.separator "Reference: https://github.com/k-takata/Onigmo/blob/master/doc/UnicodeProps.txt"
|
60
|
+
# opt.separator " Ruby-Source: /enc/unicode/name2ctype.h"
|
61
|
+
|
62
|
+
opt.parse!(ARGV)
|
63
|
+
|
64
|
+
# exit if --list-property
|
65
|
+
(puts get_file_unicode_properties; exit 1) if OPTS[:list_property]
|
66
|
+
|
67
|
+
if !OPTS[:with_char] && !OPTS[:with_codepoint]
|
68
|
+
warn "Specify (or do not specify at all) what to print (do not specify -p and -c simultaneously)."
|
69
|
+
exit 1
|
70
|
+
end
|
71
|
+
|
72
|
+
if ARGV.size == 0
|
73
|
+
warn "ERROR: Specify Unicode property(ies). To see help, run #{File.basename($0)} --help"
|
74
|
+
exit 1
|
75
|
+
end
|
76
|
+
|
77
|
+
# Adjustments
|
78
|
+
OPTS[:delimiter] = "\n" if 'NL' == OPTS[:delimiter] # Special case; 'NL' means a newline.
|
79
|
+
OPTS[:delimiter] ||=
|
80
|
+
if OPTS[:with_char] && OPTS[:with_codepoint]
|
81
|
+
" "
|
82
|
+
elsif OPTS[:with_char]
|
83
|
+
""
|
84
|
+
else
|
85
|
+
$/
|
86
|
+
end
|
87
|
+
|
88
|
+
OPTS
|
89
|
+
end
|
90
|
+
|
91
|
+
# Returns the array of characters that satisfies the Property conditions.
|
92
|
+
#
|
93
|
+
# @param argv [ARGV]
|
94
|
+
# @param opts [Hash] Hash of command-line options.
|
95
|
+
# @return [Array]
|
96
|
+
def get_ary_chars(argv, opts)
|
97
|
+
if opts[:posix]
|
98
|
+
# for 'ascii' and 'blank', joined as [[:ascii:][:blank:]]
|
99
|
+
fmt1 = '[%s]'
|
100
|
+
fmt2 = '[:%s:]'
|
101
|
+
fmt_deli = ""
|
102
|
+
else
|
103
|
+
# for 'ASCII' and 'Digit', joined as (?:\p{ASCII}|\p{Digit})
|
104
|
+
fmt1 = '(?:%s)'
|
105
|
+
fmt2 = '\\p{%s}'
|
106
|
+
fmt_deli = '|'
|
107
|
+
end
|
108
|
+
|
109
|
+
prop_q = fmt1 % argv.map{|c| fmt2 % Regexp.quote(c)}.join(fmt_deli)
|
110
|
+
|
111
|
+
fmt0 = (opts[:lowercase] ? "%04x" : "%04X")
|
112
|
+
|
113
|
+
arret = []
|
114
|
+
begin
|
115
|
+
(0..MAX_UNICODE_HEX).each do |i|
|
116
|
+
s = i.chr(Encoding::UTF_8) rescue next # invalid codepoint 0xD800 in UTF-8 (RangeError)
|
117
|
+
next if Regexp.new(prop_q) !~ s
|
118
|
+
arret <<
|
119
|
+
if opts[:with_char] && opts[:with_codepoint]
|
120
|
+
(fmt0+"%s%s") % [i, opts[:delimiter], s]
|
121
|
+
elsif opts[:with_char]
|
122
|
+
s
|
123
|
+
else
|
124
|
+
fmt0 % i
|
125
|
+
end
|
126
|
+
end
|
127
|
+
rescue RegexpError => er
|
128
|
+
warn '(RegexpError) '+er.message
|
129
|
+
exit 1
|
130
|
+
end
|
131
|
+
|
132
|
+
arret
|
133
|
+
end
|
134
|
+
|
135
|
+
|
136
|
+
# Returns the array of characters that satisfies the Property conditions.
|
137
|
+
#
|
138
|
+
# If encountering ERROR, this directly exits!
|
139
|
+
#
|
140
|
+
# @return [String]
|
141
|
+
def get_file_unicode_properties
|
142
|
+
url = URL_RUBY_UNICODE_PROPS
|
143
|
+
#url = "http://google.com/naiyo.txt"
|
144
|
+
#url = "http://googlegoo345.com/naiyo.txt"
|
145
|
+
begin
|
146
|
+
resp = Net::HTTP.get_response( URI.parse url )
|
147
|
+
rescue SocketError => er
|
148
|
+
warn(er.message + "\nERROR: Host not reachable: " + url)
|
149
|
+
exit 1
|
150
|
+
end
|
151
|
+
|
152
|
+
return "Fetched from: %s\n%s" % [url, resp.body] if resp.code.to_i == 200
|
153
|
+
|
154
|
+
warn "ERROR in HTTP response (#{resp.code}) - File not found or something: "+url
|
155
|
+
exit 1
|
156
|
+
end
|
157
|
+
|
158
|
+
|
159
|
+
################################################
|
160
|
+
# MAIN
|
161
|
+
################################################
|
162
|
+
|
163
|
+
$stdout.sync=true
|
164
|
+
$stderr.sync=true
|
165
|
+
|
166
|
+
# Handle the command-line options => OPTS
|
167
|
+
begin
|
168
|
+
handle_argv()
|
169
|
+
rescue OptionParser::InvalidOption, OptionParser::AmbiguousOption => er
|
170
|
+
warn er
|
171
|
+
exit 1
|
172
|
+
end
|
173
|
+
|
174
|
+
arret = get_ary_chars(ARGV, OPTS)
|
175
|
+
|
176
|
+
if OPTS[:with_char] && OPTS[:with_codepoint]
|
177
|
+
puts arret.join($/)
|
178
|
+
else
|
179
|
+
puts arret.join(OPTS[:delimiter])
|
180
|
+
end
|
181
|
+
|
182
|
+
exit
|
183
|
+
|
184
|
+
__END__
|
185
|
+
|
186
|
+
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require 'rake'
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = %q{ruby_unicode_prop}.sub(/.*/){|c| (c == File.basename(Dir.pwd)) ? c : raise("ERROR: s.name=(#{c}) in gemspec seems wrong!")}
|
7
|
+
s.version = "1.0"
|
8
|
+
# s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
9
|
+
s.bindir = 'bin'
|
10
|
+
%w(ruby_unicode_prop).each do |f|
|
11
|
+
path = s.bindir+'/'+f
|
12
|
+
File.executable?(path) ? s.executables << f : raise("ERROR: Executable (#{path}) is not executable!")
|
13
|
+
end
|
14
|
+
s.authors = ["Masa Sakano"]
|
15
|
+
s.date = %q{2019-10-26}
|
16
|
+
s.summary = %q{Command to print the characters and hex-codepoints with the given Unicode properties}
|
17
|
+
s.description = %q{This module provides utility functions and methods to handle plain text, classes Part/Paragraph/Boundary to represent the logical structure of a document and ParseRule to describe the rules to parse plain text to produce a Part-type Ruby instance.}
|
18
|
+
# s.email = %q{abc@example.com}
|
19
|
+
s.extra_rdoc_files = [
|
20
|
+
#"LICENSE.txt",
|
21
|
+
"README.en.rdoc",
|
22
|
+
]
|
23
|
+
s.license = 'MIT'
|
24
|
+
s.files = FileList['.gitignore','lib/**/*.rb','[A-Z]*','test/**/*.rb', '*.gemspec', 'bin'].to_a.delete_if{ |f|
|
25
|
+
ret = false
|
26
|
+
arignore = IO.readlines('.gitignore')
|
27
|
+
arignore.map{|i| i.chomp}.each do |suffix|
|
28
|
+
if File.fnmatch(suffix, File.basename(f))
|
29
|
+
ret = true
|
30
|
+
break
|
31
|
+
end
|
32
|
+
end
|
33
|
+
ret
|
34
|
+
}
|
35
|
+
s.files.reject! { |fn| File.symlink? fn }
|
36
|
+
# s.add_runtime_dependency 'rails'
|
37
|
+
# s.add_development_dependency "bourne", [">= 0"]
|
38
|
+
s.homepage = %q{https://www.wisebabel.com}
|
39
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
40
|
+
|
41
|
+
# s.require_paths = ["lib"] # Default "lib"
|
42
|
+
s.required_ruby_version = '>= 2.0'
|
43
|
+
s.test_files = Dir['test/**/*.rb']
|
44
|
+
s.test_files.reject! { |fn| File.symlink? fn }
|
45
|
+
# s.requirements << 'libmagick, v6.0' # Simply, info to users.
|
46
|
+
# s.rubygems_version = %q{1.3.5} # This is always set automatically!!
|
47
|
+
|
48
|
+
s.metadata["yard.run"] = "yri" # use "yard" to build full HTML docs.
|
49
|
+
end
|
50
|
+
|
@@ -0,0 +1,140 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
# @author: M. Sakano (Wise Babel Ltd)
|
4
|
+
|
5
|
+
require 'plain_text'
|
6
|
+
require 'open3'
|
7
|
+
|
8
|
+
$stdout.sync=true
|
9
|
+
$stderr.sync=true
|
10
|
+
# print '$LOAD_PATH=';p $LOAD_PATH
|
11
|
+
|
12
|
+
#################################################
|
13
|
+
# Unit Test
|
14
|
+
#################################################
|
15
|
+
|
16
|
+
gem "minitest"
|
17
|
+
# require 'minitest/unit'
|
18
|
+
require 'minitest/autorun'
|
19
|
+
|
20
|
+
class TestUnitRubyUnicodeProp < MiniTest::Test
|
21
|
+
T = true
|
22
|
+
F = false
|
23
|
+
SCFNAME = File.basename(__FILE__)
|
24
|
+
EXE = "%s/../bin/%s" % [File.dirname(__FILE__), File.basename(__FILE__).sub(/^test_(.+)\.rb/, '\1')]
|
25
|
+
|
26
|
+
def setup
|
27
|
+
end
|
28
|
+
|
29
|
+
def teardown
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_ruby_unicode_prop01
|
33
|
+
o, e, s = Open3.capture3 EXE+" ASCII"
|
34
|
+
assert_equal 0, s.exitstatus
|
35
|
+
assert_match(/\A0000 .\n0001 .\n.*\n004A J/m, o)
|
36
|
+
assert_operator 70, '<', o.count($/)
|
37
|
+
assert_empty e
|
38
|
+
size_ascii = o.size
|
39
|
+
|
40
|
+
o, e, s = Open3.capture3 EXE+" -l ASCII"
|
41
|
+
assert_equal 0, s.exitstatus
|
42
|
+
assert_match(/\A0000 .\n0001 .\n.*\n004a J/m, o)
|
43
|
+
|
44
|
+
o, e, s = Open3.capture3 EXE+" ascii"
|
45
|
+
assert_equal 0, s.exitstatus
|
46
|
+
assert_match(/\A0000 .\n0001 .\n.*\n004A J/m, o)
|
47
|
+
assert_operator 70, '<', o.count($/)
|
48
|
+
assert_empty e
|
49
|
+
|
50
|
+
o, e, s = Open3.capture3 EXE+" -p ASCII" # => Error (b/c "ascii" in the POSIX form)
|
51
|
+
assert_equal 1, s.exitstatus
|
52
|
+
assert_match(/invalid POSIX/i, e)
|
53
|
+
|
54
|
+
o, e, s = Open3.capture3 EXE+" -d H ASCII"
|
55
|
+
assert_equal 0, s.exitstatus
|
56
|
+
assert_match(/\A0000H.\n0001H.\n.*\n004AHJ/m, o)
|
57
|
+
o, e, s = Open3.capture3 EXE+" --delimiter=H ASCII"
|
58
|
+
assert_match(/\A0000H.\n0001H.\n.*\n004AHJ/m, o)
|
59
|
+
|
60
|
+
o, e, s = Open3.capture3 EXE+" --without-codepoint ASCII" # characters only (-c)
|
61
|
+
assert_equal 0, s.exitstatus
|
62
|
+
assert_equal 0, o[40..-1].chomp.count($/) # "\n" is included in ASCII itself.
|
63
|
+
assert_equal 1, o.count(" ")
|
64
|
+
assert_match(/XYZ/m, o)
|
65
|
+
assert_empty e
|
66
|
+
|
67
|
+
o, e, s = Open3.capture3 EXE+" -c -d H ASCII" # characters only (-c)
|
68
|
+
assert_equal 0, o[40..-1].chomp.count($/) # "\n" is included in ASCII itself.
|
69
|
+
assert_equal 1, o.count(" ")
|
70
|
+
assert_match(/XHYHZ/m, o)
|
71
|
+
assert_empty e
|
72
|
+
|
73
|
+
o, e, s = Open3.capture3 EXE+" -c -d NL ASCII" # characters only (-c)
|
74
|
+
assert_operator 70, '<', o.count($/), "Special case of 'NL' is not handled correctly."
|
75
|
+
assert_equal 1, o.count(" ")
|
76
|
+
assert_match(/X\nY\nZ/m, o)
|
77
|
+
assert_empty e
|
78
|
+
|
79
|
+
o, e, s = Open3.capture3 EXE+" --without-char ASCII" # codepoints only (-n)
|
80
|
+
assert_equal 0, o.count(" ")
|
81
|
+
assert_equal 0, o.count("X")
|
82
|
+
assert_operator 70, '<', o.count($/)
|
83
|
+
assert_equal "0000\n0001\n", o[0..9]
|
84
|
+
assert_empty e
|
85
|
+
|
86
|
+
o, e, s = Open3.capture3 EXE+" -n -d H ASCII" # codepoints only (-n)
|
87
|
+
assert_equal 0, o[60..-1].chomp.count($/) # "\n" is included in ASCII itself.
|
88
|
+
assert_operator 70, '<', o.count(?H)
|
89
|
+
assert_equal "0000H0001H", o[0..9]
|
90
|
+
assert_match(/\A0000H0001H.*H004AH/m, o)
|
91
|
+
assert_empty e
|
92
|
+
|
93
|
+
o, e, s = Open3.capture3 EXE+" -l -n -d H ASCII" # codepoints only (-n)
|
94
|
+
assert_match(/\A0000H0001H.*H004aH/m, o)
|
95
|
+
|
96
|
+
# Multiple arguments
|
97
|
+
o, e, s = Open3.capture3 EXE+" Currency_Symbol ASCII"
|
98
|
+
assert_equal 0, s.exitstatus
|
99
|
+
assert_match(/0023 \#\n0024 \$\n.*00A3 £/m, o)
|
100
|
+
assert_operator size_ascii, '<', o.size # Increased size (because Currency Symbols are added!)
|
101
|
+
|
102
|
+
o, e, s = Open3.capture3 EXE+" ASCII Digit"
|
103
|
+
ou_ad = o
|
104
|
+
si_ad = o.size
|
105
|
+
o, e, s = Open3.capture3 EXE+" Digit ASCII"
|
106
|
+
ou_da = o
|
107
|
+
o, e, s = Open3.capture3 EXE+" Digit"
|
108
|
+
size_digit = o.size
|
109
|
+
|
110
|
+
assert_equal ou_ad, ou_da, 'Should be unordered for multiple arguments.'
|
111
|
+
assert_operator si_ad, '<', size_ascii + size_digit, 'Duplication should not appear.'
|
112
|
+
end
|
113
|
+
|
114
|
+
|
115
|
+
## tests of Errors ##
|
116
|
+
def test_ruby_unicode_prop_error02
|
117
|
+
o, e, s = Open3.capture3 EXE+" ASCII -d" # => Error (-d without a parameter)
|
118
|
+
assert_equal 1, s.exitstatus
|
119
|
+
assert_empty o
|
120
|
+
|
121
|
+
o, e, s = Open3.capture3 EXE+" -n -c ASCII" # => Error
|
122
|
+
assert_equal 1, s.exitstatus
|
123
|
+
assert_empty o
|
124
|
+
assert_match(/specify/m, e)
|
125
|
+
|
126
|
+
o, e, s = Open3.capture3 EXE+" -Z ASCII" # => Error
|
127
|
+
assert_equal 1, s.exitstatus
|
128
|
+
assert_match(/(invalid|ambiguous) option/i, e)
|
129
|
+
|
130
|
+
o, e, s = Open3.capture3 EXE+" naiyo" # => Error
|
131
|
+
assert_equal 1, s.exitstatus
|
132
|
+
assert_empty o
|
133
|
+
assert_match(/RegexpError/m, e)
|
134
|
+
|
135
|
+
o, e, s = Open3.capture3 EXE # => Error (No arguments specified)
|
136
|
+
assert_equal 1, s.exitstatus
|
137
|
+
assert_empty o
|
138
|
+
end
|
139
|
+
end # class TestUnitRubyUnicodeProp < MiniTest::Test
|
140
|
+
|
metadata
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ruby_unicode_prop
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '1.0'
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Masa Sakano
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2019-10-26 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: This module provides utility functions and methods to handle plain text,
|
14
|
+
classes Part/Paragraph/Boundary to represent the logical structure of a document
|
15
|
+
and ParseRule to describe the rules to parse plain text to produce a Part-type Ruby
|
16
|
+
instance.
|
17
|
+
email:
|
18
|
+
executables:
|
19
|
+
- ruby_unicode_prop
|
20
|
+
extensions: []
|
21
|
+
extra_rdoc_files:
|
22
|
+
- README.en.rdoc
|
23
|
+
files:
|
24
|
+
- ".gitignore"
|
25
|
+
- ChangeLog
|
26
|
+
- LICENSE.txt
|
27
|
+
- Makefile
|
28
|
+
- README.en.rdoc
|
29
|
+
- Rakefile
|
30
|
+
- bin/ruby_unicode_prop
|
31
|
+
- ruby_unicode_prop.gemspec
|
32
|
+
- test/test_ruby_unicode_prop.rb
|
33
|
+
homepage: https://www.wisebabel.com
|
34
|
+
licenses:
|
35
|
+
- MIT
|
36
|
+
metadata:
|
37
|
+
yard.run: yri
|
38
|
+
post_install_message:
|
39
|
+
rdoc_options:
|
40
|
+
- "--charset=UTF-8"
|
41
|
+
require_paths:
|
42
|
+
- lib
|
43
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '2.0'
|
48
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: '0'
|
53
|
+
requirements: []
|
54
|
+
rubygems_version: 3.0.3
|
55
|
+
signing_key:
|
56
|
+
specification_version: 4
|
57
|
+
summary: Command to print the characters and hex-codepoints with the given Unicode
|
58
|
+
properties
|
59
|
+
test_files:
|
60
|
+
- test/test_ruby_unicode_prop.rb
|