unf_ext 0.0.8.2.beta-x64-mingw-ucrt
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.github/workflows/unf_ext.yml +43 -0
- data/.gitignore +19 -0
- data/CHANGELOG.md +62 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +57 -0
- data/Rakefile +37 -0
- data/ext/unf_ext/extconf.rb +50 -0
- data/ext/unf_ext/unf/normalizer.hh +139 -0
- data/ext/unf_ext/unf/table.hh +13617 -0
- data/ext/unf_ext/unf/trie/char_stream.hh +150 -0
- data/ext/unf_ext/unf/trie/node.hh +25 -0
- data/ext/unf_ext/unf/trie/searcher.hh +194 -0
- data/ext/unf_ext/unf/util.hh +24 -0
- data/ext/unf_ext/unf.cc +75 -0
- data/lib/3.1/unf_ext.so +0 -0
- data/lib/unf_ext/version.rb +5 -0
- data/lib/unf_ext.rb +5 -0
- data/test/helper.rb +18 -0
- data/test/normalization-test.txt +112332 -0
- data/test/test_unf_ext.rb +40 -0
- data/unf_ext.gemspec +34 -0
- metadata +157 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 0df2b2042b01091aaf309d44de895e1085657b92f1fe7f37c87beb41c0242f59
|
4
|
+
data.tar.gz: a9b406b2790051a65f2fedb2942bfecb6fa46836d25f096e1e5519380a4d57e8
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 607c0bf6f8c11b205c7b755525ead44b5e90b21dcebfb90e0932b1f5ae09c79e0b92d8e804f8fec28c92296198bd7514097c98cb2b904bfac6a0886ccd6973ce
|
7
|
+
data.tar.gz: 12fc5c1d7f14828b9e07c4a986cd0f60084211685e6339ff63cdede0bab2bc41c9a1c1f774b207f7c3fbcffc774ed7a20ec88dd2d6635f3d77aebbdc132e720a
|
data/.document
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
name: CI
|
2
|
+
|
3
|
+
on: [push, pull_request]
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
build:
|
7
|
+
name: >-
|
8
|
+
${{ matrix.os }} ${{ matrix.ruby }}
|
9
|
+
|
10
|
+
runs-on: ${{ matrix.os }}
|
11
|
+
strategy:
|
12
|
+
fail-fast: false
|
13
|
+
matrix:
|
14
|
+
os: [ ubuntu-20.04, macos-11.0, windows-2019 ]
|
15
|
+
ruby: [ 2.6, 2.7, "3.0", 3.1, head ]
|
16
|
+
include:
|
17
|
+
- { os: windows-2019, ruby: mingw }
|
18
|
+
exclude:
|
19
|
+
- { os: windows-2019, ruby: head }
|
20
|
+
|
21
|
+
steps:
|
22
|
+
- name: repo checkout
|
23
|
+
uses: actions/checkout@v2
|
24
|
+
|
25
|
+
- name: load ruby cross-compilation toolkit
|
26
|
+
uses: MSP-Greg/setup-ruby-pkgs@v1
|
27
|
+
with:
|
28
|
+
ruby-version: ${{ matrix.ruby }}
|
29
|
+
mingw: _upgrade_
|
30
|
+
|
31
|
+
- name: bundle install
|
32
|
+
shell: pwsh
|
33
|
+
run: |
|
34
|
+
bundle config set --local path .bundle/vendor
|
35
|
+
bundle install --jobs 4 --retry 3
|
36
|
+
|
37
|
+
- name: compile
|
38
|
+
timeout-minutes: 5
|
39
|
+
run: bundle exec rake compile
|
40
|
+
|
41
|
+
- name: test
|
42
|
+
timeout-minutes: 5
|
43
|
+
run: bundle exec rake test
|
data/.gitignore
ADDED
data/CHANGELOG.md
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
## 0.0.8.2 (2022-04-13)
|
2
|
+
|
3
|
+
- Add x64-mingw-ucrt native gem support for RubyInstaller 3.1.
|
4
|
+
|
5
|
+
## 0.0.8.1 (2022-03-13)
|
6
|
+
|
7
|
+
- Include Windows binaries for Ruby 3.1. (FAIL)
|
8
|
+
|
9
|
+
## 0.0.8 (2021-09-14)
|
10
|
+
|
11
|
+
- No functional change in the library code.
|
12
|
+
- Include Windows binaries for Ruby 3.0.
|
13
|
+
- Drop support for Ruby 2.1 and earlier.
|
14
|
+
- Replace Travis CI with Github Actions.
|
15
|
+
- Fix cross-build after upgrading rake-compiler/rake-compiler-dock to 1.1.1/1.1.0.
|
16
|
+
|
17
|
+
## 0.0.7.7 (2020-03-30)
|
18
|
+
|
19
|
+
- Include Windows binaries for Ruby 2.7.
|
20
|
+
|
21
|
+
## 0.0.7.6 (2019-03-19)
|
22
|
+
|
23
|
+
- Include Windows binaries for Ruby 2.6.
|
24
|
+
|
25
|
+
## 0.0.7.5 (2018-02-06)
|
26
|
+
|
27
|
+
- Include Windows binaries for Ruby 2.5.
|
28
|
+
|
29
|
+
## 0.0.7.4 (2017-04-19)
|
30
|
+
|
31
|
+
- Fix build on ARM and GCC 6 again.
|
32
|
+
|
33
|
+
## 0.0.7.3 (2017-04-11)
|
34
|
+
|
35
|
+
- Update the base Unicode version to 9.
|
36
|
+
|
37
|
+
- Fix compile issues on ARM and GCC 6.
|
38
|
+
|
39
|
+
## 0.0.7.2 (2016-02-01)
|
40
|
+
|
41
|
+
- Include Windows binaries for Ruby 2.3.
|
42
|
+
|
43
|
+
## 0.0.7.1 (2015-04-18)
|
44
|
+
|
45
|
+
- Windows fat binary gems no longer require libstd++ to
|
46
|
+
run, which are statically linked.
|
47
|
+
|
48
|
+
- Add a fat binary gem for x64-mingw32 (64bit Windows).
|
49
|
+
|
50
|
+
- Windows fat binary gems now include binaries for Ruby up to 2.2.
|
51
|
+
|
52
|
+
## 0.0.6 (2013-02-16)
|
53
|
+
|
54
|
+
- Migrate from Jeweler to Bundler.
|
55
|
+
|
56
|
+
## 0.0.5 (2012-05-30)
|
57
|
+
|
58
|
+
- Fix a type error for strict compilers.
|
59
|
+
|
60
|
+
## 0.0.4 (2011-12-08)
|
61
|
+
|
62
|
+
- Release under the current name of `unf_ext`.
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
The MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2010 Takeru Ohta <phjgt308@gmail.com>
|
4
|
+
Copyright (c) 2011-2018 Akinori MUSHA <knu@idaemons.org> (extended Ruby support)
|
5
|
+
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
8
|
+
in the Software without restriction, including without limitation the rights
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
11
|
+
furnished to do so, subject to the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be included in
|
14
|
+
all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
22
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
ruby-unf_ext
|
2
|
+
============
|
3
|
+
|
4
|
+
Synopsis
|
5
|
+
--------
|
6
|
+
|
7
|
+
* Unicode Normalization Form support library for CRuby
|
8
|
+
|
9
|
+
Description
|
10
|
+
-----------
|
11
|
+
|
12
|
+
* Normalizes UTF-8 strings into and from NFC, NFD, NFKC or NFKD
|
13
|
+
|
14
|
+
# For bulk conversion
|
15
|
+
normalizer = UNF::Normalizer.new
|
16
|
+
a_bunch_of_strings.map! { |string|
|
17
|
+
normalizer.normalize(string, :nfc) #=> string in NFC
|
18
|
+
}
|
19
|
+
|
20
|
+
* Compliant with Unicode 9.0
|
21
|
+
|
22
|
+
Requirement
|
23
|
+
-----------
|
24
|
+
|
25
|
+
* Ruby 1.8.7+, 1.9.2+
|
26
|
+
|
27
|
+
* C++ compiler and libstdc++
|
28
|
+
|
29
|
+
Installation
|
30
|
+
------------
|
31
|
+
|
32
|
+
gem install unf_ext
|
33
|
+
|
34
|
+
Or:
|
35
|
+
|
36
|
+
ruby extconf.rb && make && make install
|
37
|
+
|
38
|
+
Development Resources
|
39
|
+
---------------------
|
40
|
+
|
41
|
+
* https://github.com/sile/unf
|
42
|
+
|
43
|
+
For issues regarding files under the directory `unf`, please
|
44
|
+
contact this upstream.
|
45
|
+
|
46
|
+
* https://github.com/knu/ruby-unf_ext
|
47
|
+
|
48
|
+
The development site and the repository.
|
49
|
+
|
50
|
+
License
|
51
|
+
-------
|
52
|
+
|
53
|
+
Copyright (c) 2010-2017 Takeru Ohta
|
54
|
+
Copyright (c) 2011-2018 Akinori MUSHA
|
55
|
+
|
56
|
+
Licensed under the MIT license.
|
57
|
+
See `LICENSE` for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'bundler/gem_tasks'
|
2
|
+
|
3
|
+
gemspec = Bundler::GemHelper.gemspec
|
4
|
+
|
5
|
+
native_platforms = %w[
|
6
|
+
x86-mingw32
|
7
|
+
x64-mingw32
|
8
|
+
x64-mingw-ucrt
|
9
|
+
]
|
10
|
+
|
11
|
+
require 'rake/extensiontask'
|
12
|
+
Rake::ExtensionTask.new('unf_ext', gemspec) do |ext|
|
13
|
+
ext.cross_compile = true
|
14
|
+
ext.cross_platform = native_platforms
|
15
|
+
ext.cross_config_options << '--with-ldflags="-static-libgcc"' << '--with-static-libstdc++'
|
16
|
+
end
|
17
|
+
|
18
|
+
namespace :gem do
|
19
|
+
task :native do
|
20
|
+
require 'rake_compiler_dock'
|
21
|
+
sh 'bundle package --all'
|
22
|
+
native_platforms.each do |plat|
|
23
|
+
RakeCompilerDock.sh "bundle --local && rake native:#{plat} gem", platform: plat
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
task :gems => %i[build gem:native]
|
29
|
+
|
30
|
+
require 'rake/testtask'
|
31
|
+
Rake::TestTask.new(:test) do |test|
|
32
|
+
test.libs << 'test'
|
33
|
+
test.test_files = gemspec.test_files
|
34
|
+
test.verbose = true
|
35
|
+
end
|
36
|
+
|
37
|
+
task :default => :test
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'mkmf'
|
2
|
+
|
3
|
+
if with_config('static-libstdc++')
|
4
|
+
$LDFLAGS << ' ' << `#{CONFIG['CC']} -print-file-name=libstdc++.a`.chomp
|
5
|
+
else
|
6
|
+
have_library('stdc++')
|
7
|
+
|
8
|
+
case RbConfig::CONFIG['host_os']
|
9
|
+
when /solaris(!?2.11)/
|
10
|
+
# Do a little trickery here to enable C++ standard on Solaris 11 if found.
|
11
|
+
# This also forces 64bit compilation mode.
|
12
|
+
$CXX = CONFIG['CXX']
|
13
|
+
$CXX << ' ' << '-m64'
|
14
|
+
$CFLAGS = CONFIG['CFLAGS'].gsub(/-std=c99/, '')
|
15
|
+
$CFLAGS << ' ' << '-m64 -std=c++11'
|
16
|
+
$CPPFLAGS = CONFIG['CFLAGS'].gsub(/-std=c99/, '')
|
17
|
+
$CPPFLAGS << ' ' << '-m64 -std=c++11'
|
18
|
+
$CXXFLAGS = CONFIG['CFLAGS'].gsub(/-std=c99/, '')
|
19
|
+
$CXXFLAGS << ' ' << '-m64 -std=c++11'
|
20
|
+
when /aix/
|
21
|
+
# Compiler flags necessary on AIX.
|
22
|
+
# rubocop:disable Style/GlobalVars
|
23
|
+
$CFLAGS << ' ' << '-D_ALL_SOURCE=1'
|
24
|
+
$CPPFLAGS << ' ' << '-D_ALL_SOURCE=1'
|
25
|
+
$CXXFLAGS << ' ' << '-D_ALL_SOURCE=1'
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
create_makefile 'unf_ext'
|
30
|
+
|
31
|
+
unless CONFIG['CXX']
|
32
|
+
case CONFIG['CC']
|
33
|
+
when %r{((?:.*[-/])?)gcc([-0-9.]*)$}
|
34
|
+
cxx = $1 + 'g++' + $2
|
35
|
+
when %r{((?:.*[-/])?)clang([-0-9.]*)$}
|
36
|
+
cxx = $1 + 'clang++' + $2
|
37
|
+
else
|
38
|
+
cxx = CONFIG['CC']
|
39
|
+
end
|
40
|
+
|
41
|
+
warn "CXX is automatically set to #{cxx}"
|
42
|
+
|
43
|
+
new_mf = <<-EOF << File.read('Makefile')
|
44
|
+
CXX=#{cxx}
|
45
|
+
EOF
|
46
|
+
|
47
|
+
File.open('Makefile', 'w') { |mf|
|
48
|
+
mf.print new_mf
|
49
|
+
}
|
50
|
+
end
|
@@ -0,0 +1,139 @@
|
|
1
|
+
#ifndef UNF_NORMALIZER_HH
|
2
|
+
#define UNF_NORMALIZER_HH
|
3
|
+
|
4
|
+
#include <vector>
|
5
|
+
#include <string>
|
6
|
+
#include <algorithm>
|
7
|
+
#include <cstring>
|
8
|
+
#include "trie/searcher.hh"
|
9
|
+
#include "trie/char_stream.hh"
|
10
|
+
#include "table.hh"
|
11
|
+
#include "util.hh"
|
12
|
+
|
13
|
+
namespace UNF {
|
14
|
+
class Normalizer {
|
15
|
+
public:
|
16
|
+
enum Form { FORM_NFD, FORM_NFC, FORM_NFKD, FORM_NFKC };
|
17
|
+
|
18
|
+
public:
|
19
|
+
Normalizer()
|
20
|
+
: nf_d(TABLE::NODES, TABLE::CANONICAL_DECOM_ROOT, (const char *)TABLE::STRINGS),
|
21
|
+
nf_kd(TABLE::NODES, TABLE::COMPATIBILITY_DECOM_ROOT, (const char *)TABLE::STRINGS),
|
22
|
+
nf_c(TABLE::NODES, TABLE::CANONICAL_COM_ROOT, (const char *)TABLE::STRINGS),
|
23
|
+
nf_c_qc(TABLE::NODES, TABLE::NFC_ILLEGAL_ROOT),
|
24
|
+
nf_kc_qc(TABLE::NODES, TABLE::NFKC_ILLEGAL_ROOT),
|
25
|
+
ccc(TABLE::NODES, TABLE::CANONICAL_CLASS_ROOT)
|
26
|
+
{}
|
27
|
+
|
28
|
+
const char* normalize(const char* src, Form form) {
|
29
|
+
switch(form) {
|
30
|
+
case FORM_NFD: return nfd(src);
|
31
|
+
case FORM_NFC: return nfc(src);
|
32
|
+
case FORM_NFKD: return nfkd(src);
|
33
|
+
case FORM_NFKC: return nfkc(src);
|
34
|
+
default: return src;
|
35
|
+
}
|
36
|
+
}
|
37
|
+
const char* nfd(const char* src) { return decompose(src, nf_d); }
|
38
|
+
const char* nfkd(const char* src) { return decompose(src, nf_kd); }
|
39
|
+
const char* nfc(const char* src) { return compose(src, nf_c_qc, nf_d); }
|
40
|
+
const char* nfkc(const char* src) { return compose(src, nf_kc_qc, nf_kd); }
|
41
|
+
|
42
|
+
private:
|
43
|
+
const char* decompose(const char* src, const Trie::NormalizationForm& nf) {
|
44
|
+
const char* beg = next_invalid_char(src, nf);
|
45
|
+
if(*beg=='\0')
|
46
|
+
return src;
|
47
|
+
|
48
|
+
buffer.assign(src, beg);
|
49
|
+
do {
|
50
|
+
const char* end = next_valid_starter(beg, nf);
|
51
|
+
decompose_one(beg, end, nf, buffer);
|
52
|
+
beg = next_invalid_char(end, nf);
|
53
|
+
buffer.append(end, beg);
|
54
|
+
} while(*beg!='\0');
|
55
|
+
|
56
|
+
return buffer.c_str();
|
57
|
+
}
|
58
|
+
|
59
|
+
void decompose_one(const char* beg, const char* end, const Trie::NormalizationForm& nf, std::string& buf) {
|
60
|
+
unsigned last = buf.size();
|
61
|
+
nf.decompose(Trie::RangeCharStream(beg,end), buf);
|
62
|
+
char* bufbeg = const_cast<char*>(buf.data());
|
63
|
+
canonical_combining_class_ordering(bufbeg+last, bufbeg+buf.size());
|
64
|
+
}
|
65
|
+
|
66
|
+
const char* compose(const char* src, const Trie::NormalizationForm& nf, const Trie::NormalizationForm& nf_decomp) {
|
67
|
+
const char* beg = next_invalid_char(src, nf);
|
68
|
+
if(*beg=='\0')
|
69
|
+
return src;
|
70
|
+
|
71
|
+
buffer.assign(src, beg);
|
72
|
+
while(*beg!='\0') {
|
73
|
+
const char* end = next_valid_starter(beg, nf);
|
74
|
+
buffer2.clear();
|
75
|
+
decompose_one(beg, end, nf_decomp, buffer2);
|
76
|
+
end = compose_one(buffer2.c_str(), end, buffer);
|
77
|
+
beg = next_invalid_char(end, nf);
|
78
|
+
buffer.append(end, beg);
|
79
|
+
}
|
80
|
+
|
81
|
+
return buffer.c_str();
|
82
|
+
}
|
83
|
+
|
84
|
+
const char* compose_one(const char* starter, const char* rest_starter, std::string& buf) {
|
85
|
+
Trie::CharStreamForComposition in(starter, rest_starter, canonical_classes, buffer3);
|
86
|
+
while(in.within_first())
|
87
|
+
nf_c.compose(in, buf);
|
88
|
+
return in.cur();
|
89
|
+
}
|
90
|
+
|
91
|
+
void canonical_combining_class_ordering(char* beg, const char* end) {
|
92
|
+
canonical_classes.assign(end-beg+1, 0); // +1 is for sentinel value
|
93
|
+
ccc.sort(beg, canonical_classes);
|
94
|
+
}
|
95
|
+
|
96
|
+
const char* next_invalid_char(const char* src, const Trie::NormalizationForm& nf) const {
|
97
|
+
int last_canonical_class = 0;
|
98
|
+
const char* cur = Util::nearest_utf8_char_start_point(src);
|
99
|
+
const char* starter = cur;
|
100
|
+
|
101
|
+
for(; *cur != '\0'; cur = Util::nearest_utf8_char_start_point(cur+1)) {
|
102
|
+
int canonical_class = ccc.get_class(cur);
|
103
|
+
if(last_canonical_class > canonical_class && canonical_class != 0)
|
104
|
+
return starter;
|
105
|
+
|
106
|
+
if(nf.quick_check(cur)==false)
|
107
|
+
return starter;
|
108
|
+
|
109
|
+
if(canonical_class==0)
|
110
|
+
starter=cur;
|
111
|
+
|
112
|
+
last_canonical_class = canonical_class;
|
113
|
+
}
|
114
|
+
return cur;
|
115
|
+
}
|
116
|
+
|
117
|
+
const char* next_valid_starter(const char* src, const Trie::NormalizationForm& nf) const {
|
118
|
+
const char* cur = Util::nearest_utf8_char_start_point(src+1);
|
119
|
+
while(ccc.get_class(cur)!=0 || nf.quick_check(cur)==false)
|
120
|
+
cur = Util::nearest_utf8_char_start_point(cur+1);
|
121
|
+
return cur;
|
122
|
+
}
|
123
|
+
|
124
|
+
private:
|
125
|
+
const Trie::NormalizationForm nf_d;
|
126
|
+
const Trie::NormalizationForm nf_kd;
|
127
|
+
const Trie::NormalizationForm nf_c;
|
128
|
+
const Trie::NormalizationForm nf_c_qc;
|
129
|
+
const Trie::NormalizationForm nf_kc_qc;
|
130
|
+
const Trie::CanonicalCombiningClass ccc;
|
131
|
+
|
132
|
+
std::string buffer;
|
133
|
+
std::string buffer2;
|
134
|
+
std::string buffer3;
|
135
|
+
std::vector<unsigned char> canonical_classes;
|
136
|
+
};
|
137
|
+
}
|
138
|
+
|
139
|
+
#endif
|