forkforge 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,28 @@
1
+ # encoding: utf-8
2
+
3
+ module Forkforge
4
+ class Handler
5
+ def initialize handler_class, *args
6
+ @handler = handler_class.split('::').inject(Object) do |mod, clazz|
7
+ mod.const_get(clazz)
8
+ end.new *args
9
+
10
+ @delayed = []
11
+ end
12
+
13
+ def method_missing method, *args, &block
14
+ if @handler.respond_to? method
15
+ @handler.send(method, *args, &block)
16
+ @delayed.clear
17
+ else
18
+ @delayed.unshift "#{method}"
19
+ end
20
+ end
21
+
22
+ def parse input
23
+ self.instance_eval %Q{
24
+ #{File.read input}
25
+ }
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,66 @@
1
+ # encoding: utf-8
2
+
3
+ require 'forkforge/internal/unicode_data'
4
+
5
+ module Forkforge
6
+
7
+ =begin
8
+ 0: Spacing, split, enclosing, reordrant, and Tibetan subjoined
9
+ 1: Overlays and interior
10
+ 7: Nuktas
11
+ 8: Hiragana/Katakana voicing marks
12
+ 9: Viramas
13
+ 10: Start of fixed position classes
14
+ 199: End of fixed position classes
15
+ 200: Below left attached
16
+ 202: Below attached
17
+ 204: Below right attached
18
+ 208: Left attached (reordrant around single base character)
19
+ 210: Right attached
20
+ 212: Above left attached
21
+ 214: Above attached
22
+ 216: Above right attached
23
+ 218: Below left
24
+ 220: Below
25
+ 222: Below right
26
+ 224: Left (reordrant around single base character)
27
+ 226: Right
28
+ 228: Above left
29
+ 230: Above
30
+ 232: Above right
31
+ 233: Double below
32
+ 234: Double above
33
+ 240: Below (iota subscript)
34
+ =end
35
+ module CanonicalCombiningClasses
36
+ VARIANTS = {
37
+ '0' => { name: 'Spacing, split, enclosing, reordrant, and Tibetan subjoined' },
38
+ '1' => { name: 'Overlays and interior' },
39
+ '7' => { name: 'Nuktas' },
40
+ '8' => { name: 'Hiragana/Katakana voicing marks' },
41
+ '9' => { name: 'Viramas' },
42
+ '10' => { name: 'Start of fixed position classes' },
43
+ '199' => { name: 'End of fixed position classes' },
44
+ '200' => { name: 'Below left attached' },
45
+ '202' => { name: 'Below attached' },
46
+ '204' => { name: 'Below right attached' },
47
+ '208' => { name: 'Left attached (reordrant around single base character)' },
48
+ '210' => { name: 'Right attached' },
49
+ '212' => { name: 'Above left attached' },
50
+ '214' => { name: 'Above attached' },
51
+ '216' => { name: 'Above right attached' },
52
+ '218' => { name: 'Below left' }
53
+ '220' => { name: 'Below' },
54
+ '222' => { name: 'Below right' },
55
+ '224' => { name: 'Left (reordrant around single base character)' },
56
+ '226' => { name: 'Right' },
57
+ '228' => { name: 'Above left' }
58
+ '230' => { name: 'Above' },
59
+ '232' => { name: 'Above right' },
60
+ '234' => { name: 'Double above' },
61
+ '240' => { name: 'Below (iota subscript)' }
62
+ }
63
+
64
+ extend self
65
+ end
66
+ end
@@ -0,0 +1,65 @@
1
+ # encoding: utf-8
2
+
3
+ require 'forkforge/internal/unicode_data'
4
+
5
+ module Forkforge
6
+
7
+ =begin
8
+ <font> A font variant (e.g. a blackletter form).
9
+ <noBreak> A no-break version of a space or hyphen.
10
+ <initial> An initial presentation form (Arabic).
11
+ <medial> A medial presentation form (Arabic).
12
+ <final> A final presentation form (Arabic).
13
+ <isolated> An isolated presentation form (Arabic).
14
+ <circle> An encircled form.
15
+ <super> A superscript form.
16
+ <sub> A subscript form.
17
+ <vertical> A vertical layout presentation form.
18
+ <wide> A wide (or zenkaku) compatibility character.
19
+ <narrow> A narrow (or hankaku) compatibility character.
20
+ <small> A small variant form (CNS compatibility).
21
+ <square> A CJK squared font variant.
22
+ <fraction> A vulgar fraction form.
23
+ <compat> Otherwise unspecified compatibility character.
24
+ =end
25
+ module CharacterDecompositionMapping
26
+ VARIANTS = {
27
+ font: { name: 'A font variant (e.g. a blackletter form)' },
28
+ noBreak: { name: 'A no-break version of a space or hyphen' },
29
+ initial: { name: 'An initial presentation form (Arabic)' },
30
+ medial: { name: 'A medial presentation form (Arabic)' },
31
+ final: { name: 'A final presentation form (Arabic)' },
32
+ isolated: { name: 'An isolated presentation form (Arabic)' },
33
+ circle: { name: 'An encircled form' },
34
+ super: { name: 'A superscript form' },
35
+ sub: { name: 'A subscript form' },
36
+ vertical: { name: 'A vertical layout presentation form' },
37
+ wide: { name: 'A wide (or zenkaku) compatibility character' },
38
+ narrow: { name: 'A narrow (or hankaku) compatibility character' },
39
+ small: { name: 'A small variant form (CNS compatibility)' },
40
+ square: { name: 'A CJK squared font variant' },
41
+ fraction: { name: 'A vulgar fraction form' },
42
+ compat: { name: 'Otherwise unspecified compatibility character' }
43
+ }
44
+ VARIANTS_UC = VARIANTS.map { |k, v| [ "<#{k}>", v ] }.to_h
45
+
46
+ class Tag
47
+ attr_reader :tag, :sym
48
+ def initialize str
49
+ m = "#{str}".match /^<?(#{VARIANTS.keys.join('|')})>?$/
50
+ @tag, @sym = "<#{m[1]}>", :"#{m[1]}" if MatchData === m
51
+ end
52
+ def valid?
53
+ !@tag.nil? && !@sym.nil?
54
+ end
55
+ def self.tag s
56
+ Tag.new(s)
57
+ end
58
+ def self.tag? s
59
+ self.tag(s).valid?
60
+ end
61
+ end
62
+
63
+ extend self
64
+ end
65
+ end
@@ -0,0 +1,109 @@
1
+ # encoding: utf-8
2
+
3
+ module Forkforge
4
+ class CodePoint
5
+ UNICODE_FIELDS = [
6
+ :code_point,
7
+ :character_name,
8
+ :general_category,
9
+ :canonical_combining_classes,
10
+ :bidirectional_category,
11
+ :character_decomposition_mapping,
12
+ :decimal_digit_value,
13
+ :digit_value,
14
+ :numeric_value,
15
+ :mirrored,
16
+ :unicode_1_0_name,
17
+ :_10646_comment_field,
18
+ :uppercase_mapping,
19
+ :lowercase_mapping,
20
+ :titlecase_mapping
21
+ ]
22
+
23
+ UNICODE_FIELDS.each { |f|
24
+ class_eval %Q{
25
+ attr_reader :#{f}
26
+ }
27
+ }
28
+ def initialize hash
29
+ UNICODE_FIELDS.each { |f|
30
+ instance_eval %Q{
31
+ @#{f} = hash[:#{f}]
32
+ }
33
+ }
34
+ end
35
+ def to_s
36
+ [@code_point.to_i(16)].pack('U')
37
+ end
38
+ def inspect
39
+ "'#{to_s}' ⇒ [#{@character_name}]"
40
+ end
41
+ end
42
+
43
+ class CodePoints
44
+ def initialize hash
45
+ @hash = hash
46
+ end
47
+
48
+ def filter field, pattern = nil
49
+ pattern = case pattern
50
+ when NilClass then /\A.+/ # not empty
51
+ when Regexp then pattern
52
+ else Regexp.new(pattern)
53
+ end
54
+ @hash.select { |k, v|
55
+ v[field.to_sym] =~ pattern
56
+ }
57
+ end
58
+ private :filter
59
+
60
+ def select field, pattern = nil
61
+ CodePoints.new filter field, pattern
62
+ end
63
+
64
+ def inspect
65
+ @hash.inspect
66
+ end
67
+
68
+ def to_a
69
+ @hash.values
70
+ end
71
+
72
+ # FIXME is is shallow or deep copy?
73
+ def to_h
74
+ @hash.dup
75
+ end
76
+
77
+ def to_s
78
+ @hash.values.map { |v|
79
+ CodePoint.new(v).to_s
80
+ }.join
81
+ end
82
+
83
+ def respond_to? method
84
+ m = "#{method}".split '_'
85
+ return !(filter :character_name, /#{m}/i).empty?
86
+ end
87
+
88
+ def method_missing method, *args, &block
89
+ m, rest = "#{method}".split '_', 2
90
+ if args.count <= 1 && !(result = filter :character_name, /#{m}/i).empty?
91
+ result.select! { |k, v|
92
+ v[:character_decomposition_mapping] =~ case args.first
93
+ when String then /#{args.first.codepoints.map { |cp| '%04X' % cp }.join('|')}\Z/
94
+ when Integer then /#{'%04X' % cp}/
95
+ when Regexp then args.first
96
+ else /#{args.first}/
97
+ end
98
+ } if args.count > 0
99
+ result.each do |k, v|
100
+ yield CodePoint.new v
101
+ end if block_given? && !rest.nil?
102
+ result = CodePoints.new(result)
103
+ rest.nil? ? result : result.send(rest.to_sym)
104
+ else
105
+ super
106
+ end
107
+ end
108
+ end
109
+ end
@@ -0,0 +1,45 @@
1
+ # encoding: utf-8
2
+
3
+ unless NilClass.respond_to? :strip
4
+ class NilClass
5
+ def strip
6
+ nil
7
+ end
8
+ def match *args
9
+ false
10
+ end
11
+ end
12
+ end
13
+
14
+ class Object
15
+ def vacant?
16
+ self.nil? ||
17
+ self.respond_to?(:strip) && self.strip.empty? ||
18
+ self.respond_to?(:zero?) && self.zero? ||
19
+ Array === self && self.compact.empty?
20
+ end
21
+ end
22
+
23
+ unless Hash.respond_to? :take
24
+ class Hash
25
+ def take count, from = 0
26
+ Hash[self.to_a[from..from+count]]
27
+ end
28
+ end
29
+ end
30
+
31
+ unless Array.respond_to? :to_h
32
+ class Array
33
+ def to_h
34
+ i = 0
35
+ self.inject({}) { |memo, e|
36
+ raise TypeError.new("wrong element type #{e.class} at #{i} (expected array)") unless Array === e
37
+ raise ArgumentError.new("wrong array length at #{i} (expected 2, was #{e.count})") unless e.count == 2
38
+
39
+ i += 1
40
+ memo[e.first] = e.last
41
+ memo
42
+ }
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,69 @@
1
+ # encoding: utf-8
2
+
3
+ require 'forkforge/internal/monkeypatches'
4
+
5
+ module Forkforge
6
+ module SpecialCasing
7
+ include UnicodeOrgFileFormat
8
+
9
+ LOCAL = 'data'
10
+ REMOTE = 'Public/UNIDATA'
11
+ FILE = 'SpecialCasing.txt'
12
+
13
+ SPECIAL_CASING_FIELDS = [
14
+ :code_point,
15
+ :lowercase_mapping,
16
+ :titlecase_mapping,
17
+ :uppercase_mapping,
18
+ :condition_list,
19
+ :comment
20
+ ]
21
+
22
+ def hash
23
+ i_hash REMOTE, LOCAL, FILE, SPECIAL_CASING_FIELDS
24
+ end
25
+
26
+ # filter_code_point '00A0' | filter_uppercase_mapping 0xA0 | ...
27
+ SPECIAL_CASING_FIELDS.each { |method|
28
+ define_method("filter_#{method}") { |cp, filters = []|
29
+ return hash[ncp = __to_code_point(cp)].nil? ? \
30
+ nil : [*hash[ncp]].select { |h|
31
+ filters.inject(true) { |memo, f|
32
+ memo &&= h[method.to_sym].match f
33
+ }
34
+ } || [*hash[ncp]].select { |h| h[method.to_sym].vacant? }
35
+ }
36
+
37
+ define_method("all_#{method}") { |pattern = nil|
38
+ pattern = Regexp.new(pattern) unless pattern.nil? || Regexp === pattern
39
+ hash.map { |k, v|
40
+ [
41
+ k,
42
+ v.reject { |vv|
43
+ pattern.nil? ? vv[method.to_sym].vacant? : pattern.match(vv[method.to_sym]).nil?
44
+ }
45
+ ]
46
+ }.to_h
47
+ }
48
+ }
49
+
50
+ [:uppercase, :lowercase, :titlecase].each { |method|
51
+ class_eval %Q{
52
+ def cp_#{method}(cp, lang = nil, context = nil)
53
+ filters = []
54
+ filters << Regexp.new('^' + lang + '(?=\\Z|\\s)') unless lang.nil?
55
+ filters << Regexp.new('(?<=\\A|\\s)' + context + '$') unless context.nil?
56
+ conditions = filter_condition_list cp, filters
57
+ (conditions.vacant? || conditions.count != 1 || conditions.first[:#{method}_mapping].vacant? || conditions.first[:#{method}_mapping] == __to_code_point(cp)) ? \
58
+ cp : conditions.first[:#{method}_mapping].split(' ').map { |cpn| cp_#{method}(cpn.to_i(16), lang, context) }
59
+ end
60
+ private :cp_#{method}
61
+ def #{method}(cp, lang = nil, context = nil)
62
+ (cpm = cp_#{method}(cp, lang, context)).nil? ? nil : [*cpm].pack('U')
63
+ end
64
+ }
65
+ }
66
+
67
+ extend self
68
+ end
69
+ end
@@ -0,0 +1,91 @@
1
+ # encoding: utf-8
2
+
3
+ require 'forkforge/internal/monkeypatches'
4
+ require 'forkforge/internal/unicode_org_file'
5
+ require 'forkforge/internal/code_point'
6
+ require 'forkforge/internal/character_decomposition_mapping'
7
+
8
+ module Forkforge
9
+ module UnicodeData
10
+ include UnicodeOrgFileFormat
11
+
12
+ LOCAL = 'data'
13
+ REMOTE = 'Public/UCD/latest/ucd'
14
+ FILE = 'UnicodeData.txt'
15
+
16
+ @cdm = {}
17
+
18
+ def hash
19
+ i_hash(REMOTE, LOCAL, FILE, CodePoint::UNICODE_FIELDS, false)
20
+ end
21
+
22
+ def code_points
23
+ @codepoints ||= CodePoints.new hash
24
+ end
25
+
26
+ def info cp
27
+ cp = cp.codepoints.first if String === cp && cp.length == 1
28
+ hash[__to_code_point(cp)]
29
+ end
30
+
31
+ def infos string
32
+ string.codepoints.map { |cp| hash[__to_code_point(cp)] }
33
+ end
34
+
35
+ # TODO return true/false whether the normalization was done?
36
+ def to_char cp, action = :code_point
37
+ elem = hash[__to_code_point(cp)]
38
+ __to_char(elem[action].vacant? ? elem[:code_point] : elem[action])
39
+ end
40
+
41
+ def to_codepoint cp
42
+ Forkforge::CodePoint.new info cp
43
+ end
44
+
45
+ # get_code_point '00A0' | get_character_decomposition_mapping 0xA0 | ...
46
+ # all_code_point /00[A-C]\d/ | get_character_decomposition_mapping /00A*/ | ...
47
+ CodePoint::UNICODE_FIELDS.each { |method|
48
+ define_method("get_#{method}") { |cp|
49
+ ncp = __to_code_point cp
50
+ return hash[ncp] ? hash[ncp][method.to_sym] : nil
51
+ }
52
+ define_method("all_#{method}") { |pattern = nil|
53
+ pattern = Regexp.new(pattern) unless pattern.nil? || Regexp === pattern
54
+ hash.select { |k, v|
55
+ pattern.nil? ? !v[method.to_sym].vacant? : !pattern.match(v[method.to_sym]).nil?
56
+ }
57
+ }
58
+ }
59
+
60
+ def compose_cp cp, tag = :font, thorough = true
61
+ cp = __to_code_point cp
62
+ return Forkforge::CodePoint.new(hash[cp]) unless (t = CharacterDecompositionMapping::Tag.tag(tag)).valid?
63
+
64
+ @cdm[tag] = all_character_decomposition_mapping(/#{t.tag}/).values if @cdm[tag].nil?
65
+ # FIXME Could we distinguish “<wide> 0ABC” and “0A00 0ABC” in more elegant way?
66
+ lmbd = ->(v) { v[:character_decomposition_mapping] =~ /[^\dA-Fa-f]\s+#{cp}\Z/ }
67
+ thorough ? \
68
+ @cdm[tag].select(&lmbd).map { |cp| Forkforge::CodePoint.new(cp) } :
69
+ Forkforge::CodePoint.new(@cdm[tag].find(&lmbd) || hash[cp])
70
+ end
71
+
72
+ def decompose_cp cp, tags = []
73
+ normalized = __to_code_point cp
74
+ mapping = get_character_decomposition_mapping cp
75
+ return normalized if mapping.vacant?
76
+
77
+ cps = mapping.split ' '
78
+
79
+ return normalized if ![*tags].vacant? && \
80
+ cps.inject(false) { |memo, cp|
81
+ memo || (CharacterDecompositionMapping::Tag::tag?(cp) && ![*tags].include?(CharacterDecompositionMapping::Tag::tag(cp).sym))
82
+ }
83
+
84
+ cps.reject { |cp|
85
+ Forkforge::CharacterDecompositionMapping::Tag::tag? cp
86
+ }.map { |cp| decompose_cp cp, tags }
87
+ end
88
+
89
+ extend self
90
+ end
91
+ end
@@ -0,0 +1,65 @@
1
+ # encoding: utf-8
2
+
3
+ module Forkforge
4
+ module UnicodeOrgFileFormat
5
+ HOST = 'www.unicode.org'
6
+
7
+ @@hashmap = {}
8
+
9
+ def i_grab remote_folder, local_folder, file
10
+ require 'net/http'
11
+ Net::HTTP.start(HOST) do |http|
12
+ resp = http.get "/#{remote_folder}/#{file}"
13
+ if !File.exist? local_folder
14
+ require 'fileutils'
15
+ FileUtils.mkpath local_folder
16
+ end
17
+ open("#{local_folder}/#{file}", "wb") do |file|
18
+ file.write(resp.body.gsub(/^\s*#.*?$/, '').gsub(/\R+/, "\n").gsub(/\A\R+/, ''))
19
+ end
20
+ end
21
+ end
22
+ private :i_grab
23
+
24
+ def i_load remote_folder, local_folder, file
25
+ i_grab(remote_folder, local_folder, file) unless File.exist? "#{local_folder}/#{file}"
26
+ File.read "#{local_folder}/#{file}"
27
+ end
28
+ private :i_load
29
+
30
+ def i_hash remote_folder, local_folder, file, fields, arrayize = true
31
+ if @@hashmap[self.name].nil?
32
+ @@hashmap[self.name] = {}
33
+ i_load(remote_folder, local_folder, file).split(/\R/).each do |line|
34
+ # comment is always last, while the amount of fields is subject to change
35
+ comment = line.scan(/(?<=#).*?$/).first.strip
36
+ line.gsub!(/;\s*#.*$/, '') unless comment.nil?
37
+ values = line.split ';'
38
+ key = values.first.strip
39
+ value = (fields.map { |f|
40
+ [ f, values.shift.strip ]
41
+ } + [[ :comment, comment ]]).to_h
42
+ arrayize ? \
43
+ (@@hashmap[self.name][key] ||= []) << value : \
44
+ @@hashmap[self.name][key] = value
45
+ end
46
+ end
47
+ @@hashmap[self.name]
48
+ end
49
+ private :i_hash
50
+
51
+ def __to_code_point cp
52
+ case cp
53
+ when Integer then cp = cp.to_s(16)
54
+ when Forkforge::CodePoint then cp = cp.code_point
55
+ end
56
+ '%04X' % cp.to_i(16)
57
+ end
58
+
59
+ def __to_char cp
60
+ cp = cp.to_s(16) if Integer === cp
61
+ [cp.to_i(16)].pack('U')
62
+ end
63
+
64
+ end
65
+ end
@@ -0,0 +1,35 @@
1
+ # encoding: utf-8
2
+
3
+ require 'forkforge/unicode'
4
+
5
+ class String
6
+
7
+ def decompose tags = []
8
+ Forkforge::Unicode::decompose self, tags
9
+ end
10
+
11
+ [:circle, :super, :sub, :wide].each { |m|
12
+ class_eval %Q{
13
+ def compose_#{m}
14
+ Forkforge::Unicode::#{m} self
15
+ end
16
+ }
17
+ }
18
+
19
+ [:uppercase, :lowercase].each { |m|
20
+ class_eval %Q{
21
+ def #{m} lang = nil, context = nil
22
+ Forkforge::Unicode::#{m} self, lang, context
23
+ end
24
+ }
25
+ }
26
+
27
+ def upcase
28
+ uppercase
29
+ end
30
+
31
+ def downcase
32
+ lowercase
33
+ end
34
+
35
+ end
@@ -0,0 +1,49 @@
1
+ # encoding: utf-8
2
+
3
+ require 'forkforge/unicode'
4
+
5
+ module Forkforge
6
+ module Selector
7
+ def self.included base
8
+ re, basename = nil, base.name.gsub(/.*::/, '')
9
+ # FILTER = { 'Tagged' => /^<.*?>$/ }
10
+ if base.const_defined?(:FILTER)
11
+ filter = base.const_get(:FILTER).to_a.flatten
12
+ re = filter.last
13
+ basename = basename.gsub(/^#{filter.first}/, '')
14
+ end
15
+ basename = (Forkforge::Unicode::camel_to_underscore basename).to_sym
16
+
17
+ # HASH = Forkforge::UnicodeData::all_character_name /^<.*?>$/
18
+ base.const_set :HASH, UnicodeData::send("all_#{basename}", re)
19
+
20
+ # ALL = Forkforge::UnicodeData::all_bidirectional_category.uniq
21
+ base.const_set :ALL, base::HASH.map { |k, v| v[basename] }.uniq
22
+
23
+ base.class_eval %Q{
24
+ CHARACTERS = HASH.reduce({}) { |memo, v|
25
+ (memo[v.last[:#{basename}]] ||= []) << v.last
26
+ memo
27
+ }
28
+ }
29
+
30
+ base.extend base
31
+ end
32
+ end
33
+
34
+ module TaggedCharacterName
35
+ FILTER = { 'Tagged' => /^<.*?>$/ }
36
+
37
+ include Selector
38
+
39
+ # E. g. def control ⇒ [ ALL_ITEMS_WITH_CONTROL_NAME ]
40
+ CHARACTERS.each { |k, v|
41
+ define_method(k.downcase.gsub(/^<|>$/, '').gsub(/\W/, '_')) { v }
42
+ }
43
+ end
44
+
45
+ module BidirectionalCategory
46
+ include Selector
47
+ end
48
+
49
+ end