ucode 0.2.3 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/ucode/code_chart/extractor.rb +1 -9
- data/lib/ucode/code_chart/writer.rb +1 -1
- data/lib/ucode/commands/canonical_build.rb +4 -4
- data/lib/ucode/commands/universal_set.rb +5 -3
- data/lib/ucode/coordinator/enrichment/bidi.rb +35 -0
- data/lib/ucode/coordinator/enrichment/binary.rb +38 -0
- data/lib/ucode/coordinator/enrichment/casing.rb +55 -0
- data/lib/ucode/coordinator/enrichment/cjk.rb +49 -0
- data/lib/ucode/coordinator/enrichment/display.rb +36 -0
- data/lib/ucode/coordinator/enrichment/emoji.rb +36 -0
- data/lib/ucode/coordinator/enrichment/identity.rb +42 -0
- data/lib/ucode/coordinator/enrichment/indic.rb +32 -0
- data/lib/ucode/coordinator/enrichment/names.rb +63 -0
- data/lib/ucode/coordinator/enrichment/segmentation.rb +34 -0
- data/lib/ucode/coordinator/enrichment.rb +51 -0
- data/lib/ucode/coordinator/range_lookup.rb +65 -0
- data/lib/ucode/coordinator.rb +4 -276
- data/lib/ucode/glyphs/embedded_fonts/catalog.rb +32 -376
- data/lib/ucode/glyphs/embedded_fonts/codepoint_mapper.rb +130 -0
- data/lib/ucode/glyphs/embedded_fonts/content_stream_correlator.rb +25 -124
- data/lib/ucode/glyphs/embedded_fonts/font_entry.rb +0 -1
- data/lib/ucode/glyphs/embedded_fonts/pdf_indexer.rb +236 -0
- data/lib/ucode/glyphs/embedded_fonts/{source.rb → pdf_location.rb} +5 -5
- data/lib/ucode/glyphs/embedded_fonts/positional_matcher.rb +162 -0
- data/lib/ucode/glyphs/embedded_fonts/raw_font_descriptor.rb +24 -0
- data/lib/ucode/glyphs/embedded_fonts/renderer.rb +0 -2
- data/lib/ucode/glyphs/embedded_fonts/trace_correlator.rb +54 -168
- data/lib/ucode/glyphs/embedded_fonts/writer.rb +0 -4
- data/lib/ucode/glyphs/embedded_fonts.rb +5 -1
- data/lib/ucode/glyphs/resolver_factory.rb +45 -0
- data/lib/ucode/glyphs/sources/pillar1_embedded_tounicode.rb +1 -1
- data/lib/ucode/glyphs.rb +1 -0
- data/lib/ucode/version.rb +1 -1
- metadata +20 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: a6674c0e1a8cab0ef2ea7782878be4571e68603843742de78a43da6a6499220c
|
|
4
|
+
data.tar.gz: 416b0965b6fa4e1e22ef9c431a7c27a16093ba90b708224b7e9c9e1f15ac2c49
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 361d385dd8b5bb04d3ce528ddda0bed64b8c5db46a1ef7e2f77c79ba5c4cf41f67235e8e55d5a38ec2957e1650f7fa4610b50722a299f2dfd67f900bd25e66c0
|
|
7
|
+
data.tar.gz: 1049d2b20c925da363339af6bcf1b98fd1d7fcf93a50adf9bde2240314eb272f89af6981a6604d8e7b5fa8b075b6c36deb68d42e691425f18fa70fca94a4286e
|
|
@@ -2,14 +2,6 @@
|
|
|
2
2
|
|
|
3
3
|
require "pathname"
|
|
4
4
|
|
|
5
|
-
require "ucode/error"
|
|
6
|
-
require "ucode/glyphs/embedded_fonts/catalog"
|
|
7
|
-
require "ucode/glyphs/embedded_fonts/renderer"
|
|
8
|
-
require "ucode/glyphs/embedded_fonts/source"
|
|
9
|
-
require "ucode/glyphs/resolver"
|
|
10
|
-
require "ucode/glyphs/sources/pillar1_embedded_tounicode"
|
|
11
|
-
require "ucode/glyphs/sources/tier1_real_font"
|
|
12
|
-
|
|
13
5
|
module Ucode
|
|
14
6
|
module CodeChart
|
|
15
7
|
# Walks every assigned codepoint in a block and returns one
|
|
@@ -108,7 +100,7 @@ module Ucode
|
|
|
108
100
|
end
|
|
109
101
|
|
|
110
102
|
def embedded_pillar_sources
|
|
111
|
-
embedded_source = Glyphs::EmbeddedFonts::
|
|
103
|
+
embedded_source = Glyphs::EmbeddedFonts::PdfLocation.new(
|
|
112
104
|
pdf: @pdf_path, cache_dir: @cache_dir,
|
|
113
105
|
)
|
|
114
106
|
catalog = Glyphs::EmbeddedFonts::Catalog.new(embedded_source)
|
|
@@ -47,7 +47,7 @@ module Ucode
|
|
|
47
47
|
# @param ucd_version [String, nil] UCD version to stamp on
|
|
48
48
|
# provenance. nil = resolved via {VersionResolver.resolve(nil)}.
|
|
49
49
|
# @param cache_dir [Pathname, String, nil] font-stream cache
|
|
50
|
-
# directory for the EmbeddedFonts::
|
|
50
|
+
# directory for the EmbeddedFonts::PdfLocation.
|
|
51
51
|
# @param now [Time, nil] timestamp override (for tests).
|
|
52
52
|
# @param pillar3_source, tier1_sources: forwarded to the Extractor.
|
|
53
53
|
def initialize(output_root:, pdf_path:, ucd_version: nil,
|
|
@@ -118,10 +118,10 @@ module Ucode
|
|
|
118
118
|
end
|
|
119
119
|
|
|
120
120
|
def build_resolver(version, source_config_path)
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
121
|
+
Glyphs::ResolverFactory.build(
|
|
122
|
+
version: version,
|
|
123
|
+
source_config_path: source_config_path,
|
|
124
|
+
)
|
|
125
125
|
end
|
|
126
126
|
|
|
127
127
|
def source_config_path_or_default(path)
|
|
@@ -126,9 +126,11 @@ module Ucode
|
|
|
126
126
|
end
|
|
127
127
|
|
|
128
128
|
def build_resolver(_version, config_path, database)
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
129
|
+
Glyphs::ResolverFactory.build(
|
|
130
|
+
version: _version,
|
|
131
|
+
source_config_path: config_path,
|
|
132
|
+
database: database,
|
|
133
|
+
)
|
|
132
134
|
end
|
|
133
135
|
|
|
134
136
|
def codepoint_enum(version)
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ucode
|
|
4
|
+
class Coordinator
|
|
5
|
+
module Enrichment
|
|
6
|
+
# Bidirectional behaviour: mirroring glyph and paired bracket info.
|
|
7
|
+
module Bidi
|
|
8
|
+
class << self
|
|
9
|
+
# @param cp [Ucode::Models::CodePoint]
|
|
10
|
+
# @param indices [Ucode::Coordinator::Indices]
|
|
11
|
+
def enrich(cp, indices)
|
|
12
|
+
mirroring = indices.bidi_mirroring[cp.cp]
|
|
13
|
+
brackets = indices.bidi_brackets[cp.cp]
|
|
14
|
+
return unless mirroring || brackets
|
|
15
|
+
|
|
16
|
+
cp.bidi ||= Ucode::Models::CodePoint::Bidi.new
|
|
17
|
+
apply_mirroring(cp, mirroring) if mirroring
|
|
18
|
+
apply_brackets(cp, brackets) if brackets
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
private
|
|
22
|
+
|
|
23
|
+
def apply_mirroring(cp, mirroring)
|
|
24
|
+
cp.bidi.mirroring_glyph_id = mirroring.mirrored_id
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def apply_brackets(cp, brackets)
|
|
28
|
+
cp.bidi.paired_bracket_type = brackets.type
|
|
29
|
+
cp.bidi.paired_bracket_id = brackets.paired_id
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ucode
|
|
4
|
+
class Coordinator
|
|
5
|
+
module Enrichment
|
|
6
|
+
# Binary properties: DerivedCoreProperties (core) + PropList (extra).
|
|
7
|
+
# Both contribute to the same `cp.binary_properties` array.
|
|
8
|
+
module Binary
|
|
9
|
+
class << self
|
|
10
|
+
def enrich(cp, indices)
|
|
11
|
+
assign_core(cp, indices)
|
|
12
|
+
assign_extra(cp, indices)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
private
|
|
16
|
+
|
|
17
|
+
def assign_core(cp, indices)
|
|
18
|
+
records = indices.binary_properties[cp.cp]
|
|
19
|
+
return unless records && !records.empty?
|
|
20
|
+
|
|
21
|
+
cp.binary_properties = records.map(&:property_short)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# PropList carries binary properties beyond DerivedCoreProperties
|
|
25
|
+
# (White_Space, Hyphen, Variation_Selector, etc.). Merge into the
|
|
26
|
+
# same binary_properties list, deduped.
|
|
27
|
+
def assign_extra(cp, indices)
|
|
28
|
+
extras = RangeLookup.all_range_values(cp.cp, indices.extra_binary_properties)
|
|
29
|
+
return if extras.empty?
|
|
30
|
+
|
|
31
|
+
cp.binary_properties.concat(extras)
|
|
32
|
+
cp.binary_properties.uniq!
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ucode
|
|
4
|
+
class Coordinator
|
|
5
|
+
module Enrichment
|
|
6
|
+
# Context-sensitive case mappings and case folding for comparison.
|
|
7
|
+
module Casing
|
|
8
|
+
class << self
|
|
9
|
+
def enrich(cp, indices)
|
|
10
|
+
assign_special_casing(cp, indices)
|
|
11
|
+
assign_case_folding(cp, indices)
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
private
|
|
15
|
+
|
|
16
|
+
# NOTE: do not uniq the *_ids arrays — a mapping like U+00DF → "SS"
|
|
17
|
+
# legitimately contains two U+0053 entries and they must be
|
|
18
|
+
# preserved in order. Conditions, by contrast, are categorical
|
|
19
|
+
# tags (Final_Sigma, tr, After_I) and deduping them is correct.
|
|
20
|
+
def assign_special_casing(cp, indices)
|
|
21
|
+
rules = indices.special_casing[cp.cp]
|
|
22
|
+
return unless rules && !rules.empty?
|
|
23
|
+
|
|
24
|
+
cp.casing ||= Ucode::Models::CodePoint::Casing.new
|
|
25
|
+
apply_casing_rules(cp.casing, rules)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def apply_casing_rules(casing, rules)
|
|
29
|
+
casing.full_upper_ids = rules.flat_map(&:upper_ids)
|
|
30
|
+
casing.full_lower_ids = rules.flat_map(&:lower_ids)
|
|
31
|
+
casing.full_title_ids = rules.flat_map(&:title_ids)
|
|
32
|
+
casing.conditions = rules.flat_map(&:conditions).uniq
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def assign_case_folding(cp, indices)
|
|
36
|
+
rules = indices.case_folding[cp.cp]
|
|
37
|
+
return unless rules && !rules.empty?
|
|
38
|
+
|
|
39
|
+
cp.case_folding ||= Ucode::Models::CodePoint::CaseFolding.new
|
|
40
|
+
rules.each { |rule| apply_folding_rule(cp, rule) }
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def apply_folding_rule(cp, rule)
|
|
44
|
+
case rule.status
|
|
45
|
+
when "C" then cp.case_folding.common_id = rule.mapping_ids.first
|
|
46
|
+
when "S" then cp.case_folding.simple_id = rule.mapping_ids.first
|
|
47
|
+
when "T" then cp.case_folding.turkic_id = rule.mapping_ids.first
|
|
48
|
+
when "F" then cp.case_folding.full_ids = rule.mapping_ids
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ucode
|
|
4
|
+
class Coordinator
|
|
5
|
+
module Enrichment
|
|
6
|
+
# CJK-specific data: Unihan readings, KangXi radical mapping,
|
|
7
|
+
# Hangul syllable type.
|
|
8
|
+
module CJK
|
|
9
|
+
class << self
|
|
10
|
+
def enrich(cp, indices)
|
|
11
|
+
assign_unihan(cp, indices)
|
|
12
|
+
assign_cjk_radical(cp, indices)
|
|
13
|
+
assign_hangul(cp, indices)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
private
|
|
17
|
+
|
|
18
|
+
def assign_unihan(cp, indices)
|
|
19
|
+
entry = indices.unihan[cp.cp]
|
|
20
|
+
return unless entry
|
|
21
|
+
|
|
22
|
+
cp.unihan = entry
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def assign_cjk_radical(cp, indices)
|
|
26
|
+
radicals = indices.cjk_radicals[cp.id]
|
|
27
|
+
return unless radicals && !radicals.empty?
|
|
28
|
+
|
|
29
|
+
radicals.each do |radical|
|
|
30
|
+
cp.relationships << Ucode::Models::Relationship::CrossReference.new(
|
|
31
|
+
target_ids: [radical.cjk_radical_id],
|
|
32
|
+
description: "KangXi radical ##{radical.radical_number}",
|
|
33
|
+
source: "cjk_radicals",
|
|
34
|
+
)
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def assign_hangul(cp, indices)
|
|
39
|
+
tuple = RangeLookup.find_in_range(cp.cp, indices.hangul_syllable_type)
|
|
40
|
+
return unless tuple
|
|
41
|
+
|
|
42
|
+
cp.hangul ||= Ucode::Models::CodePoint::HangulSyllable.new
|
|
43
|
+
cp.hangul.type = tuple.value
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ucode
|
|
4
|
+
class Coordinator
|
|
5
|
+
module Enrichment
|
|
6
|
+
# Display layout properties: Line Break class, East Asian Width,
|
|
7
|
+
# Vertical Orientation. All three are range+value files, looked up
|
|
8
|
+
# via bsearch on sorted arrays of ExtractedProperties::Tuple.
|
|
9
|
+
module Display
|
|
10
|
+
class << self
|
|
11
|
+
def enrich(cp, indices)
|
|
12
|
+
lb = lookup_value(cp, indices.line_break)
|
|
13
|
+
eaw = lookup_value(cp, indices.east_asian_width)
|
|
14
|
+
vo = lookup_value(cp, indices.vertical_orientation)
|
|
15
|
+
return if lb.nil? && eaw.nil? && vo.nil?
|
|
16
|
+
|
|
17
|
+
cp.display ||= Ucode::Models::CodePoint::Display.new
|
|
18
|
+
apply_values(cp.display, lb, eaw, vo)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
private
|
|
22
|
+
|
|
23
|
+
def lookup_value(cp, ranges)
|
|
24
|
+
RangeLookup.find_in_range(cp.cp, ranges)&.value
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def apply_values(display, lb, eaw, vo)
|
|
28
|
+
display.line_break_class = lb if lb
|
|
29
|
+
display.east_asian_width = eaw if eaw
|
|
30
|
+
display.vertical_orientation = vo if vo
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ucode
|
|
4
|
+
class Coordinator
|
|
5
|
+
module Enrichment
|
|
6
|
+
# Emoji property bundle. Each Emoji_* property from emoji-data.txt
|
|
7
|
+
# flips the matching boolean on the Emoji sub-model.
|
|
8
|
+
module Emoji
|
|
9
|
+
class << self
|
|
10
|
+
def enrich(cp, indices)
|
|
11
|
+
return unless RangeLookup.find_in_range(cp.cp, indices.emoji_properties)
|
|
12
|
+
|
|
13
|
+
props = RangeLookup.all_range_values(cp.cp, indices.emoji_properties)
|
|
14
|
+
return if props.empty?
|
|
15
|
+
|
|
16
|
+
cp.emoji ||= Ucode::Models::CodePoint::Emoji.new
|
|
17
|
+
props.each { |prop| apply_property(cp, prop) }
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
private
|
|
21
|
+
|
|
22
|
+
def apply_property(cp, prop)
|
|
23
|
+
case prop
|
|
24
|
+
when "Emoji" then cp.emoji.is_emoji = true
|
|
25
|
+
when "Emoji_Presentation" then cp.emoji.is_presentation_default = true
|
|
26
|
+
when "Emoji_Modifier" then cp.emoji.is_modifier = true
|
|
27
|
+
when "Emoji_Modifier_Base" then cp.emoji.is_base = true
|
|
28
|
+
when "Emoji_Component" then cp.emoji.is_component = true
|
|
29
|
+
when "Extended_Pictographic" then cp.emoji.is_extended_pictographic = true
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ucode
|
|
4
|
+
class Coordinator
|
|
5
|
+
module Enrichment
|
|
6
|
+
# Basic identity properties: primary script, script extensions,
|
|
7
|
+
# and the Unicode version when the codepoint was introduced.
|
|
8
|
+
module Identity
|
|
9
|
+
class << self
|
|
10
|
+
def enrich(cp, indices)
|
|
11
|
+
assign_script(cp, indices)
|
|
12
|
+
assign_script_extensions(cp, indices)
|
|
13
|
+
assign_age(cp, indices)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
private
|
|
17
|
+
|
|
18
|
+
def assign_script(cp, indices)
|
|
19
|
+
script = RangeLookup.find_in_range(cp.cp, indices.scripts)
|
|
20
|
+
return unless script
|
|
21
|
+
|
|
22
|
+
cp.script_code = script.code || script.name
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def assign_script_extensions(cp, indices)
|
|
26
|
+
tuples = indices.script_extensions[cp.cp]
|
|
27
|
+
return unless tuples && !tuples.empty?
|
|
28
|
+
|
|
29
|
+
tuples.each { |tuple| cp.script_extensions << tuple.script_code }
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def assign_age(cp, indices)
|
|
33
|
+
record = indices.derived_age[cp.cp]
|
|
34
|
+
return unless record
|
|
35
|
+
|
|
36
|
+
cp.age = record.age
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ucode
|
|
4
|
+
class Coordinator
|
|
5
|
+
module Enrichment
|
|
6
|
+
# Indic script shaping categories: positional and syllabic.
|
|
7
|
+
module Indic
|
|
8
|
+
class << self
|
|
9
|
+
def enrich(cp, indices)
|
|
10
|
+
positional = lookup_value(cp, indices.indic_positional)
|
|
11
|
+
syllabic = lookup_value(cp, indices.indic_syllabic)
|
|
12
|
+
return if positional.nil? && syllabic.nil?
|
|
13
|
+
|
|
14
|
+
cp.indic ||= Ucode::Models::CodePoint::Indic.new
|
|
15
|
+
apply_values(cp.indic, positional, syllabic)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
private
|
|
19
|
+
|
|
20
|
+
def lookup_value(cp, ranges)
|
|
21
|
+
RangeLookup.find_in_range(cp.cp, ranges)&.value
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def apply_values(indic, positional, syllabic)
|
|
25
|
+
indic.positional_category = positional if positional
|
|
26
|
+
indic.syllabic_category = syllabic if syllabic
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ucode
|
|
4
|
+
class Coordinator
|
|
5
|
+
module Enrichment
|
|
6
|
+
# Human-curated name annotations: cross-references, informal aliases,
|
|
7
|
+
# footnotes from NamesList.txt; formal name aliases from
|
|
8
|
+
# NameAliases.txt; standardized variation sequences.
|
|
9
|
+
module Names
|
|
10
|
+
class << self
|
|
11
|
+
def enrich(cp, indices)
|
|
12
|
+
assign_names_list(cp, indices)
|
|
13
|
+
assign_name_aliases(cp, indices)
|
|
14
|
+
assign_standardized_variants(cp, indices)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
private
|
|
18
|
+
|
|
19
|
+
def assign_names_list(cp, indices)
|
|
20
|
+
entry = indices.names_list[cp.cp]
|
|
21
|
+
return unless entry
|
|
22
|
+
|
|
23
|
+
cp.names_list = entry
|
|
24
|
+
cp.relationships.concat(entry.cross_references)
|
|
25
|
+
cp.relationships.concat(entry.sample_sequences)
|
|
26
|
+
cp.relationships.concat(entry.compatibility_equivalents)
|
|
27
|
+
cp.relationships.concat(entry.informal_aliases)
|
|
28
|
+
cp.relationships.concat(entry.footnotes)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def assign_name_aliases(cp, indices)
|
|
32
|
+
aliases = indices.name_aliases[cp.cp]
|
|
33
|
+
return unless aliases && !aliases.empty?
|
|
34
|
+
|
|
35
|
+
aliases.each do |alias_record|
|
|
36
|
+
cp.relationships << Ucode::Models::Relationship::InformalAlias.new(
|
|
37
|
+
description: alias_record.text,
|
|
38
|
+
source: "name_aliases",
|
|
39
|
+
)
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def assign_standardized_variants(cp, indices)
|
|
44
|
+
variants = indices.standardized_variants[cp.id]
|
|
45
|
+
return unless variants && !variants.empty?
|
|
46
|
+
|
|
47
|
+
cp.standardized_variants = variants
|
|
48
|
+
variants.each { |v| add_variant_relationship(cp, v) }
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def add_variant_relationship(cp, variant)
|
|
52
|
+
cp.relationships << Ucode::Models::Relationship::VariationSequence.new(
|
|
53
|
+
target_ids: [variant.base_id, variant.variation_selector_id],
|
|
54
|
+
description: variant.description,
|
|
55
|
+
contexts: variant.contexts,
|
|
56
|
+
source: "standardized_variants",
|
|
57
|
+
)
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ucode
|
|
4
|
+
class Coordinator
|
|
5
|
+
module Enrichment
|
|
6
|
+
# UAX #29 text segmentation: Grapheme / Word / Sentence break class.
|
|
7
|
+
module Segmentation
|
|
8
|
+
class << self
|
|
9
|
+
def enrich(cp, indices)
|
|
10
|
+
grapheme = lookup_value(cp, indices.grapheme_break)
|
|
11
|
+
word = lookup_value(cp, indices.word_break)
|
|
12
|
+
sentence = lookup_value(cp, indices.sentence_break)
|
|
13
|
+
return if grapheme.nil? && word.nil? && sentence.nil?
|
|
14
|
+
|
|
15
|
+
cp.break_segmentation ||= Ucode::Models::CodePoint::BreakSegmentation.new
|
|
16
|
+
apply_values(cp.break_segmentation, grapheme, word, sentence)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
private
|
|
20
|
+
|
|
21
|
+
def lookup_value(cp, ranges)
|
|
22
|
+
RangeLookup.find_in_range(cp.cp, ranges)&.value
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def apply_values(seg, grapheme, word, sentence)
|
|
26
|
+
seg.grapheme = grapheme if grapheme
|
|
27
|
+
seg.word = word if word
|
|
28
|
+
seg.sentence = sentence if sentence
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ucode
|
|
4
|
+
class Coordinator
|
|
5
|
+
# Registry of per-codepoint enrichment concerns. Each module under
|
|
6
|
+
# {Enrichment} owns one slice of the UCD/Unihan merge: Script, Bidi,
|
|
7
|
+
# Casing, Names, CJK, Display, Segmentation, Indic, Emoji, Binary.
|
|
8
|
+
#
|
|
9
|
+
# The registry is an ordered Array of modules. {Coordinator#enrich}
|
|
10
|
+
# iterates it, calling `enrich(cp, indices)` on each. New UCD
|
|
11
|
+
# properties land as a new module + one line in {REGISTRY}.
|
|
12
|
+
#
|
|
13
|
+
# Each module is pure: it reads from {Indices} and mutates the
|
|
14
|
+
# CodePoint model. Range lookups go through {RangeLookup}.
|
|
15
|
+
module Enrichment
|
|
16
|
+
autoload :Identity, "ucode/coordinator/enrichment/identity"
|
|
17
|
+
autoload :Bidi, "ucode/coordinator/enrichment/bidi"
|
|
18
|
+
autoload :Casing, "ucode/coordinator/enrichment/casing"
|
|
19
|
+
autoload :Binary, "ucode/coordinator/enrichment/binary"
|
|
20
|
+
autoload :Names, "ucode/coordinator/enrichment/names"
|
|
21
|
+
autoload :CJK, "ucode/coordinator/enrichment/cjk"
|
|
22
|
+
autoload :Display, "ucode/coordinator/enrichment/display"
|
|
23
|
+
autoload :Segmentation, "ucode/coordinator/enrichment/segmentation"
|
|
24
|
+
autoload :Indic, "ucode/coordinator/enrichment/indic"
|
|
25
|
+
autoload :Emoji, "ucode/coordinator/enrichment/emoji"
|
|
26
|
+
|
|
27
|
+
# Order matters only for determinism — each module sets disjoint
|
|
28
|
+
# fields on the CodePoint model. Preserved from the original flat
|
|
29
|
+
# dispatch for stable diff comparisons.
|
|
30
|
+
REGISTRY = [
|
|
31
|
+
Identity,
|
|
32
|
+
Bidi,
|
|
33
|
+
Casing,
|
|
34
|
+
Binary,
|
|
35
|
+
Names,
|
|
36
|
+
CJK,
|
|
37
|
+
Display,
|
|
38
|
+
Segmentation,
|
|
39
|
+
Indic,
|
|
40
|
+
Emoji,
|
|
41
|
+
].freeze
|
|
42
|
+
|
|
43
|
+
# Apply every enrichment concern to `cp`, in registry order.
|
|
44
|
+
# @param cp [Ucode::Models::CodePoint]
|
|
45
|
+
# @param indices [Ucode::Coordinator::Indices]
|
|
46
|
+
def self.apply(cp, indices)
|
|
47
|
+
REGISTRY.each { |mod| mod.enrich(cp, indices) }
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ucode
|
|
4
|
+
class Coordinator
|
|
5
|
+
# Pure-function range lookups shared by the enrichment pipeline.
|
|
6
|
+
#
|
|
7
|
+
# Extracted from Coordinator so that {Enrichment} modules can call
|
|
8
|
+
# them without inheriting Coordinator's instance context. Both
|
|
9
|
+
# methods are deterministic and side-effect free.
|
|
10
|
+
module RangeLookup
|
|
11
|
+
module_function
|
|
12
|
+
|
|
13
|
+
# Finds the single range-containing record in a sorted array via
|
|
14
|
+
# bsearch. Records respond to `range_first` and `range_last`.
|
|
15
|
+
#
|
|
16
|
+
# bsearch_index integer-mode convention: return -1 to search LEFT,
|
|
17
|
+
# +1 to search RIGHT, 0 for a match. `cp < range_first` means the
|
|
18
|
+
# target range lies in earlier (lower-indexed) records, so we
|
|
19
|
+
# return -1; `cp > range_last` means it lies in later records, so
|
|
20
|
+
# we return +1.
|
|
21
|
+
#
|
|
22
|
+
# @param cp [Integer]
|
|
23
|
+
# @param sorted_ranges [Array] sorted by range_first
|
|
24
|
+
# @return [Object, nil] the record whose range contains cp
|
|
25
|
+
def find_in_range(cp, sorted_ranges)
|
|
26
|
+
return nil if sorted_ranges.nil? || sorted_ranges.empty?
|
|
27
|
+
|
|
28
|
+
idx = sorted_ranges.bsearch_index { |record| compare_cp(cp, record) }
|
|
29
|
+
idx.nil? ? nil : sorted_ranges[idx]
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def compare_cp(cp, record)
|
|
33
|
+
return -1 if cp < record.range_first
|
|
34
|
+
return 1 if cp > record.range_last
|
|
35
|
+
|
|
36
|
+
0
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Returns every value whose range contains `cp` in a sorted tuple
|
|
40
|
+
# array. Most codepoint+property pairs match at most one range, but
|
|
41
|
+
# a codepoint can carry multiple binary properties from PropList or
|
|
42
|
+
# emoji-data, so we collect them all.
|
|
43
|
+
#
|
|
44
|
+
# Ranges are sorted by `range_first`. Once we hit a range that
|
|
45
|
+
# starts after `cp`, every subsequent range also starts after `cp`,
|
|
46
|
+
# so we break. Ranges that end before `cp` are skipped.
|
|
47
|
+
#
|
|
48
|
+
# @param cp [Integer]
|
|
49
|
+
# @param sorted_ranges [Array] sorted by range_first
|
|
50
|
+
# @return [Array] values of every range containing cp
|
|
51
|
+
def all_range_values(cp, sorted_ranges)
|
|
52
|
+
return [] if sorted_ranges.nil? || sorted_ranges.empty?
|
|
53
|
+
|
|
54
|
+
values = []
|
|
55
|
+
sorted_ranges.each do |record|
|
|
56
|
+
break if record.range_first > cp
|
|
57
|
+
next if record.range_last < cp
|
|
58
|
+
|
|
59
|
+
values << record.value
|
|
60
|
+
end
|
|
61
|
+
values
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|