glaemscribe 1.3.0 → 1.3.1

Sign up to get free protection for your applications and to get access to all the features.
data/lib/api/charset.rb CHANGED
@@ -1,22 +1,22 @@
1
1
  # encoding: UTF-8
2
2
  #
3
3
  # Glǽmscribe (also written Glaemscribe) is a software dedicated to
4
- # the transcription of texts between writing systems, and more
5
- # specifically dedicated to the transcription of J.R.R. Tolkien's
4
+ # the transcription of texts between writing systems, and more
5
+ # specifically dedicated to the transcription of J.R.R. Tolkien's
6
6
  # invented languages to some of his devised writing systems.
7
- #
7
+ #
8
8
  # Copyright (C) 2015 Benjamin Babut (Talagan).
9
- #
9
+ #
10
10
  # This program is free software: you can redistribute it and/or modify
11
11
  # it under the terms of the GNU Affero General Public License as published by
12
12
  # the Free Software Foundation, either version 3 of the License, or
13
13
  # any later version.
14
- #
14
+ #
15
15
  # This program is distributed in the hope that it will be useful,
16
16
  # but WITHOUT ANY WARRANTY; without even the implied warranty of
17
17
  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
18
  # GNU Affero General Public License for more details.
19
- #
19
+ #
20
20
  # You should have received a copy of the GNU Affero General Public License
21
21
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
22
22
 
@@ -24,53 +24,94 @@ module Glaemscribe
24
24
  module API
25
25
  class Charset
26
26
  attr_reader :name
27
-
27
+
28
28
  attr_accessor :errors
29
29
  attr_reader :chars
30
30
  attr_reader :virtual_chars
31
-
31
+ attr_reader :swaps
32
+
33
+ class Swap
34
+ attr_accessor :line
35
+ attr_accessor :trigger
36
+ attr_accessor :targets
37
+
38
+ def initialize(trigger, target_list)
39
+ @trigger = trigger
40
+ @targets = {}
41
+
42
+ @target_list = target_list
43
+ end
44
+
45
+ def finalize(charset)
46
+ @lookup_table = {}
47
+
48
+ trig = charset.n2c(@trigger)
49
+
50
+ if !trig
51
+ charset.errors << Glaeml::Error.new(@line, "Swap operator triggers #{@trigger} which does not exist in charset.")
52
+ end
53
+
54
+ @target_list.each{ |target_id|
55
+ c = charset.n2c(target_id)
56
+ if !c
57
+ charset.errors << Glaeml::Error.new(@line, "Swap operator targets #{target_id} which does not exist in charset.")
58
+ else
59
+ c.names.each{ |n|
60
+ @targets[n] = c
61
+ }
62
+ end
63
+ }
64
+
65
+ trig
66
+ end
67
+
68
+ def has_target?(tname)
69
+ (@targets[tname] != nil)
70
+ end
71
+ end
72
+
32
73
  class Char
33
74
  attr_accessor :line # Line num in the sourcecode
34
75
  attr_accessor :code # Position in unicode
35
76
  attr_accessor :names # Names
36
77
  attr_accessor :str # How does this char resolve as a string
37
78
  attr_accessor :charset # Pointer to parent charset
38
-
79
+
39
80
  def initialize
40
81
  @names = {}
41
82
  end
42
-
83
+
43
84
  def virtual?
44
85
  false
45
86
  end
46
-
87
+
47
88
  def sequence?
48
89
  false
49
90
  end
50
91
  end
51
-
52
- class VirtualChar # Could have had inheritance here ...
92
+
93
+ class VirtualChar # Could have had inheritance here ...
53
94
  attr_accessor :line
54
95
  attr_accessor :names
55
96
  attr_accessor :classes
56
97
  attr_accessor :charset
57
98
  attr_accessor :reversed
58
99
  attr_accessor :default
59
-
100
+
60
101
  class VirtualClass
61
102
  attr_accessor :target
62
103
  attr_accessor :triggers
63
104
  end
64
-
105
+
65
106
  def initialize
66
107
  @classes = {} # result_char_1 => [trigger_char_1, trigger_char_2 ...] , result_char_1 => ...
67
108
  @lookup_table = {}
68
109
  @reversed = false
69
110
  @default = nil
70
111
  end
71
-
112
+
72
113
  def str
73
-
114
+
74
115
  # Will be called if the virtual char could not be replaced and still exists at the end of the transcription chain
75
116
  if @default
76
117
  @charset[@default].str
@@ -78,14 +119,14 @@ module Glaemscribe
78
119
  VIRTUAL_CHAR_OUTPUT
79
120
  end
80
121
  end
81
-
122
+
82
123
  def finalize
83
124
  @lookup_table = {}
84
125
  @classes.each{ |vc|
85
-
126
+
86
127
  result_char = vc.target
87
128
  trigger_chars = vc.triggers
88
-
129
+
89
130
  trigger_chars.each{ |trigger_char|
90
131
  found = @lookup_table[trigger_char]
91
132
  if found
@@ -93,90 +134,91 @@ module Glaemscribe
93
134
  else
94
135
  rc = @charset[result_char]
95
136
  tc = @charset[trigger_char]
96
-
137
+
97
138
  if rc.nil?
98
139
  @charset.errors << Glaeml::Error.new(@line, "Trigger char #{trigger_char} points to unknown result char #{result_char}.")
99
140
  elsif tc.nil?
100
- @charset.errors << Glaeml::Error.new(@line, "Unknown trigger char #{trigger_char}.")
141
+ @charset.errors << Glaeml::Error.new(@line, "Unknown trigger char #{trigger_char}.")
101
142
  elsif rc.class == VirtualChar
102
143
  @charset.errors << Glaeml::Error.new(@line, "Trigger char #{trigger_char} points to another virtual char #{result_char}. This is not supported!")
103
144
  else
104
145
  tc.names.each{|trigger_char_name| # Don't forget to match all name variants for that trigger char!
105
146
  @lookup_table[trigger_char_name] = rc
106
- }
107
- end
108
- end
147
+ }
148
+ end
149
+ end
109
150
  }
110
151
  }
111
152
  if @default
112
153
  c = @charset[@default]
113
154
  if !c
114
- @charset.errors << Glaeml::Error.new(@line, "Default char #{@default} does not match any real character in the charset.")
155
+ @charset.errors << Glaeml::Error.new(@line, "Default char #{@default} does not match any real character in the charset.")
115
156
  elsif c.virtual?
116
157
  @charset.errors << Glaeml::Error.new(@line, "Default char #{@default} is virtual, it should be real only.")
117
158
  end
118
159
  end
119
160
  end
120
-
161
+
121
162
  def [](trigger_char_name)
122
163
  @lookup_table[trigger_char_name]
123
164
  end
124
-
165
+
125
166
  def virtual?
126
167
  true
127
168
  end
128
-
169
+
129
170
  def sequence?
130
171
  false
131
172
  end
132
173
  end
133
-
174
+
134
175
  class SequenceChar
135
176
  attr_accessor :line # Line of code
136
177
  attr_accessor :names # Names
137
178
  attr_accessor :sequence # The sequence of chars
138
179
  attr_accessor :charset # Pointer to parent charset
139
-
180
+
140
181
  def virtual?
141
182
  false
142
183
  end
143
-
184
+
144
185
  def sequence?
145
186
  true
146
- end
147
-
187
+ end
188
+
148
189
  def str
149
190
  # A sequence char should never arrive unreplaced
150
191
  VIRTUAL_CHAR_OUTPUT
151
192
  end
152
-
153
- def finalize
193
+
194
+ def finalize
154
195
  if @sequence.count == 0
155
- @charset.errors << Glaeml::Error.new(@line, "Sequence for sequence char is empty.")
196
+ @charset.errors << Glaeml::Error.new(@line, "Sequence for sequence char is empty.")
156
197
  end
157
-
198
+
158
199
  @sequence.each{ |symbol|
159
200
  # Check that the sequence is correct
160
201
  found = @charset[symbol]
161
202
  if !found
162
203
  @charset.errors << Glaeml::Error.new(@line, "Sequence char #{symbol} cannot be found in the charset.")
163
204
  end
164
- }
205
+ }
165
206
  end
166
-
207
+
167
208
  end
168
-
209
+
169
210
  def initialize(name)
170
211
  @name = name
171
212
  @chars = []
172
213
  @errors = []
173
214
  @virtual_chars = []
215
+ @swaps = []
174
216
  end
175
-
217
+
176
218
  # Pass integer (utf8 num) and array (of strings)
177
219
  def add_char(line, code, names)
178
220
  return if names.empty? || names.include?("?") # Ignore characters with '?'
179
-
221
+
180
222
  c = Char.new
181
223
  c.line = line
182
224
  c.code = code
@@ -185,10 +227,10 @@ module Glaemscribe
185
227
  c.charset = self
186
228
  @chars << c
187
229
  end
188
-
230
+
189
231
  def add_virtual_char(line, classes, names, reversed = false, default = nil)
190
232
  return if names.empty? || names.include?("?") # Ignore characters with '?'
191
-
233
+
192
234
  c = VirtualChar.new
193
235
  c.line = line
194
236
  c.names = names
@@ -196,25 +238,34 @@ module Glaemscribe
196
238
  c.charset = self
197
239
  c.reversed = reversed
198
240
  c.default = default
199
- @chars << c
241
+ @chars << c
200
242
  end
201
-
243
+
202
244
  def add_sequence_char(line, names, seq)
203
245
  return if names.empty? || names.include?("?") # Ignore characters with '?'
204
-
246
+
205
247
  c = SequenceChar.new
206
248
  c.line = line
207
249
  c.names = names
208
- c.sequence = seq.split.reject{|token| token.empty? }
250
+ c.sequence = seq.split.reject{|token| token.empty? }
209
251
  c.charset = self
210
252
  @chars << c
211
253
  end
212
-
254
+
255
+ def add_swap(line, target, triggers)
256
+ return if target.empty? || triggers.empty?
257
+
258
+ s = Swap.new(target, triggers)
259
+ s.line = line
260
+ @swaps << s
261
+ end
262
+
213
263
  def finalize
214
264
  @errors = []
215
265
  @lookup_table = {}
216
266
  @virtual_chars = [] # A convenient filtered array
217
-
267
+ @swap_lookup = {}
268
+
218
269
  @chars.each { |c|
219
270
  c.names.each { |cname|
220
271
  found = @lookup_table[cname]
@@ -225,27 +276,43 @@ module Glaemscribe
225
276
  end
226
277
  }
227
278
  }
228
-
279
+
229
280
  @chars.each{ |c|
230
281
  if c.class == VirtualChar
231
282
  c.finalize
232
283
  @virtual_chars << c
233
284
  end
234
285
  }
235
-
286
+
236
287
  @chars.each{|c|
237
288
  if c.class == SequenceChar
238
289
  c.finalize
239
290
  end
240
291
  }
241
-
292
+
293
+ @swaps.each{ |s|
294
+ trig = s.finalize(self)
295
+ if trig
296
+ trig.names.each{ |n|
297
+ @swap_lookup[n] = s
298
+ }
299
+ end
300
+ }
242
301
  API::Debug::log("Finalized charset '#{@name}', #{@lookup_table.count} symbols loaded.")
243
302
  end
244
-
303
+
245
304
  def [](symbol)
246
305
  @lookup_table[symbol]
247
306
  end
248
-
307
+
308
+ def n2c(symbol)
309
+ self[symbol]
310
+ end
311
+
312
+ def swap_for_trigger(trigger_name)
313
+ @swap_lookup[trigger_name]
314
+ end
315
+
249
316
  end
250
317
  end
251
- end
318
+ end
@@ -1,37 +1,37 @@
1
1
  # encoding: UTF-8
2
2
  #
3
3
  # Glǽmscribe (also written Glaemscribe) is a software dedicated to
4
- # the transcription of texts between writing systems, and more
5
- # specifically dedicated to the transcription of J.R.R. Tolkien's
4
+ # the transcription of texts between writing systems, and more
5
+ # specifically dedicated to the transcription of J.R.R. Tolkien's
6
6
  # invented languages to some of his devised writing systems.
7
- #
7
+ #
8
8
  # Copyright (C) 2015 Benjamin Babut (Talagan).
9
- #
9
+ #
10
10
  # This program is free software: you can redistribute it and/or modify
11
11
  # it under the terms of the GNU Affero General Public License as published by
12
12
  # the Free Software Foundation, either version 3 of the License, or
13
13
  # any later version.
14
- #
14
+ #
15
15
  # This program is distributed in the hope that it will be useful,
16
16
  # but WITHOUT ANY WARRANTY; without even the implied warranty of
17
17
  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
18
  # GNU Affero General Public License for more details.
19
- #
19
+ #
20
20
  # You should have received a copy of the GNU Affero General Public License
21
21
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
22
22
 
23
23
  module Glaemscribe
24
24
  module API
25
-
25
+
26
26
  class CharsetParser
27
-
27
+
28
28
  def initialize()
29
29
  @charset = nil
30
30
  end
31
-
31
+
32
32
  def parse(file_path)
33
- @charset = Charset.new(ResourceManager::charset_name_from_file_path(file_path))
34
-
33
+ @charset = Charset.new(ResourceManager::charset_name_from_file_path(file_path))
34
+
35
35
  raw = File.open(file_path,"rb:utf-8").read
36
36
  doc = Glaeml::Parser.new.parse(raw)
37
37
 
@@ -39,33 +39,40 @@ module Glaemscribe
39
39
  @charset.errors = doc.errors
40
40
  return @charset
41
41
  end
42
-
42
+
43
43
  # TODO : verify charset glaeml like we do with modes
44
-
44
+
45
45
  doc.root_node.gpath("char").each { |char_element|
46
46
  code = char_element.args[0].hex
47
47
  names = char_element.args[1..-1].map{|cname| cname.strip }.reject{ |cname| cname.empty? }
48
48
  @charset.add_char(char_element.line,code,names)
49
49
  }
50
-
51
- doc.root_node.gpath("seq").each{ |seq_elemnt|
50
+
51
+ doc.root_node.gpath("seq").each{ |seq_elemnt|
52
52
  names = seq_elemnt.args
53
53
  child_node = seq_elemnt.children.first
54
54
  seq = (child_node && child_node.text?)?(child_node.args.first):("")
55
55
  @charset.add_sequence_char(seq_elemnt.line,names,seq)
56
56
  }
57
-
57
+
58
+ doc.root_node.gpath("swap").each{ |element|
59
+ trigger_one = element.args.first
60
+ text_lines = element.children.select{ |c| c.text? }.map{ |c| c.args.first }
61
+ second_triggers = text_lines.join(" ").split(/\s/).select{ |e| e != '' }
62
+ @charset.add_swap(element.line, trigger_one, second_triggers)
63
+ }
64
+
58
65
  doc.root_node.gpath("virtual").each { |virtual_element|
59
66
  names = virtual_element.args
60
- reversed = false
67
+ reversed = false
61
68
  default = nil
62
69
  classes = []
63
-
70
+
64
71
  virtual_element.gpath("class").each { |class_element|
65
72
  vc = Charset::VirtualChar::VirtualClass.new
66
73
  vc.target = class_element.args[0]
67
74
  vc.triggers = class_element.args[1..-1].map{|cname| cname.strip }.reject{ |cname| cname.empty? }
68
-
75
+
69
76
  # Allow triggers to be defined inside the body of the class element
70
77
  text_lines = class_element.children.select { |c| c.text? }.map{ |c| c.args.first}
71
78
  inner_triggers = text_lines.join(" ").split(/\s/).select{ |e| e != '' }
@@ -73,21 +80,21 @@ module Glaemscribe
73
80
 
74
81
  classes << vc
75
82
  }
76
- virtual_element.gpath("reversed").each { |reversed_element|
83
+ virtual_element.gpath("reversed").each { |reversed_element|
77
84
  reversed = true
78
85
  }
79
- virtual_element.gpath("default").each { |default_element|
86
+ virtual_element.gpath("default").each { |default_element|
80
87
  default = default_element.args[0]
81
88
  }
82
-
89
+
83
90
  @charset.add_virtual_char(virtual_element.line,classes,names,reversed,default)
84
91
  }
85
-
92
+
86
93
  @charset.finalize
87
-
88
- @charset
94
+
95
+ @charset
89
96
  end
90
-
97
+
91
98
  end
92
99
  end
93
100
  end
@@ -1,22 +1,22 @@
1
1
  # encoding: UTF-8
2
2
  #
3
3
  # Glǽmscribe (also written Glaemscribe) is a software dedicated to
4
- # the transcription of texts between writing systems, and more
5
- # specifically dedicated to the transcription of J.R.R. Tolkien's
4
+ # the transcription of texts between writing systems, and more
5
+ # specifically dedicated to the transcription of J.R.R. Tolkien's
6
6
  # invented languages to some of his devised writing systems.
7
- #
7
+ #
8
8
  # Copyright (C) 2015 Benjamin Babut (Talagan).
9
- #
9
+ #
10
10
  # This program is free software: you can redistribute it and/or modify
11
11
  # it under the terms of the GNU Affero General Public License as published by
12
12
  # the Free Software Foundation, either version 3 of the License, or
13
13
  # any later version.
14
- #
14
+ #
15
15
  # This program is distributed in the hope that it will be useful,
16
16
  # but WITHOUT ANY WARRANTY; without even the implied warranty of
17
17
  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
18
  # GNU Affero General Public License for more details.
19
- #
19
+ #
20
20
  # You should have received a copy of the GNU Affero General Public License
21
21
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
22
22
 
@@ -24,25 +24,25 @@ module Glaemscribe
24
24
  module API
25
25
 
26
26
  class ResolveVirtualsPostProcessorOperator < PostProcessorOperator
27
-
27
+
28
28
  def finalize(trans_options)
29
29
  super(trans_options)
30
30
  @last_triggers = {} # Allocate the lookup here to optimize
31
31
  end
32
-
32
+
33
33
  def reset_trigger_states(charset)
34
34
  # For each virtual char in charset, maintain a state.
35
35
  charset.virtual_chars.each{ |vc|
36
36
  @last_triggers[vc] = nil # Clear the state
37
37
  }
38
38
  end
39
-
39
+
40
40
  def apply_loop(charset, tokens, new_tokens, reversed, token, idx)
41
41
  if token == '*SPACE' || token =='*LF'
42
42
  reset_trigger_states(charset)
43
43
  return
44
44
  end
45
-
45
+
46
46
  # Check if token is a virtual char
47
47
  c = charset[token]
48
48
  return if c.nil? # May happen for empty tokens
@@ -54,14 +54,14 @@ module Glaemscribe
54
54
  token = new_tokens[idx] # Consider the token replaced, being itself a potential trigger for further virtuals (cascading virtuals)
55
55
  end
56
56
  end
57
-
57
+
58
58
  # Update states of virtual classes
59
59
  charset.virtual_chars.each{|vc|
60
60
  rc = vc[token]
61
- @last_triggers[vc] = rc if rc != nil
61
+ @last_triggers[vc] = rc if rc != nil
62
62
  }
63
63
  end
64
-
64
+
65
65
  def apply_sequences(charset,tokens)
66
66
  ret = []
67
67
  tokens.each { |token|
@@ -74,21 +74,43 @@ module Glaemscribe
74
74
  }
75
75
  ret
76
76
  end
77
-
77
+
78
+ def apply_swaps(charset, tokens)
79
+
80
+ idx = 0
81
+ while idx < tokens.length - 1
82
+ tok = tokens[idx]
83
+ tgt = tokens[idx+1]
84
+
85
+ trig = charset.swap_for_trigger(tok)
86
+
87
+ if trig && trig.has_target?(tgt)
88
+ tokens[idx+1] = tok
89
+ tokens[idx] = tgt
90
+ end
91
+
92
+ idx += 1
93
+ end
94
+
95
+ tokens
96
+ end
97
+
78
98
  def apply(tokens,charset)
79
99
  # Apply sequence chars
80
100
  tokens = apply_sequences(charset,tokens)
81
-
101
+
102
+ tokens = apply_swaps(charset, tokens)
103
+
82
104
  # Clone the tokens so that we can perform ligatures AND diacritics without interferences
83
105
  new_tokens = tokens.clone
84
-
106
+
85
107
  # Handle l to r virtuals (diacritics ?)
86
- reset_trigger_states(charset)
108
+ reset_trigger_states(charset)
87
109
  tokens.each_with_index{ |token,idx|
88
110
  apply_loop(charset,tokens,new_tokens,false,token,idx)
89
111
  }
90
112
  # Handle r to l virtuals (ligatures ?)
91
- reset_trigger_states(charset)
113
+ reset_trigger_states(charset)
92
114
  tokens.reverse_each.with_index{ |token,idx|
93
115
  apply_loop(charset,tokens,new_tokens,true,token,tokens.count - 1 - idx)
94
116
  }
@@ -96,7 +118,7 @@ module Glaemscribe
96
118
  end
97
119
  end
98
120
 
99
- ResourceManager::register_post_processor_class("resolve_virtuals", ResolveVirtualsPostProcessorOperator)
121
+ ResourceManager::register_post_processor_class("resolve_virtuals", ResolveVirtualsPostProcessorOperator)
100
122
 
101
123
  end
102
124
  end