treebank-transform 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,295 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
3
+ <xsl:import href="beta-uni-util.xsl"/>
4
+
5
+ <!--
6
+ Copyright 2008-2009 Cantus Foundation
7
+ http://alpheios.net
8
+
9
+ This file is part of Alpheios.
10
+
11
+ Alpheios is free software: you can redistribute it and/or modify
12
+ it under the terms of the GNU General Public License as published by
13
+ the Free Software Foundation, either version 3 of the License, or
14
+ (at your option) any later version.
15
+
16
+ Alpheios is distributed in the hope that it will be useful,
17
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
18
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19
+ GNU General Public License for more details.
20
+
21
+ You should have received a copy of the GNU General Public License
22
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
23
+ -->
24
+
25
+ <!--
26
+ Test whether text is in betacode
27
+ Parameters:
28
+ $a_in string/node to be tested
29
+ Output:
30
+ 1 if encoded in betacode, else 0
31
+ (Note: Boolean return value does not seem to work
32
+ reliably, perhaps because of recursion.)
33
+ -->
34
+ <xsl:template name="is-beta">
35
+ <xsl:param name="a_in"/>
36
+
37
+ <xsl:choose>
38
+ <!-- if xml:lang says betacode, so be it -->
39
+ <xsl:when test="lang('grc-x-beta')">
40
+ <xsl:value-of select="1"/>
41
+ </xsl:when>
42
+
43
+ <!-- if no input, can't be betacode -->
44
+ <xsl:when test="string-length($a_in) = 0">
45
+ <xsl:value-of select="0"/>
46
+ </xsl:when>
47
+
48
+ <!-- otherwise, check the characters in input -->
49
+ <xsl:otherwise>
50
+ <xsl:variable name="head" select="substring($a_in, 1, 1)"/>
51
+
52
+ <xsl:choose>
53
+ <!-- if betacode base letter, assume it's betacode -->
54
+ <xsl:when
55
+ test="contains($s_betaUppers, $head) or
56
+ contains($s_betaLowers, $head)">
57
+ <xsl:value-of select="1"/>
58
+ </xsl:when>
59
+
60
+ <xsl:otherwise>
61
+ <!-- look up unicode in table -->
62
+ <xsl:variable name="beta">
63
+ <xsl:apply-templates select="$s_betaUniTable" mode="u2b">
64
+ <xsl:with-param name="a_key" select="$head"/>
65
+ </xsl:apply-templates>
66
+ </xsl:variable>
67
+
68
+ <xsl:choose>
69
+ <!-- if found in unicode table, it's not betacode -->
70
+ <xsl:when test="string-length($beta) > 0">
71
+ <xsl:value-of select="0"/>
72
+ </xsl:when>
73
+
74
+ <!-- otherwise, skip letter and check remainder of string -->
75
+ <xsl:otherwise>
76
+ <xsl:call-template name="is-beta">
77
+ <xsl:with-param name="a_in" select="substring($a_in, 2)"/>
78
+ </xsl:call-template>
79
+ </xsl:otherwise>
80
+ </xsl:choose>
81
+ </xsl:otherwise>
82
+ </xsl:choose>
83
+
84
+ </xsl:otherwise>
85
+ </xsl:choose>
86
+ </xsl:template>
87
+
88
+ <!--
89
+ Convert Greek betacode to Unicode
90
+ Parameters:
91
+ $a_in betacode input string to be converted
92
+ $a_pending character waiting to be output
93
+ $a_state diacritics associated with pending character
94
+ $a_precomposed whether to put out precomposed or decomposed Unicode
95
+ $a_partial whether this is a partial word
96
+ (If true, do not use final sigma for last letter)
97
+
98
+ Output:
99
+ $a_in transformed to equivalent Unicode
100
+
101
+ The characters in the state string are maintained in a canonical order,
102
+ which allows the lookup table to contain a single entry for each
103
+ combination of base character and diacritics. The diacritics may appear
104
+ in any order in the input.
105
+
106
+ Diacritics associated with (either preceding or following) a base
107
+ character are accumulated until either a non-diacritic character or end
108
+ of input are encountered, at which point the pending character is output.
109
+ -->
110
+ <xsl:template name="beta-to-uni">
111
+ <xsl:param name="a_in"/>
112
+ <xsl:param name="a_pending" select="''"/>
113
+ <xsl:param name="a_state" select="''"/>
114
+ <xsl:param name="a_precomposed" select="true()"/>
115
+ <xsl:param name="a_partial" select="false()"/>
116
+
117
+ <xsl:variable name="head" select="substring($a_in, 1, 1)"/>
118
+
119
+ <xsl:choose>
120
+ <!-- if no more input -->
121
+ <xsl:when test="string-length($a_in) = 0">
122
+ <!-- output last pending char -->
123
+ <xsl:choose>
124
+ <!-- final sigma: S with no state -->
125
+ <xsl:when
126
+ test="(($a_pending = 's') or ($a_pending = 'S')) and
127
+ not($a_partial) and (string-length($a_state) = 0)">
128
+ <xsl:call-template name="output-uni-char">
129
+ <xsl:with-param name="a_char" select="$a_pending"/>
130
+ <xsl:with-param name="a_state" select="'2'"/>
131
+ <xsl:with-param name="a_precomposed" select="$a_precomposed"/>
132
+ </xsl:call-template>
133
+ </xsl:when>
134
+
135
+ <xsl:otherwise>
136
+ <xsl:call-template name="output-uni-char">
137
+ <xsl:with-param name="a_char" select="$a_pending"/>
138
+ <xsl:with-param name="a_state" select="$a_state"/>
139
+ <xsl:with-param name="a_precomposed" select="$a_precomposed"/>
140
+ </xsl:call-template>
141
+ </xsl:otherwise>
142
+ </xsl:choose>
143
+ </xsl:when>
144
+
145
+ <!-- if input starts with "*" -->
146
+ <xsl:when test="$head = '*'">
147
+ <!-- output pending char -->
148
+ <xsl:call-template name="output-uni-char">
149
+ <xsl:with-param name="a_char" select="$a_pending"/>
150
+ <xsl:with-param name="a_state" select="$a_state"/>
151
+ <xsl:with-param name="a_precomposed" select="$a_precomposed"/>
152
+ </xsl:call-template>
153
+
154
+ <!-- recurse, capitalizing next char, erasing any saved state -->
155
+ <xsl:call-template name="beta-to-uni">
156
+ <xsl:with-param name="a_in" select="substring($a_in, 2)"/>
157
+ <xsl:with-param name="a_state" select="'*'"/>
158
+ <xsl:with-param name="a_pending" select="''"/>
159
+ <xsl:with-param name="a_precomposed" select="$a_precomposed"/>
160
+ <xsl:with-param name="a_partial" select="$a_partial"/>
161
+ </xsl:call-template>
162
+ </xsl:when>
163
+
164
+ <!-- if input starts with diacritic -->
165
+ <xsl:when test="contains($s_betaDiacritics, $head)">
166
+ <!-- update state with new character -->
167
+ <xsl:variable name="newstate">
168
+ <xsl:call-template name="insert-diacritic">
169
+ <xsl:with-param name="a_string" select="$a_state"/>
170
+ <xsl:with-param name="a_char" select="$head"/>
171
+ </xsl:call-template>
172
+ </xsl:variable>
173
+
174
+ <!-- recurse with updated state -->
175
+ <xsl:call-template name="beta-to-uni">
176
+ <xsl:with-param name="a_in" select="substring($a_in, 2)"/>
177
+ <xsl:with-param name="a_state" select="$newstate"/>
178
+ <xsl:with-param name="a_pending" select="$a_pending"/>
179
+ <xsl:with-param name="a_precomposed" select="$a_precomposed"/>
180
+ <xsl:with-param name="a_partial" select="$a_partial"/>
181
+ </xsl:call-template>
182
+ </xsl:when>
183
+
184
+ <!-- if not special char -->
185
+ <xsl:otherwise>
186
+ <!-- output pending char -->
187
+ <xsl:choose>
188
+ <!-- final sigma: S with no state followed by word break -->
189
+ <xsl:when
190
+ test="(($a_pending = 's') or ($a_pending = 'S')) and
191
+ (string-length($a_state) = 0) and
192
+ (contains($s_betaSeparators, $head) or
193
+ contains($s_betaSeparators2, $head))">
194
+ <xsl:call-template name="output-uni-char">
195
+ <xsl:with-param name="a_char" select="$a_pending"/>
196
+ <xsl:with-param name="a_state" select="'2'"/>
197
+ <xsl:with-param name="a_precomposed" select="$a_precomposed"/>
198
+ </xsl:call-template>
199
+ </xsl:when>
200
+
201
+ <xsl:otherwise>
202
+ <xsl:call-template name="output-uni-char">
203
+ <xsl:with-param name="a_char" select="$a_pending"/>
204
+ <xsl:with-param name="a_state" select="$a_state"/>
205
+ <xsl:with-param name="a_precomposed" select="$a_precomposed"/>
206
+ </xsl:call-template>
207
+ </xsl:otherwise>
208
+ </xsl:choose>
209
+
210
+ <!-- reset state if there was a pending character -->
211
+ <xsl:variable name="newstate">
212
+ <xsl:choose>
213
+ <xsl:when test="$a_pending"/>
214
+ <xsl:otherwise>
215
+ <xsl:value-of select="$a_state"/>
216
+ </xsl:otherwise>
217
+ </xsl:choose>
218
+ </xsl:variable>
219
+
220
+ <!-- recurse with head as pending char -->
221
+ <xsl:call-template name="beta-to-uni">
222
+ <xsl:with-param name="a_in" select="substring($a_in, 2)"/>
223
+ <xsl:with-param name="a_state" select="$newstate"/>
224
+ <xsl:with-param name="a_pending" select="$head"/>
225
+ <xsl:with-param name="a_precomposed" select="$a_precomposed"/>
226
+ <xsl:with-param name="a_partial" select="$a_partial"/>
227
+ </xsl:call-template>
228
+ </xsl:otherwise>
229
+ </xsl:choose>
230
+ </xsl:template>
231
+
232
+ <!--
233
+ Output a single character with diacritics
234
+ Parameters:
235
+ $a_char character to be output
236
+ $a_state diacritics associated with character
237
+ $a_precomposed whether to put out precomposed or decomposed Unicode
238
+ -->
239
+ <xsl:template name="output-uni-char">
240
+ <xsl:param name="a_char"/>
241
+ <xsl:param name="a_state"/>
242
+ <xsl:param name="a_precomposed"/>
243
+
244
+ <xsl:choose>
245
+ <!-- if no character pending -->
246
+ <xsl:when test="string-length($a_char) = 0">
247
+ <!-- if we have state and we're not processing a capital -->
248
+ <xsl:if
249
+ test="(string-length($a_state) > 0) and
250
+ (substring($a_state, 1, 1) != '*')">
251
+ <!-- output just the state -->
252
+ <!-- here precomposed=true means don't make it combining -->
253
+ <xsl:apply-templates select="$s_betaUniTable" mode="b2u">
254
+ <xsl:with-param name="a_key" select="$a_state"/>
255
+ <xsl:with-param name="a_precomposed" select="true()"/>
256
+ </xsl:apply-templates>
257
+ </xsl:if>
258
+ </xsl:when>
259
+
260
+ <!-- if character is pending -->
261
+ <xsl:otherwise>
262
+ <!-- translate to lower and back -->
263
+ <xsl:variable name="lowerchar"
264
+ select="translate($a_char, $s_betaUppers, $s_betaLowers)"/>
265
+ <xsl:variable name="upperchar"
266
+ select="translate($a_char, $s_betaLowers, $s_betaUppers)"/>
267
+ <xsl:choose>
268
+ <!-- if upper != lower, we have a letter -->
269
+ <xsl:when test="$lowerchar != $upperchar">
270
+ <!-- use letter+state as key into table -->
271
+ <xsl:apply-templates select="$s_betaUniTable" mode="b2u">
272
+ <xsl:with-param name="a_key"
273
+ select="concat($lowerchar, $a_state)"/>
274
+ <xsl:with-param name="a_precomposed" select="$a_precomposed"/>
275
+ </xsl:apply-templates>
276
+ </xsl:when>
277
+
278
+ <!-- if upper = lower, we have a non-letter -->
279
+ <xsl:otherwise>
280
+ <!-- output character, if any, then use state as key into table -->
281
+ <!-- this handles the case of isolated diacritics -->
282
+ <xsl:value-of select="$a_char"/>
283
+ <xsl:if test="string-length($a_state) > 0">
284
+ <xsl:apply-templates select="$s_betaUniTable" mode="b2u">
285
+ <xsl:with-param name="a_key" select="$a_state"/>
286
+ <xsl:with-param name="a_precomposed" select="$a_precomposed"/>
287
+ </xsl:apply-templates>
288
+ </xsl:if>
289
+ </xsl:otherwise>
290
+ </xsl:choose>
291
+ </xsl:otherwise>
292
+ </xsl:choose>
293
+ </xsl:template>
294
+
295
+ </xsl:stylesheet>
@@ -0,0 +1,31 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
3
+ xmlns:xs="http://www.w3.org/2001/XMLSchema"
4
+ exclude-result-prefixes="xs"
5
+ version="1.0">
6
+ <xsl:output media-type="text/xml" omit-xml-declaration="no" method="xml" indent="yes"/>
7
+ <xsl:preserve-space elements="*"/>
8
+ <xsl:include href="beta2unicode.xsl"/>
9
+
10
+ <xsl:template match="@*|node()">
11
+ <xsl:copy>
12
+ <xsl:apply-templates select="@*"></xsl:apply-templates>
13
+ <xsl:apply-templates select="node()"></xsl:apply-templates>
14
+ </xsl:copy>
15
+ </xsl:template>
16
+
17
+ <xsl:template match="@span|@lemma|@form">
18
+ <xsl:choose>
19
+ <xsl:when test="ancestor::treebank[@xml:lang='grc' or @xml:lang='greek']">
20
+ <xsl:attribute name="{local-name(.)}">
21
+ <xsl:call-template name="beta-to-uni">
22
+ <xsl:with-param name="a_in" select="."/>
23
+ </xsl:call-template>
24
+ </xsl:attribute>
25
+ </xsl:when>
26
+ <xsl:otherwise><xsl:copy/></xsl:otherwise>
27
+ </xsl:choose>
28
+
29
+ </xsl:template>
30
+
31
+ </xsl:stylesheet>
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: treebank-transform
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - LFDM
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-11-26 00:00:00.000000000 Z
11
+ date: 2014-12-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -111,6 +111,7 @@ files:
111
111
  - Rakefile
112
112
  - bin/treebank-transform
113
113
  - examples/cicero_catilina_sample.xml
114
+ - examples/hesiod_shield_of_heracles_sample.xml
114
115
  - lib/treebank/alphabet.rb
115
116
  - lib/treebank/elliptic_word.rb
116
117
  - lib/treebank/sentence.rb
@@ -119,6 +120,9 @@ files:
119
120
  - lib/treebank/transform/version.rb
120
121
  - spec/spec_helper.rb
121
122
  - spec/treebank/transform_spec.rb
123
+ - stylesheets/beta-uni-util.xsl
124
+ - stylesheets/beta2unicode.xsl
125
+ - stylesheets/treebank-beta-uni.xsl
122
126
  - treebank-transform.gemspec
123
127
  homepage: ''
124
128
  licenses: