treebank-transform 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +2 -1
- data/examples/hesiod_shield_of_heracles_sample.xml +5137 -0
- data/lib/treebank/sentence.rb +1 -1
- data/lib/treebank/transform/version.rb +1 -1
- data/lib/treebank/transform.rb +40 -7
- data/spec/treebank/transform_spec.rb +43 -17
- data/stylesheets/beta-uni-util.xsl +2015 -0
- data/stylesheets/beta2unicode.xsl +295 -0
- data/stylesheets/treebank-beta-uni.xsl +31 -0
- metadata +6 -2
@@ -0,0 +1,295 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
|
3
|
+
<xsl:import href="beta-uni-util.xsl"/>
|
4
|
+
|
5
|
+
<!--
|
6
|
+
Copyright 2008-2009 Cantus Foundation
|
7
|
+
http://alpheios.net
|
8
|
+
|
9
|
+
This file is part of Alpheios.
|
10
|
+
|
11
|
+
Alpheios is free software: you can redistribute it and/or modify
|
12
|
+
it under the terms of the GNU General Public License as published by
|
13
|
+
the Free Software Foundation, either version 3 of the License, or
|
14
|
+
(at your option) any later version.
|
15
|
+
|
16
|
+
Alpheios is distributed in the hope that it will be useful,
|
17
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
18
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
19
|
+
GNU General Public License for more details.
|
20
|
+
|
21
|
+
You should have received a copy of the GNU General Public License
|
22
|
+
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
23
|
+
-->
|
24
|
+
|
25
|
+
<!--
|
26
|
+
Test whether text is in betacode
|
27
|
+
Parameters:
|
28
|
+
$a_in string/node to be tested
|
29
|
+
Output:
|
30
|
+
1 if encoded in betacode, else 0
|
31
|
+
(Note: Boolean return value does not seem to work
|
32
|
+
reliably, perhaps because of recursion.)
|
33
|
+
-->
|
34
|
+
<xsl:template name="is-beta">
|
35
|
+
<xsl:param name="a_in"/>
|
36
|
+
|
37
|
+
<xsl:choose>
|
38
|
+
<!-- if xml:lang says betacode, so be it -->
|
39
|
+
<xsl:when test="lang('grc-x-beta')">
|
40
|
+
<xsl:value-of select="1"/>
|
41
|
+
</xsl:when>
|
42
|
+
|
43
|
+
<!-- if no input, can't be betacode -->
|
44
|
+
<xsl:when test="string-length($a_in) = 0">
|
45
|
+
<xsl:value-of select="0"/>
|
46
|
+
</xsl:when>
|
47
|
+
|
48
|
+
<!-- otherwise, check the characters in input -->
|
49
|
+
<xsl:otherwise>
|
50
|
+
<xsl:variable name="head" select="substring($a_in, 1, 1)"/>
|
51
|
+
|
52
|
+
<xsl:choose>
|
53
|
+
<!-- if betacode base letter, assume it's betacode -->
|
54
|
+
<xsl:when
|
55
|
+
test="contains($s_betaUppers, $head) or
|
56
|
+
contains($s_betaLowers, $head)">
|
57
|
+
<xsl:value-of select="1"/>
|
58
|
+
</xsl:when>
|
59
|
+
|
60
|
+
<xsl:otherwise>
|
61
|
+
<!-- look up unicode in table -->
|
62
|
+
<xsl:variable name="beta">
|
63
|
+
<xsl:apply-templates select="$s_betaUniTable" mode="u2b">
|
64
|
+
<xsl:with-param name="a_key" select="$head"/>
|
65
|
+
</xsl:apply-templates>
|
66
|
+
</xsl:variable>
|
67
|
+
|
68
|
+
<xsl:choose>
|
69
|
+
<!-- if found in unicode table, it's not betacode -->
|
70
|
+
<xsl:when test="string-length($beta) > 0">
|
71
|
+
<xsl:value-of select="0"/>
|
72
|
+
</xsl:when>
|
73
|
+
|
74
|
+
<!-- otherwise, skip letter and check remainder of string -->
|
75
|
+
<xsl:otherwise>
|
76
|
+
<xsl:call-template name="is-beta">
|
77
|
+
<xsl:with-param name="a_in" select="substring($a_in, 2)"/>
|
78
|
+
</xsl:call-template>
|
79
|
+
</xsl:otherwise>
|
80
|
+
</xsl:choose>
|
81
|
+
</xsl:otherwise>
|
82
|
+
</xsl:choose>
|
83
|
+
|
84
|
+
</xsl:otherwise>
|
85
|
+
</xsl:choose>
|
86
|
+
</xsl:template>
|
87
|
+
|
88
|
+
<!--
|
89
|
+
Convert Greek betacode to Unicode
|
90
|
+
Parameters:
|
91
|
+
$a_in betacode input string to be converted
|
92
|
+
$a_pending character waiting to be output
|
93
|
+
$a_state diacritics associated with pending character
|
94
|
+
$a_precomposed whether to put out precomposed or decomposed Unicode
|
95
|
+
$a_partial whether this is a partial word
|
96
|
+
(If true, do not use final sigma for last letter)
|
97
|
+
|
98
|
+
Output:
|
99
|
+
$a_in transformed to equivalent Unicode
|
100
|
+
|
101
|
+
The characters in the state string are maintained in a canonical order,
|
102
|
+
which allows the lookup table to contain a single entry for each
|
103
|
+
combination of base character and diacritics. The diacritics may appear
|
104
|
+
in any order in the input.
|
105
|
+
|
106
|
+
Diacritics associated with (either preceding or following) a base
|
107
|
+
character are accumulated until either a non-diacritic character or end
|
108
|
+
of input are encountered, at which point the pending character is output.
|
109
|
+
-->
|
110
|
+
<xsl:template name="beta-to-uni">
|
111
|
+
<xsl:param name="a_in"/>
|
112
|
+
<xsl:param name="a_pending" select="''"/>
|
113
|
+
<xsl:param name="a_state" select="''"/>
|
114
|
+
<xsl:param name="a_precomposed" select="true()"/>
|
115
|
+
<xsl:param name="a_partial" select="false()"/>
|
116
|
+
|
117
|
+
<xsl:variable name="head" select="substring($a_in, 1, 1)"/>
|
118
|
+
|
119
|
+
<xsl:choose>
|
120
|
+
<!-- if no more input -->
|
121
|
+
<xsl:when test="string-length($a_in) = 0">
|
122
|
+
<!-- output last pending char -->
|
123
|
+
<xsl:choose>
|
124
|
+
<!-- final sigma: S with no state -->
|
125
|
+
<xsl:when
|
126
|
+
test="(($a_pending = 's') or ($a_pending = 'S')) and
|
127
|
+
not($a_partial) and (string-length($a_state) = 0)">
|
128
|
+
<xsl:call-template name="output-uni-char">
|
129
|
+
<xsl:with-param name="a_char" select="$a_pending"/>
|
130
|
+
<xsl:with-param name="a_state" select="'2'"/>
|
131
|
+
<xsl:with-param name="a_precomposed" select="$a_precomposed"/>
|
132
|
+
</xsl:call-template>
|
133
|
+
</xsl:when>
|
134
|
+
|
135
|
+
<xsl:otherwise>
|
136
|
+
<xsl:call-template name="output-uni-char">
|
137
|
+
<xsl:with-param name="a_char" select="$a_pending"/>
|
138
|
+
<xsl:with-param name="a_state" select="$a_state"/>
|
139
|
+
<xsl:with-param name="a_precomposed" select="$a_precomposed"/>
|
140
|
+
</xsl:call-template>
|
141
|
+
</xsl:otherwise>
|
142
|
+
</xsl:choose>
|
143
|
+
</xsl:when>
|
144
|
+
|
145
|
+
<!-- if input starts with "*" -->
|
146
|
+
<xsl:when test="$head = '*'">
|
147
|
+
<!-- output pending char -->
|
148
|
+
<xsl:call-template name="output-uni-char">
|
149
|
+
<xsl:with-param name="a_char" select="$a_pending"/>
|
150
|
+
<xsl:with-param name="a_state" select="$a_state"/>
|
151
|
+
<xsl:with-param name="a_precomposed" select="$a_precomposed"/>
|
152
|
+
</xsl:call-template>
|
153
|
+
|
154
|
+
<!-- recurse, capitalizing next char, erasing any saved state -->
|
155
|
+
<xsl:call-template name="beta-to-uni">
|
156
|
+
<xsl:with-param name="a_in" select="substring($a_in, 2)"/>
|
157
|
+
<xsl:with-param name="a_state" select="'*'"/>
|
158
|
+
<xsl:with-param name="a_pending" select="''"/>
|
159
|
+
<xsl:with-param name="a_precomposed" select="$a_precomposed"/>
|
160
|
+
<xsl:with-param name="a_partial" select="$a_partial"/>
|
161
|
+
</xsl:call-template>
|
162
|
+
</xsl:when>
|
163
|
+
|
164
|
+
<!-- if input starts with diacritic -->
|
165
|
+
<xsl:when test="contains($s_betaDiacritics, $head)">
|
166
|
+
<!-- update state with new character -->
|
167
|
+
<xsl:variable name="newstate">
|
168
|
+
<xsl:call-template name="insert-diacritic">
|
169
|
+
<xsl:with-param name="a_string" select="$a_state"/>
|
170
|
+
<xsl:with-param name="a_char" select="$head"/>
|
171
|
+
</xsl:call-template>
|
172
|
+
</xsl:variable>
|
173
|
+
|
174
|
+
<!-- recurse with updated state -->
|
175
|
+
<xsl:call-template name="beta-to-uni">
|
176
|
+
<xsl:with-param name="a_in" select="substring($a_in, 2)"/>
|
177
|
+
<xsl:with-param name="a_state" select="$newstate"/>
|
178
|
+
<xsl:with-param name="a_pending" select="$a_pending"/>
|
179
|
+
<xsl:with-param name="a_precomposed" select="$a_precomposed"/>
|
180
|
+
<xsl:with-param name="a_partial" select="$a_partial"/>
|
181
|
+
</xsl:call-template>
|
182
|
+
</xsl:when>
|
183
|
+
|
184
|
+
<!-- if not special char -->
|
185
|
+
<xsl:otherwise>
|
186
|
+
<!-- output pending char -->
|
187
|
+
<xsl:choose>
|
188
|
+
<!-- final sigma: S with no state followed by word break -->
|
189
|
+
<xsl:when
|
190
|
+
test="(($a_pending = 's') or ($a_pending = 'S')) and
|
191
|
+
(string-length($a_state) = 0) and
|
192
|
+
(contains($s_betaSeparators, $head) or
|
193
|
+
contains($s_betaSeparators2, $head))">
|
194
|
+
<xsl:call-template name="output-uni-char">
|
195
|
+
<xsl:with-param name="a_char" select="$a_pending"/>
|
196
|
+
<xsl:with-param name="a_state" select="'2'"/>
|
197
|
+
<xsl:with-param name="a_precomposed" select="$a_precomposed"/>
|
198
|
+
</xsl:call-template>
|
199
|
+
</xsl:when>
|
200
|
+
|
201
|
+
<xsl:otherwise>
|
202
|
+
<xsl:call-template name="output-uni-char">
|
203
|
+
<xsl:with-param name="a_char" select="$a_pending"/>
|
204
|
+
<xsl:with-param name="a_state" select="$a_state"/>
|
205
|
+
<xsl:with-param name="a_precomposed" select="$a_precomposed"/>
|
206
|
+
</xsl:call-template>
|
207
|
+
</xsl:otherwise>
|
208
|
+
</xsl:choose>
|
209
|
+
|
210
|
+
<!-- reset state if there was a pending character -->
|
211
|
+
<xsl:variable name="newstate">
|
212
|
+
<xsl:choose>
|
213
|
+
<xsl:when test="$a_pending"/>
|
214
|
+
<xsl:otherwise>
|
215
|
+
<xsl:value-of select="$a_state"/>
|
216
|
+
</xsl:otherwise>
|
217
|
+
</xsl:choose>
|
218
|
+
</xsl:variable>
|
219
|
+
|
220
|
+
<!-- recurse with head as pending char -->
|
221
|
+
<xsl:call-template name="beta-to-uni">
|
222
|
+
<xsl:with-param name="a_in" select="substring($a_in, 2)"/>
|
223
|
+
<xsl:with-param name="a_state" select="$newstate"/>
|
224
|
+
<xsl:with-param name="a_pending" select="$head"/>
|
225
|
+
<xsl:with-param name="a_precomposed" select="$a_precomposed"/>
|
226
|
+
<xsl:with-param name="a_partial" select="$a_partial"/>
|
227
|
+
</xsl:call-template>
|
228
|
+
</xsl:otherwise>
|
229
|
+
</xsl:choose>
|
230
|
+
</xsl:template>
|
231
|
+
|
232
|
+
<!--
|
233
|
+
Output a single character with diacritics
|
234
|
+
Parameters:
|
235
|
+
$a_char character to be output
|
236
|
+
$a_state diacritics associated with character
|
237
|
+
$a_precomposed whether to put out precomposed or decomposed Unicode
|
238
|
+
-->
|
239
|
+
<xsl:template name="output-uni-char">
|
240
|
+
<xsl:param name="a_char"/>
|
241
|
+
<xsl:param name="a_state"/>
|
242
|
+
<xsl:param name="a_precomposed"/>
|
243
|
+
|
244
|
+
<xsl:choose>
|
245
|
+
<!-- if no character pending -->
|
246
|
+
<xsl:when test="string-length($a_char) = 0">
|
247
|
+
<!-- if we have state and we're not processing a capital -->
|
248
|
+
<xsl:if
|
249
|
+
test="(string-length($a_state) > 0) and
|
250
|
+
(substring($a_state, 1, 1) != '*')">
|
251
|
+
<!-- output just the state -->
|
252
|
+
<!-- here precomposed=true means don't make it combining -->
|
253
|
+
<xsl:apply-templates select="$s_betaUniTable" mode="b2u">
|
254
|
+
<xsl:with-param name="a_key" select="$a_state"/>
|
255
|
+
<xsl:with-param name="a_precomposed" select="true()"/>
|
256
|
+
</xsl:apply-templates>
|
257
|
+
</xsl:if>
|
258
|
+
</xsl:when>
|
259
|
+
|
260
|
+
<!-- if character is pending -->
|
261
|
+
<xsl:otherwise>
|
262
|
+
<!-- translate to lower and back -->
|
263
|
+
<xsl:variable name="lowerchar"
|
264
|
+
select="translate($a_char, $s_betaUppers, $s_betaLowers)"/>
|
265
|
+
<xsl:variable name="upperchar"
|
266
|
+
select="translate($a_char, $s_betaLowers, $s_betaUppers)"/>
|
267
|
+
<xsl:choose>
|
268
|
+
<!-- if upper != lower, we have a letter -->
|
269
|
+
<xsl:when test="$lowerchar != $upperchar">
|
270
|
+
<!-- use letter+state as key into table -->
|
271
|
+
<xsl:apply-templates select="$s_betaUniTable" mode="b2u">
|
272
|
+
<xsl:with-param name="a_key"
|
273
|
+
select="concat($lowerchar, $a_state)"/>
|
274
|
+
<xsl:with-param name="a_precomposed" select="$a_precomposed"/>
|
275
|
+
</xsl:apply-templates>
|
276
|
+
</xsl:when>
|
277
|
+
|
278
|
+
<!-- if upper = lower, we have a non-letter -->
|
279
|
+
<xsl:otherwise>
|
280
|
+
<!-- output character, if any, then use state as key into table -->
|
281
|
+
<!-- this handles the case of isolated diacritics -->
|
282
|
+
<xsl:value-of select="$a_char"/>
|
283
|
+
<xsl:if test="string-length($a_state) > 0">
|
284
|
+
<xsl:apply-templates select="$s_betaUniTable" mode="b2u">
|
285
|
+
<xsl:with-param name="a_key" select="$a_state"/>
|
286
|
+
<xsl:with-param name="a_precomposed" select="$a_precomposed"/>
|
287
|
+
</xsl:apply-templates>
|
288
|
+
</xsl:if>
|
289
|
+
</xsl:otherwise>
|
290
|
+
</xsl:choose>
|
291
|
+
</xsl:otherwise>
|
292
|
+
</xsl:choose>
|
293
|
+
</xsl:template>
|
294
|
+
|
295
|
+
</xsl:stylesheet>
|
@@ -0,0 +1,31 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
3
|
+
xmlns:xs="http://www.w3.org/2001/XMLSchema"
|
4
|
+
exclude-result-prefixes="xs"
|
5
|
+
version="1.0">
|
6
|
+
<xsl:output media-type="text/xml" omit-xml-declaration="no" method="xml" indent="yes"/>
|
7
|
+
<xsl:preserve-space elements="*"/>
|
8
|
+
<xsl:include href="beta2unicode.xsl"/>
|
9
|
+
|
10
|
+
<xsl:template match="@*|node()">
|
11
|
+
<xsl:copy>
|
12
|
+
<xsl:apply-templates select="@*"></xsl:apply-templates>
|
13
|
+
<xsl:apply-templates select="node()"></xsl:apply-templates>
|
14
|
+
</xsl:copy>
|
15
|
+
</xsl:template>
|
16
|
+
|
17
|
+
<xsl:template match="@span|@lemma|@form">
|
18
|
+
<xsl:choose>
|
19
|
+
<xsl:when test="ancestor::treebank[@xml:lang='grc' or @xml:lang='greek']">
|
20
|
+
<xsl:attribute name="{local-name(.)}">
|
21
|
+
<xsl:call-template name="beta-to-uni">
|
22
|
+
<xsl:with-param name="a_in" select="."/>
|
23
|
+
</xsl:call-template>
|
24
|
+
</xsl:attribute>
|
25
|
+
</xsl:when>
|
26
|
+
<xsl:otherwise><xsl:copy/></xsl:otherwise>
|
27
|
+
</xsl:choose>
|
28
|
+
|
29
|
+
</xsl:template>
|
30
|
+
|
31
|
+
</xsl:stylesheet>
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: treebank-transform
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- LFDM
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-12-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -111,6 +111,7 @@ files:
|
|
111
111
|
- Rakefile
|
112
112
|
- bin/treebank-transform
|
113
113
|
- examples/cicero_catilina_sample.xml
|
114
|
+
- examples/hesiod_shield_of_heracles_sample.xml
|
114
115
|
- lib/treebank/alphabet.rb
|
115
116
|
- lib/treebank/elliptic_word.rb
|
116
117
|
- lib/treebank/sentence.rb
|
@@ -119,6 +120,9 @@ files:
|
|
119
120
|
- lib/treebank/transform/version.rb
|
120
121
|
- spec/spec_helper.rb
|
121
122
|
- spec/treebank/transform_spec.rb
|
123
|
+
- stylesheets/beta-uni-util.xsl
|
124
|
+
- stylesheets/beta2unicode.xsl
|
125
|
+
- stylesheets/treebank-beta-uni.xsl
|
122
126
|
- treebank-transform.gemspec
|
123
127
|
homepage: ''
|
124
128
|
licenses:
|