treebank-transform 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +2 -1
- data/examples/hesiod_shield_of_heracles_sample.xml +5137 -0
- data/lib/treebank/sentence.rb +1 -1
- data/lib/treebank/transform/version.rb +1 -1
- data/lib/treebank/transform.rb +40 -7
- data/spec/treebank/transform_spec.rb +43 -17
- data/stylesheets/beta-uni-util.xsl +2015 -0
- data/stylesheets/beta2unicode.xsl +295 -0
- data/stylesheets/treebank-beta-uni.xsl +31 -0
- metadata +6 -2
@@ -0,0 +1,295 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
|
3
|
+
<xsl:import href="beta-uni-util.xsl"/>
|
4
|
+
|
5
|
+
<!--
|
6
|
+
Copyright 2008-2009 Cantus Foundation
|
7
|
+
http://alpheios.net
|
8
|
+
|
9
|
+
This file is part of Alpheios.
|
10
|
+
|
11
|
+
Alpheios is free software: you can redistribute it and/or modify
|
12
|
+
it under the terms of the GNU General Public License as published by
|
13
|
+
the Free Software Foundation, either version 3 of the License, or
|
14
|
+
(at your option) any later version.
|
15
|
+
|
16
|
+
Alpheios is distributed in the hope that it will be useful,
|
17
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
18
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
19
|
+
GNU General Public License for more details.
|
20
|
+
|
21
|
+
You should have received a copy of the GNU General Public License
|
22
|
+
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
23
|
+
-->
|
24
|
+
|
25
|
+
<!--
|
26
|
+
Test whether text is in betacode
|
27
|
+
Parameters:
|
28
|
+
$a_in string/node to be tested
|
29
|
+
Output:
|
30
|
+
1 if encoded in betacode, else 0
|
31
|
+
(Note: Boolean return value does not seem to work
|
32
|
+
reliably, perhaps because of recursion.)
|
33
|
+
-->
|
34
|
+
<xsl:template name="is-beta">
|
35
|
+
<xsl:param name="a_in"/>
|
36
|
+
|
37
|
+
<xsl:choose>
|
38
|
+
<!-- if xml:lang says betacode, so be it -->
|
39
|
+
<xsl:when test="lang('grc-x-beta')">
|
40
|
+
<xsl:value-of select="1"/>
|
41
|
+
</xsl:when>
|
42
|
+
|
43
|
+
<!-- if no input, can't be betacode -->
|
44
|
+
<xsl:when test="string-length($a_in) = 0">
|
45
|
+
<xsl:value-of select="0"/>
|
46
|
+
</xsl:when>
|
47
|
+
|
48
|
+
<!-- otherwise, check the characters in input -->
|
49
|
+
<xsl:otherwise>
|
50
|
+
<xsl:variable name="head" select="substring($a_in, 1, 1)"/>
|
51
|
+
|
52
|
+
<xsl:choose>
|
53
|
+
<!-- if betacode base letter, assume it's betacode -->
|
54
|
+
<xsl:when
|
55
|
+
test="contains($s_betaUppers, $head) or
|
56
|
+
contains($s_betaLowers, $head)">
|
57
|
+
<xsl:value-of select="1"/>
|
58
|
+
</xsl:when>
|
59
|
+
|
60
|
+
<xsl:otherwise>
|
61
|
+
<!-- look up unicode in table -->
|
62
|
+
<xsl:variable name="beta">
|
63
|
+
<xsl:apply-templates select="$s_betaUniTable" mode="u2b">
|
64
|
+
<xsl:with-param name="a_key" select="$head"/>
|
65
|
+
</xsl:apply-templates>
|
66
|
+
</xsl:variable>
|
67
|
+
|
68
|
+
<xsl:choose>
|
69
|
+
<!-- if found in unicode table, it's not betacode -->
|
70
|
+
<xsl:when test="string-length($beta) > 0">
|
71
|
+
<xsl:value-of select="0"/>
|
72
|
+
</xsl:when>
|
73
|
+
|
74
|
+
<!-- otherwise, skip letter and check remainder of string -->
|
75
|
+
<xsl:otherwise>
|
76
|
+
<xsl:call-template name="is-beta">
|
77
|
+
<xsl:with-param name="a_in" select="substring($a_in, 2)"/>
|
78
|
+
</xsl:call-template>
|
79
|
+
</xsl:otherwise>
|
80
|
+
</xsl:choose>
|
81
|
+
</xsl:otherwise>
|
82
|
+
</xsl:choose>
|
83
|
+
|
84
|
+
</xsl:otherwise>
|
85
|
+
</xsl:choose>
|
86
|
+
</xsl:template>
|
87
|
+
|
88
|
+
<!--
|
89
|
+
Convert Greek betacode to Unicode
|
90
|
+
Parameters:
|
91
|
+
$a_in betacode input string to be converted
|
92
|
+
$a_pending character waiting to be output
|
93
|
+
$a_state diacritics associated with pending character
|
94
|
+
$a_precomposed whether to put out precomposed or decomposed Unicode
|
95
|
+
$a_partial whether this is a partial word
|
96
|
+
(If true, do not use final sigma for last letter)
|
97
|
+
|
98
|
+
Output:
|
99
|
+
$a_in transformed to equivalent Unicode
|
100
|
+
|
101
|
+
The characters in the state string are maintained in a canonical order,
|
102
|
+
which allows the lookup table to contain a single entry for each
|
103
|
+
combination of base character and diacritics. The diacritics may appear
|
104
|
+
in any order in the input.
|
105
|
+
|
106
|
+
Diacritics associated with (either preceding or following) a base
|
107
|
+
character are accumulated until either a non-diacritic character or end
|
108
|
+
of input are encountered, at which point the pending character is output.
|
109
|
+
-->
|
110
|
+
<xsl:template name="beta-to-uni">
|
111
|
+
<xsl:param name="a_in"/>
|
112
|
+
<xsl:param name="a_pending" select="''"/>
|
113
|
+
<xsl:param name="a_state" select="''"/>
|
114
|
+
<xsl:param name="a_precomposed" select="true()"/>
|
115
|
+
<xsl:param name="a_partial" select="false()"/>
|
116
|
+
|
117
|
+
<xsl:variable name="head" select="substring($a_in, 1, 1)"/>
|
118
|
+
|
119
|
+
<xsl:choose>
|
120
|
+
<!-- if no more input -->
|
121
|
+
<xsl:when test="string-length($a_in) = 0">
|
122
|
+
<!-- output last pending char -->
|
123
|
+
<xsl:choose>
|
124
|
+
<!-- final sigma: S with no state -->
|
125
|
+
<xsl:when
|
126
|
+
test="(($a_pending = 's') or ($a_pending = 'S')) and
|
127
|
+
not($a_partial) and (string-length($a_state) = 0)">
|
128
|
+
<xsl:call-template name="output-uni-char">
|
129
|
+
<xsl:with-param name="a_char" select="$a_pending"/>
|
130
|
+
<xsl:with-param name="a_state" select="'2'"/>
|
131
|
+
<xsl:with-param name="a_precomposed" select="$a_precomposed"/>
|
132
|
+
</xsl:call-template>
|
133
|
+
</xsl:when>
|
134
|
+
|
135
|
+
<xsl:otherwise>
|
136
|
+
<xsl:call-template name="output-uni-char">
|
137
|
+
<xsl:with-param name="a_char" select="$a_pending"/>
|
138
|
+
<xsl:with-param name="a_state" select="$a_state"/>
|
139
|
+
<xsl:with-param name="a_precomposed" select="$a_precomposed"/>
|
140
|
+
</xsl:call-template>
|
141
|
+
</xsl:otherwise>
|
142
|
+
</xsl:choose>
|
143
|
+
</xsl:when>
|
144
|
+
|
145
|
+
<!-- if input starts with "*" -->
|
146
|
+
<xsl:when test="$head = '*'">
|
147
|
+
<!-- output pending char -->
|
148
|
+
<xsl:call-template name="output-uni-char">
|
149
|
+
<xsl:with-param name="a_char" select="$a_pending"/>
|
150
|
+
<xsl:with-param name="a_state" select="$a_state"/>
|
151
|
+
<xsl:with-param name="a_precomposed" select="$a_precomposed"/>
|
152
|
+
</xsl:call-template>
|
153
|
+
|
154
|
+
<!-- recurse, capitalizing next char, erasing any saved state -->
|
155
|
+
<xsl:call-template name="beta-to-uni">
|
156
|
+
<xsl:with-param name="a_in" select="substring($a_in, 2)"/>
|
157
|
+
<xsl:with-param name="a_state" select="'*'"/>
|
158
|
+
<xsl:with-param name="a_pending" select="''"/>
|
159
|
+
<xsl:with-param name="a_precomposed" select="$a_precomposed"/>
|
160
|
+
<xsl:with-param name="a_partial" select="$a_partial"/>
|
161
|
+
</xsl:call-template>
|
162
|
+
</xsl:when>
|
163
|
+
|
164
|
+
<!-- if input starts with diacritic -->
|
165
|
+
<xsl:when test="contains($s_betaDiacritics, $head)">
|
166
|
+
<!-- update state with new character -->
|
167
|
+
<xsl:variable name="newstate">
|
168
|
+
<xsl:call-template name="insert-diacritic">
|
169
|
+
<xsl:with-param name="a_string" select="$a_state"/>
|
170
|
+
<xsl:with-param name="a_char" select="$head"/>
|
171
|
+
</xsl:call-template>
|
172
|
+
</xsl:variable>
|
173
|
+
|
174
|
+
<!-- recurse with updated state -->
|
175
|
+
<xsl:call-template name="beta-to-uni">
|
176
|
+
<xsl:with-param name="a_in" select="substring($a_in, 2)"/>
|
177
|
+
<xsl:with-param name="a_state" select="$newstate"/>
|
178
|
+
<xsl:with-param name="a_pending" select="$a_pending"/>
|
179
|
+
<xsl:with-param name="a_precomposed" select="$a_precomposed"/>
|
180
|
+
<xsl:with-param name="a_partial" select="$a_partial"/>
|
181
|
+
</xsl:call-template>
|
182
|
+
</xsl:when>
|
183
|
+
|
184
|
+
<!-- if not special char -->
|
185
|
+
<xsl:otherwise>
|
186
|
+
<!-- output pending char -->
|
187
|
+
<xsl:choose>
|
188
|
+
<!-- final sigma: S with no state followed by word break -->
|
189
|
+
<xsl:when
|
190
|
+
test="(($a_pending = 's') or ($a_pending = 'S')) and
|
191
|
+
(string-length($a_state) = 0) and
|
192
|
+
(contains($s_betaSeparators, $head) or
|
193
|
+
contains($s_betaSeparators2, $head))">
|
194
|
+
<xsl:call-template name="output-uni-char">
|
195
|
+
<xsl:with-param name="a_char" select="$a_pending"/>
|
196
|
+
<xsl:with-param name="a_state" select="'2'"/>
|
197
|
+
<xsl:with-param name="a_precomposed" select="$a_precomposed"/>
|
198
|
+
</xsl:call-template>
|
199
|
+
</xsl:when>
|
200
|
+
|
201
|
+
<xsl:otherwise>
|
202
|
+
<xsl:call-template name="output-uni-char">
|
203
|
+
<xsl:with-param name="a_char" select="$a_pending"/>
|
204
|
+
<xsl:with-param name="a_state" select="$a_state"/>
|
205
|
+
<xsl:with-param name="a_precomposed" select="$a_precomposed"/>
|
206
|
+
</xsl:call-template>
|
207
|
+
</xsl:otherwise>
|
208
|
+
</xsl:choose>
|
209
|
+
|
210
|
+
<!-- reset state if there was a pending character -->
|
211
|
+
<xsl:variable name="newstate">
|
212
|
+
<xsl:choose>
|
213
|
+
<xsl:when test="$a_pending"/>
|
214
|
+
<xsl:otherwise>
|
215
|
+
<xsl:value-of select="$a_state"/>
|
216
|
+
</xsl:otherwise>
|
217
|
+
</xsl:choose>
|
218
|
+
</xsl:variable>
|
219
|
+
|
220
|
+
<!-- recurse with head as pending char -->
|
221
|
+
<xsl:call-template name="beta-to-uni">
|
222
|
+
<xsl:with-param name="a_in" select="substring($a_in, 2)"/>
|
223
|
+
<xsl:with-param name="a_state" select="$newstate"/>
|
224
|
+
<xsl:with-param name="a_pending" select="$head"/>
|
225
|
+
<xsl:with-param name="a_precomposed" select="$a_precomposed"/>
|
226
|
+
<xsl:with-param name="a_partial" select="$a_partial"/>
|
227
|
+
</xsl:call-template>
|
228
|
+
</xsl:otherwise>
|
229
|
+
</xsl:choose>
|
230
|
+
</xsl:template>
|
231
|
+
|
232
|
+
<!--
|
233
|
+
Output a single character with diacritics
|
234
|
+
Parameters:
|
235
|
+
$a_char character to be output
|
236
|
+
$a_state diacritics associated with character
|
237
|
+
$a_precomposed whether to put out precomposed or decomposed Unicode
|
238
|
+
-->
|
239
|
+
<xsl:template name="output-uni-char">
|
240
|
+
<xsl:param name="a_char"/>
|
241
|
+
<xsl:param name="a_state"/>
|
242
|
+
<xsl:param name="a_precomposed"/>
|
243
|
+
|
244
|
+
<xsl:choose>
|
245
|
+
<!-- if no character pending -->
|
246
|
+
<xsl:when test="string-length($a_char) = 0">
|
247
|
+
<!-- if we have state and we're not processing a capital -->
|
248
|
+
<xsl:if
|
249
|
+
test="(string-length($a_state) > 0) and
|
250
|
+
(substring($a_state, 1, 1) != '*')">
|
251
|
+
<!-- output just the state -->
|
252
|
+
<!-- here precomposed=true means don't make it combining -->
|
253
|
+
<xsl:apply-templates select="$s_betaUniTable" mode="b2u">
|
254
|
+
<xsl:with-param name="a_key" select="$a_state"/>
|
255
|
+
<xsl:with-param name="a_precomposed" select="true()"/>
|
256
|
+
</xsl:apply-templates>
|
257
|
+
</xsl:if>
|
258
|
+
</xsl:when>
|
259
|
+
|
260
|
+
<!-- if character is pending -->
|
261
|
+
<xsl:otherwise>
|
262
|
+
<!-- translate to lower and back -->
|
263
|
+
<xsl:variable name="lowerchar"
|
264
|
+
select="translate($a_char, $s_betaUppers, $s_betaLowers)"/>
|
265
|
+
<xsl:variable name="upperchar"
|
266
|
+
select="translate($a_char, $s_betaLowers, $s_betaUppers)"/>
|
267
|
+
<xsl:choose>
|
268
|
+
<!-- if upper != lower, we have a letter -->
|
269
|
+
<xsl:when test="$lowerchar != $upperchar">
|
270
|
+
<!-- use letter+state as key into table -->
|
271
|
+
<xsl:apply-templates select="$s_betaUniTable" mode="b2u">
|
272
|
+
<xsl:with-param name="a_key"
|
273
|
+
select="concat($lowerchar, $a_state)"/>
|
274
|
+
<xsl:with-param name="a_precomposed" select="$a_precomposed"/>
|
275
|
+
</xsl:apply-templates>
|
276
|
+
</xsl:when>
|
277
|
+
|
278
|
+
<!-- if upper = lower, we have a non-letter -->
|
279
|
+
<xsl:otherwise>
|
280
|
+
<!-- output character, if any, then use state as key into table -->
|
281
|
+
<!-- this handles the case of isolated diacritics -->
|
282
|
+
<xsl:value-of select="$a_char"/>
|
283
|
+
<xsl:if test="string-length($a_state) > 0">
|
284
|
+
<xsl:apply-templates select="$s_betaUniTable" mode="b2u">
|
285
|
+
<xsl:with-param name="a_key" select="$a_state"/>
|
286
|
+
<xsl:with-param name="a_precomposed" select="$a_precomposed"/>
|
287
|
+
</xsl:apply-templates>
|
288
|
+
</xsl:if>
|
289
|
+
</xsl:otherwise>
|
290
|
+
</xsl:choose>
|
291
|
+
</xsl:otherwise>
|
292
|
+
</xsl:choose>
|
293
|
+
</xsl:template>
|
294
|
+
|
295
|
+
</xsl:stylesheet>
|
@@ -0,0 +1,31 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
3
|
+
xmlns:xs="http://www.w3.org/2001/XMLSchema"
|
4
|
+
exclude-result-prefixes="xs"
|
5
|
+
version="1.0">
|
6
|
+
<xsl:output media-type="text/xml" omit-xml-declaration="no" method="xml" indent="yes"/>
|
7
|
+
<xsl:preserve-space elements="*"/>
|
8
|
+
<xsl:include href="beta2unicode.xsl"/>
|
9
|
+
|
10
|
+
<xsl:template match="@*|node()">
|
11
|
+
<xsl:copy>
|
12
|
+
<xsl:apply-templates select="@*"></xsl:apply-templates>
|
13
|
+
<xsl:apply-templates select="node()"></xsl:apply-templates>
|
14
|
+
</xsl:copy>
|
15
|
+
</xsl:template>
|
16
|
+
|
17
|
+
<xsl:template match="@span|@lemma|@form">
|
18
|
+
<xsl:choose>
|
19
|
+
<xsl:when test="ancestor::treebank[@xml:lang='grc' or @xml:lang='greek']">
|
20
|
+
<xsl:attribute name="{local-name(.)}">
|
21
|
+
<xsl:call-template name="beta-to-uni">
|
22
|
+
<xsl:with-param name="a_in" select="."/>
|
23
|
+
</xsl:call-template>
|
24
|
+
</xsl:attribute>
|
25
|
+
</xsl:when>
|
26
|
+
<xsl:otherwise><xsl:copy/></xsl:otherwise>
|
27
|
+
</xsl:choose>
|
28
|
+
|
29
|
+
</xsl:template>
|
30
|
+
|
31
|
+
</xsl:stylesheet>
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: treebank-transform
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- LFDM
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-12-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -111,6 +111,7 @@ files:
|
|
111
111
|
- Rakefile
|
112
112
|
- bin/treebank-transform
|
113
113
|
- examples/cicero_catilina_sample.xml
|
114
|
+
- examples/hesiod_shield_of_heracles_sample.xml
|
114
115
|
- lib/treebank/alphabet.rb
|
115
116
|
- lib/treebank/elliptic_word.rb
|
116
117
|
- lib/treebank/sentence.rb
|
@@ -119,6 +120,9 @@ files:
|
|
119
120
|
- lib/treebank/transform/version.rb
|
120
121
|
- spec/spec_helper.rb
|
121
122
|
- spec/treebank/transform_spec.rb
|
123
|
+
- stylesheets/beta-uni-util.xsl
|
124
|
+
- stylesheets/beta2unicode.xsl
|
125
|
+
- stylesheets/treebank-beta-uni.xsl
|
122
126
|
- treebank-transform.gemspec
|
123
127
|
homepage: ''
|
124
128
|
licenses:
|