proiel-cli 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +23 -0
- data/README.md +34 -0
- data/bin/proiel +27 -0
- data/bin/setup +7 -0
- data/contrib/proiel-giza-train +6 -0
- data/contrib/proiel-lexc-compile +18 -0
- data/contrib/proiel-maltparser-parse +2 -0
- data/contrib/proiel-maltparser-train +6 -0
- data/contrib/proiel-tnt-train +15 -0
- data/examples/decision-tree.rb +41 -0
- data/examples/dep-pos-cooccurrences.rb +84 -0
- data/examples/lint-rules.rb +174 -0
- data/examples/relation-as-disambiguator.rb +134 -0
- data/examples/word-occurrences.rb +30 -0
- data/lib/proiel/cli.rb +2 -0
- data/lib/proiel/cli/commands.rb +28 -0
- data/lib/proiel/cli/commands/convert.rb +94 -0
- data/lib/proiel/cli/commands/grep.rb +136 -0
- data/lib/proiel/cli/commands/info.rb +126 -0
- data/lib/proiel/cli/commands/tokenize.rb +165 -0
- data/lib/proiel/cli/commands/validate.rb +42 -0
- data/lib/proiel/cli/converters/conll-u.rb +589 -0
- data/lib/proiel/cli/converters/conll-u/morphology.rb +235 -0
- data/lib/proiel/cli/converters/conll-u/syntax.rb +81 -0
- data/lib/proiel/cli/converters/conll-x.rb +66 -0
- data/lib/proiel/cli/converters/lexc.rb +36 -0
- data/lib/proiel/cli/converters/proielxml.rb +152 -0
- data/lib/proiel/cli/converters/text.rb +99 -0
- data/lib/proiel/cli/converters/tiger.rb +157 -0
- data/lib/proiel/cli/converters/tiger2.rb +193 -0
- data/lib/proiel/cli/converters/tnt.rb +30 -0
- data/lib/proiel/cli/version.rb +5 -0
- metadata +248 -0
@@ -0,0 +1,235 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
module PROIEL
|
3
|
+
module Converter
|
4
|
+
class CoNLLU
|
5
|
+
COPULAR_LEMMATA = ['sum,V-,lat', 'εἰμί#1,V-,grc']
|
6
|
+
DETERMINERS = ['S-', 'Pd', 'Px', 'Ps', 'Pt']
|
7
|
+
NEGATION_LEMMATA = ['non,Df,lat', 'ne,Df,lat',
|
8
|
+
'μή,Df,grc',
|
9
|
+
'μήγε,Df,grc',
|
10
|
+
'μηδαμῶς,Df,grc',
|
11
|
+
'μηδέποτε,Df,grc',
|
12
|
+
'μηδέπω,Df,grc',
|
13
|
+
'μηκέτι,Df,grc',
|
14
|
+
'μήπω,Df,grc',
|
15
|
+
'μήτε,Df,grc',
|
16
|
+
'μήτι,Df,grc',
|
17
|
+
'μήτιγε,Df,grc',
|
18
|
+
'οὐ,Df,grc',
|
19
|
+
'οὐδαμῇ,Df,grc',
|
20
|
+
'οὐδαμῶς,Df,grc',
|
21
|
+
'οὐδέ,Df,grc',
|
22
|
+
'οὐδέποτε,Df,grc',
|
23
|
+
'οὐδέπω,Df,grc',
|
24
|
+
'οὐκέτι,Df,grc',
|
25
|
+
'οὐκοῦν,Df,grc',
|
26
|
+
'οὔπω,Df,grc',
|
27
|
+
'οὔτε,Df,grc',
|
28
|
+
'οὔτι,Df,grc',
|
29
|
+
'οὐχί,Df,grc',
|
30
|
+
'не,Df,chu',
|
31
|
+
'ни,Df,chu',
|
32
|
+
'нѣ,Df,chu',
|
33
|
+
'nei,Df,got',
|
34
|
+
'ni,Df,got',
|
35
|
+
'nibai#2,Df,got',
|
36
|
+
'nih,Df,got',
|
37
|
+
]
|
38
|
+
|
39
|
+
|
40
|
+
PARTICLE_LEMMATA = [ 'at,Df,lat',
|
41
|
+
'atque,Df,lat',
|
42
|
+
'autem,Df,lat',
|
43
|
+
'certe,Df,lat',
|
44
|
+
'ergo,Df,lat',
|
45
|
+
'et,Df,lat',
|
46
|
+
'enim,Df,lat',
|
47
|
+
'etiam,Df,lat',
|
48
|
+
'igitur,Df,lat',
|
49
|
+
'immo,Df,lat',
|
50
|
+
'itaque,Df,lat',
|
51
|
+
'nam,Df,lat',
|
52
|
+
'nonne,Df,lat',
|
53
|
+
'nonne,Du,lat',
|
54
|
+
'quidem,Df,lat',
|
55
|
+
'quoque,Df,lat',
|
56
|
+
'sic,Df,lat',
|
57
|
+
'tamen,Df,lat',
|
58
|
+
'tum,Df,lat',
|
59
|
+
'tunc,Df,lat',
|
60
|
+
'vero,Df,lat',
|
61
|
+
'ἅμα,Df,grc',
|
62
|
+
'ἄν,Df,grc',
|
63
|
+
'ἀνά,Df,grc',
|
64
|
+
'ἆρα,Df,grc',
|
65
|
+
'ἄραγε,Df,grc',
|
66
|
+
'ἀτάρ,Df,grc',
|
67
|
+
'ἅτε,Df,grc',
|
68
|
+
'αὗ,Df,grc',
|
69
|
+
'αὖθις,Df,grc',
|
70
|
+
'γάρ,Df,grc',
|
71
|
+
'γε,Df,grc',
|
72
|
+
'γοῦν,Df,grc',
|
73
|
+
'δέ,Df,grc',
|
74
|
+
'δή,Df,grc',
|
75
|
+
'δῆθεν,Df,grc',
|
76
|
+
'δηλαδή,Df,grc',
|
77
|
+
'δηλονότι,Df,grc',
|
78
|
+
'δῆτα,Df,grc',
|
79
|
+
'εἶτα,Df,grc',
|
80
|
+
'ἔτι,Df,grc',
|
81
|
+
'ἦ#2,Df,grc',
|
82
|
+
'ἤγουν,Df,grc',
|
83
|
+
'ἤδη,Df,grc',
|
84
|
+
'ἤτοι,Df,grc',
|
85
|
+
'καίτοι,Df,grc',
|
86
|
+
'καίτοιγε,Df,grc',
|
87
|
+
'μέν,Df,grc',
|
88
|
+
'μενοῦνγε,Df,grc',
|
89
|
+
'μέντοι,Df,grc',
|
90
|
+
'μήν,Df,grc',
|
91
|
+
'νά,Df,grc',
|
92
|
+
'νῦν#1,Df,grc',
|
93
|
+
'νυν#2,Df,grc',
|
94
|
+
'νυνί,Df,grc',
|
95
|
+
'οὖν,Df,grc',
|
96
|
+
'πέρ,Df,grc',
|
97
|
+
'πῃ,Df,grc',
|
98
|
+
'ποτε,Df,grc',
|
99
|
+
'πού,Df,grc',
|
100
|
+
'πω,Df,grc',
|
101
|
+
'πως,Df,grc',
|
102
|
+
'τάχα,Df,grc',
|
103
|
+
'τε,Df,grc',
|
104
|
+
'τοι,Df,grc',
|
105
|
+
'τοιγαροῦν,Df,grc',
|
106
|
+
'τοίνυν,Df,grc',
|
107
|
+
'бо,Df,chu',
|
108
|
+
'же,Df,chu',
|
109
|
+
'занѥ,Df,chu',
|
110
|
+
'ибо,Df,chu',
|
111
|
+
'иде,Df,chu',
|
112
|
+
'ижде,Df,chu',
|
113
|
+
'ли,Df,chu',
|
114
|
+
'обаче,Df,chu',
|
115
|
+
'оубо,Df,chu',
|
116
|
+
'ти,Df,chu',
|
117
|
+
'тѣ,Df,chu',
|
118
|
+
'ꙗко#2,Df,chu',
|
119
|
+
'an,Df,got',
|
120
|
+
'auk,Df,got',
|
121
|
+
'aufto,Df,got',
|
122
|
+
'nu,Df,got',
|
123
|
+
'ussindo,Df,got',
|
124
|
+
'waitei,Df,got',
|
125
|
+
'þan,Df,got',
|
126
|
+
'nuh,Df,got',
|
127
|
+
'nunu,Df,got',
|
128
|
+
'raihtis,Df,got',
|
129
|
+
'sunsaiw,Df,got',
|
130
|
+
'unte,Df,got',
|
131
|
+
'þande,Df,got',
|
132
|
+
'þannu,Df,got',
|
133
|
+
'þanuh,Df,got',
|
134
|
+
'þaruh,Df,got',
|
135
|
+
]
|
136
|
+
|
137
|
+
|
138
|
+
POS_MAP =
|
139
|
+
{
|
140
|
+
'A-' => ['ADJ'],
|
141
|
+
'Df' => ['ADV'],
|
142
|
+
'S-' => ['DET', "Definite=Def|PronType=Dem"], # (we only have definite articles)
|
143
|
+
'Ma' => ['NUM'],
|
144
|
+
'Nb' => ['NOUN'],
|
145
|
+
'C-' => ['CONJ'],
|
146
|
+
'Pd' => ['DET'],
|
147
|
+
'F-' => ['X'],
|
148
|
+
'Px' => ['PRON'],
|
149
|
+
'N-' => ['SCONJ'], #irrelevant for our purposes
|
150
|
+
'I-' => ['INTJ'],
|
151
|
+
'Du' => ['ADV', "PronType=Int"],
|
152
|
+
'Pi' => ['PRON', "PronType=Int"],
|
153
|
+
'Mo' => ['ADJ'],
|
154
|
+
'Pp' => ['PRON', "PronType=Prs"],
|
155
|
+
'Pk' => ['PRON', "PronType=Prs|Reflex=Yes"],
|
156
|
+
'Ps' => ['PRON', "PronType=Prs|Poss=Yes"], ### layered gender?
|
157
|
+
'Pt' => ['PRON', "PronType=Prs|Poss=Yes|Reflex=Yes" ], ### layered gender?
|
158
|
+
'R-' => ['ADP'],
|
159
|
+
'Ne' => ['PROPN'],
|
160
|
+
'Py' => ['DET'],
|
161
|
+
'Pc' => ['PRON', "PronType=Rcp"],
|
162
|
+
'Dq' => ['ADV', "PronType=Rel"],
|
163
|
+
'Pr' => ['PRON', "PronType=Rel"],
|
164
|
+
'G-' => ['SCONJ'],
|
165
|
+
'V-' => ['VERB'],
|
166
|
+
'X-' => ['X'] }
|
167
|
+
|
168
|
+
MORPHOLOGY_MAP = {
|
169
|
+
:person => {'1' => 'Person=1',
|
170
|
+
'2' => 'Person=2',
|
171
|
+
'3' => 'Person=3' } ,
|
172
|
+
:number => {'s' => 'Number=Sing',
|
173
|
+
'd' => 'Number=Dual',
|
174
|
+
'p' => 'Number=Plur' } ,
|
175
|
+
:tense => {'p' => 'Tense=Pres',
|
176
|
+
'i' => 'Tense=Past|Aspect=Imp',
|
177
|
+
'r' => 'Tense=Past|Aspect=Perf', #'Tense=Perfect',
|
178
|
+
's' => 'Aspect=Res',
|
179
|
+
# tags Perf is not universal
|
180
|
+
'a' => 'Tense=Past|Aspect=Perf',
|
181
|
+
'u' => 'Tense=Past',
|
182
|
+
'l' => 'Tense=Pqp',
|
183
|
+
'f' => 'Tense=Fut',
|
184
|
+
# tag FutPerfect is not universal
|
185
|
+
't' => 'Tense=Fut|Aspect=Perf', #FutPerfect'
|
186
|
+
},
|
187
|
+
:mood => {'i' => 'VerbForm=Fin|Mood=Ind',
|
188
|
+
's' => 'VerbForm=Fin|Mood=Sub',
|
189
|
+
'm' => 'VerbForm=Fin|Mood=Imp',
|
190
|
+
'o' => 'VerbForm=Fin|Mood=Opt',
|
191
|
+
'n' => 'VerbForm=Inf',
|
192
|
+
'p' => 'VerbForm=Part',
|
193
|
+
'd' => 'VerbForm=Ger',
|
194
|
+
# Gdv (gerundive) is not universal
|
195
|
+
'g' => 'VerbForm=Gdv',
|
196
|
+
'u' => 'VerbForm=Sup',
|
197
|
+
'e'=> 'VerbForm=Fin|Mood=Ind,Sub',
|
198
|
+
'f'=> 'VerbForm=Fin|Mood=Imp,Ind',
|
199
|
+
'h'=> 'VerbForm=Fin|Mood=Imp,Sub',
|
200
|
+
't' => 'VerbForm=Fin' },
|
201
|
+
:voice => {'a' => 'Voice=Act',
|
202
|
+
# Med is not universal
|
203
|
+
'm' => 'Voice=Mid',
|
204
|
+
'p' => 'Voice=Pass',
|
205
|
+
'e' => 'Voice=Mid,Pass' },
|
206
|
+
:gender => {'m' => 'Gender=Masc',
|
207
|
+
'f' => 'Gender=Fem',
|
208
|
+
'n' => 'Gender=Neut',
|
209
|
+
'p' => 'Gender=Fem,Masc',
|
210
|
+
'o' => 'Gender=Masc,Neut',
|
211
|
+
'r' => 'Gender=Fem,Neut' },
|
212
|
+
:case => {'n' => 'Case=Nom',
|
213
|
+
'a' => 'Case=Acc',
|
214
|
+
# Obl(ique) is not universal
|
215
|
+
'o' => 'Case=Obl',
|
216
|
+
'g' => 'Case=Gen',
|
217
|
+
'c' => 'Case=Dat,Gen',
|
218
|
+
'e' => 'Case=Acc,Dat',
|
219
|
+
'd' => 'Case=Dat',
|
220
|
+
'b' => 'Case=Abl',
|
221
|
+
'i' => 'Case=Ins',
|
222
|
+
'l' => 'Case=Loc',
|
223
|
+
'v' => 'Case=Voc' },
|
224
|
+
:degree => {'p' => 'Degree=Pos',
|
225
|
+
'c' => 'Degree=Cmp',
|
226
|
+
's' => 'Degree=Sup' },
|
227
|
+
# The whole strength category is not universal
|
228
|
+
:strength => {'w' => 'Strength=Weak',
|
229
|
+
's' => 'Strength=Strong'},
|
230
|
+
:inflection => {},
|
231
|
+
}
|
232
|
+
end
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
@@ -0,0 +1,81 @@
|
|
1
|
+
module PROIEL
|
2
|
+
module Converter
|
3
|
+
class CoNLLU
|
4
|
+
RELATION_MAPPING = {
|
5
|
+
"adnom" => "dep",
|
6
|
+
"adv" => [["advcl", lambda(&:clausal?) ],
|
7
|
+
["advmod", lambda { |x| x.adverb? or x.preposition? } ],
|
8
|
+
["advmod", lambda(&:adjectival?) ], # adjective for adverb
|
9
|
+
["nmod", lambda(&:nominal?) ],
|
10
|
+
["advmod", lambda { |x| true } ],
|
11
|
+
],
|
12
|
+
"ag" => "nmod", # add :agent" once defined
|
13
|
+
"apos" => [["name", lambda { |x| x.proper_noun? and x.head and x.head.proper_noun? } ],
|
14
|
+
["appos", lambda { |x| (x.nominal? or x.adjectival?) and x.head and x.head.nominal? } ],
|
15
|
+
["acl", lambda { |x| x.clausal? and x.head and x.head.nominal? } ], # add :relcl ?
|
16
|
+
# what to do about sentential appositions?
|
17
|
+
["advcl", lambda(&:clausal?) ],
|
18
|
+
["appos", lambda { |x| true } ],
|
19
|
+
],
|
20
|
+
"arg" => "dep",
|
21
|
+
"atr" => [["nummod", lambda(&:cardinal?) ],
|
22
|
+
["nmod", lambda(&:nominal?) ],
|
23
|
+
["acl", lambda { |x| x.clausal? } ], # add :relcl?
|
24
|
+
["advmod", lambda { |x| x.head and x.head.clausal? } ],
|
25
|
+
["det", lambda(&:determiner?) ],
|
26
|
+
["amod", lambda { |x| true } ], #default
|
27
|
+
],
|
28
|
+
"aux" => [["det", lambda(&:determiner?) ],
|
29
|
+
["auxpass", lambda { |x| x.clausal? and x.head.passive? } ],
|
30
|
+
["aux", lambda(&:clausal?) ],
|
31
|
+
["neg", lambda(&:negation?) ],
|
32
|
+
["discourse", lambda { |x| x.particle? or x.interjection? } ],
|
33
|
+
["advmod", lambda { |x| x.adjectival? or x.adverb? } ], # or subjunction (? why did I write this?)
|
34
|
+
["cc", lambda(&:conjunction?) ],
|
35
|
+
["foreign", lambda(&:foreign?) ],
|
36
|
+
# We need some more distinctions to get Gothic and Armenian. Introduce language in the treebank? (Read from xml)
|
37
|
+
["mark", lambda { |x| ['Pk', 'R-'].include? x.part_of_speech } ], #reflexive as valency reducer, 'R-' as infinitive marker in Gothic
|
38
|
+
['amod', lambda { |x| x.preposition? } ], # Armenian DOM
|
39
|
+
['mwe', lambda { |x| ['Px', 'Pr'].include? x.part_of_speech } ], # NB there are a lot of bogus annotations with 'Px'
|
40
|
+
|
41
|
+
# MISANNOTATION IF A NOUN or a 'Pi' or a 'Pp' or a 'Ps'
|
42
|
+
],
|
43
|
+
"comp" => [['csubjpass', lambda { |x| x.head and x.head.passive? } ],
|
44
|
+
['csubj', lambda { |x| x.head and x.head.copula? } ],
|
45
|
+
['ccomp', lambda { |x| true } ],
|
46
|
+
],
|
47
|
+
"expl" => "expl",
|
48
|
+
"narg" => [['acl', lambda(&:clausal?) ],
|
49
|
+
['nmod', lambda(&:nominal?) ],
|
50
|
+
['nmod', lambda(&:adjectival?) ], # nominaliezed in this function
|
51
|
+
['nmod', lambda { |x| true } ],
|
52
|
+
],
|
53
|
+
"nonsub" => "dep",
|
54
|
+
"obj" => "dobj",
|
55
|
+
"obl" => [["advmod", lambda { |x| x.adverb? or x.preposition? } ], # though normally a preposition will be subordinate to its noun
|
56
|
+
["iobj", lambda(&:nominal?) ],# if nominal (NB check for presence of article!)
|
57
|
+
["iobj", lambda(&:adjectival?) ], # OBL adjectives are nominalized
|
58
|
+
["advcl", lambda(&:clausal?) ], # this seems to happen with ad libros legendos etc. but check closer!
|
59
|
+
["iobj", lambda { |x| true } ],
|
60
|
+
],
|
61
|
+
"parpred" => "parataxis",
|
62
|
+
"part" => "nmod",
|
63
|
+
"per" => "dep",
|
64
|
+
"pid" => ["ERROR", lambda { |x| raise "Remaining pid edge!" } ],
|
65
|
+
"pred" => [["root", lambda(&:root?) ],
|
66
|
+
["ERROR", lambda { |x| raise "#{x.to_n} (head_id #{x.head_id}) is not a root!" }],
|
67
|
+
],
|
68
|
+
"rel" => "acl", # add :relcl?
|
69
|
+
"sub" => [["nsubjpass", lambda { |x| x.head and x.head.passive? } ],
|
70
|
+
["nsubj", lambda { |x| true }],
|
71
|
+
],
|
72
|
+
"voc" => "vocative",
|
73
|
+
"xadv" => [["advcl", lambda(&:clausal?)], #add :contr ?
|
74
|
+
["advmod", lambda { |x| true } ], # add :contr ?
|
75
|
+
],
|
76
|
+
"xobj" => "xcomp", # copula cases have already been taken care of
|
77
|
+
"xsub" => "xsub",
|
78
|
+
}
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
module PROIEL
|
2
|
+
module Converter
|
3
|
+
# This converts to the CoNLL-X format as described on http://ilk.uvt.nl/conll/#dataformat.
|
4
|
+
class CoNLLX
|
5
|
+
class << self
|
6
|
+
def process(tb, options)
|
7
|
+
tb.sources.each do |source|
|
8
|
+
source.divs.each do |div|
|
9
|
+
div.sentences.each do |sentence|
|
10
|
+
id_to_number = {}
|
11
|
+
|
12
|
+
# Do not care about prodrop tokens
|
13
|
+
tk = sentence.tokens.reject { |t| t.empty_token_sort == 'P' }
|
14
|
+
|
15
|
+
# Renumber to make the sequence continguous after prodrop tokens where left out
|
16
|
+
tk.map(&:id).each_with_index.each do |id, i|
|
17
|
+
id_to_number[id] = i + 1
|
18
|
+
end
|
19
|
+
|
20
|
+
id_to_token = tk.inject({}) { |h, t| h.merge({t.id => t}) }
|
21
|
+
|
22
|
+
tk.each do |token|
|
23
|
+
unless token.is_empty?
|
24
|
+
this_number = id_to_number[token.id]
|
25
|
+
head_number, relation = find_lexical_head_and_relation(id_to_number, id_to_token, token)
|
26
|
+
form = token.form.gsub(/[[:space:]]/, '.')
|
27
|
+
lemma = token.lemma.gsub(/[[:space:]]/, '.')
|
28
|
+
pos_major = token.part_of_speech_hash[:major]
|
29
|
+
pos_full = token.part_of_speech
|
30
|
+
morphology = format_morphology(token)
|
31
|
+
|
32
|
+
puts [this_number, form, lemma, pos_major, pos_full,
|
33
|
+
morphology, head_number, relation, "_", "_"].join("\t")
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
puts
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def format_morphology(token)
|
44
|
+
token.morphology_hash.map do |k, v|
|
45
|
+
# Remove inflection tag unless when set to inflecting
|
46
|
+
if k == :inflection and v =='i'
|
47
|
+
nil
|
48
|
+
else
|
49
|
+
"#{k.upcase[0..3]}#{v}"
|
50
|
+
end
|
51
|
+
end.compact.join('|')
|
52
|
+
end
|
53
|
+
|
54
|
+
def find_lexical_head_and_relation(id_to_number, id_to_token, t, rel = '')
|
55
|
+
if t.is_root?
|
56
|
+
[0, rel + t.relation] # FIXME: may be empty token anyway
|
57
|
+
elsif id_to_token[t.head_id].has_content?
|
58
|
+
[id_to_number[t.head_id], rel + t.relation]
|
59
|
+
else
|
60
|
+
find_lexical_head_and_relation(id_to_number, id_to_token, id_to_token[t.head_id], rel + "#{t.relation}(#{id_to_number[t.head_id]})")
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module PROIEL
|
2
|
+
module Converter
|
3
|
+
# This converts part of speech and morphology to a lexc file.
|
4
|
+
class Lexc
|
5
|
+
class << self
|
6
|
+
def process(tb, options)
|
7
|
+
lexicon = {}
|
8
|
+
|
9
|
+
tb.sources.each do |source|
|
10
|
+
source.divs.each do |div|
|
11
|
+
div.sentences.each do |sentence|
|
12
|
+
sentence.tokens.each do |token|
|
13
|
+
unless token.is_empty?
|
14
|
+
lexicon[token.form] ||= []
|
15
|
+
if options['morphology']
|
16
|
+
lexicon[token.form] << [token.lemma, [token.part_of_speech, token.morphology].join].join(',')
|
17
|
+
else
|
18
|
+
lexicon[token.form] << [token.lemma, token.part_of_speech].join(',')
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
puts "LEXICON Root"
|
27
|
+
lexicon.sort.each do |form, tags|
|
28
|
+
tags.sort.uniq.each do |tag|
|
29
|
+
puts " %s:%s #;" % [tag, form]
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,152 @@
|
|
1
|
+
module PROIEL
|
2
|
+
module Converter
|
3
|
+
class PROIELXML
|
4
|
+
class << self
|
5
|
+
def process(tb, options)
|
6
|
+
builder = Builder::XmlMarkup.new(target: STDOUT, indent: 2)
|
7
|
+
builder.instruct! :xml, version: '1.0', encoding: 'UTF-8'
|
8
|
+
builder.proiel('export-time' => DateTime.now.xmlschema, 'schema-version' => '2.0') do
|
9
|
+
builder.annotation do
|
10
|
+
builder.relations do
|
11
|
+
tb.annotation_schema.relation_tags.each do |tag, value|
|
12
|
+
attrs = { tag: tag }
|
13
|
+
attrs.merge!(grab_features(value, %i(summary primary secondary)))
|
14
|
+
builder.value(attrs)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
builder.tag! 'parts-of-speech' do
|
19
|
+
tb.annotation_schema.part_of_speech_tags.each do |tag, value|
|
20
|
+
attrs = { tag: tag }
|
21
|
+
attrs.merge!(grab_features(value, %i(summary)))
|
22
|
+
builder.value(attrs)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
builder.morphology do
|
27
|
+
tb.annotation_schema.morphology_tags.each do |cat_tag, cat_values|
|
28
|
+
builder.field(tag: cat_tag) do
|
29
|
+
cat_values.each do |tag, value|
|
30
|
+
attrs = { tag: tag }
|
31
|
+
attrs.merge!(grab_features(value, %i(summary)))
|
32
|
+
builder.value(attrs)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
builder.tag! 'information-statuses' do
|
39
|
+
tb.annotation_schema.information_status_tags.each do |tag, value|
|
40
|
+
attrs = { tag: tag }
|
41
|
+
attrs.merge!(grab_features(value, %i(summary)))
|
42
|
+
builder.value(attrs)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
tb.sources.each do |source|
|
48
|
+
builder.source(id: source.id, language: source.language) do
|
49
|
+
PROIEL::Treebank::METADATA_ELEMENTS.each do |field|
|
50
|
+
builder.tag!(field.to_s.gsub('_', '-'), source.send(field)) if source.send(field)
|
51
|
+
end
|
52
|
+
|
53
|
+
source.divs.each do |div|
|
54
|
+
if include_div?(div, options)
|
55
|
+
builder.div(grab_features(div, %i(), %i(presentation_before presentation_after))) do
|
56
|
+
builder.title div.title if div.title
|
57
|
+
|
58
|
+
div.sentences.each do |sentence|
|
59
|
+
if include_sentence?(sentence, options)
|
60
|
+
mandatory_features = %i(id)
|
61
|
+
|
62
|
+
optional_features = [] # we do it this way to preserve the order of status and presentation_* so that diffing files is easier
|
63
|
+
optional_features += %i(status) unless options['remove-status']
|
64
|
+
optional_features += %i(presentation_before presentation_after)
|
65
|
+
|
66
|
+
builder.sentence(grab_features(sentence, mandatory_features, optional_features)) do
|
67
|
+
sentence.tokens.each do |token|
|
68
|
+
next if token.empty_token_sort == 'P' and options['remove-information-structure']
|
69
|
+
next if token.empty_token_sort == 'C' and options['remove-syntax']
|
70
|
+
next if token.empty_token_sort == 'V' and options['remove-syntax']
|
71
|
+
|
72
|
+
mandatory_features = %i(id)
|
73
|
+
|
74
|
+
optional_features = %i(citation_part)
|
75
|
+
optional_features += %i(lemma part_of_speech morphology) unless options['remove-morphology']
|
76
|
+
optional_features += %i(head_id relation) unless options['remove-syntax']
|
77
|
+
optional_features += %i(antecedent_id information_status contrast_group) unless options['remove-information-structure']
|
78
|
+
|
79
|
+
unless token.is_empty?
|
80
|
+
mandatory_features << :form
|
81
|
+
optional_features += %i(presentation_before presentation_after foreign_ids)
|
82
|
+
else
|
83
|
+
mandatory_features << :empty_token_sort
|
84
|
+
end
|
85
|
+
|
86
|
+
attrs = grab_features(token, mandatory_features, optional_features)
|
87
|
+
|
88
|
+
unless token.slashes.empty? or options['remove-syntax'] # this extra test avoids <token></token> style XML
|
89
|
+
builder.token(attrs) do
|
90
|
+
token.slashes.each do |relation, target_id|
|
91
|
+
builder.slash("target-id": target_id, relation: relation)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
else
|
95
|
+
unless options['remove-syntax'] and token.is_empty?
|
96
|
+
builder.token(attrs)
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def include_div?(div, options)
|
112
|
+
if options['remove-empty-divs']
|
113
|
+
div.sentences.any? { |sentence| include_sentence?(sentence, options) }
|
114
|
+
else
|
115
|
+
true
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
def include_sentence?(sentence, options)
|
120
|
+
case sentence.status
|
121
|
+
when :reviewed
|
122
|
+
true
|
123
|
+
when :annotated
|
124
|
+
not options['remove-not-reviewed']
|
125
|
+
else
|
126
|
+
not options['remove-not-reviewed'] and not options['remove-not-annotated']
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
def grab_features(obj, mandatory_features, optional_features = [])
|
131
|
+
attrs = {}
|
132
|
+
|
133
|
+
mandatory_features.each do |f|
|
134
|
+
v = obj.send(f)
|
135
|
+
|
136
|
+
attrs[f.to_s.gsub('_', '-')] = v
|
137
|
+
end
|
138
|
+
|
139
|
+
optional_features.each do |f|
|
140
|
+
v = obj.send(f)
|
141
|
+
|
142
|
+
if v and v.to_s != ''
|
143
|
+
attrs[f.to_s.gsub('_', '-')] = v
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
attrs
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|