proiel-cli 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE +23 -0
- data/README.md +34 -0
- data/bin/proiel +27 -0
- data/bin/setup +7 -0
- data/contrib/proiel-giza-train +6 -0
- data/contrib/proiel-lexc-compile +18 -0
- data/contrib/proiel-maltparser-parse +2 -0
- data/contrib/proiel-maltparser-train +6 -0
- data/contrib/proiel-tnt-train +15 -0
- data/examples/decision-tree.rb +41 -0
- data/examples/dep-pos-cooccurrences.rb +84 -0
- data/examples/lint-rules.rb +174 -0
- data/examples/relation-as-disambiguator.rb +134 -0
- data/examples/word-occurrences.rb +30 -0
- data/lib/proiel/cli.rb +2 -0
- data/lib/proiel/cli/commands.rb +28 -0
- data/lib/proiel/cli/commands/convert.rb +94 -0
- data/lib/proiel/cli/commands/grep.rb +136 -0
- data/lib/proiel/cli/commands/info.rb +126 -0
- data/lib/proiel/cli/commands/tokenize.rb +165 -0
- data/lib/proiel/cli/commands/validate.rb +42 -0
- data/lib/proiel/cli/converters/conll-u.rb +589 -0
- data/lib/proiel/cli/converters/conll-u/morphology.rb +235 -0
- data/lib/proiel/cli/converters/conll-u/syntax.rb +81 -0
- data/lib/proiel/cli/converters/conll-x.rb +66 -0
- data/lib/proiel/cli/converters/lexc.rb +36 -0
- data/lib/proiel/cli/converters/proielxml.rb +152 -0
- data/lib/proiel/cli/converters/text.rb +99 -0
- data/lib/proiel/cli/converters/tiger.rb +157 -0
- data/lib/proiel/cli/converters/tiger2.rb +193 -0
- data/lib/proiel/cli/converters/tnt.rb +30 -0
- data/lib/proiel/cli/version.rb +5 -0
- metadata +248 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: cc539e4839fccb93166f5fb00309efafc88bdb25
|
4
|
+
data.tar.gz: 758707d18035ce10ecd59772031b6c01d703723e
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: cedb768615cbc1e6c2ecb1b09dc30f88f97d72df5a7545fd0a1087cf925432887c4838316609a6b5706f79f63cf9c35898e2ca64ce1d850e0ba264e99f6d1b3f
|
7
|
+
data.tar.gz: f105fcc4028036319efb2b4c7b3d9c5427a00c273166719f8256adc029f8ca1545514985ed814deea5c430682342cc10043ed94dc4c37de180db8e41717f957c
|
data/LICENSE
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2015 Marius L. Jøhndal
|
4
|
+
Copyright (c) 2015 Dag Haug
|
5
|
+
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
8
|
+
in the Software without restriction, including without limitation the rights
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
11
|
+
furnished to do so, subject to the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be included in all
|
14
|
+
copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
22
|
+
SOFTWARE.
|
23
|
+
|
data/README.md
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
# PROIEL command-line interface
|
2
|
+
|
3
|
+
This is a command-line interface for manpulating PROIEL treebanks.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Install as
|
8
|
+
|
9
|
+
```shell
|
10
|
+
gem install proiel-cli
|
11
|
+
```
|
12
|
+
|
13
|
+
## Using the command-line interface
|
14
|
+
|
15
|
+
The gem includes a command-line utility `proiel` for a number of routine tasks.
|
16
|
+
`proiel info`, for example, displays metadata and some brief statistics, and
|
17
|
+
`proiel convert conll` converts the treebank to CoNLL format. Use `proiel
|
18
|
+
--help` for further examples and usage instructions.
|
19
|
+
|
20
|
+
## Development
|
21
|
+
|
22
|
+
Check out the git repository from github and run `bin/setup` to install
|
23
|
+
development dependencies. Then run `rake` to run the tests.
|
24
|
+
|
25
|
+
To install a development version of this gem, run `bundle exec rake install`.
|
26
|
+
To release a new version, update the version number in `version.rb`, and then
|
27
|
+
run `bundle exec rake release`, which will create a git tag for the version,
|
28
|
+
push git commits and tags, and push the gem to
|
29
|
+
[rubygems.org](https://rubygems.org).
|
30
|
+
|
31
|
+
## Contributing
|
32
|
+
|
33
|
+
Bug reports and pull requests are welcome on GitHub at
|
34
|
+
https://github.com/proiel/proiel-cli.
|
data/bin/proiel
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
STDOUT.sync = true
|
3
|
+
|
4
|
+
$:.unshift File.join(File.dirname(__FILE__), *%w{ .. lib })
|
5
|
+
|
6
|
+
require 'proiel/cli'
|
7
|
+
|
8
|
+
Mercenary.program(:proiel) do |p|
|
9
|
+
p.version PROIEL::VERSION
|
10
|
+
p.description 'proiel is a command-line interface for PROIEL treebanks'
|
11
|
+
p.syntax 'proiel <subcommand> [options]'
|
12
|
+
|
13
|
+
p.option 'verbose', '-V', '--verbose', 'Display verbose information'
|
14
|
+
|
15
|
+
PROIEL::Command.subclasses.each { |c| c.init_with_program(p) }
|
16
|
+
|
17
|
+
p.action do |args, options|
|
18
|
+
if args.empty?
|
19
|
+
puts p
|
20
|
+
else
|
21
|
+
unless p.has_command?(args.first)
|
22
|
+
STDERR.puts 'Invalid command. Use --help for more information.'
|
23
|
+
exit 1
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
data/bin/setup
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
tmpdir=$(mktemp -dt "$0.XXXXXXXXXX")
|
3
|
+
lexcfile=${tmpdir}/output.lexc
|
4
|
+
scriptfile=${tmpdir}/output.script
|
5
|
+
fstfile=output.fst
|
6
|
+
|
7
|
+
proiel=../bin/proiel
|
8
|
+
foma=foma
|
9
|
+
#foma=xfst
|
10
|
+
|
11
|
+
../bin/proiel convert lexc -V $@ > ${lexcfile} || exit 1
|
12
|
+
echo "read lexc ${lexcfile}" > ${scriptfile}
|
13
|
+
echo "save stack ${fstfile}" >> ${scriptfile}
|
14
|
+
|
15
|
+
${foma} -f ${scriptfile}
|
16
|
+
|
17
|
+
echo
|
18
|
+
echo "Generated transducer is ${fstfile}. Try 'echo wordform | flookup ${fstfile}' to test."
|
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
tmpdir=$(mktemp -dt "$0.XXXXXXXXXX")
|
3
|
+
tntfile=${tmpdir}/output.t
|
4
|
+
#fstfile=output.fst
|
5
|
+
|
6
|
+
proiel=../bin/proiel
|
7
|
+
tnt=tnt
|
8
|
+
#tnt=hunpos
|
9
|
+
|
10
|
+
../bin/proiel convert tnt -V $@ > ${tntfile} || exit 1
|
11
|
+
|
12
|
+
tnt-para ${tntfile} && mv -vi ${tmpdir}/output.{lex,123} .
|
13
|
+
|
14
|
+
echo
|
15
|
+
echo "Generated parameter files are output.lex and output.123. Try 'tnt output mycorpus.t' to test."
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Train a decision tree on the (head_relation, head_pos, head_lemma,
|
4
|
+
# child_relation, child_pos, child_lemma) and then predict the child_relation
|
5
|
+
# of an unknown child.
|
6
|
+
#
|
7
|
+
require 'colorize'
|
8
|
+
require 'decisiontree'
|
9
|
+
require 'proiel'
|
10
|
+
|
11
|
+
if ARGV.length < 1
|
12
|
+
STDERR.puts "Usage: #{$0} treebank-files(s)"
|
13
|
+
exit 1
|
14
|
+
end
|
15
|
+
|
16
|
+
tb = PROIEL::Treebank.new
|
17
|
+
tb.load_from_xml(ARGV)
|
18
|
+
|
19
|
+
tokens = {}
|
20
|
+
|
21
|
+
tb.sources.each do |source|
|
22
|
+
source.tokens.each do |token|
|
23
|
+
tokens[token.id.to_i] = [token.relation, token.part_of_speech, token.lemma, token.head_id]
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
training_data = tokens.map do |_, (child_relation, child_pos, child_lemma, head_id)|
|
28
|
+
if head_id
|
29
|
+
head = tokens[head_id.to_i]
|
30
|
+
head_relation, head_pos, head_lemma, _ = *head
|
31
|
+
|
32
|
+
[head_pos || '', head_lemma || '', head_relation, child_pos || '', child_lemma || '', child_relation]
|
33
|
+
end
|
34
|
+
end.compact
|
35
|
+
|
36
|
+
attributes = %w(head_pos head_lemma head_relation child_pos child_lemma)
|
37
|
+
dr = DecisionTree::ID3Tree.new(attributes, training_data, 'pred', :discrete)
|
38
|
+
dr.train
|
39
|
+
dr.save_to_file("dr.marshal")
|
40
|
+
|
41
|
+
p dr.predict(["Ne", "Gallia", "sub", "Px", "omnis"])
|
@@ -0,0 +1,84 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'proiel'
|
3
|
+
require 'colorize'
|
4
|
+
require 'terminal-table'
|
5
|
+
|
6
|
+
if ARGV.length < 1
|
7
|
+
STDERR.puts "Usage: #{$0} treebank-files(s)"
|
8
|
+
exit 1
|
9
|
+
end
|
10
|
+
|
11
|
+
tb = PROIEL::Treebank.new
|
12
|
+
tb.load_from_xml(ARGV)
|
13
|
+
|
14
|
+
# Present by POS
|
15
|
+
relations = tb.annotation.relation_tags.keys
|
16
|
+
|
17
|
+
c = {}
|
18
|
+
tb.sources.each do |s|
|
19
|
+
s.tokens.each do |t|
|
20
|
+
next if t.pos.nil? or t.relation.nil?
|
21
|
+
|
22
|
+
c[t.pos] ||= {}
|
23
|
+
c[t.pos][t.relation] ||= 0
|
24
|
+
c[t.pos][t.relation] += 1
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
rows = []
|
29
|
+
c.sort_by(&:first).each do |pos, d|
|
30
|
+
total = d.inject(0) { |a, (k, v)| a + v }
|
31
|
+
|
32
|
+
rows << [pos] + relations.map do |r|
|
33
|
+
n = d[r ? r.to_s : nil]
|
34
|
+
|
35
|
+
if n and n < total * 0.001
|
36
|
+
n.to_s.red
|
37
|
+
elsif n and n > total * 0.999
|
38
|
+
n.to_s.green
|
39
|
+
else
|
40
|
+
n
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
table = Terminal::Table.new headings: ['Part of speech'] + relations, rows: rows
|
46
|
+
puts table
|
47
|
+
puts "(red = relation occurs for less than 0.1% of tokens with this POS; green = relation occurs for more than 99.9% of tokens with this POS)"
|
48
|
+
puts
|
49
|
+
|
50
|
+
# Present by relation
|
51
|
+
poses = tb.annotation.part_of_speech_tags.keys
|
52
|
+
|
53
|
+
c = {}
|
54
|
+
|
55
|
+
tb.sources.each do |s|
|
56
|
+
s.tokens.each do |t|
|
57
|
+
next if t.pos.nil? or t.relation.nil?
|
58
|
+
|
59
|
+
c[t.relation] ||= {}
|
60
|
+
c[t.relation][t.pos] ||= 0
|
61
|
+
c[t.relation][t.pos] += 1
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
rows = []
|
66
|
+
c.sort_by(&:first).each do |relation, d|
|
67
|
+
total = d.inject(0) { |a, (k, v)| a + v }
|
68
|
+
|
69
|
+
rows << [relation] + poses.map do |r|
|
70
|
+
n = d[r ? r.to_s : nil]
|
71
|
+
|
72
|
+
if n and n < total * 0.001
|
73
|
+
n.to_s.red
|
74
|
+
elsif n and n > total * 0.999
|
75
|
+
n.to_s.green
|
76
|
+
else
|
77
|
+
n
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
table = Terminal::Table.new headings: ['Relation'] + poses, rows: rows
|
83
|
+
puts table
|
84
|
+
puts "(red = POS occurs for less than 0.1% of tokens with this relation; green = POS occurs for more than 99.9% of tokens with this relation)"
|
@@ -0,0 +1,174 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Very simple testing of implicational feature rules. Example rules only
|
4
|
+
# apply to Latin.
|
5
|
+
#
|
6
|
+
require 'colorize'
|
7
|
+
require 'proiel'
|
8
|
+
|
9
|
+
VIOLATIONS = {}
|
10
|
+
|
11
|
+
def report_violation(token, message)
|
12
|
+
VIOLATIONS[message] ||= []
|
13
|
+
VIOLATIONS[message] << token
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_token(token, rules, dependent_rules)
|
17
|
+
rules.each do |match_features, test_alternatives|
|
18
|
+
f = token.features + ["\"#{token.form}\""]
|
19
|
+
|
20
|
+
if (match_features - f).empty?
|
21
|
+
unless test_alternatives.any? { |test_alternative| token.features.include?(test_alternative) }
|
22
|
+
report_violation(token, "#{match_features.join(' ')}")
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
dependent_rules.each do |match_features, test_alternatives|
|
28
|
+
f = token.features + ["\"#{token.form}\""]
|
29
|
+
|
30
|
+
if (match_features - f).empty?
|
31
|
+
t = token.children.all? do |dependent|
|
32
|
+
test_alternatives.any? { |test_alternative| dependent.features.include?(test_alternative) }
|
33
|
+
end
|
34
|
+
|
35
|
+
unless t
|
36
|
+
report_violation(token, "#{match_features.join(' ')} → dependents()")
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def load_rules
|
43
|
+
rules = {}
|
44
|
+
dependent_rules = {}
|
45
|
+
|
46
|
+
DATA.each do |rule|
|
47
|
+
rule.chomp!
|
48
|
+
rule.sub!(/\s*#.*$/, '')
|
49
|
+
|
50
|
+
next if rule.empty?
|
51
|
+
|
52
|
+
match_features, test = rule.split(/\s*→\s*/)
|
53
|
+
match_features = match_features.split(/\s+/)
|
54
|
+
|
55
|
+
if test[/\s*dependents\(([^)]*)\)\s*/]
|
56
|
+
dependent_rules[match_features] ||= []
|
57
|
+
dependent_rules[match_features] << $1
|
58
|
+
test.sub!(/\s*dependents\([^)]*\)\s*/, '')
|
59
|
+
end
|
60
|
+
|
61
|
+
if test != ''
|
62
|
+
rules[match_features] ||= []
|
63
|
+
rules[match_features] << test
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
[rules, dependent_rules]
|
68
|
+
end
|
69
|
+
|
70
|
+
if ARGV.length < 1
|
71
|
+
STDERR.puts "Usage: #{$0} treebank-files(s)"
|
72
|
+
exit 1
|
73
|
+
end
|
74
|
+
|
75
|
+
tb = PROIEL::Treebank.new
|
76
|
+
tb.load_from_xml(ARGV)
|
77
|
+
|
78
|
+
rules, dependent_rules = load_rules
|
79
|
+
|
80
|
+
tb.sources.each do |source|
|
81
|
+
source.sentences.each do |sentence|
|
82
|
+
if sentence.status == 'reviewed'
|
83
|
+
sentence.tokens.each do |token|
|
84
|
+
test_token(token, rules, dependent_rules)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
base_url = 'http://foni.uio.no:3000'
|
91
|
+
|
92
|
+
puts "<h1>PROIEL lint report</h1>"
|
93
|
+
|
94
|
+
VIOLATIONS.each do |rule, tokens|
|
95
|
+
puts "<h2>#{rule}</h2><ul>"
|
96
|
+
tokens.each do |token|
|
97
|
+
puts "<li>Token <a href='#{base_url}/tokens/#{token.id}'>#{token.id}</a> in sentence <a href='#{base_url}/sentences/#{token.sentence.id}'>#{token.sentence.id}</a></li>"
|
98
|
+
end
|
99
|
+
puts "</ul>"
|
100
|
+
end
|
101
|
+
|
102
|
+
__END__
|
103
|
+
|
104
|
+
# Gerundives
|
105
|
+
gdv nom → xobj # modal gerundive heading a main clause
|
106
|
+
|
107
|
+
gdv acc → comp # modal gerundive heading an AcI, or in the _curo faciendum_ type
|
108
|
+
gdv acc → xobj # modal gerundive heading an AcI with an overt auxiliary
|
109
|
+
gdv acc → obl # as argument of a preposition
|
110
|
+
gdv acc → xadv # in the _do librum legendum_ type
|
111
|
+
|
112
|
+
gdv gen → atr # in the _tempus dicendi_ type
|
113
|
+
gdv gen → narg # in the _facultas dicendi_ type
|
114
|
+
|
115
|
+
gdv abl → obl # as argument of a preposition
|
116
|
+
gdv abl → abl # in circumstantial adjuncts of various types
|
117
|
+
|
118
|
+
# Gerunds
|
119
|
+
ger nom → 0 # invalid case for a gerundive
|
120
|
+
|
121
|
+
ger acc → obl # as argument of a preposition
|
122
|
+
|
123
|
+
ger gen → atr # in the _tempus dicendi_ type
|
124
|
+
ger gen → narg # in the _facultas dicendi_ type
|
125
|
+
|
126
|
+
ger abl → obl # as argument of a preposition
|
127
|
+
ger abl → abl # in circumstantial adjuncts of various types
|
128
|
+
|
129
|
+
# Reflexive pronouns
|
130
|
+
persrefl nom → 0
|
131
|
+
|
132
|
+
persrefl acc → sub
|
133
|
+
persrefl acc → obj
|
134
|
+
persrefl acc → obl
|
135
|
+
|
136
|
+
persrefl dat → obl
|
137
|
+
persrefl dat → adv
|
138
|
+
persrefl dat → ag
|
139
|
+
|
140
|
+
persrefl abl → obl
|
141
|
+
persrefl abl → sub
|
142
|
+
|
143
|
+
persrefl "se" → acc
|
144
|
+
persrefl "se" → abl
|
145
|
+
|
146
|
+
persrefl "sese" → acc
|
147
|
+
persrefl "sese" → abl
|
148
|
+
|
149
|
+
persrefl "sibi" → dat
|
150
|
+
|
151
|
+
# Personal pronouns
|
152
|
+
perspron nom → sub
|
153
|
+
|
154
|
+
perspron acc → sub
|
155
|
+
perspron acc → obj
|
156
|
+
perspron acc → obl
|
157
|
+
|
158
|
+
perspron dat → obl
|
159
|
+
perspron dat → adv
|
160
|
+
perspron dat → ag
|
161
|
+
|
162
|
+
perspron abl → obl
|
163
|
+
perspron abl → sub
|
164
|
+
|
165
|
+
# The dependent of the complementisers _ut_ and _ne_ should be a PRED or an AUX
|
166
|
+
subj "ut" → dependents(pred) # the standard case, a predicate heading a clause
|
167
|
+
subj "ut" → dependents(aux) # some particle-like material dependent on the complementiser
|
168
|
+
|
169
|
+
subj "ne" → dependents(pred)
|
170
|
+
subj "ne" → dependents(aux)
|
171
|
+
|
172
|
+
# Particles and adverbs
|
173
|
+
"iam" → adverb adv
|
174
|
+
"iam" → adverb aux # possibly
|