proiel-cli 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +23 -0
- data/README.md +34 -0
- data/bin/proiel +27 -0
- data/bin/setup +7 -0
- data/contrib/proiel-giza-train +6 -0
- data/contrib/proiel-lexc-compile +18 -0
- data/contrib/proiel-maltparser-parse +2 -0
- data/contrib/proiel-maltparser-train +6 -0
- data/contrib/proiel-tnt-train +15 -0
- data/examples/decision-tree.rb +41 -0
- data/examples/dep-pos-cooccurrences.rb +84 -0
- data/examples/lint-rules.rb +174 -0
- data/examples/relation-as-disambiguator.rb +134 -0
- data/examples/word-occurrences.rb +30 -0
- data/lib/proiel/cli.rb +2 -0
- data/lib/proiel/cli/commands.rb +28 -0
- data/lib/proiel/cli/commands/convert.rb +94 -0
- data/lib/proiel/cli/commands/grep.rb +136 -0
- data/lib/proiel/cli/commands/info.rb +126 -0
- data/lib/proiel/cli/commands/tokenize.rb +165 -0
- data/lib/proiel/cli/commands/validate.rb +42 -0
- data/lib/proiel/cli/converters/conll-u.rb +589 -0
- data/lib/proiel/cli/converters/conll-u/morphology.rb +235 -0
- data/lib/proiel/cli/converters/conll-u/syntax.rb +81 -0
- data/lib/proiel/cli/converters/conll-x.rb +66 -0
- data/lib/proiel/cli/converters/lexc.rb +36 -0
- data/lib/proiel/cli/converters/proielxml.rb +152 -0
- data/lib/proiel/cli/converters/text.rb +99 -0
- data/lib/proiel/cli/converters/tiger.rb +157 -0
- data/lib/proiel/cli/converters/tiger2.rb +193 -0
- data/lib/proiel/cli/converters/tnt.rb +30 -0
- data/lib/proiel/cli/version.rb +5 -0
- metadata +248 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: cc539e4839fccb93166f5fb00309efafc88bdb25
|
4
|
+
data.tar.gz: 758707d18035ce10ecd59772031b6c01d703723e
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: cedb768615cbc1e6c2ecb1b09dc30f88f97d72df5a7545fd0a1087cf925432887c4838316609a6b5706f79f63cf9c35898e2ca64ce1d850e0ba264e99f6d1b3f
|
7
|
+
data.tar.gz: f105fcc4028036319efb2b4c7b3d9c5427a00c273166719f8256adc029f8ca1545514985ed814deea5c430682342cc10043ed94dc4c37de180db8e41717f957c
|
data/LICENSE
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2015 Marius L. Jøhndal
|
4
|
+
Copyright (c) 2015 Dag Haug
|
5
|
+
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
8
|
+
in the Software without restriction, including without limitation the rights
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
11
|
+
furnished to do so, subject to the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be included in all
|
14
|
+
copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
22
|
+
SOFTWARE.
|
23
|
+
|
data/README.md
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
# PROIEL command-line interface
|
2
|
+
|
3
|
+
This is a command-line interface for manpulating PROIEL treebanks.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Install as
|
8
|
+
|
9
|
+
```shell
|
10
|
+
gem install proiel-cli
|
11
|
+
```
|
12
|
+
|
13
|
+
## Using the command-line interface
|
14
|
+
|
15
|
+
The gem includes a command-line utility `proiel` for a number of routine tasks.
|
16
|
+
`proiel info`, for example, displays metadata and some brief statistics, and
|
17
|
+
`proiel convert conll` converts the treebank to CoNLL format. Use `proiel
|
18
|
+
--help` for further examples and usage instructions.
|
19
|
+
|
20
|
+
## Development
|
21
|
+
|
22
|
+
Check out the git repository from github and run `bin/setup` to install
|
23
|
+
development dependencies. Then run `rake` to run the tests.
|
24
|
+
|
25
|
+
To install a development version of this gem, run `bundle exec rake install`.
|
26
|
+
To release a new version, update the version number in `version.rb`, and then
|
27
|
+
run `bundle exec rake release`, which will create a git tag for the version,
|
28
|
+
push git commits and tags, and push the gem to
|
29
|
+
[rubygems.org](https://rubygems.org).
|
30
|
+
|
31
|
+
## Contributing
|
32
|
+
|
33
|
+
Bug reports and pull requests are welcome on GitHub at
|
34
|
+
https://github.com/proiel/proiel-cli.
|
data/bin/proiel
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
STDOUT.sync = true
|
3
|
+
|
4
|
+
$:.unshift File.join(File.dirname(__FILE__), *%w{ .. lib })
|
5
|
+
|
6
|
+
require 'proiel/cli'
|
7
|
+
|
8
|
+
Mercenary.program(:proiel) do |p|
|
9
|
+
p.version PROIEL::VERSION
|
10
|
+
p.description 'proiel is a command-line interface for PROIEL treebanks'
|
11
|
+
p.syntax 'proiel <subcommand> [options]'
|
12
|
+
|
13
|
+
p.option 'verbose', '-V', '--verbose', 'Display verbose information'
|
14
|
+
|
15
|
+
PROIEL::Command.subclasses.each { |c| c.init_with_program(p) }
|
16
|
+
|
17
|
+
p.action do |args, options|
|
18
|
+
if args.empty?
|
19
|
+
puts p
|
20
|
+
else
|
21
|
+
unless p.has_command?(args.first)
|
22
|
+
STDERR.puts 'Invalid command. Use --help for more information.'
|
23
|
+
exit 1
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
data/bin/setup
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
tmpdir=$(mktemp -dt "$0.XXXXXXXXXX")
|
3
|
+
lexcfile=${tmpdir}/output.lexc
|
4
|
+
scriptfile=${tmpdir}/output.script
|
5
|
+
fstfile=output.fst
|
6
|
+
|
7
|
+
proiel=../bin/proiel
|
8
|
+
foma=foma
|
9
|
+
#foma=xfst
|
10
|
+
|
11
|
+
../bin/proiel convert lexc -V $@ > ${lexcfile} || exit 1
|
12
|
+
echo "read lexc ${lexcfile}" > ${scriptfile}
|
13
|
+
echo "save stack ${fstfile}" >> ${scriptfile}
|
14
|
+
|
15
|
+
${foma} -f ${scriptfile}
|
16
|
+
|
17
|
+
echo
|
18
|
+
echo "Generated transducer is ${fstfile}. Try 'echo wordform | flookup ${fstfile}' to test."
|
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
tmpdir=$(mktemp -dt "$0.XXXXXXXXXX")
|
3
|
+
tntfile=${tmpdir}/output.t
|
4
|
+
#fstfile=output.fst
|
5
|
+
|
6
|
+
proiel=../bin/proiel
|
7
|
+
tnt=tnt
|
8
|
+
#tnt=hunpos
|
9
|
+
|
10
|
+
../bin/proiel convert tnt -V $@ > ${tntfile} || exit 1
|
11
|
+
|
12
|
+
tnt-para ${tntfile} && mv -vi ${tmpdir}/output.{lex,123} .
|
13
|
+
|
14
|
+
echo
|
15
|
+
echo "Generated parameter files are output.lex and output.123. Try 'tnt output mycorpus.t' to test."
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Train a decision tree on the (head_relation, head_pos, head_lemma,
|
4
|
+
# child_relation, child_pos, child_lemma) and then predict the child_relation
|
5
|
+
# of an unknown child.
|
6
|
+
#
|
7
|
+
require 'colorize'
|
8
|
+
require 'decisiontree'
|
9
|
+
require 'proiel'
|
10
|
+
|
11
|
+
if ARGV.length < 1
|
12
|
+
STDERR.puts "Usage: #{$0} treebank-files(s)"
|
13
|
+
exit 1
|
14
|
+
end
|
15
|
+
|
16
|
+
tb = PROIEL::Treebank.new
|
17
|
+
tb.load_from_xml(ARGV)
|
18
|
+
|
19
|
+
tokens = {}
|
20
|
+
|
21
|
+
tb.sources.each do |source|
|
22
|
+
source.tokens.each do |token|
|
23
|
+
tokens[token.id.to_i] = [token.relation, token.part_of_speech, token.lemma, token.head_id]
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
training_data = tokens.map do |_, (child_relation, child_pos, child_lemma, head_id)|
|
28
|
+
if head_id
|
29
|
+
head = tokens[head_id.to_i]
|
30
|
+
head_relation, head_pos, head_lemma, _ = *head
|
31
|
+
|
32
|
+
[head_pos || '', head_lemma || '', head_relation, child_pos || '', child_lemma || '', child_relation]
|
33
|
+
end
|
34
|
+
end.compact
|
35
|
+
|
36
|
+
attributes = %w(head_pos head_lemma head_relation child_pos child_lemma)
|
37
|
+
dr = DecisionTree::ID3Tree.new(attributes, training_data, 'pred', :discrete)
|
38
|
+
dr.train
|
39
|
+
dr.save_to_file("dr.marshal")
|
40
|
+
|
41
|
+
p dr.predict(["Ne", "Gallia", "sub", "Px", "omnis"])
|
@@ -0,0 +1,84 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'proiel'
|
3
|
+
require 'colorize'
|
4
|
+
require 'terminal-table'
|
5
|
+
|
6
|
+
if ARGV.length < 1
|
7
|
+
STDERR.puts "Usage: #{$0} treebank-files(s)"
|
8
|
+
exit 1
|
9
|
+
end
|
10
|
+
|
11
|
+
tb = PROIEL::Treebank.new
|
12
|
+
tb.load_from_xml(ARGV)
|
13
|
+
|
14
|
+
# Present by POS
|
15
|
+
relations = tb.annotation.relation_tags.keys
|
16
|
+
|
17
|
+
c = {}
|
18
|
+
tb.sources.each do |s|
|
19
|
+
s.tokens.each do |t|
|
20
|
+
next if t.pos.nil? or t.relation.nil?
|
21
|
+
|
22
|
+
c[t.pos] ||= {}
|
23
|
+
c[t.pos][t.relation] ||= 0
|
24
|
+
c[t.pos][t.relation] += 1
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
rows = []
|
29
|
+
c.sort_by(&:first).each do |pos, d|
|
30
|
+
total = d.inject(0) { |a, (k, v)| a + v }
|
31
|
+
|
32
|
+
rows << [pos] + relations.map do |r|
|
33
|
+
n = d[r ? r.to_s : nil]
|
34
|
+
|
35
|
+
if n and n < total * 0.001
|
36
|
+
n.to_s.red
|
37
|
+
elsif n and n > total * 0.999
|
38
|
+
n.to_s.green
|
39
|
+
else
|
40
|
+
n
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
table = Terminal::Table.new headings: ['Part of speech'] + relations, rows: rows
|
46
|
+
puts table
|
47
|
+
puts "(red = relation occurs for less than 0.1% of tokens with this POS; green = relation occurs for more than 99.9% of tokens with this POS)"
|
48
|
+
puts
|
49
|
+
|
50
|
+
# Present by relation
|
51
|
+
poses = tb.annotation.part_of_speech_tags.keys
|
52
|
+
|
53
|
+
c = {}
|
54
|
+
|
55
|
+
tb.sources.each do |s|
|
56
|
+
s.tokens.each do |t|
|
57
|
+
next if t.pos.nil? or t.relation.nil?
|
58
|
+
|
59
|
+
c[t.relation] ||= {}
|
60
|
+
c[t.relation][t.pos] ||= 0
|
61
|
+
c[t.relation][t.pos] += 1
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
rows = []
|
66
|
+
c.sort_by(&:first).each do |relation, d|
|
67
|
+
total = d.inject(0) { |a, (k, v)| a + v }
|
68
|
+
|
69
|
+
rows << [relation] + poses.map do |r|
|
70
|
+
n = d[r ? r.to_s : nil]
|
71
|
+
|
72
|
+
if n and n < total * 0.001
|
73
|
+
n.to_s.red
|
74
|
+
elsif n and n > total * 0.999
|
75
|
+
n.to_s.green
|
76
|
+
else
|
77
|
+
n
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
table = Terminal::Table.new headings: ['Relation'] + poses, rows: rows
|
83
|
+
puts table
|
84
|
+
puts "(red = POS occurs for less than 0.1% of tokens with this relation; green = POS occurs for more than 99.9% of tokens with this relation)"
|
@@ -0,0 +1,174 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Very simple testing of implicational feature rules. Example rules only
|
4
|
+
# apply to Latin.
|
5
|
+
#
|
6
|
+
require 'colorize'
|
7
|
+
require 'proiel'
|
8
|
+
|
9
|
+
VIOLATIONS = {}
|
10
|
+
|
11
|
+
def report_violation(token, message)
|
12
|
+
VIOLATIONS[message] ||= []
|
13
|
+
VIOLATIONS[message] << token
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_token(token, rules, dependent_rules)
|
17
|
+
rules.each do |match_features, test_alternatives|
|
18
|
+
f = token.features + ["\"#{token.form}\""]
|
19
|
+
|
20
|
+
if (match_features - f).empty?
|
21
|
+
unless test_alternatives.any? { |test_alternative| token.features.include?(test_alternative) }
|
22
|
+
report_violation(token, "#{match_features.join(' ')}")
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
dependent_rules.each do |match_features, test_alternatives|
|
28
|
+
f = token.features + ["\"#{token.form}\""]
|
29
|
+
|
30
|
+
if (match_features - f).empty?
|
31
|
+
t = token.children.all? do |dependent|
|
32
|
+
test_alternatives.any? { |test_alternative| dependent.features.include?(test_alternative) }
|
33
|
+
end
|
34
|
+
|
35
|
+
unless t
|
36
|
+
report_violation(token, "#{match_features.join(' ')} → dependents()")
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def load_rules
|
43
|
+
rules = {}
|
44
|
+
dependent_rules = {}
|
45
|
+
|
46
|
+
DATA.each do |rule|
|
47
|
+
rule.chomp!
|
48
|
+
rule.sub!(/\s*#.*$/, '')
|
49
|
+
|
50
|
+
next if rule.empty?
|
51
|
+
|
52
|
+
match_features, test = rule.split(/\s*→\s*/)
|
53
|
+
match_features = match_features.split(/\s+/)
|
54
|
+
|
55
|
+
if test[/\s*dependents\(([^)]*)\)\s*/]
|
56
|
+
dependent_rules[match_features] ||= []
|
57
|
+
dependent_rules[match_features] << $1
|
58
|
+
test.sub!(/\s*dependents\([^)]*\)\s*/, '')
|
59
|
+
end
|
60
|
+
|
61
|
+
if test != ''
|
62
|
+
rules[match_features] ||= []
|
63
|
+
rules[match_features] << test
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
[rules, dependent_rules]
|
68
|
+
end
|
69
|
+
|
70
|
+
if ARGV.length < 1
|
71
|
+
STDERR.puts "Usage: #{$0} treebank-files(s)"
|
72
|
+
exit 1
|
73
|
+
end
|
74
|
+
|
75
|
+
tb = PROIEL::Treebank.new
|
76
|
+
tb.load_from_xml(ARGV)
|
77
|
+
|
78
|
+
rules, dependent_rules = load_rules
|
79
|
+
|
80
|
+
tb.sources.each do |source|
|
81
|
+
source.sentences.each do |sentence|
|
82
|
+
if sentence.status == 'reviewed'
|
83
|
+
sentence.tokens.each do |token|
|
84
|
+
test_token(token, rules, dependent_rules)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
base_url = 'http://foni.uio.no:3000'
|
91
|
+
|
92
|
+
puts "<h1>PROIEL lint report</h1>"
|
93
|
+
|
94
|
+
VIOLATIONS.each do |rule, tokens|
|
95
|
+
puts "<h2>#{rule}</h2><ul>"
|
96
|
+
tokens.each do |token|
|
97
|
+
puts "<li>Token <a href='#{base_url}/tokens/#{token.id}'>#{token.id}</a> in sentence <a href='#{base_url}/sentences/#{token.sentence.id}'>#{token.sentence.id}</a></li>"
|
98
|
+
end
|
99
|
+
puts "</ul>"
|
100
|
+
end
|
101
|
+
|
102
|
+
__END__
|
103
|
+
|
104
|
+
# Gerundives
|
105
|
+
gdv nom → xobj # modal gerundive heading a main clause
|
106
|
+
|
107
|
+
gdv acc → comp # modal gerundive heading an AcI, or in the _curo faciendum_ type
|
108
|
+
gdv acc → xobj # modal gerundive heading an AcI with an overt auxiliary
|
109
|
+
gdv acc → obl # as argument of a preposition
|
110
|
+
gdv acc → xadv # in the _do librum legendum_ type
|
111
|
+
|
112
|
+
gdv gen → atr # in the _tempus dicendi_ type
|
113
|
+
gdv gen → narg # in the _facultas dicendi_ type
|
114
|
+
|
115
|
+
gdv abl → obl # as argument of a preposition
|
116
|
+
gdv abl → abl # in circumstantial adjuncts of various types
|
117
|
+
|
118
|
+
# Gerunds
|
119
|
+
ger nom → 0 # invalid case for a gerundive
|
120
|
+
|
121
|
+
ger acc → obl # as argument of a preposition
|
122
|
+
|
123
|
+
ger gen → atr # in the _tempus dicendi_ type
|
124
|
+
ger gen → narg # in the _facultas dicendi_ type
|
125
|
+
|
126
|
+
ger abl → obl # as argument of a preposition
|
127
|
+
ger abl → abl # in circumstantial adjuncts of various types
|
128
|
+
|
129
|
+
# Reflexive pronouns
|
130
|
+
persrefl nom → 0
|
131
|
+
|
132
|
+
persrefl acc → sub
|
133
|
+
persrefl acc → obj
|
134
|
+
persrefl acc → obl
|
135
|
+
|
136
|
+
persrefl dat → obl
|
137
|
+
persrefl dat → adv
|
138
|
+
persrefl dat → ag
|
139
|
+
|
140
|
+
persrefl abl → obl
|
141
|
+
persrefl abl → sub
|
142
|
+
|
143
|
+
persrefl "se" → acc
|
144
|
+
persrefl "se" → abl
|
145
|
+
|
146
|
+
persrefl "sese" → acc
|
147
|
+
persrefl "sese" → abl
|
148
|
+
|
149
|
+
persrefl "sibi" → dat
|
150
|
+
|
151
|
+
# Personal pronouns
|
152
|
+
perspron nom → sub
|
153
|
+
|
154
|
+
perspron acc → sub
|
155
|
+
perspron acc → obj
|
156
|
+
perspron acc → obl
|
157
|
+
|
158
|
+
perspron dat → obl
|
159
|
+
perspron dat → adv
|
160
|
+
perspron dat → ag
|
161
|
+
|
162
|
+
perspron abl → obl
|
163
|
+
perspron abl → sub
|
164
|
+
|
165
|
+
# The dependent of the complementisers _ut_ and _ne_ should be a PRED or an AUX
|
166
|
+
subj "ut" → dependents(pred) # the standard case, a predicate heading a clause
|
167
|
+
subj "ut" → dependents(aux) # some particle-like material dependent on the complementiser
|
168
|
+
|
169
|
+
subj "ne" → dependents(pred)
|
170
|
+
subj "ne" → dependents(aux)
|
171
|
+
|
172
|
+
# Particles and adverbs
|
173
|
+
"iam" → adverb adv
|
174
|
+
"iam" → adverb aux # possibly
|