proiel-cli 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: cc539e4839fccb93166f5fb00309efafc88bdb25
4
+ data.tar.gz: 758707d18035ce10ecd59772031b6c01d703723e
5
+ SHA512:
6
+ metadata.gz: cedb768615cbc1e6c2ecb1b09dc30f88f97d72df5a7545fd0a1087cf925432887c4838316609a6b5706f79f63cf9c35898e2ca64ce1d850e0ba264e99f6d1b3f
7
+ data.tar.gz: f105fcc4028036319efb2b4c7b3d9c5427a00c273166719f8256adc029f8ca1545514985ed814deea5c430682342cc10043ed94dc4c37de180db8e41717f957c
data/LICENSE ADDED
@@ -0,0 +1,23 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2015 Marius L. Jøhndal
4
+ Copyright (c) 2015 Dag Haug
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in all
14
+ copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ SOFTWARE.
23
+
data/README.md ADDED
@@ -0,0 +1,34 @@
1
+ # PROIEL command-line interface
2
+
3
+ This is a command-line interface for manpulating PROIEL treebanks.
4
+
5
+ ## Installation
6
+
7
+ Install as
8
+
9
+ ```shell
10
+ gem install proiel-cli
11
+ ```
12
+
13
+ ## Using the command-line interface
14
+
15
+ The gem includes a command-line utility `proiel` for a number of routine tasks.
16
+ `proiel info`, for example, displays metadata and some brief statistics, and
17
+ `proiel convert conll` converts the treebank to CoNLL format. Use `proiel
18
+ --help` for further examples and usage instructions.
19
+
20
+ ## Development
21
+
22
+ Check out the git repository from github and run `bin/setup` to install
23
+ development dependencies. Then run `rake` to run the tests.
24
+
25
+ To install a development version of this gem, run `bundle exec rake install`.
26
+ To release a new version, update the version number in `version.rb`, and then
27
+ run `bundle exec rake release`, which will create a git tag for the version,
28
+ push git commits and tags, and push the gem to
29
+ [rubygems.org](https://rubygems.org).
30
+
31
+ ## Contributing
32
+
33
+ Bug reports and pull requests are welcome on GitHub at
34
+ https://github.com/proiel/proiel-cli.
data/bin/proiel ADDED
@@ -0,0 +1,27 @@
1
+ #!/usr/bin/env ruby
2
+ STDOUT.sync = true
3
+
4
+ $:.unshift File.join(File.dirname(__FILE__), *%w{ .. lib })
5
+
6
+ require 'proiel/cli'
7
+
8
+ Mercenary.program(:proiel) do |p|
9
+ p.version PROIEL::VERSION
10
+ p.description 'proiel is a command-line interface for PROIEL treebanks'
11
+ p.syntax 'proiel <subcommand> [options]'
12
+
13
+ p.option 'verbose', '-V', '--verbose', 'Display verbose information'
14
+
15
+ PROIEL::Command.subclasses.each { |c| c.init_with_program(p) }
16
+
17
+ p.action do |args, options|
18
+ if args.empty?
19
+ puts p
20
+ else
21
+ unless p.has_command?(args.first)
22
+ STDERR.puts 'Invalid command. Use --help for more information.'
23
+ exit 1
24
+ end
25
+ end
26
+ end
27
+ end
data/bin/setup ADDED
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+
5
+ bundle install
6
+
7
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,6 @@
1
+ #!/bin/bash
2
+ ln -sf "$1" a
3
+ ln -sf "$2" b
4
+ plain2snt a b
5
+ snt2cooc a.vcb b.vcb a_b.snt > corp.cooc
6
+ GIZA++ -S a.vcb -T b.vcb -C a_b.snt -CoocurrenceFile corp.cooc -o alignment > alignment.log
@@ -0,0 +1,18 @@
1
+ #!/bin/bash
2
+ tmpdir=$(mktemp -dt "$0.XXXXXXXXXX")
3
+ lexcfile=${tmpdir}/output.lexc
4
+ scriptfile=${tmpdir}/output.script
5
+ fstfile=output.fst
6
+
7
+ proiel=../bin/proiel
8
+ foma=foma
9
+ #foma=xfst
10
+
11
+ ../bin/proiel convert lexc -V $@ > ${lexcfile} || exit 1
12
+ echo "read lexc ${lexcfile}" > ${scriptfile}
13
+ echo "save stack ${fstfile}" >> ${scriptfile}
14
+
15
+ ${foma} -f ${scriptfile}
16
+
17
+ echo
18
+ echo "Generated transducer is ${fstfile}. Try 'echo wordform | flookup ${fstfile}' to test."
@@ -0,0 +1,2 @@
1
+ #!/bin/sh
2
+ java -jar maltparser-1.8.jar -c test -i untagged.conll -o tagged.conll -m parse
@@ -0,0 +1,6 @@
1
+ #!/bin/sh
2
+ java \
3
+ -jar maltparser-1.8.jar \
4
+ -c test \
5
+ -m learn \
6
+ -i $1
@@ -0,0 +1,15 @@
1
+ #!/bin/bash
2
+ tmpdir=$(mktemp -dt "$0.XXXXXXXXXX")
3
+ tntfile=${tmpdir}/output.t
4
+ #fstfile=output.fst
5
+
6
+ proiel=../bin/proiel
7
+ tnt=tnt
8
+ #tnt=hunpos
9
+
10
+ ../bin/proiel convert tnt -V $@ > ${tntfile} || exit 1
11
+
12
+ tnt-para ${tntfile} && mv -vi ${tmpdir}/output.{lex,123} .
13
+
14
+ echo
15
+ echo "Generated parameter files are output.lex and output.123. Try 'tnt output mycorpus.t' to test."
@@ -0,0 +1,41 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Train a decision tree on the (head_relation, head_pos, head_lemma,
4
+ # child_relation, child_pos, child_lemma) and then predict the child_relation
5
+ # of an unknown child.
6
+ #
7
+ require 'colorize'
8
+ require 'decisiontree'
9
+ require 'proiel'
10
+
11
+ if ARGV.length < 1
12
+ STDERR.puts "Usage: #{$0} treebank-files(s)"
13
+ exit 1
14
+ end
15
+
16
+ tb = PROIEL::Treebank.new
17
+ tb.load_from_xml(ARGV)
18
+
19
+ tokens = {}
20
+
21
+ tb.sources.each do |source|
22
+ source.tokens.each do |token|
23
+ tokens[token.id.to_i] = [token.relation, token.part_of_speech, token.lemma, token.head_id]
24
+ end
25
+ end
26
+
27
+ training_data = tokens.map do |_, (child_relation, child_pos, child_lemma, head_id)|
28
+ if head_id
29
+ head = tokens[head_id.to_i]
30
+ head_relation, head_pos, head_lemma, _ = *head
31
+
32
+ [head_pos || '', head_lemma || '', head_relation, child_pos || '', child_lemma || '', child_relation]
33
+ end
34
+ end.compact
35
+
36
+ attributes = %w(head_pos head_lemma head_relation child_pos child_lemma)
37
+ dr = DecisionTree::ID3Tree.new(attributes, training_data, 'pred', :discrete)
38
+ dr.train
39
+ dr.save_to_file("dr.marshal")
40
+
41
+ p dr.predict(["Ne", "Gallia", "sub", "Px", "omnis"])
@@ -0,0 +1,84 @@
1
+ #!/usr/bin/env ruby
2
+ require 'proiel'
3
+ require 'colorize'
4
+ require 'terminal-table'
5
+
6
+ if ARGV.length < 1
7
+ STDERR.puts "Usage: #{$0} treebank-files(s)"
8
+ exit 1
9
+ end
10
+
11
+ tb = PROIEL::Treebank.new
12
+ tb.load_from_xml(ARGV)
13
+
14
+ # Present by POS
15
+ relations = tb.annotation.relation_tags.keys
16
+
17
+ c = {}
18
+ tb.sources.each do |s|
19
+ s.tokens.each do |t|
20
+ next if t.pos.nil? or t.relation.nil?
21
+
22
+ c[t.pos] ||= {}
23
+ c[t.pos][t.relation] ||= 0
24
+ c[t.pos][t.relation] += 1
25
+ end
26
+ end
27
+
28
+ rows = []
29
+ c.sort_by(&:first).each do |pos, d|
30
+ total = d.inject(0) { |a, (k, v)| a + v }
31
+
32
+ rows << [pos] + relations.map do |r|
33
+ n = d[r ? r.to_s : nil]
34
+
35
+ if n and n < total * 0.001
36
+ n.to_s.red
37
+ elsif n and n > total * 0.999
38
+ n.to_s.green
39
+ else
40
+ n
41
+ end
42
+ end
43
+ end
44
+
45
+ table = Terminal::Table.new headings: ['Part of speech'] + relations, rows: rows
46
+ puts table
47
+ puts "(red = relation occurs for less than 0.1% of tokens with this POS; green = relation occurs for more than 99.9% of tokens with this POS)"
48
+ puts
49
+
50
+ # Present by relation
51
+ poses = tb.annotation.part_of_speech_tags.keys
52
+
53
+ c = {}
54
+
55
+ tb.sources.each do |s|
56
+ s.tokens.each do |t|
57
+ next if t.pos.nil? or t.relation.nil?
58
+
59
+ c[t.relation] ||= {}
60
+ c[t.relation][t.pos] ||= 0
61
+ c[t.relation][t.pos] += 1
62
+ end
63
+ end
64
+
65
+ rows = []
66
+ c.sort_by(&:first).each do |relation, d|
67
+ total = d.inject(0) { |a, (k, v)| a + v }
68
+
69
+ rows << [relation] + poses.map do |r|
70
+ n = d[r ? r.to_s : nil]
71
+
72
+ if n and n < total * 0.001
73
+ n.to_s.red
74
+ elsif n and n > total * 0.999
75
+ n.to_s.green
76
+ else
77
+ n
78
+ end
79
+ end
80
+ end
81
+
82
+ table = Terminal::Table.new headings: ['Relation'] + poses, rows: rows
83
+ puts table
84
+ puts "(red = POS occurs for less than 0.1% of tokens with this relation; green = POS occurs for more than 99.9% of tokens with this relation)"
@@ -0,0 +1,174 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Very simple testing of implicational feature rules. Example rules only
4
+ # apply to Latin.
5
+ #
6
+ require 'colorize'
7
+ require 'proiel'
8
+
9
+ VIOLATIONS = {}
10
+
11
+ def report_violation(token, message)
12
+ VIOLATIONS[message] ||= []
13
+ VIOLATIONS[message] << token
14
+ end
15
+
16
+ def test_token(token, rules, dependent_rules)
17
+ rules.each do |match_features, test_alternatives|
18
+ f = token.features + ["\"#{token.form}\""]
19
+
20
+ if (match_features - f).empty?
21
+ unless test_alternatives.any? { |test_alternative| token.features.include?(test_alternative) }
22
+ report_violation(token, "#{match_features.join(' ')}")
23
+ end
24
+ end
25
+ end
26
+
27
+ dependent_rules.each do |match_features, test_alternatives|
28
+ f = token.features + ["\"#{token.form}\""]
29
+
30
+ if (match_features - f).empty?
31
+ t = token.children.all? do |dependent|
32
+ test_alternatives.any? { |test_alternative| dependent.features.include?(test_alternative) }
33
+ end
34
+
35
+ unless t
36
+ report_violation(token, "#{match_features.join(' ')} → dependents()")
37
+ end
38
+ end
39
+ end
40
+ end
41
+
42
+ def load_rules
43
+ rules = {}
44
+ dependent_rules = {}
45
+
46
+ DATA.each do |rule|
47
+ rule.chomp!
48
+ rule.sub!(/\s*#.*$/, '')
49
+
50
+ next if rule.empty?
51
+
52
+ match_features, test = rule.split(/\s*→\s*/)
53
+ match_features = match_features.split(/\s+/)
54
+
55
+ if test[/\s*dependents\(([^)]*)\)\s*/]
56
+ dependent_rules[match_features] ||= []
57
+ dependent_rules[match_features] << $1
58
+ test.sub!(/\s*dependents\([^)]*\)\s*/, '')
59
+ end
60
+
61
+ if test != ''
62
+ rules[match_features] ||= []
63
+ rules[match_features] << test
64
+ end
65
+ end
66
+
67
+ [rules, dependent_rules]
68
+ end
69
+
70
+ if ARGV.length < 1
71
+ STDERR.puts "Usage: #{$0} treebank-files(s)"
72
+ exit 1
73
+ end
74
+
75
+ tb = PROIEL::Treebank.new
76
+ tb.load_from_xml(ARGV)
77
+
78
+ rules, dependent_rules = load_rules
79
+
80
+ tb.sources.each do |source|
81
+ source.sentences.each do |sentence|
82
+ if sentence.status == 'reviewed'
83
+ sentence.tokens.each do |token|
84
+ test_token(token, rules, dependent_rules)
85
+ end
86
+ end
87
+ end
88
+ end
89
+
90
+ base_url = 'http://foni.uio.no:3000'
91
+
92
+ puts "<h1>PROIEL lint report</h1>"
93
+
94
+ VIOLATIONS.each do |rule, tokens|
95
+ puts "<h2>#{rule}</h2><ul>"
96
+ tokens.each do |token|
97
+ puts "<li>Token <a href='#{base_url}/tokens/#{token.id}'>#{token.id}</a> in sentence <a href='#{base_url}/sentences/#{token.sentence.id}'>#{token.sentence.id}</a></li>"
98
+ end
99
+ puts "</ul>"
100
+ end
101
+
102
+ __END__
103
+
104
+ # Gerundives
105
+ gdv nom → xobj # modal gerundive heading a main clause
106
+
107
+ gdv acc → comp # modal gerundive heading an AcI, or in the _curo faciendum_ type
108
+ gdv acc → xobj # modal gerundive heading an AcI with an overt auxiliary
109
+ gdv acc → obl # as argument of a preposition
110
+ gdv acc → xadv # in the _do librum legendum_ type
111
+
112
+ gdv gen → atr # in the _tempus dicendi_ type
113
+ gdv gen → narg # in the _facultas dicendi_ type
114
+
115
+ gdv abl → obl # as argument of a preposition
116
+ gdv abl → abl # in circumstantial adjuncts of various types
117
+
118
+ # Gerunds
119
+ ger nom → 0 # invalid case for a gerundive
120
+
121
+ ger acc → obl # as argument of a preposition
122
+
123
+ ger gen → atr # in the _tempus dicendi_ type
124
+ ger gen → narg # in the _facultas dicendi_ type
125
+
126
+ ger abl → obl # as argument of a preposition
127
+ ger abl → abl # in circumstantial adjuncts of various types
128
+
129
+ # Reflexive pronouns
130
+ persrefl nom → 0
131
+
132
+ persrefl acc → sub
133
+ persrefl acc → obj
134
+ persrefl acc → obl
135
+
136
+ persrefl dat → obl
137
+ persrefl dat → adv
138
+ persrefl dat → ag
139
+
140
+ persrefl abl → obl
141
+ persrefl abl → sub
142
+
143
+ persrefl "se" → acc
144
+ persrefl "se" → abl
145
+
146
+ persrefl "sese" → acc
147
+ persrefl "sese" → abl
148
+
149
+ persrefl "sibi" → dat
150
+
151
+ # Personal pronouns
152
+ perspron nom → sub
153
+
154
+ perspron acc → sub
155
+ perspron acc → obj
156
+ perspron acc → obl
157
+
158
+ perspron dat → obl
159
+ perspron dat → adv
160
+ perspron dat → ag
161
+
162
+ perspron abl → obl
163
+ perspron abl → sub
164
+
165
+ # The dependent of the complementisers _ut_ and _ne_ should be a PRED or an AUX
166
+ subj "ut" → dependents(pred) # the standard case, a predicate heading a clause
167
+ subj "ut" → dependents(aux) # some particle-like material dependent on the complementiser
168
+
169
+ subj "ne" → dependents(pred)
170
+ subj "ne" → dependents(aux)
171
+
172
+ # Particles and adverbs
173
+ "iam" → adverb adv
174
+ "iam" → adverb aux # possibly