proiel-cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: cc539e4839fccb93166f5fb00309efafc88bdb25
4
+ data.tar.gz: 758707d18035ce10ecd59772031b6c01d703723e
5
+ SHA512:
6
+ metadata.gz: cedb768615cbc1e6c2ecb1b09dc30f88f97d72df5a7545fd0a1087cf925432887c4838316609a6b5706f79f63cf9c35898e2ca64ce1d850e0ba264e99f6d1b3f
7
+ data.tar.gz: f105fcc4028036319efb2b4c7b3d9c5427a00c273166719f8256adc029f8ca1545514985ed814deea5c430682342cc10043ed94dc4c37de180db8e41717f957c
data/LICENSE ADDED
@@ -0,0 +1,23 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2015 Marius L. Jøhndal
4
+ Copyright (c) 2015 Dag Haug
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in all
14
+ copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ SOFTWARE.
23
+
data/README.md ADDED
@@ -0,0 +1,34 @@
1
+ # PROIEL command-line interface
2
+
3
+ This is a command-line interface for manpulating PROIEL treebanks.
4
+
5
+ ## Installation
6
+
7
+ Install as
8
+
9
+ ```shell
10
+ gem install proiel-cli
11
+ ```
12
+
13
+ ## Using the command-line interface
14
+
15
+ The gem includes a command-line utility `proiel` for a number of routine tasks.
16
+ `proiel info`, for example, displays metadata and some brief statistics, and
17
+ `proiel convert conll` converts the treebank to CoNLL format. Use `proiel
18
+ --help` for further examples and usage instructions.
19
+
20
+ ## Development
21
+
22
+ Check out the git repository from github and run `bin/setup` to install
23
+ development dependencies. Then run `rake` to run the tests.
24
+
25
+ To install a development version of this gem, run `bundle exec rake install`.
26
+ To release a new version, update the version number in `version.rb`, and then
27
+ run `bundle exec rake release`, which will create a git tag for the version,
28
+ push git commits and tags, and push the gem to
29
+ [rubygems.org](https://rubygems.org).
30
+
31
+ ## Contributing
32
+
33
+ Bug reports and pull requests are welcome on GitHub at
34
+ https://github.com/proiel/proiel-cli.
data/bin/proiel ADDED
@@ -0,0 +1,27 @@
1
+ #!/usr/bin/env ruby
2
+ STDOUT.sync = true
3
+
4
+ $:.unshift File.join(File.dirname(__FILE__), *%w{ .. lib })
5
+
6
+ require 'proiel/cli'
7
+
8
+ Mercenary.program(:proiel) do |p|
9
+ p.version PROIEL::VERSION
10
+ p.description 'proiel is a command-line interface for PROIEL treebanks'
11
+ p.syntax 'proiel <subcommand> [options]'
12
+
13
+ p.option 'verbose', '-V', '--verbose', 'Display verbose information'
14
+
15
+ PROIEL::Command.subclasses.each { |c| c.init_with_program(p) }
16
+
17
+ p.action do |args, options|
18
+ if args.empty?
19
+ puts p
20
+ else
21
+ unless p.has_command?(args.first)
22
+ STDERR.puts 'Invalid command. Use --help for more information.'
23
+ exit 1
24
+ end
25
+ end
26
+ end
27
+ end
data/bin/setup ADDED
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+
5
+ bundle install
6
+
7
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,6 @@
1
+ #!/bin/bash
2
+ ln -sf "$1" a
3
+ ln -sf "$2" b
4
+ plain2snt a b
5
+ snt2cooc a.vcb b.vcb a_b.snt > corp.cooc
6
+ GIZA++ -S a.vcb -T b.vcb -C a_b.snt -CoocurrenceFile corp.cooc -o alignment > alignment.log
@@ -0,0 +1,18 @@
1
+ #!/bin/bash
2
+ tmpdir=$(mktemp -dt "$0.XXXXXXXXXX")
3
+ lexcfile=${tmpdir}/output.lexc
4
+ scriptfile=${tmpdir}/output.script
5
+ fstfile=output.fst
6
+
7
+ proiel=../bin/proiel
8
+ foma=foma
9
+ #foma=xfst
10
+
11
+ ../bin/proiel convert lexc -V $@ > ${lexcfile} || exit 1
12
+ echo "read lexc ${lexcfile}" > ${scriptfile}
13
+ echo "save stack ${fstfile}" >> ${scriptfile}
14
+
15
+ ${foma} -f ${scriptfile}
16
+
17
+ echo
18
+ echo "Generated transducer is ${fstfile}. Try 'echo wordform | flookup ${fstfile}' to test."
@@ -0,0 +1,2 @@
1
+ #!/bin/sh
2
+ java -jar maltparser-1.8.jar -c test -i untagged.conll -o tagged.conll -m parse
@@ -0,0 +1,6 @@
1
+ #!/bin/sh
2
+ java \
3
+ -jar maltparser-1.8.jar \
4
+ -c test \
5
+ -m learn \
6
+ -i $1
@@ -0,0 +1,15 @@
1
+ #!/bin/bash
2
+ tmpdir=$(mktemp -dt "$0.XXXXXXXXXX")
3
+ tntfile=${tmpdir}/output.t
4
+ #fstfile=output.fst
5
+
6
+ proiel=../bin/proiel
7
+ tnt=tnt
8
+ #tnt=hunpos
9
+
10
+ ../bin/proiel convert tnt -V $@ > ${tntfile} || exit 1
11
+
12
+ tnt-para ${tntfile} && mv -vi ${tmpdir}/output.{lex,123} .
13
+
14
+ echo
15
+ echo "Generated parameter files are output.lex and output.123. Try 'tnt output mycorpus.t' to test."
@@ -0,0 +1,41 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Train a decision tree on the (head_relation, head_pos, head_lemma,
4
+ # child_relation, child_pos, child_lemma) and then predict the child_relation
5
+ # of an unknown child.
6
+ #
7
+ require 'colorize'
8
+ require 'decisiontree'
9
+ require 'proiel'
10
+
11
+ if ARGV.length < 1
12
+ STDERR.puts "Usage: #{$0} treebank-files(s)"
13
+ exit 1
14
+ end
15
+
16
+ tb = PROIEL::Treebank.new
17
+ tb.load_from_xml(ARGV)
18
+
19
+ tokens = {}
20
+
21
+ tb.sources.each do |source|
22
+ source.tokens.each do |token|
23
+ tokens[token.id.to_i] = [token.relation, token.part_of_speech, token.lemma, token.head_id]
24
+ end
25
+ end
26
+
27
+ training_data = tokens.map do |_, (child_relation, child_pos, child_lemma, head_id)|
28
+ if head_id
29
+ head = tokens[head_id.to_i]
30
+ head_relation, head_pos, head_lemma, _ = *head
31
+
32
+ [head_pos || '', head_lemma || '', head_relation, child_pos || '', child_lemma || '', child_relation]
33
+ end
34
+ end.compact
35
+
36
+ attributes = %w(head_pos head_lemma head_relation child_pos child_lemma)
37
+ dr = DecisionTree::ID3Tree.new(attributes, training_data, 'pred', :discrete)
38
+ dr.train
39
+ dr.save_to_file("dr.marshal")
40
+
41
+ p dr.predict(["Ne", "Gallia", "sub", "Px", "omnis"])
@@ -0,0 +1,84 @@
1
+ #!/usr/bin/env ruby
2
+ require 'proiel'
3
+ require 'colorize'
4
+ require 'terminal-table'
5
+
6
+ if ARGV.length < 1
7
+ STDERR.puts "Usage: #{$0} treebank-files(s)"
8
+ exit 1
9
+ end
10
+
11
+ tb = PROIEL::Treebank.new
12
+ tb.load_from_xml(ARGV)
13
+
14
+ # Present by POS
15
+ relations = tb.annotation.relation_tags.keys
16
+
17
+ c = {}
18
+ tb.sources.each do |s|
19
+ s.tokens.each do |t|
20
+ next if t.pos.nil? or t.relation.nil?
21
+
22
+ c[t.pos] ||= {}
23
+ c[t.pos][t.relation] ||= 0
24
+ c[t.pos][t.relation] += 1
25
+ end
26
+ end
27
+
28
+ rows = []
29
+ c.sort_by(&:first).each do |pos, d|
30
+ total = d.inject(0) { |a, (k, v)| a + v }
31
+
32
+ rows << [pos] + relations.map do |r|
33
+ n = d[r ? r.to_s : nil]
34
+
35
+ if n and n < total * 0.001
36
+ n.to_s.red
37
+ elsif n and n > total * 0.999
38
+ n.to_s.green
39
+ else
40
+ n
41
+ end
42
+ end
43
+ end
44
+
45
+ table = Terminal::Table.new headings: ['Part of speech'] + relations, rows: rows
46
+ puts table
47
+ puts "(red = relation occurs for less than 0.1% of tokens with this POS; green = relation occurs for more than 99.9% of tokens with this POS)"
48
+ puts
49
+
50
+ # Present by relation
51
+ poses = tb.annotation.part_of_speech_tags.keys
52
+
53
+ c = {}
54
+
55
+ tb.sources.each do |s|
56
+ s.tokens.each do |t|
57
+ next if t.pos.nil? or t.relation.nil?
58
+
59
+ c[t.relation] ||= {}
60
+ c[t.relation][t.pos] ||= 0
61
+ c[t.relation][t.pos] += 1
62
+ end
63
+ end
64
+
65
+ rows = []
66
+ c.sort_by(&:first).each do |relation, d|
67
+ total = d.inject(0) { |a, (k, v)| a + v }
68
+
69
+ rows << [relation] + poses.map do |r|
70
+ n = d[r ? r.to_s : nil]
71
+
72
+ if n and n < total * 0.001
73
+ n.to_s.red
74
+ elsif n and n > total * 0.999
75
+ n.to_s.green
76
+ else
77
+ n
78
+ end
79
+ end
80
+ end
81
+
82
+ table = Terminal::Table.new headings: ['Relation'] + poses, rows: rows
83
+ puts table
84
+ puts "(red = POS occurs for less than 0.1% of tokens with this relation; green = POS occurs for more than 99.9% of tokens with this relation)"
@@ -0,0 +1,174 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Very simple testing of implicational feature rules. Example rules only
4
+ # apply to Latin.
5
+ #
6
+ require 'colorize'
7
+ require 'proiel'
8
+
9
+ VIOLATIONS = {}
10
+
11
+ def report_violation(token, message)
12
+ VIOLATIONS[message] ||= []
13
+ VIOLATIONS[message] << token
14
+ end
15
+
16
+ def test_token(token, rules, dependent_rules)
17
+ rules.each do |match_features, test_alternatives|
18
+ f = token.features + ["\"#{token.form}\""]
19
+
20
+ if (match_features - f).empty?
21
+ unless test_alternatives.any? { |test_alternative| token.features.include?(test_alternative) }
22
+ report_violation(token, "#{match_features.join(' ')}")
23
+ end
24
+ end
25
+ end
26
+
27
+ dependent_rules.each do |match_features, test_alternatives|
28
+ f = token.features + ["\"#{token.form}\""]
29
+
30
+ if (match_features - f).empty?
31
+ t = token.children.all? do |dependent|
32
+ test_alternatives.any? { |test_alternative| dependent.features.include?(test_alternative) }
33
+ end
34
+
35
+ unless t
36
+ report_violation(token, "#{match_features.join(' ')} → dependents()")
37
+ end
38
+ end
39
+ end
40
+ end
41
+
42
+ def load_rules
43
+ rules = {}
44
+ dependent_rules = {}
45
+
46
+ DATA.each do |rule|
47
+ rule.chomp!
48
+ rule.sub!(/\s*#.*$/, '')
49
+
50
+ next if rule.empty?
51
+
52
+ match_features, test = rule.split(/\s*→\s*/)
53
+ match_features = match_features.split(/\s+/)
54
+
55
+ if test[/\s*dependents\(([^)]*)\)\s*/]
56
+ dependent_rules[match_features] ||= []
57
+ dependent_rules[match_features] << $1
58
+ test.sub!(/\s*dependents\([^)]*\)\s*/, '')
59
+ end
60
+
61
+ if test != ''
62
+ rules[match_features] ||= []
63
+ rules[match_features] << test
64
+ end
65
+ end
66
+
67
+ [rules, dependent_rules]
68
+ end
69
+
70
+ if ARGV.length < 1
71
+ STDERR.puts "Usage: #{$0} treebank-files(s)"
72
+ exit 1
73
+ end
74
+
75
+ tb = PROIEL::Treebank.new
76
+ tb.load_from_xml(ARGV)
77
+
78
+ rules, dependent_rules = load_rules
79
+
80
+ tb.sources.each do |source|
81
+ source.sentences.each do |sentence|
82
+ if sentence.status == 'reviewed'
83
+ sentence.tokens.each do |token|
84
+ test_token(token, rules, dependent_rules)
85
+ end
86
+ end
87
+ end
88
+ end
89
+
90
+ base_url = 'http://foni.uio.no:3000'
91
+
92
+ puts "<h1>PROIEL lint report</h1>"
93
+
94
+ VIOLATIONS.each do |rule, tokens|
95
+ puts "<h2>#{rule}</h2><ul>"
96
+ tokens.each do |token|
97
+ puts "<li>Token <a href='#{base_url}/tokens/#{token.id}'>#{token.id}</a> in sentence <a href='#{base_url}/sentences/#{token.sentence.id}'>#{token.sentence.id}</a></li>"
98
+ end
99
+ puts "</ul>"
100
+ end
101
+
102
+ __END__
103
+
104
+ # Gerundives
105
+ gdv nom → xobj # modal gerundive heading a main clause
106
+
107
+ gdv acc → comp # modal gerundive heading an AcI, or in the _curo faciendum_ type
108
+ gdv acc → xobj # modal gerundive heading an AcI with an overt auxiliary
109
+ gdv acc → obl # as argument of a preposition
110
+ gdv acc → xadv # in the _do librum legendum_ type
111
+
112
+ gdv gen → atr # in the _tempus dicendi_ type
113
+ gdv gen → narg # in the _facultas dicendi_ type
114
+
115
+ gdv abl → obl # as argument of a preposition
116
+ gdv abl → abl # in circumstantial adjuncts of various types
117
+
118
+ # Gerunds
119
+ ger nom → 0 # invalid case for a gerundive
120
+
121
+ ger acc → obl # as argument of a preposition
122
+
123
+ ger gen → atr # in the _tempus dicendi_ type
124
+ ger gen → narg # in the _facultas dicendi_ type
125
+
126
+ ger abl → obl # as argument of a preposition
127
+ ger abl → abl # in circumstantial adjuncts of various types
128
+
129
+ # Reflexive pronouns
130
+ persrefl nom → 0
131
+
132
+ persrefl acc → sub
133
+ persrefl acc → obj
134
+ persrefl acc → obl
135
+
136
+ persrefl dat → obl
137
+ persrefl dat → adv
138
+ persrefl dat → ag
139
+
140
+ persrefl abl → obl
141
+ persrefl abl → sub
142
+
143
+ persrefl "se" → acc
144
+ persrefl "se" → abl
145
+
146
+ persrefl "sese" → acc
147
+ persrefl "sese" → abl
148
+
149
+ persrefl "sibi" → dat
150
+
151
+ # Personal pronouns
152
+ perspron nom → sub
153
+
154
+ perspron acc → sub
155
+ perspron acc → obj
156
+ perspron acc → obl
157
+
158
+ perspron dat → obl
159
+ perspron dat → adv
160
+ perspron dat → ag
161
+
162
+ perspron abl → obl
163
+ perspron abl → sub
164
+
165
+ # The dependent of the complementisers _ut_ and _ne_ should be a PRED or an AUX
166
+ subj "ut" → dependents(pred) # the standard case, a predicate heading a clause
167
+ subj "ut" → dependents(aux) # some particle-like material dependent on the complementiser
168
+
169
+ subj "ne" → dependents(pred)
170
+ subj "ne" → dependents(aux)
171
+
172
+ # Particles and adverbs
173
+ "iam" → adverb adv
174
+ "iam" → adverb aux # possibly