proiel-cli 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cc539e4839fccb93166f5fb00309efafc88bdb25
4
- data.tar.gz: 758707d18035ce10ecd59772031b6c01d703723e
3
+ metadata.gz: bfb4db333ae791d4171490c7b3d316364b205729
4
+ data.tar.gz: fd001265bdcc8e75e49fc1e2a188f7c63471bb62
5
5
  SHA512:
6
- metadata.gz: cedb768615cbc1e6c2ecb1b09dc30f88f97d72df5a7545fd0a1087cf925432887c4838316609a6b5706f79f63cf9c35898e2ca64ce1d850e0ba264e99f6d1b3f
7
- data.tar.gz: f105fcc4028036319efb2b4c7b3d9c5427a00c273166719f8256adc029f8ca1545514985ed814deea5c430682342cc10043ed94dc4c37de180db8e41717f957c
6
+ metadata.gz: 29cb26e28f22486db097b982d7e2cd4783f2b38d8d6c13fb06309fb25377791cd8248c88627b7d5b72c3adfe0e958c9696b2508fcb975e9a8fdf516a9825963b
7
+ data.tar.gz: b565292467456b10415039f0464bdfafcae5fecf55747dd3b9ec2473313ba08789d49431ecb06d649596bfd98031640481efccd9b5be58b6b565d13dcc3a1161
data/README.md CHANGED
@@ -1,10 +1,10 @@
1
1
  # PROIEL command-line interface
2
2
 
3
- This is a command-line interface for manpulating PROIEL treebanks.
3
+ This is a command-line interface for manipulating PROIEL treebanks.
4
4
 
5
5
  ## Installation
6
6
 
7
- Install as
7
+ This library requires Ruby >= 2.1. Install as
8
8
 
9
9
  ```shell
10
10
  gem install proiel-cli
@@ -88,7 +88,7 @@ module PROIEL
88
88
  unless token.slashes.empty? or options['remove-syntax'] # this extra test avoids <token></token> style XML
89
89
  builder.token(attrs) do
90
90
  token.slashes.each do |relation, target_id|
91
- builder.slash("target-id": target_id, relation: relation)
91
+ builder.slash(:"target-id" => target_id, relation: relation)
92
92
  end
93
93
  end
94
94
  else
@@ -1,5 +1,5 @@
1
1
  module PROIEL
2
2
  module CLI
3
- VERSION = '0.1.0'
3
+ VERSION = '0.1.1'
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: proiel-cli
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Marius L. Jøhndal
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2015-11-07 00:00:00.000000000 Z
12
+ date: 2015-12-06 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: builder
@@ -197,11 +197,6 @@ files:
197
197
  - contrib/proiel-maltparser-parse
198
198
  - contrib/proiel-maltparser-train
199
199
  - contrib/proiel-tnt-train
200
- - examples/decision-tree.rb
201
- - examples/dep-pos-cooccurrences.rb
202
- - examples/lint-rules.rb
203
- - examples/relation-as-disambiguator.rb
204
- - examples/word-occurrences.rb
205
200
  - lib/proiel/cli.rb
206
201
  - lib/proiel/cli/commands.rb
207
202
  - lib/proiel/cli/commands/convert.rb
@@ -1,41 +0,0 @@
1
- #!/usr/bin/env ruby
2
- #
3
- # Train a decision tree on the (head_relation, head_pos, head_lemma,
4
- # child_relation, child_pos, child_lemma) and then predict the child_relation
5
- # of an unknown child.
6
- #
7
- require 'colorize'
8
- require 'decisiontree'
9
- require 'proiel'
10
-
11
- if ARGV.length < 1
12
- STDERR.puts "Usage: #{$0} treebank-files(s)"
13
- exit 1
14
- end
15
-
16
- tb = PROIEL::Treebank.new
17
- tb.load_from_xml(ARGV)
18
-
19
- tokens = {}
20
-
21
- tb.sources.each do |source|
22
- source.tokens.each do |token|
23
- tokens[token.id.to_i] = [token.relation, token.part_of_speech, token.lemma, token.head_id]
24
- end
25
- end
26
-
27
- training_data = tokens.map do |_, (child_relation, child_pos, child_lemma, head_id)|
28
- if head_id
29
- head = tokens[head_id.to_i]
30
- head_relation, head_pos, head_lemma, _ = *head
31
-
32
- [head_pos || '', head_lemma || '', head_relation, child_pos || '', child_lemma || '', child_relation]
33
- end
34
- end.compact
35
-
36
- attributes = %w(head_pos head_lemma head_relation child_pos child_lemma)
37
- dr = DecisionTree::ID3Tree.new(attributes, training_data, 'pred', :discrete)
38
- dr.train
39
- dr.save_to_file("dr.marshal")
40
-
41
- p dr.predict(["Ne", "Gallia", "sub", "Px", "omnis"])
@@ -1,84 +0,0 @@
1
- #!/usr/bin/env ruby
2
- require 'proiel'
3
- require 'colorize'
4
- require 'terminal-table'
5
-
6
- if ARGV.length < 1
7
- STDERR.puts "Usage: #{$0} treebank-files(s)"
8
- exit 1
9
- end
10
-
11
- tb = PROIEL::Treebank.new
12
- tb.load_from_xml(ARGV)
13
-
14
- # Present by POS
15
- relations = tb.annotation.relation_tags.keys
16
-
17
- c = {}
18
- tb.sources.each do |s|
19
- s.tokens.each do |t|
20
- next if t.pos.nil? or t.relation.nil?
21
-
22
- c[t.pos] ||= {}
23
- c[t.pos][t.relation] ||= 0
24
- c[t.pos][t.relation] += 1
25
- end
26
- end
27
-
28
- rows = []
29
- c.sort_by(&:first).each do |pos, d|
30
- total = d.inject(0) { |a, (k, v)| a + v }
31
-
32
- rows << [pos] + relations.map do |r|
33
- n = d[r ? r.to_s : nil]
34
-
35
- if n and n < total * 0.001
36
- n.to_s.red
37
- elsif n and n > total * 0.999
38
- n.to_s.green
39
- else
40
- n
41
- end
42
- end
43
- end
44
-
45
- table = Terminal::Table.new headings: ['Part of speech'] + relations, rows: rows
46
- puts table
47
- puts "(red = relation occurs for less than 0.1% of tokens with this POS; green = relation occurs for more than 99.9% of tokens with this POS)"
48
- puts
49
-
50
- # Present by relation
51
- poses = tb.annotation.part_of_speech_tags.keys
52
-
53
- c = {}
54
-
55
- tb.sources.each do |s|
56
- s.tokens.each do |t|
57
- next if t.pos.nil? or t.relation.nil?
58
-
59
- c[t.relation] ||= {}
60
- c[t.relation][t.pos] ||= 0
61
- c[t.relation][t.pos] += 1
62
- end
63
- end
64
-
65
- rows = []
66
- c.sort_by(&:first).each do |relation, d|
67
- total = d.inject(0) { |a, (k, v)| a + v }
68
-
69
- rows << [relation] + poses.map do |r|
70
- n = d[r ? r.to_s : nil]
71
-
72
- if n and n < total * 0.001
73
- n.to_s.red
74
- elsif n and n > total * 0.999
75
- n.to_s.green
76
- else
77
- n
78
- end
79
- end
80
- end
81
-
82
- table = Terminal::Table.new headings: ['Relation'] + poses, rows: rows
83
- puts table
84
- puts "(red = POS occurs for less than 0.1% of tokens with this relation; green = POS occurs for more than 99.9% of tokens with this relation)"
@@ -1,174 +0,0 @@
1
- #!/usr/bin/env ruby
2
- #
3
- # Very simple testing of implicational feature rules. Example rules only
4
- # apply to Latin.
5
- #
6
- require 'colorize'
7
- require 'proiel'
8
-
9
- VIOLATIONS = {}
10
-
11
- def report_violation(token, message)
12
- VIOLATIONS[message] ||= []
13
- VIOLATIONS[message] << token
14
- end
15
-
16
- def test_token(token, rules, dependent_rules)
17
- rules.each do |match_features, test_alternatives|
18
- f = token.features + ["\"#{token.form}\""]
19
-
20
- if (match_features - f).empty?
21
- unless test_alternatives.any? { |test_alternative| token.features.include?(test_alternative) }
22
- report_violation(token, "#{match_features.join(' ')}")
23
- end
24
- end
25
- end
26
-
27
- dependent_rules.each do |match_features, test_alternatives|
28
- f = token.features + ["\"#{token.form}\""]
29
-
30
- if (match_features - f).empty?
31
- t = token.children.all? do |dependent|
32
- test_alternatives.any? { |test_alternative| dependent.features.include?(test_alternative) }
33
- end
34
-
35
- unless t
36
- report_violation(token, "#{match_features.join(' ')} → dependents()")
37
- end
38
- end
39
- end
40
- end
41
-
42
- def load_rules
43
- rules = {}
44
- dependent_rules = {}
45
-
46
- DATA.each do |rule|
47
- rule.chomp!
48
- rule.sub!(/\s*#.*$/, '')
49
-
50
- next if rule.empty?
51
-
52
- match_features, test = rule.split(/\s*→\s*/)
53
- match_features = match_features.split(/\s+/)
54
-
55
- if test[/\s*dependents\(([^)]*)\)\s*/]
56
- dependent_rules[match_features] ||= []
57
- dependent_rules[match_features] << $1
58
- test.sub!(/\s*dependents\([^)]*\)\s*/, '')
59
- end
60
-
61
- if test != ''
62
- rules[match_features] ||= []
63
- rules[match_features] << test
64
- end
65
- end
66
-
67
- [rules, dependent_rules]
68
- end
69
-
70
- if ARGV.length < 1
71
- STDERR.puts "Usage: #{$0} treebank-files(s)"
72
- exit 1
73
- end
74
-
75
- tb = PROIEL::Treebank.new
76
- tb.load_from_xml(ARGV)
77
-
78
- rules, dependent_rules = load_rules
79
-
80
- tb.sources.each do |source|
81
- source.sentences.each do |sentence|
82
- if sentence.status == 'reviewed'
83
- sentence.tokens.each do |token|
84
- test_token(token, rules, dependent_rules)
85
- end
86
- end
87
- end
88
- end
89
-
90
- base_url = 'http://foni.uio.no:3000'
91
-
92
- puts "<h1>PROIEL lint report</h1>"
93
-
94
- VIOLATIONS.each do |rule, tokens|
95
- puts "<h2>#{rule}</h2><ul>"
96
- tokens.each do |token|
97
- puts "<li>Token <a href='#{base_url}/tokens/#{token.id}'>#{token.id}</a> in sentence <a href='#{base_url}/sentences/#{token.sentence.id}'>#{token.sentence.id}</a></li>"
98
- end
99
- puts "</ul>"
100
- end
101
-
102
- __END__
103
-
104
- # Gerundives
105
- gdv nom → xobj # modal gerundive heading a main clause
106
-
107
- gdv acc → comp # modal gerundive heading an AcI, or in the _curo faciendum_ type
108
- gdv acc → xobj # modal gerundive heading an AcI with an overt auxiliary
109
- gdv acc → obl # as argument of a preposition
110
- gdv acc → xadv # in the _do librum legendum_ type
111
-
112
- gdv gen → atr # in the _tempus dicendi_ type
113
- gdv gen → narg # in the _facultas dicendi_ type
114
-
115
- gdv abl → obl # as argument of a preposition
116
- gdv abl → abl # in circumstantial adjuncts of various types
117
-
118
- # Gerunds
119
- ger nom → 0 # invalid case for a gerundive
120
-
121
- ger acc → obl # as argument of a preposition
122
-
123
- ger gen → atr # in the _tempus dicendi_ type
124
- ger gen → narg # in the _facultas dicendi_ type
125
-
126
- ger abl → obl # as argument of a preposition
127
- ger abl → abl # in circumstantial adjuncts of various types
128
-
129
- # Reflexive pronouns
130
- persrefl nom → 0
131
-
132
- persrefl acc → sub
133
- persrefl acc → obj
134
- persrefl acc → obl
135
-
136
- persrefl dat → obl
137
- persrefl dat → adv
138
- persrefl dat → ag
139
-
140
- persrefl abl → obl
141
- persrefl abl → sub
142
-
143
- persrefl "se" → acc
144
- persrefl "se" → abl
145
-
146
- persrefl "sese" → acc
147
- persrefl "sese" → abl
148
-
149
- persrefl "sibi" → dat
150
-
151
- # Personal pronouns
152
- perspron nom → sub
153
-
154
- perspron acc → sub
155
- perspron acc → obj
156
- perspron acc → obl
157
-
158
- perspron dat → obl
159
- perspron dat → adv
160
- perspron dat → ag
161
-
162
- perspron abl → obl
163
- perspron abl → sub
164
-
165
- # The dependent of the complementisers _ut_ and _ne_ should be a PRED or an AUX
166
- subj "ut" → dependents(pred) # the standard case, a predicate heading a clause
167
- subj "ut" → dependents(aux) # some particle-like material dependent on the complementiser
168
-
169
- subj "ne" → dependents(pred)
170
- subj "ne" → dependents(aux)
171
-
172
- # Particles and adverbs
173
- "iam" → adverb adv
174
- "iam" → adverb aux # possibly
@@ -1,134 +0,0 @@
1
- #!/usr/bin/env ruby
2
- #
3
- # Does the dependency relation suffice to disambiguate ambiguous morphology?
4
- #
5
- require 'colorize'
6
- require 'proiel'
7
-
8
- if ARGV.length < 1
9
- STDERR.puts "Usage: #{$0} treebank-files(s)"
10
-
11
- exit 1
12
- end
13
-
14
- tb = PROIEL::Treebank.new
15
- tb.load_from_xml(ARGV)
16
-
17
- # Harvest morphology
18
- form_hash = {}
19
-
20
- tb.sources.reject { |source| source.language != language_tag }.each do |source|
21
- source.tokens.each do |token|
22
- next unless token.form and token.pos and token.morphology and token.relation
23
-
24
- # TODO: problem with using token.form is that sentence-initial words are sometimes capitalised
25
- relation_hash = (form_hash[token.form] ||= {})
26
- pos_hash = (relation_hash[token.relation] ||= {})
27
- morphology_hash = (pos_hash[token.pos] ||= {})
28
- morphology_hash[token.morphology] ||= 0
29
- morphology_hash[token.morphology] += 1
30
- end
31
- end
32
-
33
- # Calculate by unique forms first
34
- unique_pos = 0
35
- relation_predicts_pos = 0
36
- relation_does_not_predict_pos = 0
37
-
38
- unique_morphology = 0
39
- relation_predicts_morphology = 0
40
- relation_does_not_predict_morphology = 0
41
-
42
- form_hash.each do |form, h|
43
- number_of_poses = 0
44
- number_of_morphologies = 0
45
-
46
- h.each do |_, i|
47
- i.each do |_, j|
48
- number_of_poses += 1
49
- j.each do |_, k|
50
- number_of_morphologies += 1
51
- end
52
- end
53
- end
54
-
55
- if number_of_poses == 1
56
- unique_pos += 1
57
- elsif h.all? { |_, i| i.keys.count == 1 }
58
- relation_predicts_pos += 1
59
- else
60
- relation_does_not_predict_pos += 1
61
- end
62
-
63
- if number_of_morphologies == 1
64
- unique_morphology += 1
65
- elsif h.all? { |_, i| i.all? { |_, j| j.keys.count == 1 } }
66
- relation_predicts_morphology += 1
67
- else
68
- relation_does_not_predict_morphology += 1
69
- end
70
- end
71
-
72
- puts "By unique forms (types)"
73
- puts "======================="
74
- puts "Forms with a unique POS: #{unique_pos}"
75
- puts "Forms whose relation predicts its POS: #{relation_predicts_pos}"
76
- puts "Forms whose relation does not predict its POS: #{relation_does_not_predict_pos}"
77
-
78
- puts "Forms with a unique morphology: #{unique_morphology}"
79
- puts "Forms whose relation predicts its morphology: #{relation_predicts_morphology}"
80
- puts "Forms whose relation does not predict its morphology: #{relation_does_not_predict_morphology}"
81
-
82
- # Calculate by actual number of occurrences
83
- unique_pos = 0
84
- relation_predicts_pos = 0
85
- relation_does_not_predict_pos = 0
86
-
87
- unique_morphology = 0
88
- relation_predicts_morphology = 0
89
- relation_does_not_predict_morphology = 0
90
-
91
- form_hash.each do |form, h|
92
- n = 0
93
- number_of_poses = 0
94
- number_of_morphologies = 0
95
-
96
- h.each do |_, i|
97
- i.each do |_, j|
98
- number_of_poses += 1
99
- j.each do |_, k|
100
- number_of_morphologies += 1
101
- n += k
102
- end
103
- end
104
- end
105
-
106
- if number_of_poses == 1
107
- unique_pos += n
108
- elsif h.all? { |_, i| i.keys.count == 1 }
109
- relation_predicts_pos += n
110
- else
111
- relation_does_not_predict_pos += n
112
- end
113
-
114
- if number_of_morphologies == 1
115
- unique_morphology += n
116
- elsif h.all? { |_, i| i.all? { |_, j| j.keys.count == 1 } }
117
- relation_predicts_morphology += n
118
- else
119
- relation_does_not_predict_morphology += n
120
- end
121
- end
122
-
123
- puts
124
- puts "By occurrences of forms (tokens)"
125
- puts "================================"
126
- puts "Forms with a unique POS: #{unique_pos}"
127
- puts "Forms whose relation predicts its POS: #{relation_predicts_pos}"
128
- puts "Forms whose relation does not predict its POS: #{relation_does_not_predict_pos}"
129
-
130
- puts "Forms with a unique morphology: #{unique_morphology}"
131
- puts "Forms whose relation predicts its morphology: #{relation_predicts_morphology}"
132
- puts "Forms whose relation does not predict its morphology: #{relation_does_not_predict_morphology}"
133
-
134
- exit 0
@@ -1,30 +0,0 @@
1
- #!/usr/bin/env ruby
2
- #
3
- # Word form occurrence extraction
4
- #
5
- require 'colorize'
6
- require 'proiel'
7
-
8
- if ARGV.length < 1
9
- STDERR.puts "Usage: #{$0} treebank-files(s)"
10
-
11
- exit 1
12
- end
13
-
14
- tb = PROIEL::Treebank.new
15
- tb.load_from_xml(ARGV)
16
-
17
- form_index = {}
18
-
19
- tb.sources.each do |source|
20
- source.tokens.each do |token|
21
- unless token.form.nil?
22
- form_index[token.form] ||= []
23
- form_index[token.form] << [source.id, token.id].join(':')
24
- end
25
- end
26
- end
27
-
28
- form_index.sort_by(&:first).each do |form, ids|
29
- puts "#{form}: #{ids.join(', ')}"
30
- end