proiel-cli 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cc539e4839fccb93166f5fb00309efafc88bdb25
4
- data.tar.gz: 758707d18035ce10ecd59772031b6c01d703723e
3
+ metadata.gz: bfb4db333ae791d4171490c7b3d316364b205729
4
+ data.tar.gz: fd001265bdcc8e75e49fc1e2a188f7c63471bb62
5
5
  SHA512:
6
- metadata.gz: cedb768615cbc1e6c2ecb1b09dc30f88f97d72df5a7545fd0a1087cf925432887c4838316609a6b5706f79f63cf9c35898e2ca64ce1d850e0ba264e99f6d1b3f
7
- data.tar.gz: f105fcc4028036319efb2b4c7b3d9c5427a00c273166719f8256adc029f8ca1545514985ed814deea5c430682342cc10043ed94dc4c37de180db8e41717f957c
6
+ metadata.gz: 29cb26e28f22486db097b982d7e2cd4783f2b38d8d6c13fb06309fb25377791cd8248c88627b7d5b72c3adfe0e958c9696b2508fcb975e9a8fdf516a9825963b
7
+ data.tar.gz: b565292467456b10415039f0464bdfafcae5fecf55747dd3b9ec2473313ba08789d49431ecb06d649596bfd98031640481efccd9b5be58b6b565d13dcc3a1161
data/README.md CHANGED
@@ -1,10 +1,10 @@
1
1
  # PROIEL command-line interface
2
2
 
3
- This is a command-line interface for manpulating PROIEL treebanks.
3
+ This is a command-line interface for manipulating PROIEL treebanks.
4
4
 
5
5
  ## Installation
6
6
 
7
- Install as
7
+ This library requires Ruby >= 2.1. Install as
8
8
 
9
9
  ```shell
10
10
  gem install proiel-cli
@@ -88,7 +88,7 @@ module PROIEL
88
88
  unless token.slashes.empty? or options['remove-syntax'] # this extra test avoids <token></token> style XML
89
89
  builder.token(attrs) do
90
90
  token.slashes.each do |relation, target_id|
91
- builder.slash("target-id": target_id, relation: relation)
91
+ builder.slash(:"target-id" => target_id, relation: relation)
92
92
  end
93
93
  end
94
94
  else
@@ -1,5 +1,5 @@
1
1
  module PROIEL
2
2
  module CLI
3
- VERSION = '0.1.0'
3
+ VERSION = '0.1.1'
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: proiel-cli
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Marius L. Jøhndal
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2015-11-07 00:00:00.000000000 Z
12
+ date: 2015-12-06 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: builder
@@ -197,11 +197,6 @@ files:
197
197
  - contrib/proiel-maltparser-parse
198
198
  - contrib/proiel-maltparser-train
199
199
  - contrib/proiel-tnt-train
200
- - examples/decision-tree.rb
201
- - examples/dep-pos-cooccurrences.rb
202
- - examples/lint-rules.rb
203
- - examples/relation-as-disambiguator.rb
204
- - examples/word-occurrences.rb
205
200
  - lib/proiel/cli.rb
206
201
  - lib/proiel/cli/commands.rb
207
202
  - lib/proiel/cli/commands/convert.rb
@@ -1,41 +0,0 @@
1
- #!/usr/bin/env ruby
2
- #
3
- # Train a decision tree on the (head_relation, head_pos, head_lemma,
4
- # child_relation, child_pos, child_lemma) and then predict the child_relation
5
- # of an unknown child.
6
- #
7
- require 'colorize'
8
- require 'decisiontree'
9
- require 'proiel'
10
-
11
- if ARGV.length < 1
12
- STDERR.puts "Usage: #{$0} treebank-files(s)"
13
- exit 1
14
- end
15
-
16
- tb = PROIEL::Treebank.new
17
- tb.load_from_xml(ARGV)
18
-
19
- tokens = {}
20
-
21
- tb.sources.each do |source|
22
- source.tokens.each do |token|
23
- tokens[token.id.to_i] = [token.relation, token.part_of_speech, token.lemma, token.head_id]
24
- end
25
- end
26
-
27
- training_data = tokens.map do |_, (child_relation, child_pos, child_lemma, head_id)|
28
- if head_id
29
- head = tokens[head_id.to_i]
30
- head_relation, head_pos, head_lemma, _ = *head
31
-
32
- [head_pos || '', head_lemma || '', head_relation, child_pos || '', child_lemma || '', child_relation]
33
- end
34
- end.compact
35
-
36
- attributes = %w(head_pos head_lemma head_relation child_pos child_lemma)
37
- dr = DecisionTree::ID3Tree.new(attributes, training_data, 'pred', :discrete)
38
- dr.train
39
- dr.save_to_file("dr.marshal")
40
-
41
- p dr.predict(["Ne", "Gallia", "sub", "Px", "omnis"])
@@ -1,84 +0,0 @@
1
- #!/usr/bin/env ruby
2
- require 'proiel'
3
- require 'colorize'
4
- require 'terminal-table'
5
-
6
- if ARGV.length < 1
7
- STDERR.puts "Usage: #{$0} treebank-files(s)"
8
- exit 1
9
- end
10
-
11
- tb = PROIEL::Treebank.new
12
- tb.load_from_xml(ARGV)
13
-
14
- # Present by POS
15
- relations = tb.annotation.relation_tags.keys
16
-
17
- c = {}
18
- tb.sources.each do |s|
19
- s.tokens.each do |t|
20
- next if t.pos.nil? or t.relation.nil?
21
-
22
- c[t.pos] ||= {}
23
- c[t.pos][t.relation] ||= 0
24
- c[t.pos][t.relation] += 1
25
- end
26
- end
27
-
28
- rows = []
29
- c.sort_by(&:first).each do |pos, d|
30
- total = d.inject(0) { |a, (k, v)| a + v }
31
-
32
- rows << [pos] + relations.map do |r|
33
- n = d[r ? r.to_s : nil]
34
-
35
- if n and n < total * 0.001
36
- n.to_s.red
37
- elsif n and n > total * 0.999
38
- n.to_s.green
39
- else
40
- n
41
- end
42
- end
43
- end
44
-
45
- table = Terminal::Table.new headings: ['Part of speech'] + relations, rows: rows
46
- puts table
47
- puts "(red = relation occurs for less than 0.1% of tokens with this POS; green = relation occurs for more than 99.9% of tokens with this POS)"
48
- puts
49
-
50
- # Present by relation
51
- poses = tb.annotation.part_of_speech_tags.keys
52
-
53
- c = {}
54
-
55
- tb.sources.each do |s|
56
- s.tokens.each do |t|
57
- next if t.pos.nil? or t.relation.nil?
58
-
59
- c[t.relation] ||= {}
60
- c[t.relation][t.pos] ||= 0
61
- c[t.relation][t.pos] += 1
62
- end
63
- end
64
-
65
- rows = []
66
- c.sort_by(&:first).each do |relation, d|
67
- total = d.inject(0) { |a, (k, v)| a + v }
68
-
69
- rows << [relation] + poses.map do |r|
70
- n = d[r ? r.to_s : nil]
71
-
72
- if n and n < total * 0.001
73
- n.to_s.red
74
- elsif n and n > total * 0.999
75
- n.to_s.green
76
- else
77
- n
78
- end
79
- end
80
- end
81
-
82
- table = Terminal::Table.new headings: ['Relation'] + poses, rows: rows
83
- puts table
84
- puts "(red = POS occurs for less than 0.1% of tokens with this relation; green = POS occurs for more than 99.9% of tokens with this relation)"
@@ -1,174 +0,0 @@
1
- #!/usr/bin/env ruby
2
- #
3
- # Very simple testing of implicational feature rules. Example rules only
4
- # apply to Latin.
5
- #
6
- require 'colorize'
7
- require 'proiel'
8
-
9
- VIOLATIONS = {}
10
-
11
- def report_violation(token, message)
12
- VIOLATIONS[message] ||= []
13
- VIOLATIONS[message] << token
14
- end
15
-
16
- def test_token(token, rules, dependent_rules)
17
- rules.each do |match_features, test_alternatives|
18
- f = token.features + ["\"#{token.form}\""]
19
-
20
- if (match_features - f).empty?
21
- unless test_alternatives.any? { |test_alternative| token.features.include?(test_alternative) }
22
- report_violation(token, "#{match_features.join(' ')}")
23
- end
24
- end
25
- end
26
-
27
- dependent_rules.each do |match_features, test_alternatives|
28
- f = token.features + ["\"#{token.form}\""]
29
-
30
- if (match_features - f).empty?
31
- t = token.children.all? do |dependent|
32
- test_alternatives.any? { |test_alternative| dependent.features.include?(test_alternative) }
33
- end
34
-
35
- unless t
36
- report_violation(token, "#{match_features.join(' ')} → dependents()")
37
- end
38
- end
39
- end
40
- end
41
-
42
- def load_rules
43
- rules = {}
44
- dependent_rules = {}
45
-
46
- DATA.each do |rule|
47
- rule.chomp!
48
- rule.sub!(/\s*#.*$/, '')
49
-
50
- next if rule.empty?
51
-
52
- match_features, test = rule.split(/\s*→\s*/)
53
- match_features = match_features.split(/\s+/)
54
-
55
- if test[/\s*dependents\(([^)]*)\)\s*/]
56
- dependent_rules[match_features] ||= []
57
- dependent_rules[match_features] << $1
58
- test.sub!(/\s*dependents\([^)]*\)\s*/, '')
59
- end
60
-
61
- if test != ''
62
- rules[match_features] ||= []
63
- rules[match_features] << test
64
- end
65
- end
66
-
67
- [rules, dependent_rules]
68
- end
69
-
70
- if ARGV.length < 1
71
- STDERR.puts "Usage: #{$0} treebank-files(s)"
72
- exit 1
73
- end
74
-
75
- tb = PROIEL::Treebank.new
76
- tb.load_from_xml(ARGV)
77
-
78
- rules, dependent_rules = load_rules
79
-
80
- tb.sources.each do |source|
81
- source.sentences.each do |sentence|
82
- if sentence.status == 'reviewed'
83
- sentence.tokens.each do |token|
84
- test_token(token, rules, dependent_rules)
85
- end
86
- end
87
- end
88
- end
89
-
90
- base_url = 'http://foni.uio.no:3000'
91
-
92
- puts "<h1>PROIEL lint report</h1>"
93
-
94
- VIOLATIONS.each do |rule, tokens|
95
- puts "<h2>#{rule}</h2><ul>"
96
- tokens.each do |token|
97
- puts "<li>Token <a href='#{base_url}/tokens/#{token.id}'>#{token.id}</a> in sentence <a href='#{base_url}/sentences/#{token.sentence.id}'>#{token.sentence.id}</a></li>"
98
- end
99
- puts "</ul>"
100
- end
101
-
102
- __END__
103
-
104
- # Gerundives
105
- gdv nom → xobj # modal gerundive heading a main clause
106
-
107
- gdv acc → comp # modal gerundive heading an AcI, or in the _curo faciendum_ type
108
- gdv acc → xobj # modal gerundive heading an AcI with an overt auxiliary
109
- gdv acc → obl # as argument of a preposition
110
- gdv acc → xadv # in the _do librum legendum_ type
111
-
112
- gdv gen → atr # in the _tempus dicendi_ type
113
- gdv gen → narg # in the _facultas dicendi_ type
114
-
115
- gdv abl → obl # as argument of a preposition
116
- gdv abl → abl # in circumstantial adjuncts of various types
117
-
118
- # Gerunds
119
- ger nom → 0 # invalid case for a gerundive
120
-
121
- ger acc → obl # as argument of a preposition
122
-
123
- ger gen → atr # in the _tempus dicendi_ type
124
- ger gen → narg # in the _facultas dicendi_ type
125
-
126
- ger abl → obl # as argument of a preposition
127
- ger abl → abl # in circumstantial adjuncts of various types
128
-
129
- # Reflexive pronouns
130
- persrefl nom → 0
131
-
132
- persrefl acc → sub
133
- persrefl acc → obj
134
- persrefl acc → obl
135
-
136
- persrefl dat → obl
137
- persrefl dat → adv
138
- persrefl dat → ag
139
-
140
- persrefl abl → obl
141
- persrefl abl → sub
142
-
143
- persrefl "se" → acc
144
- persrefl "se" → abl
145
-
146
- persrefl "sese" → acc
147
- persrefl "sese" → abl
148
-
149
- persrefl "sibi" → dat
150
-
151
- # Personal pronouns
152
- perspron nom → sub
153
-
154
- perspron acc → sub
155
- perspron acc → obj
156
- perspron acc → obl
157
-
158
- perspron dat → obl
159
- perspron dat → adv
160
- perspron dat → ag
161
-
162
- perspron abl → obl
163
- perspron abl → sub
164
-
165
- # The dependent of the complementisers _ut_ and _ne_ should be a PRED or an AUX
166
- subj "ut" → dependents(pred) # the standard case, a predicate heading a clause
167
- subj "ut" → dependents(aux) # some particle-like material dependent on the complementiser
168
-
169
- subj "ne" → dependents(pred)
170
- subj "ne" → dependents(aux)
171
-
172
- # Particles and adverbs
173
- "iam" → adverb adv
174
- "iam" → adverb aux # possibly
@@ -1,134 +0,0 @@
1
- #!/usr/bin/env ruby
2
- #
3
- # Does the dependency relation suffice to disambiguate ambiguous morphology?
4
- #
5
- require 'colorize'
6
- require 'proiel'
7
-
8
- if ARGV.length < 1
9
- STDERR.puts "Usage: #{$0} treebank-files(s)"
10
-
11
- exit 1
12
- end
13
-
14
- tb = PROIEL::Treebank.new
15
- tb.load_from_xml(ARGV)
16
-
17
- # Harvest morphology
18
- form_hash = {}
19
-
20
- tb.sources.reject { |source| source.language != language_tag }.each do |source|
21
- source.tokens.each do |token|
22
- next unless token.form and token.pos and token.morphology and token.relation
23
-
24
- # TODO: problem with using token.form is that sentence-initial words are sometimes capitalised
25
- relation_hash = (form_hash[token.form] ||= {})
26
- pos_hash = (relation_hash[token.relation] ||= {})
27
- morphology_hash = (pos_hash[token.pos] ||= {})
28
- morphology_hash[token.morphology] ||= 0
29
- morphology_hash[token.morphology] += 1
30
- end
31
- end
32
-
33
- # Calculate by unique forms first
34
- unique_pos = 0
35
- relation_predicts_pos = 0
36
- relation_does_not_predict_pos = 0
37
-
38
- unique_morphology = 0
39
- relation_predicts_morphology = 0
40
- relation_does_not_predict_morphology = 0
41
-
42
- form_hash.each do |form, h|
43
- number_of_poses = 0
44
- number_of_morphologies = 0
45
-
46
- h.each do |_, i|
47
- i.each do |_, j|
48
- number_of_poses += 1
49
- j.each do |_, k|
50
- number_of_morphologies += 1
51
- end
52
- end
53
- end
54
-
55
- if number_of_poses == 1
56
- unique_pos += 1
57
- elsif h.all? { |_, i| i.keys.count == 1 }
58
- relation_predicts_pos += 1
59
- else
60
- relation_does_not_predict_pos += 1
61
- end
62
-
63
- if number_of_morphologies == 1
64
- unique_morphology += 1
65
- elsif h.all? { |_, i| i.all? { |_, j| j.keys.count == 1 } }
66
- relation_predicts_morphology += 1
67
- else
68
- relation_does_not_predict_morphology += 1
69
- end
70
- end
71
-
72
- puts "By unique forms (types)"
73
- puts "======================="
74
- puts "Forms with a unique POS: #{unique_pos}"
75
- puts "Forms whose relation predicts its POS: #{relation_predicts_pos}"
76
- puts "Forms whose relation does not predict its POS: #{relation_does_not_predict_pos}"
77
-
78
- puts "Forms with a unique morphology: #{unique_morphology}"
79
- puts "Forms whose relation predicts its morphology: #{relation_predicts_morphology}"
80
- puts "Forms whose relation does not predict its morphology: #{relation_does_not_predict_morphology}"
81
-
82
- # Calculate by actual number of occurrences
83
- unique_pos = 0
84
- relation_predicts_pos = 0
85
- relation_does_not_predict_pos = 0
86
-
87
- unique_morphology = 0
88
- relation_predicts_morphology = 0
89
- relation_does_not_predict_morphology = 0
90
-
91
- form_hash.each do |form, h|
92
- n = 0
93
- number_of_poses = 0
94
- number_of_morphologies = 0
95
-
96
- h.each do |_, i|
97
- i.each do |_, j|
98
- number_of_poses += 1
99
- j.each do |_, k|
100
- number_of_morphologies += 1
101
- n += k
102
- end
103
- end
104
- end
105
-
106
- if number_of_poses == 1
107
- unique_pos += n
108
- elsif h.all? { |_, i| i.keys.count == 1 }
109
- relation_predicts_pos += n
110
- else
111
- relation_does_not_predict_pos += n
112
- end
113
-
114
- if number_of_morphologies == 1
115
- unique_morphology += n
116
- elsif h.all? { |_, i| i.all? { |_, j| j.keys.count == 1 } }
117
- relation_predicts_morphology += n
118
- else
119
- relation_does_not_predict_morphology += n
120
- end
121
- end
122
-
123
- puts
124
- puts "By occurrences of forms (tokens)"
125
- puts "================================"
126
- puts "Forms with a unique POS: #{unique_pos}"
127
- puts "Forms whose relation predicts its POS: #{relation_predicts_pos}"
128
- puts "Forms whose relation does not predict its POS: #{relation_does_not_predict_pos}"
129
-
130
- puts "Forms with a unique morphology: #{unique_morphology}"
131
- puts "Forms whose relation predicts its morphology: #{relation_predicts_morphology}"
132
- puts "Forms whose relation does not predict its morphology: #{relation_does_not_predict_morphology}"
133
-
134
- exit 0
@@ -1,30 +0,0 @@
1
- #!/usr/bin/env ruby
2
- #
3
- # Word form occurrence extraction
4
- #
5
- require 'colorize'
6
- require 'proiel'
7
-
8
- if ARGV.length < 1
9
- STDERR.puts "Usage: #{$0} treebank-files(s)"
10
-
11
- exit 1
12
- end
13
-
14
- tb = PROIEL::Treebank.new
15
- tb.load_from_xml(ARGV)
16
-
17
- form_index = {}
18
-
19
- tb.sources.each do |source|
20
- source.tokens.each do |token|
21
- unless token.form.nil?
22
- form_index[token.form] ||= []
23
- form_index[token.form] << [source.id, token.id].join(':')
24
- end
25
- end
26
- end
27
-
28
- form_index.sort_by(&:first).each do |form, ids|
29
- puts "#{form}: #{ids.join(', ')}"
30
- end