proiel 1.1.0 → 1.3.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. checksums.yaml +5 -5
  2. data/LICENSE +1 -1
  3. data/README.md +2 -2
  4. data/lib/proiel.rb +16 -1
  5. data/lib/proiel/alignment.rb +3 -0
  6. data/lib/proiel/alignment/builder.rb +220 -0
  7. data/lib/proiel/annotation_schema.rb +11 -4
  8. data/lib/proiel/chronology.rb +80 -0
  9. data/lib/proiel/dictionary.rb +79 -0
  10. data/lib/proiel/dictionary/builder.rb +224 -0
  11. data/lib/proiel/div.rb +22 -3
  12. data/lib/proiel/language.rb +108 -0
  13. data/lib/proiel/lemma.rb +77 -0
  14. data/lib/proiel/proiel_xml/proiel-3.0/proiel-3.0.xsd +383 -0
  15. data/lib/proiel/proiel_xml/reader.rb +138 -2
  16. data/lib/proiel/proiel_xml/schema.rb +4 -2
  17. data/lib/proiel/proiel_xml/validator.rb +76 -9
  18. data/lib/proiel/sentence.rb +27 -4
  19. data/lib/proiel/source.rb +14 -4
  20. data/lib/proiel/statistics.rb +2 -2
  21. data/lib/proiel/token.rb +14 -6
  22. data/lib/proiel/tokenization.rb +5 -3
  23. data/lib/proiel/treebank.rb +23 -6
  24. data/lib/proiel/utils.rb +0 -1
  25. data/lib/proiel/valency.rb +5 -0
  26. data/lib/proiel/valency/arguments.rb +151 -0
  27. data/lib/proiel/valency/lexicon.rb +59 -0
  28. data/lib/proiel/valency/obliqueness.rb +31 -0
  29. data/lib/proiel/version.rb +2 -3
  30. data/lib/proiel/visualization.rb +1 -0
  31. data/lib/proiel/visualization/graphviz.rb +111 -0
  32. data/lib/proiel/visualization/graphviz/aligned-modern.dot.erb +83 -0
  33. data/lib/proiel/visualization/graphviz/classic.dot.erb +24 -0
  34. data/lib/proiel/visualization/graphviz/linearized.dot.erb +57 -0
  35. data/lib/proiel/visualization/graphviz/modern.dot.erb +39 -0
  36. data/lib/proiel/visualization/graphviz/packed.dot.erb +25 -0
  37. metadata +76 -31
@@ -0,0 +1,151 @@
1
+ module PROIEL::Valency::Arguments
2
+ def self.get_argument_frame(token)
3
+ arguments = collect_arguments(token)
4
+ hoisted_arguments = arguments.map { |a| hoist_dependents(a) }
5
+
6
+ a =
7
+ hoisted_arguments.map do |argument|
8
+ { relation: argument.relation }.merge(extract_features(argument))
9
+ end
10
+
11
+ PROIEL::Valency::Obliqueness.sort_arguments(a)
12
+ end
13
+
14
+ private
15
+
16
+ POS_CLASSIFICATION = {
17
+ 'R' => :functor,
18
+ 'G' => :functor,
19
+ 'N' => :nominal,
20
+ 'P' => :nominal,
21
+ 'A' => :nominal,
22
+ 'M' => :nominal,
23
+ 'V' => :verbal,
24
+ }.freeze
25
+
26
+ # Collapses dependents based on features
27
+ def self.collapse_dependents(dependents)
28
+ # Hoist dependents if any of the dependents is a coordinator
29
+ dependents = dependents.map { |d| hoist_dependents(d) }
30
+
31
+ # Figure out if all dependents are equivalent for the purposes of
32
+ # argument frames. Typical examples would be coordinated, identical
33
+ # prepositions (which is operationalised as same lemma, same POS, no
34
+ # case) or coordinated nouns in the same case (which is operationalised
35
+ # as same major POS, same case). If we fail to figure out a way to
36
+ # hoist and reduce arguments, we keep the coordinator.
37
+ majors = dependents.map { |d| POS_CLASSIFICATION[d.part_of_speech_hash[:major] || d.empty_token_sort] }.uniq
38
+ majors = majors.length == 1 ? majors.first : nil
39
+
40
+ case majors
41
+ when :functor
42
+ lemmas = dependents.map(&:lemma).uniq
43
+ if lemmas.length == 1
44
+ dependents.first
45
+ else
46
+ # STDERR.puts "Different lemmas R/G: #{lemmas.inspect}"
47
+ nil
48
+ end
49
+ when :nominal
50
+ cases = dependents.map { |d| d.morphology_hash[:case] }.uniq
51
+ if cases.length == 1
52
+ dependents.first
53
+ else
54
+ # STDERR.puts "Different cases N/P: #{cases.inspect}"
55
+ nil
56
+ end
57
+ when :verbal
58
+ moods = dependents.map { |d| d.morphology_hash[:mood] }.uniq
59
+ if moods.length == 1
60
+ dependents.first
61
+ else
62
+ # STDERR.puts "Different moods V: #{moods.inspect}"
63
+ nil
64
+ end
65
+ else
66
+ # STDERR.puts "Unknown combination: #{dependents.map(&:pos).inspect}"
67
+ nil
68
+ end
69
+ end
70
+
71
+ # Hoists the real argument dependents from conjoined arguments
72
+ def self.hoist_dependents(argument)
73
+ if argument.part_of_speech == 'C-' or argument.empty_token_sort == 'C'
74
+ # Pick dependents that have the same relation as the coordinator. This
75
+ # eliminates auxiliary elements like particles and repeated
76
+ # conjunctions as well as attributes that scope over all conjuncts.
77
+ dependents = argument.dependents.select { |d| d.relation == argument.relation }
78
+
79
+ collapse_dependents(dependents) || argument
80
+ else
81
+ argument
82
+ end
83
+ end
84
+
85
+ # Extracts morphosyntactic features that are relevant to the argument frame
86
+ def self.extract_features(argument)
87
+ {}.tap do |features|
88
+ case argument.part_of_speech_hash[:major]
89
+ when 'G'
90
+ features[:lemma] = argument.lemma
91
+ features[:part_of_speech] = argument.part_of_speech
92
+
93
+ # There may be multiple dependents and dependents may be headed by
94
+ # coordinators. All relevant dependents have the relation PRED.
95
+ dependents = argument.dependents.select { |d| d.relation == 'pred' }.map { |a| hoist_dependents(a) }
96
+ local_argument = collapse_dependents(dependents)
97
+ if local_argument and local_argument.morphology_hash[:mood]
98
+ features[:mood] = local_argument.morphology_hash[:mood]
99
+ end
100
+ when 'R'
101
+ features[:lemma] = argument.lemma
102
+ features[:part_of_speech] = argument.part_of_speech
103
+
104
+ # There may be multiple dependents and dependents may be headed by
105
+ # coordinators. All relevant dependents have the relation OBL.
106
+ dependents = argument.dependents.select { |d| d.relation == 'obl' }.map { |a| hoist_dependents(a) }
107
+ local_argument = collapse_dependents(dependents)
108
+ if local_argument and local_argument.morphology_hash[:case]
109
+ features[:case] = local_argument.morphology_hash[:case]
110
+ end
111
+ when 'V'
112
+ features[:mood] = argument.morphology_hash[:mood] if argument.morphology_hash[:mood]
113
+ when 'D'
114
+ features[:lemma] = argument.lemma
115
+ features[:part_of_speech] = argument.part_of_speech
116
+ when 'P'
117
+ features[:case] = argument.morphology_hash[:case] if argument.morphology_hash[:case]
118
+ if argument.part_of_speech == 'Pk' # reflexive personal pronoun
119
+ features[:lemma] = argument.lemma
120
+ features[:part_of_speech] = argument.part_of_speech
121
+ end
122
+ else
123
+ features[:case] = argument.morphology_hash[:case] if argument.morphology_hash[:case]
124
+ end
125
+ end
126
+ end
127
+
128
+ #  Determines the arguments of a predicate
129
+ def self.collect_arguments(token)
130
+ token.dependents.select do |dependent|
131
+ case dependent.relation
132
+ when 'obj', 'obl', 'xobj', 'comp', 'narg' # arguments
133
+ true
134
+ when 'aux', 'sub', 'ag', 'adv', 'xadv', 'apos', 'atr', 'part', 'expl' # non-arguments
135
+ false
136
+ when 'arg' # unspecific but always an argument
137
+ true
138
+ when 'adnom', 'nonsub', 'per' # unspecific and undetermined with respect to argumenthood
139
+ false
140
+ when 'rel' # unspecific but never an argument
141
+ false
142
+ when 'pred', 'parpred', 'voc' # shouldn't happen
143
+ false
144
+ when 'pid', 'xsub' # really shouldn't happen
145
+ false
146
+ else
147
+ raise "unknown relation #{dependent.relation.inspect}"
148
+ end
149
+ end
150
+ end
151
+ end
@@ -0,0 +1,59 @@
1
+ module PROIEL
2
+ module Valency
3
+ class Lexicon
4
+ attr_reader :frames
5
+
6
+ def initialize
7
+ @source_ids = Set.new
8
+ @source_languages = Set.new
9
+ @frames = {}
10
+ end
11
+
12
+ # Generates a valency lexicon from the provided sources. In practice the
13
+ # sources should be in the same language but this is not enforced. This
14
+ # makes it possible to generate a lexicon from sources in closely related
15
+ # languages or dialects.
16
+ def add_source!(source)
17
+ @source_ids << source.id
18
+ @source_languages << source.language
19
+
20
+ source.sentences.each do |sentence|
21
+ tokens = find_verbal_nodes(sentence)
22
+ tokens.each do |token|
23
+ frame = PROIEL::Valency::Arguments.get_argument_frame(token)
24
+
25
+ partition =
26
+ if token.dependents.any? { |d| d.relation == 'aux' and d.part_of_speech == 'Pk' }
27
+ :r
28
+ else
29
+ :a
30
+ end
31
+
32
+ @frames[token.lemma] ||= {}
33
+ @frames[token.lemma][token.part_of_speech] ||= {}
34
+ @frames[token.lemma][token.part_of_speech][frame] ||= { a: [], r: [] }
35
+ @frames[token.lemma][token.part_of_speech][frame][partition] << token.id
36
+ end
37
+ end
38
+ end
39
+
40
+ def lookup(lemma, part_of_speech)
41
+ frames =
42
+ @frames[lemma][part_of_speech].map do |arguments, token_ids|
43
+ { arguments: arguments, tokens: token_ids }
44
+ end
45
+ PROIEL::Valency::Obliqueness.sort_frames(frames)
46
+ end
47
+
48
+ private
49
+
50
+ # Find verbal nodes in a sentence
51
+ def find_verbal_nodes(sentence)
52
+ sentence.tokens.select do |token|
53
+ # FIXME: is this test in the proiel library already?
54
+ (token.part_of_speech and token.part_of_speech[/^V/]) or token.empty_token_sort == 'V'
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,31 @@
1
+ module PROIEL::Valency::Obliqueness
2
+ # Sorts frames by obliqueness
3
+ def self.sort_frames(frames)
4
+ # Sort frames by obliqueness, then by inspecting them so that we get
5
+ # a stable, reproducible order.
6
+ frames.sort_by { |frame| [obliqueness_of_arguments(frame[:arguments]).sort, frame.inspect] }
7
+ end
8
+
9
+ # Sorts arguments by obliqueness
10
+ def self.sort_arguments(arguments)
11
+ arguments.sort_by { |argument| obliqueness_of_argument(argument) }
12
+ end
13
+
14
+ private
15
+
16
+ def self.obliqueness_of_arguments(arguments)
17
+ arguments.map do |argument|
18
+ obliqueness_of_argument(argument)
19
+ end
20
+ end
21
+
22
+ def self.obliqueness_of_argument(argument)
23
+ obliqueness_of_relation(argument[:relation]) * 2 + (argument[:lemma].nil? ? 0 : 1)
24
+ end
25
+
26
+ OBLIQUENESS_HIERARCHY = %w(sub ag obj xobj arg obl comp narg).freeze
27
+
28
+ def self.obliqueness_of_relation(relation)
29
+ OBLIQUENESS_HIERARCHY.index(relation) || OBLIQUENESS_HIERARCHY.length
30
+ end
31
+ end
@@ -1,9 +1,8 @@
1
1
  #--
2
- # Copyright (c) 2015-2016 Marius L. Jøhndal
2
+ # Copyright (c) 2015-2018 Marius L. Jøhndal
3
3
  #
4
4
  # See LICENSE in the top-level source directory for licensing terms.
5
5
  #++
6
6
  module PROIEL
7
- # Gem version
8
- VERSION = '1.1.0'
7
+ VERSION = '1.3.1'.freeze
9
8
  end
@@ -0,0 +1 @@
1
+ require 'proiel/visualization/graphviz'
@@ -0,0 +1,111 @@
1
+ module PROIEL
2
+ module Visualization
3
+ module Graphviz
4
+ DEFAULT_GRAPHVIZ_BINARY = 'dot'.freeze
5
+ DEFAULT_TEMPLATES = %i(classic linearized packed modern aligned-modern).freeze
6
+ SUPPORTED_OUTPUT_FORMATS = %i(png svg).freeze
7
+
8
+ class GraphvizError < Exception
9
+ end
10
+
11
+ def self.generate_to_file(template, graph, output_format, output_filename, options = {})
12
+ raise ArgumentError, 'string expected' unless output_filename.is_a?(String)
13
+
14
+ result = PROIEL::Visualization::Graphviz.generate(template, graph, output_format, options)
15
+
16
+ File.open(output_filename, 'w') do |f|
17
+ f.write(result)
18
+ end
19
+ end
20
+
21
+ def self.generate(template, graph, output_format, options = {})
22
+ raise ArgumentError, 'string or symbol expected' unless template.is_a?(String) or template.is_a?(Symbol)
23
+
24
+ dot_code = generate_dot(template, graph, options)
25
+
26
+ if output_format.to_sym == :dot
27
+ dot_code
28
+ else
29
+ generate_image(dot_code, output_format, options)
30
+ end
31
+ end
32
+
33
+ def self.template_filename(template)
34
+ raise ArgumentError, 'string or symbol expected' unless template.is_a?(String) or template.is_a?(Symbol)
35
+ raise ArgumentError, 'invalid template' unless DEFAULT_TEMPLATES.include?(template.to_sym)
36
+
37
+ File.join(File.dirname(__FILE__), 'graphviz', "#{template}.dot.erb")
38
+ end
39
+
40
+ def self.generate_image(dot_code, output_format, options = {})
41
+ raise ArgumentError, 'string expected' unless dot_code.is_a?(String)
42
+ unless output_format.is_a?(String) or output_format.is_a?(Symbol)
43
+ raise ArgumentError, 'string or symbol expected'
44
+ end
45
+ raise ArgumentError, 'invalid output format' unless SUPPORTED_OUTPUT_FORMATS.include?(output_format.to_sym)
46
+
47
+ graphviz_binary = options[:graphviz_binary] || DEFAULT_GRAPHVIZ_BINARY
48
+
49
+ result, errors = nil, nil
50
+
51
+ Open3.popen3("dot -T#{output_format}") do |dot, img, err|
52
+ dot.write dot_code
53
+ dot.close
54
+
55
+ result, errors = img.read, err.read
56
+ end
57
+
58
+ raise GraphvizError, "graphviz exited with errors: #{errors}" unless errors.nil? or errors == ''
59
+
60
+ result
61
+ end
62
+
63
+ def self.generate_dot(template, graph, options)
64
+ unless options[:direction].nil? or %(TD LR).include?(options[:direction])
65
+ raise ArgumentError, 'invalid direction'
66
+ end
67
+
68
+ filename = template_filename(template)
69
+
70
+ content = File.read(filename)
71
+
72
+ template = ERB.new(content, nil, '-')
73
+ template.filename = filename
74
+
75
+ TemplateContext.new(graph, options[:direction] || 'TD').generate(template)
76
+ end
77
+
78
+ class TemplateContext
79
+ def initialize(graph, direction, title = '')
80
+ @graph = graph
81
+ @direction = direction
82
+ @title = title
83
+ end
84
+
85
+ def generate(template)
86
+ template.result(binding)
87
+ end
88
+
89
+ protected
90
+
91
+ # Creates a node with an identifier and a label.
92
+ def node(identifier, label = '', options = {})
93
+ attrs = join_attributes(options.merge(label: label))
94
+
95
+ "#{identifier} [#{attrs}];"
96
+ end
97
+
98
+ # Creates an edge with a label from one identifier to another identifier.
99
+ def edge(identifier1, identifier2, label = '', options = {})
100
+ attrs = join_attributes(options.merge(label: label))
101
+
102
+ "#{identifier1} -> #{identifier2} [#{attrs}];"
103
+ end
104
+
105
+ def join_attributes(attrs)
106
+ attrs.map { |a, v| %|#{a}="#{v.to_s.gsub('"', '\\"')}"| }.join(',')
107
+ end
108
+ end
109
+ end
110
+ end
111
+ end
@@ -0,0 +1,83 @@
1
+ digraph "<%= @title -%>" {
2
+ charset="UTF-8";
3
+ graph [truecolor=true,bgcolor=transparent];
4
+ rankdir="<%= @direction -%>";
5
+ nodesep=0.1;
6
+ ranksep=0.25;
7
+
8
+ <%- @graph.left.each_with_index do |tokens, i| -%>
9
+ <%= "rootL#{i}" -%> [label="",shape=point];
10
+
11
+ <%- tokens.select { |t| t.empty_token_sort != 'P' }.each do |token| -%>
12
+ <%- if token.empty_token_sort -%>
13
+ <%= node token.id, token.relation.to_s.upcase, shape: :none, fontcolor: :gray -%>
14
+ <%- else -%>
15
+ <%= node token.id, token.relation.to_s.upcase, shape: :none -%>
16
+ <%- end -%>
17
+
18
+ <%- if token.relation -%>
19
+ <%= edge (token.head ? token.head.id : "rootL#{i}"), token.id, '', weight: 1.0, color: :orange, arrowhead: :none -%>
20
+ <%- end -%>
21
+
22
+ <%- token.slashes.each do |(relation, target)| -%>
23
+ <%= edge token.id, target, relation.to_s.upcase, weight: 0.0, fontcolor: :blue, color: :blue, style: :dashed %>
24
+ <%- end -%>
25
+ <%- end -%>
26
+
27
+ <%- tokens.reject(&:empty_token_sort).each do |token| -%>
28
+ <%= edge token.id, "T#{token.id}", nil, weight: 10, arrowhead: :none -%>
29
+ <%- end -%>
30
+ <%- end -%>
31
+
32
+ {
33
+ rank="same";
34
+
35
+ <%- @graph.left.each do |tokens| -%>
36
+ <%- tokens.reject(&:empty_token_sort).each do |token| -%>
37
+ <%= node "T#{token.id}", token.form, shape: :none, fontcolor: :blue, tooltip: [token.lemma, token.part_of_speech, token.morphology].join("\n") -%>
38
+ <%- end -%>
39
+
40
+ <%= tokens.reject(&:empty_token_sort).map { |token| "T#{token.id}" }.join('->') -%> [style="invis"];
41
+ <%- end -%>
42
+ }
43
+
44
+ <%- @graph.right.each_with_index do |tokens, i| -%>
45
+ <%= "rootR#{i}" -%> [label="",shape=point];
46
+
47
+ <%- tokens.select { |t| t.empty_token_sort != 'P' }.each do |token| -%>
48
+ <%- if token.empty_token_sort -%>
49
+ <%= node token.id, token.relation.to_s.upcase, shape: :none, fontcolor: :gray -%>
50
+ <%- else -%>
51
+ <%= node token.id, token.relation.to_s.upcase, shape: :none -%>
52
+ <%- end -%>
53
+
54
+ <%- if token.relation -%>
55
+ <%= edge token.id, (token.head ? token.head.id : "rootR#{i}"), '', weight: 1.0, color: :orange, arrowhead: :none -%>
56
+ <%- end -%>
57
+
58
+ <%- token.slashes.each do |(relation, target)| -%>
59
+ <%= edge token.id, target, relation.to_s.upcase, weight: 0.0, fontcolor: :blue, color: :blue, style: :dashed %>
60
+ <%- end -%>
61
+ <%- end -%>
62
+
63
+ <%- tokens.reject(&:empty_token_sort).each do |token| -%>
64
+ <%= edge "T#{token.id}", token.id, nil, weight: 10, arrowhead: :none -%>
65
+ <%- end -%>
66
+ <%- end -%>
67
+
68
+ {
69
+ rank="same";
70
+
71
+ <%- @graph.right.each do |tokens| -%>
72
+ <%- tokens.reject(&:empty_token_sort).each do |token| -%>
73
+ <%= node "T#{token.id}", token.form, shape: :none, fontcolor: :blue, tooltip: [token.lemma, token.part_of_speech, token.morphology].join("\n") -%>
74
+ <%- end -%>
75
+
76
+ <%= tokens.reject(&:empty_token_sort).map { |token| "T#{token.id}" }.join('->') -%> [style="invis"];
77
+ <%- end -%>
78
+ }
79
+
80
+ <%- @graph.alignments.each do |x, y| -%>
81
+ <%= "T#{x}" -%> -> <%= "T#{y}" -%> [color=blue,dir=none];
82
+ <%- end -%>
83
+ }