proiel 1.1.0 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. checksums.yaml +5 -5
  2. data/LICENSE +1 -1
  3. data/README.md +2 -2
  4. data/lib/proiel.rb +16 -1
  5. data/lib/proiel/alignment.rb +3 -0
  6. data/lib/proiel/alignment/builder.rb +220 -0
  7. data/lib/proiel/annotation_schema.rb +11 -4
  8. data/lib/proiel/chronology.rb +80 -0
  9. data/lib/proiel/dictionary.rb +79 -0
  10. data/lib/proiel/dictionary/builder.rb +224 -0
  11. data/lib/proiel/div.rb +22 -3
  12. data/lib/proiel/language.rb +108 -0
  13. data/lib/proiel/lemma.rb +77 -0
  14. data/lib/proiel/proiel_xml/proiel-3.0/proiel-3.0.xsd +383 -0
  15. data/lib/proiel/proiel_xml/reader.rb +138 -2
  16. data/lib/proiel/proiel_xml/schema.rb +4 -2
  17. data/lib/proiel/proiel_xml/validator.rb +76 -9
  18. data/lib/proiel/sentence.rb +27 -4
  19. data/lib/proiel/source.rb +14 -4
  20. data/lib/proiel/statistics.rb +2 -2
  21. data/lib/proiel/token.rb +14 -6
  22. data/lib/proiel/tokenization.rb +5 -3
  23. data/lib/proiel/treebank.rb +23 -6
  24. data/lib/proiel/utils.rb +0 -1
  25. data/lib/proiel/valency.rb +5 -0
  26. data/lib/proiel/valency/arguments.rb +151 -0
  27. data/lib/proiel/valency/lexicon.rb +59 -0
  28. data/lib/proiel/valency/obliqueness.rb +31 -0
  29. data/lib/proiel/version.rb +2 -3
  30. data/lib/proiel/visualization.rb +1 -0
  31. data/lib/proiel/visualization/graphviz.rb +111 -0
  32. data/lib/proiel/visualization/graphviz/aligned-modern.dot.erb +83 -0
  33. data/lib/proiel/visualization/graphviz/classic.dot.erb +24 -0
  34. data/lib/proiel/visualization/graphviz/linearized.dot.erb +57 -0
  35. data/lib/proiel/visualization/graphviz/modern.dot.erb +39 -0
  36. data/lib/proiel/visualization/graphviz/packed.dot.erb +25 -0
  37. metadata +76 -31
@@ -0,0 +1,151 @@
1
+ module PROIEL::Valency::Arguments
2
+ def self.get_argument_frame(token)
3
+ arguments = collect_arguments(token)
4
+ hoisted_arguments = arguments.map { |a| hoist_dependents(a) }
5
+
6
+ a =
7
+ hoisted_arguments.map do |argument|
8
+ { relation: argument.relation }.merge(extract_features(argument))
9
+ end
10
+
11
+ PROIEL::Valency::Obliqueness.sort_arguments(a)
12
+ end
13
+
14
+ private
15
+
16
+ POS_CLASSIFICATION = {
17
+ 'R' => :functor,
18
+ 'G' => :functor,
19
+ 'N' => :nominal,
20
+ 'P' => :nominal,
21
+ 'A' => :nominal,
22
+ 'M' => :nominal,
23
+ 'V' => :verbal,
24
+ }.freeze
25
+
26
+ # Collapses dependents based on features
27
+ def self.collapse_dependents(dependents)
28
+ # Hoist dependents if any of the dependents is a coordinator
29
+ dependents = dependents.map { |d| hoist_dependents(d) }
30
+
31
+ # Figure out if all dependents are equivalent for the purposes of
32
+ # argument frames. Typical examples would be coordinated, identical
33
+ # prepositions (which is operationalised as same lemma, same POS, no
34
+ # case) or coordinated nouns in the same case (which is operationalised
35
+ # as same major POS, same case). If we fail to figure out a way to
36
+ # hoist and reduce arguments, we keep the coordinator.
37
+ majors = dependents.map { |d| POS_CLASSIFICATION[d.part_of_speech_hash[:major] || d.empty_token_sort] }.uniq
38
+ majors = majors.length == 1 ? majors.first : nil
39
+
40
+ case majors
41
+ when :functor
42
+ lemmas = dependents.map(&:lemma).uniq
43
+ if lemmas.length == 1
44
+ dependents.first
45
+ else
46
+ # STDERR.puts "Different lemmas R/G: #{lemmas.inspect}"
47
+ nil
48
+ end
49
+ when :nominal
50
+ cases = dependents.map { |d| d.morphology_hash[:case] }.uniq
51
+ if cases.length == 1
52
+ dependents.first
53
+ else
54
+ # STDERR.puts "Different cases N/P: #{cases.inspect}"
55
+ nil
56
+ end
57
+ when :verbal
58
+ moods = dependents.map { |d| d.morphology_hash[:mood] }.uniq
59
+ if moods.length == 1
60
+ dependents.first
61
+ else
62
+ # STDERR.puts "Different moods V: #{moods.inspect}"
63
+ nil
64
+ end
65
+ else
66
+ # STDERR.puts "Unknown combination: #{dependents.map(&:pos).inspect}"
67
+ nil
68
+ end
69
+ end
70
+
71
+ # Hoists the real argument dependents from conjoined arguments
72
+ def self.hoist_dependents(argument)
73
+ if argument.part_of_speech == 'C-' or argument.empty_token_sort == 'C'
74
+ # Pick dependents that have the same relation as the coordinator. This
75
+ # eliminates auxiliary elements like particles and repeated
76
+ # conjunctions as well as attributes that scope over all conjuncts.
77
+ dependents = argument.dependents.select { |d| d.relation == argument.relation }
78
+
79
+ collapse_dependents(dependents) || argument
80
+ else
81
+ argument
82
+ end
83
+ end
84
+
85
+ # Extracts morphosyntactic features that are relevant to the argument frame
86
+ def self.extract_features(argument)
87
+ {}.tap do |features|
88
+ case argument.part_of_speech_hash[:major]
89
+ when 'G'
90
+ features[:lemma] = argument.lemma
91
+ features[:part_of_speech] = argument.part_of_speech
92
+
93
+ # There may be multiple dependents and dependents may be headed by
94
+ # coordinators. All relevant dependents have the relation PRED.
95
+ dependents = argument.dependents.select { |d| d.relation == 'pred' }.map { |a| hoist_dependents(a) }
96
+ local_argument = collapse_dependents(dependents)
97
+ if local_argument and local_argument.morphology_hash[:mood]
98
+ features[:mood] = local_argument.morphology_hash[:mood]
99
+ end
100
+ when 'R'
101
+ features[:lemma] = argument.lemma
102
+ features[:part_of_speech] = argument.part_of_speech
103
+
104
+ # There may be multiple dependents and dependents may be headed by
105
+ # coordinators. All relevant dependents have the relation OBL.
106
+ dependents = argument.dependents.select { |d| d.relation == 'obl' }.map { |a| hoist_dependents(a) }
107
+ local_argument = collapse_dependents(dependents)
108
+ if local_argument and local_argument.morphology_hash[:case]
109
+ features[:case] = local_argument.morphology_hash[:case]
110
+ end
111
+ when 'V'
112
+ features[:mood] = argument.morphology_hash[:mood] if argument.morphology_hash[:mood]
113
+ when 'D'
114
+ features[:lemma] = argument.lemma
115
+ features[:part_of_speech] = argument.part_of_speech
116
+ when 'P'
117
+ features[:case] = argument.morphology_hash[:case] if argument.morphology_hash[:case]
118
+ if argument.part_of_speech == 'Pk' # reflexive personal pronoun
119
+ features[:lemma] = argument.lemma
120
+ features[:part_of_speech] = argument.part_of_speech
121
+ end
122
+ else
123
+ features[:case] = argument.morphology_hash[:case] if argument.morphology_hash[:case]
124
+ end
125
+ end
126
+ end
127
+
128
+ #  Determines the arguments of a predicate
129
+ def self.collect_arguments(token)
130
+ token.dependents.select do |dependent|
131
+ case dependent.relation
132
+ when 'obj', 'obl', 'xobj', 'comp', 'narg' # arguments
133
+ true
134
+ when 'aux', 'sub', 'ag', 'adv', 'xadv', 'apos', 'atr', 'part', 'expl' # non-arguments
135
+ false
136
+ when 'arg' # unspecific but always an argument
137
+ true
138
+ when 'adnom', 'nonsub', 'per' # unspecific and undetermined with respect to argumenthood
139
+ false
140
+ when 'rel' # unspecific but never an argument
141
+ false
142
+ when 'pred', 'parpred', 'voc' # shouldn't happen
143
+ false
144
+ when 'pid', 'xsub' # really shouldn't happen
145
+ false
146
+ else
147
+ raise "unknown relation #{dependent.relation.inspect}"
148
+ end
149
+ end
150
+ end
151
+ end
@@ -0,0 +1,59 @@
1
+ module PROIEL
2
+ module Valency
3
+ class Lexicon
4
+ attr_reader :frames
5
+
6
+ def initialize
7
+ @source_ids = Set.new
8
+ @source_languages = Set.new
9
+ @frames = {}
10
+ end
11
+
12
+ # Generates a valency lexicon from the provided sources. In practice the
13
+ # sources should be in the same language but this is not enforced. This
14
+ # makes it possible to generate a lexicon from sources in closely related
15
+ # languages or dialects.
16
+ def add_source!(source)
17
+ @source_ids << source.id
18
+ @source_languages << source.language
19
+
20
+ source.sentences.each do |sentence|
21
+ tokens = find_verbal_nodes(sentence)
22
+ tokens.each do |token|
23
+ frame = PROIEL::Valency::Arguments.get_argument_frame(token)
24
+
25
+ partition =
26
+ if token.dependents.any? { |d| d.relation == 'aux' and d.part_of_speech == 'Pk' }
27
+ :r
28
+ else
29
+ :a
30
+ end
31
+
32
+ @frames[token.lemma] ||= {}
33
+ @frames[token.lemma][token.part_of_speech] ||= {}
34
+ @frames[token.lemma][token.part_of_speech][frame] ||= { a: [], r: [] }
35
+ @frames[token.lemma][token.part_of_speech][frame][partition] << token.id
36
+ end
37
+ end
38
+ end
39
+
40
+ def lookup(lemma, part_of_speech)
41
+ frames =
42
+ @frames[lemma][part_of_speech].map do |arguments, token_ids|
43
+ { arguments: arguments, tokens: token_ids }
44
+ end
45
+ PROIEL::Valency::Obliqueness.sort_frames(frames)
46
+ end
47
+
48
+ private
49
+
50
+ # Find verbal nodes in a sentence
51
+ def find_verbal_nodes(sentence)
52
+ sentence.tokens.select do |token|
53
+ # FIXME: is this test in the proiel library already?
54
+ (token.part_of_speech and token.part_of_speech[/^V/]) or token.empty_token_sort == 'V'
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,31 @@
1
+ module PROIEL::Valency::Obliqueness
2
+ # Sorts frames by obliqueness
3
+ def self.sort_frames(frames)
4
+ # Sort frames by obliqueness, then by inspecting them so that we get
5
+ # a stable, reproducible order.
6
+ frames.sort_by { |frame| [obliqueness_of_arguments(frame[:arguments]).sort, frame.inspect] }
7
+ end
8
+
9
+ # Sorts arguments by obliqueness
10
+ def self.sort_arguments(arguments)
11
+ arguments.sort_by { |argument| obliqueness_of_argument(argument) }
12
+ end
13
+
14
+ private
15
+
16
+ def self.obliqueness_of_arguments(arguments)
17
+ arguments.map do |argument|
18
+ obliqueness_of_argument(argument)
19
+ end
20
+ end
21
+
22
+ def self.obliqueness_of_argument(argument)
23
+ obliqueness_of_relation(argument[:relation]) * 2 + (argument[:lemma].nil? ? 0 : 1)
24
+ end
25
+
26
+ OBLIQUENESS_HIERARCHY = %w(sub ag obj xobj arg obl comp narg).freeze
27
+
28
+ def self.obliqueness_of_relation(relation)
29
+ OBLIQUENESS_HIERARCHY.index(relation) || OBLIQUENESS_HIERARCHY.length
30
+ end
31
+ end
@@ -1,9 +1,8 @@
1
1
  #--
2
- # Copyright (c) 2015-2016 Marius L. Jøhndal
2
+ # Copyright (c) 2015-2018 Marius L. Jøhndal
3
3
  #
4
4
  # See LICENSE in the top-level source directory for licensing terms.
5
5
  #++
6
6
  module PROIEL
7
- # Gem version
8
- VERSION = '1.1.0'
7
+ VERSION = '1.3.1'.freeze
9
8
  end
@@ -0,0 +1 @@
1
+ require 'proiel/visualization/graphviz'
@@ -0,0 +1,111 @@
1
+ module PROIEL
2
+ module Visualization
3
+ module Graphviz
4
+ DEFAULT_GRAPHVIZ_BINARY = 'dot'.freeze
5
+ DEFAULT_TEMPLATES = %i(classic linearized packed modern aligned-modern).freeze
6
+ SUPPORTED_OUTPUT_FORMATS = %i(png svg).freeze
7
+
8
+ class GraphvizError < Exception
9
+ end
10
+
11
+ def self.generate_to_file(template, graph, output_format, output_filename, options = {})
12
+ raise ArgumentError, 'string expected' unless output_filename.is_a?(String)
13
+
14
+ result = PROIEL::Visualization::Graphviz.generate(template, graph, output_format, options)
15
+
16
+ File.open(output_filename, 'w') do |f|
17
+ f.write(result)
18
+ end
19
+ end
20
+
21
+ def self.generate(template, graph, output_format, options = {})
22
+ raise ArgumentError, 'string or symbol expected' unless template.is_a?(String) or template.is_a?(Symbol)
23
+
24
+ dot_code = generate_dot(template, graph, options)
25
+
26
+ if output_format.to_sym == :dot
27
+ dot_code
28
+ else
29
+ generate_image(dot_code, output_format, options)
30
+ end
31
+ end
32
+
33
+ def self.template_filename(template)
34
+ raise ArgumentError, 'string or symbol expected' unless template.is_a?(String) or template.is_a?(Symbol)
35
+ raise ArgumentError, 'invalid template' unless DEFAULT_TEMPLATES.include?(template.to_sym)
36
+
37
+ File.join(File.dirname(__FILE__), 'graphviz', "#{template}.dot.erb")
38
+ end
39
+
40
+ def self.generate_image(dot_code, output_format, options = {})
41
+ raise ArgumentError, 'string expected' unless dot_code.is_a?(String)
42
+ unless output_format.is_a?(String) or output_format.is_a?(Symbol)
43
+ raise ArgumentError, 'string or symbol expected'
44
+ end
45
+ raise ArgumentError, 'invalid output format' unless SUPPORTED_OUTPUT_FORMATS.include?(output_format.to_sym)
46
+
47
+ graphviz_binary = options[:graphviz_binary] || DEFAULT_GRAPHVIZ_BINARY
48
+
49
+ result, errors = nil, nil
50
+
51
+ Open3.popen3("dot -T#{output_format}") do |dot, img, err|
52
+ dot.write dot_code
53
+ dot.close
54
+
55
+ result, errors = img.read, err.read
56
+ end
57
+
58
+ raise GraphvizError, "graphviz exited with errors: #{errors}" unless errors.nil? or errors == ''
59
+
60
+ result
61
+ end
62
+
63
+ def self.generate_dot(template, graph, options)
64
+ unless options[:direction].nil? or %(TD LR).include?(options[:direction])
65
+ raise ArgumentError, 'invalid direction'
66
+ end
67
+
68
+ filename = template_filename(template)
69
+
70
+ content = File.read(filename)
71
+
72
+ template = ERB.new(content, nil, '-')
73
+ template.filename = filename
74
+
75
+ TemplateContext.new(graph, options[:direction] || 'TD').generate(template)
76
+ end
77
+
78
+ class TemplateContext
79
+ def initialize(graph, direction, title = '')
80
+ @graph = graph
81
+ @direction = direction
82
+ @title = title
83
+ end
84
+
85
+ def generate(template)
86
+ template.result(binding)
87
+ end
88
+
89
+ protected
90
+
91
+ # Creates a node with an identifier and a label.
92
+ def node(identifier, label = '', options = {})
93
+ attrs = join_attributes(options.merge(label: label))
94
+
95
+ "#{identifier} [#{attrs}];"
96
+ end
97
+
98
+ # Creates an edge with a label from one identifier to another identifier.
99
+ def edge(identifier1, identifier2, label = '', options = {})
100
+ attrs = join_attributes(options.merge(label: label))
101
+
102
+ "#{identifier1} -> #{identifier2} [#{attrs}];"
103
+ end
104
+
105
+ def join_attributes(attrs)
106
+ attrs.map { |a, v| %|#{a}="#{v.to_s.gsub('"', '\\"')}"| }.join(',')
107
+ end
108
+ end
109
+ end
110
+ end
111
+ end
@@ -0,0 +1,83 @@
1
+ digraph "<%= @title -%>" {
2
+ charset="UTF-8";
3
+ graph [truecolor=true,bgcolor=transparent];
4
+ rankdir="<%= @direction -%>";
5
+ nodesep=0.1;
6
+ ranksep=0.25;
7
+
8
+ <%- @graph.left.each_with_index do |tokens, i| -%>
9
+ <%= "rootL#{i}" -%> [label="",shape=point];
10
+
11
+ <%- tokens.select { |t| t.empty_token_sort != 'P' }.each do |token| -%>
12
+ <%- if token.empty_token_sort -%>
13
+ <%= node token.id, token.relation.to_s.upcase, shape: :none, fontcolor: :gray -%>
14
+ <%- else -%>
15
+ <%= node token.id, token.relation.to_s.upcase, shape: :none -%>
16
+ <%- end -%>
17
+
18
+ <%- if token.relation -%>
19
+ <%= edge (token.head ? token.head.id : "rootL#{i}"), token.id, '', weight: 1.0, color: :orange, arrowhead: :none -%>
20
+ <%- end -%>
21
+
22
+ <%- token.slashes.each do |(relation, target)| -%>
23
+ <%= edge token.id, target, relation.to_s.upcase, weight: 0.0, fontcolor: :blue, color: :blue, style: :dashed %>
24
+ <%- end -%>
25
+ <%- end -%>
26
+
27
+ <%- tokens.reject(&:empty_token_sort).each do |token| -%>
28
+ <%= edge token.id, "T#{token.id}", nil, weight: 10, arrowhead: :none -%>
29
+ <%- end -%>
30
+ <%- end -%>
31
+
32
+ {
33
+ rank="same";
34
+
35
+ <%- @graph.left.each do |tokens| -%>
36
+ <%- tokens.reject(&:empty_token_sort).each do |token| -%>
37
+ <%= node "T#{token.id}", token.form, shape: :none, fontcolor: :blue, tooltip: [token.lemma, token.part_of_speech, token.morphology].join("\n") -%>
38
+ <%- end -%>
39
+
40
+ <%= tokens.reject(&:empty_token_sort).map { |token| "T#{token.id}" }.join('->') -%> [style="invis"];
41
+ <%- end -%>
42
+ }
43
+
44
+ <%- @graph.right.each_with_index do |tokens, i| -%>
45
+ <%= "rootR#{i}" -%> [label="",shape=point];
46
+
47
+ <%- tokens.select { |t| t.empty_token_sort != 'P' }.each do |token| -%>
48
+ <%- if token.empty_token_sort -%>
49
+ <%= node token.id, token.relation.to_s.upcase, shape: :none, fontcolor: :gray -%>
50
+ <%- else -%>
51
+ <%= node token.id, token.relation.to_s.upcase, shape: :none -%>
52
+ <%- end -%>
53
+
54
+ <%- if token.relation -%>
55
+ <%= edge token.id, (token.head ? token.head.id : "rootR#{i}"), '', weight: 1.0, color: :orange, arrowhead: :none -%>
56
+ <%- end -%>
57
+
58
+ <%- token.slashes.each do |(relation, target)| -%>
59
+ <%= edge token.id, target, relation.to_s.upcase, weight: 0.0, fontcolor: :blue, color: :blue, style: :dashed %>
60
+ <%- end -%>
61
+ <%- end -%>
62
+
63
+ <%- tokens.reject(&:empty_token_sort).each do |token| -%>
64
+ <%= edge "T#{token.id}", token.id, nil, weight: 10, arrowhead: :none -%>
65
+ <%- end -%>
66
+ <%- end -%>
67
+
68
+ {
69
+ rank="same";
70
+
71
+ <%- @graph.right.each do |tokens| -%>
72
+ <%- tokens.reject(&:empty_token_sort).each do |token| -%>
73
+ <%= node "T#{token.id}", token.form, shape: :none, fontcolor: :blue, tooltip: [token.lemma, token.part_of_speech, token.morphology].join("\n") -%>
74
+ <%- end -%>
75
+
76
+ <%= tokens.reject(&:empty_token_sort).map { |token| "T#{token.id}" }.join('->') -%> [style="invis"];
77
+ <%- end -%>
78
+ }
79
+
80
+ <%- @graph.alignments.each do |x, y| -%>
81
+ <%= "T#{x}" -%> -> <%= "T#{y}" -%> [color=blue,dir=none];
82
+ <%- end -%>
83
+ }