shalmaneser-frappe 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +10 -0
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +122 -0
  6. data/lib/frappe/Ampersand.rb +41 -0
  7. data/lib/frappe/file_parser.rb +126 -0
  8. data/lib/frappe/fix_syn_sem_mapping.rb +196 -0
  9. data/lib/frappe/frappe.rb +217 -0
  10. data/lib/frappe/frappe_flat_syntax.rb +89 -0
  11. data/lib/frappe/frappe_read_stxml.rb +48 -0
  12. data/lib/frappe/interfaces/berkeley_interface.rb +380 -0
  13. data/lib/frappe/interfaces/collins_interface.rb +340 -0
  14. data/lib/frappe/interfaces/counter.rb +19 -0
  15. data/lib/frappe/interfaces/stanford_interface.rb +353 -0
  16. data/lib/frappe/interfaces/treetagger_interface.rb +74 -0
  17. data/lib/frappe/interfaces/treetagger_module.rb +111 -0
  18. data/lib/frappe/interfaces/treetagger_pos_interface.rb +80 -0
  19. data/lib/frappe/interpreters/berkeley_interpreter.rb +27 -0
  20. data/lib/frappe/interpreters/collins_tnt_interpreter.rb +807 -0
  21. data/lib/frappe/interpreters/collins_treetagger_interpreter.rb +16 -0
  22. data/lib/frappe/interpreters/empty_interpreter.rb +26 -0
  23. data/lib/frappe/interpreters/headz.rb +265 -0
  24. data/lib/frappe/interpreters/headz_helpers.rb +54 -0
  25. data/lib/frappe/interpreters/stanford_interpreter.rb +28 -0
  26. data/lib/frappe/interpreters/syn_interpreter.rb +727 -0
  27. data/lib/frappe/interpreters/tiger_interpreter.rb +1846 -0
  28. data/lib/frappe/interpreters/treetagger_interpreter.rb +89 -0
  29. data/lib/frappe/one_parsed_file.rb +31 -0
  30. data/lib/frappe/opt_parser.rb +92 -0
  31. data/lib/frappe/path.rb +199 -0
  32. data/lib/frappe/plain_converter.rb +59 -0
  33. data/lib/frappe/salsa_tab_converter.rb +154 -0
  34. data/lib/frappe/salsa_tab_with_pos_converter.rb +531 -0
  35. data/lib/frappe/stxml_converter.rb +666 -0
  36. data/lib/frappe/syn_interface.rb +76 -0
  37. data/lib/frappe/syn_interface_stxml.rb +173 -0
  38. data/lib/frappe/syn_interface_tab.rb +39 -0
  39. data/lib/frappe/utf_iso.rb +27 -0
  40. data/lib/shalmaneser/frappe.rb +1 -0
  41. metadata +130 -0
@@ -0,0 +1,89 @@
1
+ ###############
2
+ # an interpreter that only has Treetagger, no parser
3
+
4
+ require_relative 'syn_interpreter'
5
+ require 'logging'
6
+
7
+ module Shalmaneser
8
+ module Frappe
9
+ class TreetaggerInterpreter < SynInterpreter
10
+ TreetaggerInterpreter.announce_me
11
+
12
+ ###
13
+ # names of the systems interpreted by this class:
14
+ # returns a hash service(string) -> system name (string),
15
+ # e.g.
16
+ # { "parser" => "collins", "lemmatizer" => "treetagger" }
17
+ def self.systems
18
+ {"pos_tagger" => "treetagger"}
19
+ end
20
+
21
+ ###
22
+ # names of additional systems that may be interpreted by this class
23
+ # returns a hash service(string) -> system name(string)
24
+ # same as names()
25
+ def self.optional_systems
26
+ {"lemmatizer" => "treetagger"}
27
+ end
28
+
29
+ ###
30
+ # generalize over POS tags.
31
+ #
32
+ # returns one of:
33
+ #
34
+ # adj: adjective (phrase)
35
+ # adv: adverb (phrase)
36
+ # card: numbers, quantity phrases
37
+ # con: conjunction
38
+ # det: determiner, including possessive/demonstrative pronouns etc.
39
+ # for: foreign material
40
+ # noun: noun (phrase), including personal pronouns, proper names, expletives
41
+ # part: particles, truncated words (German compound parts)
42
+ # prep: preposition (phrase)
43
+ # pun: punctuation, brackets, etc.
44
+ # sent: sentence
45
+ # top: top node of a sentence
46
+ # verb: verb (phrase)
47
+ # nil: something went wrong
48
+ #
49
+ # returns: string, or nil
50
+ def self.category(node) # SynNode
51
+ pt = TreetaggerInterpreter.pt(node)
52
+ # phrase type could not be determined
53
+ return nil if pt.nil?
54
+
55
+ case pt.to_s.strip.match(/^([^-]*)/)[1]
56
+ when /^JJ/, /(WH)?ADJP/, /^PDT/
57
+ "adj"
58
+ when /^RB/, /(WH)?ADVP/, /^UH/
59
+ "adv"
60
+ when /^CD/, /^QP/
61
+ "card"
62
+ when /^CC/, /^WRB/, /^CONJP/
63
+ "con"
64
+ when /^DT/, /^POS/
65
+ "det"
66
+ when /^FW/, /^SYM/
67
+ "for"
68
+ when /^N/, "WHAD", "WDT", /^PRP/, /^WHNP/, /^EX/, /^WP/
69
+ "noun"
70
+ when /^IN/, /^TO/, /(WH)?PP/, "RP", /^PR(T|N)/
71
+ "prep"
72
+ when /^PUNC/, /LRB/, /RRB/, /[,'".:;!?\(\)]/
73
+ "pun"
74
+ when /^S(s|bar|BAR|G|Q|BARQ|INV)?$/, /^UCP/, /^FRAG/, /^X/, /^INTJ/
75
+ "sent"
76
+ when /^TOP/
77
+ "top"
78
+ when /^TRACE/
79
+ "trace"
80
+ when /^V/, /^MD/
81
+ "verb"
82
+ else
83
+ LOGGER.warn "Unknown category/POS #{pt} (English data)."
84
+ nil
85
+ end
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,31 @@
1
+ # -*- encoding: utf-8 -*-
2
+ # AB, 2010-11-25
3
+
4
+ ##############################
5
+ # class for managing the parses of one file
6
+ module Shalmaneser
7
+ module Frappe
8
+ class OneParsedFile
9
+ attr_reader :filename
10
+ # @param [String] filename The core of filename for the parse file.
11
+ # @param [String] complete_filename The complete filename of the parse file.
12
+ # @param [Enumerable] obj_with_iterator object with each_sentence method, see above
13
+ def initialize(filename, complete_filename, obj_with_iterator)
14
+ @obj_with_iterator = obj_with_iterator
15
+ @filename = filename
16
+ @complete_filename = complete_filename
17
+ end
18
+
19
+ # yield each parse sentence as a tuple
20
+ # [ salsa/tiger xml sentence, tab format sentence, mapping]
21
+ # of a SalsaTigerSentence object, a FNTabSentence object,
22
+ # and a hash: FNTab sentence lineno(integer) -> array:SynNode
23
+ # pointing each tab word to one or more SalsaTigerSentence terminals
24
+ def each_sentence
25
+ @obj_with_iterator.each_sentence(@complete_filename) do |st_sent, tab_sent, mapping|
26
+ yield [st_sent, tab_sent, mapping]
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,92 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ # @author AB
4
+ # @date 2010-11-25
5
+
6
+ require 'optparse'
7
+ require 'configuration/frappe_config_data'
8
+ require 'definitions'
9
+ require 'external_systems'
10
+ require 'logging'
11
+
12
+ module Shalmaneser
13
+ module Frappe
14
+ # This class parses options for FrPrep.
15
+ # @todo Remove explicit exits in this class.
16
+ class OptParser
17
+ # Main class method.
18
+ # OP expects cmd_args to be an array like ARGV.
19
+ def self.parse(cmd_args)
20
+ @prg_name = PROGRAM_NAME
21
+ @options = {}
22
+ parser = create_parser
23
+
24
+ # If no options provided print the help.
25
+ if cmd_args.empty?
26
+ msg = "You have to provide some options.\n"\
27
+ "Please start with <#{@prg_name} --help>."
28
+
29
+ $stderr.puts msg
30
+ exit(1)
31
+ end
32
+
33
+ # Parse ARGV and provide the options hash.
34
+ # Check if everything is correct and handle exceptions
35
+ begin
36
+ parser.parse(cmd_args)
37
+ rescue OptionParser::InvalidArgument => e
38
+ arg = e.message.split.last
39
+ $stderr.puts "The provided argument #{arg} is currently not supported!\n"\
40
+ "Please consult <#{@prg_name} --help>."
41
+ exit(1)
42
+ rescue OptionParser::InvalidOption => e
43
+ $stderr.puts "You have provided an #{e.message}.\n"\
44
+ "Please consult <#{@prg_name} --help>."
45
+ exit(1)
46
+ rescue
47
+ raise
48
+ end
49
+
50
+ # @todo Rename the config data class.
51
+ exp = ::Shalmaneser::Configuration::FrappeConfigData.new(@options[:exp_file])
52
+
53
+ # @todo AB: [2015-12-28 Mon 19:22]
54
+ # Move this to ConfigData.
55
+ ExternalSystems.check_interfaces_abort_if_missing(exp)
56
+
57
+ exp
58
+ end
59
+
60
+ private
61
+
62
+ def self.create_parser
63
+ OptionParser.new do |opts|
64
+ opts.banner = "Fred and Rosy Preprocessor <Frappe>. Preprocessing stage before Fred and Rosy\n"\
65
+ "for further frame/word sense disambiguation and semantic role assignment."\
66
+ "\n"\
67
+ "Usage: #{PROGRAM_NAME.downcase} -h|-e FILENAME"
68
+ opts.separator ''
69
+ opts.separator 'Program specific options:'
70
+
71
+ opts.on('-e', '--expfile FILENAME',
72
+ 'Provide the path to an experiment file.',
73
+ "#{PROGRAM_NAME} will preprocess data according to the specifications",
74
+ 'given in your experiment file.',
75
+ 'This option is required!',
76
+ 'Also consider the documentation on format and features.'
77
+ ) do |exp_file|
78
+ @options[:exp_file] = File.expand_path(exp_file)
79
+ end
80
+
81
+ opts.separator ''
82
+ opts.separator 'Common options:'
83
+
84
+ opts.on_tail('-h', '--help', 'Show this help message.') do
85
+ puts opts
86
+ exit
87
+ end
88
+ end
89
+ end # def self.parse
90
+ end # class OptParser
91
+ end # module Frappe
92
+ end # Shalm
@@ -0,0 +1,199 @@
1
+ #############################
2
+ # class describing a path between two nodes
3
+ #
4
+ # provides access and output facilities for different aspects of the path
5
+ #
6
+ # this is the return value of SynInterpreter.path_between
7
+ module Shalmaneser
8
+ module Frappe
9
+ class Path
10
+ attr_reader :startnode
11
+
12
+ ###
13
+ # initialize to empty path
14
+ def initialize(startnode)
15
+ @path = []
16
+ @cutoff_last_pt = false
17
+ set_startnode(startnode)
18
+ end
19
+
20
+ ###
21
+ # deep_clone:
22
+ # return clone of this path object,
23
+ # with clone of this path rather than the same path
24
+ def deep_clone
25
+ new_path = self.clone
26
+ new_path.set_path(@path.clone)
27
+
28
+ return new_path
29
+ end
30
+
31
+ ###
32
+ def set_startnode(startnode)
33
+ @startnode = startnode
34
+
35
+ return self
36
+ end
37
+
38
+ ###
39
+ # iterate through the current path
40
+ #
41
+ # yield tuples
42
+ # [direction, edgelabel, nodelabel, endnode]
43
+ # direction: string, U/D
44
+ # edgelabel: string
45
+ # nodelabel: string
46
+ # endnode: SynNode
47
+ def each_step
48
+ @path.each { |step|
49
+ yield step
50
+ }
51
+ end
52
+
53
+ ###
54
+ # empty?
55
+ # any steps in here?
56
+ def empty?
57
+ return @path.empty?
58
+ end
59
+
60
+ ###
61
+ # add one step to the beginning of the current path
62
+ def add_first_step(start_node,#SynNode
63
+ direction, # string: U, D
64
+ gf, # string: edge label
65
+ pt)
66
+ @path.unshift([direction, gf, pt, @startnode])
67
+ set_startnode(start_node)
68
+
69
+ return self
70
+ end
71
+
72
+
73
+ ###
74
+ # add one step to the end of the current path
75
+ def add_last_step(direction, # string: U, D
76
+ gf, # string: edge label
77
+ pt, # string: node label (of end_node)
78
+ end_node) # SynNode
79
+ @path << [direction, gf, pt, end_node]
80
+
81
+ return self
82
+ end
83
+
84
+ ###
85
+ # path length
86
+ def length
87
+ return @path.length
88
+ end
89
+
90
+ ###
91
+ #
92
+ def print(print_direction, # boolean. true: print direction
93
+ print_gf, # boolean. true: print edgelabel
94
+ print_pt) # boolean. true: print nodelabel
95
+
96
+ return print_aux(@path, print_direction, print_gf, print_pt)
97
+ end
98
+
99
+ ###
100
+ # print path from roof node to end
101
+ def print_downpart(print_direction,
102
+ print_gf,
103
+ print_pt)
104
+
105
+ roof, roof_index = compute_roof
106
+ if roof.nil? or @path.empty?
107
+ # no roof set
108
+ return ""
109
+
110
+ else
111
+ # roof node is in the middle
112
+ return print_aux(@path[roof_index..-1],
113
+ print_direction, print_gf, print_pt)
114
+ end
115
+ end
116
+
117
+ ###
118
+ def lca
119
+ return compute_roof.first
120
+ end
121
+
122
+ ###
123
+ # cut off last node label in print and print_downpart?
124
+ def set_cutoff_last_pt_on_printing(bool) # Boolean
125
+ @cutoff_last_pt = bool
126
+ end
127
+
128
+ ########
129
+ protected
130
+
131
+ def set_path(new_path)
132
+ @path = new_path
133
+ end
134
+
135
+
136
+ ########
137
+ private
138
+
139
+ ###
140
+ # step through the path as long as direction is up.
141
+ # when direction starts to go "D", take current node as roof node
142
+ #
143
+ # returns: pair [roof node, roof node index] (SynNode, integer)
144
+ def compute_roof
145
+ node = @startnode
146
+ index = 0
147
+
148
+ each_step { |direction, edgelabel, nodelabel, endnode|
149
+ if direction =~ /D/
150
+ # down! the previous node was roof
151
+ return [node, index]
152
+ else
153
+ node = endnode
154
+ index += 1
155
+ end
156
+ }
157
+
158
+ # last node is roof
159
+ return [node, index]
160
+
161
+ end
162
+
163
+ ###
164
+ def print_aux(path,
165
+ print_direction,
166
+ print_gf,
167
+ print_pt)
168
+ retv = ''
169
+ path.each { |step|
170
+ direction, gf, pt, _node = step.map { |entry|
171
+ if entry.nil?
172
+ "-"
173
+ else
174
+ entry
175
+ end
176
+ }
177
+
178
+ if print_direction
179
+ retv << direction + " "
180
+ end
181
+
182
+ if print_gf
183
+ retv << gf + " "
184
+ end
185
+
186
+ if print_pt
187
+ retv << pt + " "
188
+ end
189
+ }
190
+
191
+ if @cutoff_last_pt && print_pt && (retv =~ /^(.+ )\w+ $/)
192
+ return $1
193
+ else
194
+ return retv
195
+ end
196
+ end
197
+ end
198
+ end
199
+ end
@@ -0,0 +1,59 @@
1
+ require 'tokenizer'
2
+ require 'tabular_format/fn_tab_format_file'
3
+
4
+ module Shalmaneser
5
+ module Frappe
6
+ # A converter from plain text to Salsa Tab Format.
7
+ # Performs tokenization.
8
+ class PlainConverter
9
+ def initialize
10
+ # suffixes for different types of output files
11
+ @file_suffixes = {"lemma" => ".lemma", "pos" => ".pos", "tab" => ".tab", "stxml" => ".xml"}
12
+ end
13
+
14
+ ###############
15
+ # transform_plain:
16
+ #
17
+ # transformation for plaintext:
18
+ #
19
+ # transform to Tab format, separating punctuation from adjacent words
20
+ # @param input_dir [String] input directory
21
+ # @param output_dir [String] output directory
22
+ def transform_plain_dir(input_dir, output_dir)
23
+ Dir[input_dir + "*"].each do |plainfilename|
24
+ # open input and output file
25
+ # end output file name in "tab" because that is, at the moment, required
26
+ outfilename = output_dir + File.basename(plainfilename, '.*') + @file_suffixes["tab"]
27
+ plain_to_tab_file(plainfilename, outfilename)
28
+ end
29
+ end
30
+
31
+ ####
32
+ # transform plaintext file to Tab format file
33
+ # @param [String] input_filename string: name of input file
34
+ # @param [String] output_filename string: name of output file
35
+ def plain_to_tab_file(input_filename, output_filename)
36
+ sentences = File.open(input_filename) do |f|
37
+ # The file is supposed to contain one sentence per line.
38
+ f.readlines.map(&:chomp).map(&:strip).reject(&:empty?)
39
+ end
40
+ id = File.basename(input_filename, '.*')
41
+ t = Tokenizer::Tokenizer.new
42
+ File.open(output_filename, "w") do |f|
43
+ sentences.each_with_index do |sentence, idx|
44
+ # byebug
45
+ sentid = "#{id}_#{idx}"
46
+ sentence = t.tokenize(sentence)
47
+ sentence.each do |word|
48
+ # for each word, one line, entries in the line tab-separated
49
+ # the 'word' entry is the word, the 'lu_sent_ids' entry is the sentence ID sentid,
50
+ # all other entries (gf, pt, frame etc.) are not set
51
+ f.puts FNTabFormatFile.format_str("word" => word, "sent_id" => sentid)
52
+ end
53
+ f.puts
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end