shalmaneser-frappe 1.2.rc5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +10 -0
- data/CHANGELOG.md +4 -0
- data/LICENSE.md +4 -0
- data/README.md +122 -0
- data/lib/frappe/Ampersand.rb +41 -0
- data/lib/frappe/file_parser.rb +126 -0
- data/lib/frappe/fix_syn_sem_mapping.rb +196 -0
- data/lib/frappe/frappe.rb +217 -0
- data/lib/frappe/frappe_flat_syntax.rb +89 -0
- data/lib/frappe/frappe_read_stxml.rb +48 -0
- data/lib/frappe/interfaces/berkeley_interface.rb +380 -0
- data/lib/frappe/interfaces/collins_interface.rb +340 -0
- data/lib/frappe/interfaces/counter.rb +19 -0
- data/lib/frappe/interfaces/stanford_interface.rb +353 -0
- data/lib/frappe/interfaces/treetagger_interface.rb +74 -0
- data/lib/frappe/interfaces/treetagger_module.rb +111 -0
- data/lib/frappe/interfaces/treetagger_pos_interface.rb +80 -0
- data/lib/frappe/interpreters/berkeley_interpreter.rb +27 -0
- data/lib/frappe/interpreters/collins_tnt_interpreter.rb +807 -0
- data/lib/frappe/interpreters/collins_treetagger_interpreter.rb +16 -0
- data/lib/frappe/interpreters/empty_interpreter.rb +26 -0
- data/lib/frappe/interpreters/headz.rb +265 -0
- data/lib/frappe/interpreters/headz_helpers.rb +54 -0
- data/lib/frappe/interpreters/stanford_interpreter.rb +28 -0
- data/lib/frappe/interpreters/syn_interpreter.rb +727 -0
- data/lib/frappe/interpreters/tiger_interpreter.rb +1846 -0
- data/lib/frappe/interpreters/treetagger_interpreter.rb +89 -0
- data/lib/frappe/one_parsed_file.rb +31 -0
- data/lib/frappe/opt_parser.rb +92 -0
- data/lib/frappe/path.rb +199 -0
- data/lib/frappe/plain_converter.rb +59 -0
- data/lib/frappe/salsa_tab_converter.rb +154 -0
- data/lib/frappe/salsa_tab_with_pos_converter.rb +531 -0
- data/lib/frappe/stxml_converter.rb +666 -0
- data/lib/frappe/syn_interface.rb +76 -0
- data/lib/frappe/syn_interface_stxml.rb +173 -0
- data/lib/frappe/syn_interface_tab.rb +39 -0
- data/lib/frappe/utf_iso.rb +27 -0
- data/lib/shalmaneser/frappe.rb +1 -0
- metadata +130 -0
@@ -0,0 +1,89 @@
|
|
1
|
+
###############
|
2
|
+
# an interpreter that only has Treetagger, no parser
|
3
|
+
|
4
|
+
require_relative 'syn_interpreter'
|
5
|
+
require 'logging'
|
6
|
+
|
7
|
+
module Shalmaneser
|
8
|
+
module Frappe
|
9
|
+
class TreetaggerInterpreter < SynInterpreter
|
10
|
+
TreetaggerInterpreter.announce_me
|
11
|
+
|
12
|
+
###
|
13
|
+
# names of the systems interpreted by this class:
|
14
|
+
# returns a hash service(string) -> system name (string),
|
15
|
+
# e.g.
|
16
|
+
# { "parser" => "collins", "lemmatizer" => "treetagger" }
|
17
|
+
def self.systems
|
18
|
+
{"pos_tagger" => "treetagger"}
|
19
|
+
end
|
20
|
+
|
21
|
+
###
|
22
|
+
# names of additional systems that may be interpreted by this class
|
23
|
+
# returns a hash service(string) -> system name(string)
|
24
|
+
# same as names()
|
25
|
+
def self.optional_systems
|
26
|
+
{"lemmatizer" => "treetagger"}
|
27
|
+
end
|
28
|
+
|
29
|
+
###
|
30
|
+
# generalize over POS tags.
|
31
|
+
#
|
32
|
+
# returns one of:
|
33
|
+
#
|
34
|
+
# adj: adjective (phrase)
|
35
|
+
# adv: adverb (phrase)
|
36
|
+
# card: numbers, quantity phrases
|
37
|
+
# con: conjunction
|
38
|
+
# det: determiner, including possessive/demonstrative pronouns etc.
|
39
|
+
# for: foreign material
|
40
|
+
# noun: noun (phrase), including personal pronouns, proper names, expletives
|
41
|
+
# part: particles, truncated words (German compound parts)
|
42
|
+
# prep: preposition (phrase)
|
43
|
+
# pun: punctuation, brackets, etc.
|
44
|
+
# sent: sentence
|
45
|
+
# top: top node of a sentence
|
46
|
+
# verb: verb (phrase)
|
47
|
+
# nil: something went wrong
|
48
|
+
#
|
49
|
+
# returns: string, or nil
|
50
|
+
def self.category(node) # SynNode
|
51
|
+
pt = TreetaggerInterpreter.pt(node)
|
52
|
+
# phrase type could not be determined
|
53
|
+
return nil if pt.nil?
|
54
|
+
|
55
|
+
case pt.to_s.strip.match(/^([^-]*)/)[1]
|
56
|
+
when /^JJ/, /(WH)?ADJP/, /^PDT/
|
57
|
+
"adj"
|
58
|
+
when /^RB/, /(WH)?ADVP/, /^UH/
|
59
|
+
"adv"
|
60
|
+
when /^CD/, /^QP/
|
61
|
+
"card"
|
62
|
+
when /^CC/, /^WRB/, /^CONJP/
|
63
|
+
"con"
|
64
|
+
when /^DT/, /^POS/
|
65
|
+
"det"
|
66
|
+
when /^FW/, /^SYM/
|
67
|
+
"for"
|
68
|
+
when /^N/, "WHAD", "WDT", /^PRP/, /^WHNP/, /^EX/, /^WP/
|
69
|
+
"noun"
|
70
|
+
when /^IN/, /^TO/, /(WH)?PP/, "RP", /^PR(T|N)/
|
71
|
+
"prep"
|
72
|
+
when /^PUNC/, /LRB/, /RRB/, /[,'".:;!?\(\)]/
|
73
|
+
"pun"
|
74
|
+
when /^S(s|bar|BAR|G|Q|BARQ|INV)?$/, /^UCP/, /^FRAG/, /^X/, /^INTJ/
|
75
|
+
"sent"
|
76
|
+
when /^TOP/
|
77
|
+
"top"
|
78
|
+
when /^TRACE/
|
79
|
+
"trace"
|
80
|
+
when /^V/, /^MD/
|
81
|
+
"verb"
|
82
|
+
else
|
83
|
+
LOGGER.warn "Unknown category/POS #{pt} (English data)."
|
84
|
+
nil
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
# AB, 2010-11-25
|
3
|
+
|
4
|
+
##############################
|
5
|
+
# class for managing the parses of one file
|
6
|
+
module Shalmaneser
|
7
|
+
module Frappe
|
8
|
+
class OneParsedFile
|
9
|
+
attr_reader :filename
|
10
|
+
# @param [String] filename The core of filename for the parse file.
|
11
|
+
# @param [String] complete_filename The complete filename of the parse file.
|
12
|
+
# @param [Enumerable] obj_with_iterator object with each_sentence method, see above
|
13
|
+
def initialize(filename, complete_filename, obj_with_iterator)
|
14
|
+
@obj_with_iterator = obj_with_iterator
|
15
|
+
@filename = filename
|
16
|
+
@complete_filename = complete_filename
|
17
|
+
end
|
18
|
+
|
19
|
+
# yield each parse sentence as a tuple
|
20
|
+
# [ salsa/tiger xml sentence, tab format sentence, mapping]
|
21
|
+
# of a SalsaTigerSentence object, a FNTabSentence object,
|
22
|
+
# and a hash: FNTab sentence lineno(integer) -> array:SynNode
|
23
|
+
# pointing each tab word to one or more SalsaTigerSentence terminals
|
24
|
+
def each_sentence
|
25
|
+
@obj_with_iterator.each_sentence(@complete_filename) do |st_sent, tab_sent, mapping|
|
26
|
+
yield [st_sent, tab_sent, mapping]
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
# @author AB
|
4
|
+
# @date 2010-11-25
|
5
|
+
|
6
|
+
require 'optparse'
|
7
|
+
require 'configuration/frappe_config_data'
|
8
|
+
require 'definitions'
|
9
|
+
require 'external_systems'
|
10
|
+
require 'logging'
|
11
|
+
|
12
|
+
module Shalmaneser
|
13
|
+
module Frappe
|
14
|
+
# This class parses options for FrPrep.
|
15
|
+
# @todo Remove explicit exits in this class.
|
16
|
+
class OptParser
|
17
|
+
# Main class method.
|
18
|
+
# OP expects cmd_args to be an array like ARGV.
|
19
|
+
def self.parse(cmd_args)
|
20
|
+
@prg_name = PROGRAM_NAME
|
21
|
+
@options = {}
|
22
|
+
parser = create_parser
|
23
|
+
|
24
|
+
# If no options provided print the help.
|
25
|
+
if cmd_args.empty?
|
26
|
+
msg = "You have to provide some options.\n"\
|
27
|
+
"Please start with <#{@prg_name} --help>."
|
28
|
+
|
29
|
+
$stderr.puts msg
|
30
|
+
exit(1)
|
31
|
+
end
|
32
|
+
|
33
|
+
# Parse ARGV and provide the options hash.
|
34
|
+
# Check if everything is correct and handle exceptions
|
35
|
+
begin
|
36
|
+
parser.parse(cmd_args)
|
37
|
+
rescue OptionParser::InvalidArgument => e
|
38
|
+
arg = e.message.split.last
|
39
|
+
$stderr.puts "The provided argument #{arg} is currently not supported!\n"\
|
40
|
+
"Please consult <#{@prg_name} --help>."
|
41
|
+
exit(1)
|
42
|
+
rescue OptionParser::InvalidOption => e
|
43
|
+
$stderr.puts "You have provided an #{e.message}.\n"\
|
44
|
+
"Please consult <#{@prg_name} --help>."
|
45
|
+
exit(1)
|
46
|
+
rescue
|
47
|
+
raise
|
48
|
+
end
|
49
|
+
|
50
|
+
# @todo Rename the config data class.
|
51
|
+
exp = ::Shalmaneser::Configuration::FrappeConfigData.new(@options[:exp_file])
|
52
|
+
|
53
|
+
# @todo AB: [2015-12-28 Mon 19:22]
|
54
|
+
# Move this to ConfigData.
|
55
|
+
ExternalSystems.check_interfaces_abort_if_missing(exp)
|
56
|
+
|
57
|
+
exp
|
58
|
+
end
|
59
|
+
|
60
|
+
private
|
61
|
+
|
62
|
+
def self.create_parser
|
63
|
+
OptionParser.new do |opts|
|
64
|
+
opts.banner = "Fred and Rosy Preprocessor <Frappe>. Preprocessing stage before Fred and Rosy\n"\
|
65
|
+
"for further frame/word sense disambiguation and semantic role assignment."\
|
66
|
+
"\n"\
|
67
|
+
"Usage: #{PROGRAM_NAME.downcase} -h|-e FILENAME"
|
68
|
+
opts.separator ''
|
69
|
+
opts.separator 'Program specific options:'
|
70
|
+
|
71
|
+
opts.on('-e', '--expfile FILENAME',
|
72
|
+
'Provide the path to an experiment file.',
|
73
|
+
"#{PROGRAM_NAME} will preprocess data according to the specifications",
|
74
|
+
'given in your experiment file.',
|
75
|
+
'This option is required!',
|
76
|
+
'Also consider the documentation on format and features.'
|
77
|
+
) do |exp_file|
|
78
|
+
@options[:exp_file] = File.expand_path(exp_file)
|
79
|
+
end
|
80
|
+
|
81
|
+
opts.separator ''
|
82
|
+
opts.separator 'Common options:'
|
83
|
+
|
84
|
+
opts.on_tail('-h', '--help', 'Show this help message.') do
|
85
|
+
puts opts
|
86
|
+
exit
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end # def self.parse
|
90
|
+
end # class OptParser
|
91
|
+
end # module Frappe
|
92
|
+
end # Shalm
|
data/lib/frappe/path.rb
ADDED
@@ -0,0 +1,199 @@
|
|
1
|
+
#############################
|
2
|
+
# class describing a path between two nodes
|
3
|
+
#
|
4
|
+
# provides access and output facilities for different aspects of the path
|
5
|
+
#
|
6
|
+
# this is the return value of SynInterpreter.path_between
|
7
|
+
module Shalmaneser
|
8
|
+
module Frappe
|
9
|
+
class Path
|
10
|
+
attr_reader :startnode
|
11
|
+
|
12
|
+
###
|
13
|
+
# initialize to empty path
|
14
|
+
def initialize(startnode)
|
15
|
+
@path = []
|
16
|
+
@cutoff_last_pt = false
|
17
|
+
set_startnode(startnode)
|
18
|
+
end
|
19
|
+
|
20
|
+
###
|
21
|
+
# deep_clone:
|
22
|
+
# return clone of this path object,
|
23
|
+
# with clone of this path rather than the same path
|
24
|
+
def deep_clone
|
25
|
+
new_path = self.clone
|
26
|
+
new_path.set_path(@path.clone)
|
27
|
+
|
28
|
+
return new_path
|
29
|
+
end
|
30
|
+
|
31
|
+
###
|
32
|
+
def set_startnode(startnode)
|
33
|
+
@startnode = startnode
|
34
|
+
|
35
|
+
return self
|
36
|
+
end
|
37
|
+
|
38
|
+
###
|
39
|
+
# iterate through the current path
|
40
|
+
#
|
41
|
+
# yield tuples
|
42
|
+
# [direction, edgelabel, nodelabel, endnode]
|
43
|
+
# direction: string, U/D
|
44
|
+
# edgelabel: string
|
45
|
+
# nodelabel: string
|
46
|
+
# endnode: SynNode
|
47
|
+
def each_step
|
48
|
+
@path.each { |step|
|
49
|
+
yield step
|
50
|
+
}
|
51
|
+
end
|
52
|
+
|
53
|
+
###
|
54
|
+
# empty?
|
55
|
+
# any steps in here?
|
56
|
+
def empty?
|
57
|
+
return @path.empty?
|
58
|
+
end
|
59
|
+
|
60
|
+
###
|
61
|
+
# add one step to the beginning of the current path
|
62
|
+
def add_first_step(start_node,#SynNode
|
63
|
+
direction, # string: U, D
|
64
|
+
gf, # string: edge label
|
65
|
+
pt)
|
66
|
+
@path.unshift([direction, gf, pt, @startnode])
|
67
|
+
set_startnode(start_node)
|
68
|
+
|
69
|
+
return self
|
70
|
+
end
|
71
|
+
|
72
|
+
|
73
|
+
###
|
74
|
+
# add one step to the end of the current path
|
75
|
+
def add_last_step(direction, # string: U, D
|
76
|
+
gf, # string: edge label
|
77
|
+
pt, # string: node label (of end_node)
|
78
|
+
end_node) # SynNode
|
79
|
+
@path << [direction, gf, pt, end_node]
|
80
|
+
|
81
|
+
return self
|
82
|
+
end
|
83
|
+
|
84
|
+
###
|
85
|
+
# path length
|
86
|
+
def length
|
87
|
+
return @path.length
|
88
|
+
end
|
89
|
+
|
90
|
+
###
|
91
|
+
#
|
92
|
+
def print(print_direction, # boolean. true: print direction
|
93
|
+
print_gf, # boolean. true: print edgelabel
|
94
|
+
print_pt) # boolean. true: print nodelabel
|
95
|
+
|
96
|
+
return print_aux(@path, print_direction, print_gf, print_pt)
|
97
|
+
end
|
98
|
+
|
99
|
+
###
|
100
|
+
# print path from roof node to end
|
101
|
+
def print_downpart(print_direction,
|
102
|
+
print_gf,
|
103
|
+
print_pt)
|
104
|
+
|
105
|
+
roof, roof_index = compute_roof
|
106
|
+
if roof.nil? or @path.empty?
|
107
|
+
# no roof set
|
108
|
+
return ""
|
109
|
+
|
110
|
+
else
|
111
|
+
# roof node is in the middle
|
112
|
+
return print_aux(@path[roof_index..-1],
|
113
|
+
print_direction, print_gf, print_pt)
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
###
|
118
|
+
def lca
|
119
|
+
return compute_roof.first
|
120
|
+
end
|
121
|
+
|
122
|
+
###
|
123
|
+
# cut off last node label in print and print_downpart?
|
124
|
+
def set_cutoff_last_pt_on_printing(bool) # Boolean
|
125
|
+
@cutoff_last_pt = bool
|
126
|
+
end
|
127
|
+
|
128
|
+
########
|
129
|
+
protected
|
130
|
+
|
131
|
+
def set_path(new_path)
|
132
|
+
@path = new_path
|
133
|
+
end
|
134
|
+
|
135
|
+
|
136
|
+
########
|
137
|
+
private
|
138
|
+
|
139
|
+
###
|
140
|
+
# step through the path as long as direction is up.
|
141
|
+
# when direction starts to go "D", take current node as roof node
|
142
|
+
#
|
143
|
+
# returns: pair [roof node, roof node index] (SynNode, integer)
|
144
|
+
def compute_roof
|
145
|
+
node = @startnode
|
146
|
+
index = 0
|
147
|
+
|
148
|
+
each_step { |direction, edgelabel, nodelabel, endnode|
|
149
|
+
if direction =~ /D/
|
150
|
+
# down! the previous node was roof
|
151
|
+
return [node, index]
|
152
|
+
else
|
153
|
+
node = endnode
|
154
|
+
index += 1
|
155
|
+
end
|
156
|
+
}
|
157
|
+
|
158
|
+
# last node is roof
|
159
|
+
return [node, index]
|
160
|
+
|
161
|
+
end
|
162
|
+
|
163
|
+
###
|
164
|
+
def print_aux(path,
|
165
|
+
print_direction,
|
166
|
+
print_gf,
|
167
|
+
print_pt)
|
168
|
+
retv = ''
|
169
|
+
path.each { |step|
|
170
|
+
direction, gf, pt, _node = step.map { |entry|
|
171
|
+
if entry.nil?
|
172
|
+
"-"
|
173
|
+
else
|
174
|
+
entry
|
175
|
+
end
|
176
|
+
}
|
177
|
+
|
178
|
+
if print_direction
|
179
|
+
retv << direction + " "
|
180
|
+
end
|
181
|
+
|
182
|
+
if print_gf
|
183
|
+
retv << gf + " "
|
184
|
+
end
|
185
|
+
|
186
|
+
if print_pt
|
187
|
+
retv << pt + " "
|
188
|
+
end
|
189
|
+
}
|
190
|
+
|
191
|
+
if @cutoff_last_pt && print_pt && (retv =~ /^(.+ )\w+ $/)
|
192
|
+
return $1
|
193
|
+
else
|
194
|
+
return retv
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
require 'tokenizer'
|
2
|
+
require 'tabular_format/fn_tab_format_file'
|
3
|
+
|
4
|
+
module Shalmaneser
|
5
|
+
module Frappe
|
6
|
+
# A converter from plain text to Salsa Tab Format.
|
7
|
+
# Performs tokenization.
|
8
|
+
class PlainConverter
|
9
|
+
def initialize
|
10
|
+
# suffixes for different types of output files
|
11
|
+
@file_suffixes = {"lemma" => ".lemma", "pos" => ".pos", "tab" => ".tab", "stxml" => ".xml"}
|
12
|
+
end
|
13
|
+
|
14
|
+
###############
|
15
|
+
# transform_plain:
|
16
|
+
#
|
17
|
+
# transformation for plaintext:
|
18
|
+
#
|
19
|
+
# transform to Tab format, separating punctuation from adjacent words
|
20
|
+
# @param input_dir [String] input directory
|
21
|
+
# @param output_dir [String] output directory
|
22
|
+
def transform_plain_dir(input_dir, output_dir)
|
23
|
+
Dir[input_dir + "*"].each do |plainfilename|
|
24
|
+
# open input and output file
|
25
|
+
# end output file name in "tab" because that is, at the moment, required
|
26
|
+
outfilename = output_dir + File.basename(plainfilename, '.*') + @file_suffixes["tab"]
|
27
|
+
plain_to_tab_file(plainfilename, outfilename)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
####
|
32
|
+
# transform plaintext file to Tab format file
|
33
|
+
# @param [String] input_filename string: name of input file
|
34
|
+
# @param [String] output_filename string: name of output file
|
35
|
+
def plain_to_tab_file(input_filename, output_filename)
|
36
|
+
sentences = File.open(input_filename) do |f|
|
37
|
+
# The file is supposed to contain one sentence per line.
|
38
|
+
f.readlines.map(&:chomp).map(&:strip).reject(&:empty?)
|
39
|
+
end
|
40
|
+
id = File.basename(input_filename, '.*')
|
41
|
+
t = Tokenizer::Tokenizer.new
|
42
|
+
File.open(output_filename, "w") do |f|
|
43
|
+
sentences.each_with_index do |sentence, idx|
|
44
|
+
# byebug
|
45
|
+
sentid = "#{id}_#{idx}"
|
46
|
+
sentence = t.tokenize(sentence)
|
47
|
+
sentence.each do |word|
|
48
|
+
# for each word, one line, entries in the line tab-separated
|
49
|
+
# the 'word' entry is the word, the 'lu_sent_ids' entry is the sentence ID sentid,
|
50
|
+
# all other entries (gf, pt, frame etc.) are not set
|
51
|
+
f.puts FNTabFormatFile.format_str("word" => word, "sent_id" => sentid)
|
52
|
+
end
|
53
|
+
f.puts
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|