shalmaneser-lib 1.2.rc5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +10 -0
- data/CHANGELOG.md +4 -0
- data/LICENSE.md +4 -0
- data/README.md +122 -0
- data/lib/configuration/config_data.rb +457 -0
- data/lib/configuration/config_format_element.rb +210 -0
- data/lib/configuration/configuration_error.rb +15 -0
- data/lib/configuration/external_config_data.rb +56 -0
- data/lib/configuration/frappe_config_data.rb +134 -0
- data/lib/configuration/fred_config_data.rb +199 -0
- data/lib/configuration/rosy_config_data.rb +126 -0
- data/lib/db/db_interface.rb +50 -0
- data/lib/db/db_mysql.rb +141 -0
- data/lib/db/db_sqlite.rb +280 -0
- data/lib/db/db_table.rb +237 -0
- data/lib/db/db_view.rb +416 -0
- data/lib/db/db_wrapper.rb +175 -0
- data/lib/db/select_table_and_columns.rb +10 -0
- data/lib/db/sql_query.rb +243 -0
- data/lib/definitions.rb +19 -0
- data/lib/eval.rb +482 -0
- data/lib/ext/maxent/Classify.class +0 -0
- data/lib/ext/maxent/Train.class +0 -0
- data/lib/external_systems.rb +251 -0
- data/lib/framenet_format/fn_corpus_aset.rb +209 -0
- data/lib/framenet_format/fn_corpus_xml_file.rb +120 -0
- data/lib/framenet_format/fn_corpus_xml_sentence.rb +299 -0
- data/lib/framenet_format/fn_database.rb +143 -0
- data/lib/framenet_format/frame_xml_file.rb +104 -0
- data/lib/framenet_format/frame_xml_sentence.rb +411 -0
- data/lib/logging.rb +25 -0
- data/lib/ml/classifier.rb +189 -0
- data/lib/ml/mallet.rb +236 -0
- data/lib/ml/maxent.rb +229 -0
- data/lib/ml/optimize.rb +195 -0
- data/lib/ml/timbl.rb +140 -0
- data/lib/monkey_patching/array.rb +82 -0
- data/lib/monkey_patching/enumerable_bool.rb +24 -0
- data/lib/monkey_patching/enumerable_distribute.rb +18 -0
- data/lib/monkey_patching/file.rb +131 -0
- data/lib/monkey_patching/subsumed.rb +24 -0
- data/lib/ruby_class_extensions.rb +4 -0
- data/lib/salsa_tiger_xml/corpus.rb +24 -0
- data/lib/salsa_tiger_xml/fe_node.rb +98 -0
- data/lib/salsa_tiger_xml/file_parts_parser.rb +214 -0
- data/lib/salsa_tiger_xml/frame_node.rb +145 -0
- data/lib/salsa_tiger_xml/graph_node.rb +347 -0
- data/lib/salsa_tiger_xml/reg_xml.rb +285 -0
- data/lib/salsa_tiger_xml/salsa_tiger_sentence.rb +596 -0
- data/lib/salsa_tiger_xml/salsa_tiger_sentence_graph.rb +333 -0
- data/lib/salsa_tiger_xml/salsa_tiger_sentence_sem.rb +438 -0
- data/lib/salsa_tiger_xml/salsa_tiger_xml_helper.rb +84 -0
- data/lib/salsa_tiger_xml/salsa_tiger_xml_node.rb +161 -0
- data/lib/salsa_tiger_xml/sem_node.rb +58 -0
- data/lib/salsa_tiger_xml/string_terminals_in_right_order.rb +192 -0
- data/lib/salsa_tiger_xml/syn_node.rb +169 -0
- data/lib/salsa_tiger_xml/tree_node.rb +59 -0
- data/lib/salsa_tiger_xml/ts_syn_node.rb +47 -0
- data/lib/salsa_tiger_xml/usp_node.rb +72 -0
- data/lib/salsa_tiger_xml/xml_node.rb +163 -0
- data/lib/shalmaneser/lib.rb +1 -0
- data/lib/tabular_format/fn_tab_format_file.rb +38 -0
- data/lib/tabular_format/fn_tab_frame.rb +67 -0
- data/lib/tabular_format/fn_tab_sentence.rb +169 -0
- data/lib/tabular_format/tab_format_file.rb +91 -0
- data/lib/tabular_format/tab_format_named_args.rb +184 -0
- data/lib/tabular_format/tab_format_sentence.rb +119 -0
- data/lib/value_restriction.rb +49 -0
- metadata +131 -0
@@ -0,0 +1 @@
|
|
1
|
+
# A dummy file to require for now.
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require_relative 'tab_format_file'
|
2
|
+
require_relative 'fn_tab_sentence'
|
3
|
+
require_relative 'tab_format_named_args'
|
4
|
+
|
5
|
+
########################################################
|
6
|
+
# TabFormat files containing everything that's in the FN lexunit files
|
7
|
+
class FNTabFormatFile < TabFormatFile
|
8
|
+
def initialize(filename, tag_suffix = nil, lemma_suffix = nil)
|
9
|
+
corpusname = File.dirname(filename) + "/" + File.basename(filename, ".tab")
|
10
|
+
filename_label_pairs = [filename, FNTabFormatFile.fntab_format]
|
11
|
+
# raise exception if lemmatisation does not esist
|
12
|
+
if lemma_suffix
|
13
|
+
filename_label_pairs.concat [corpusname + lemma_suffix, ["lemma"]]
|
14
|
+
end
|
15
|
+
# raise exception if tagging does not exist
|
16
|
+
if tag_suffix
|
17
|
+
filename_label_pairs.concat [corpusname + tag_suffix, ["pos"]]
|
18
|
+
end
|
19
|
+
super(filename_label_pairs)
|
20
|
+
|
21
|
+
@my_sentence_class = FNTabSentence
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.fntab_format
|
25
|
+
["word", FNTabFormatFile.frametab_format, "ne", "sent_id"]
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.frametab_format
|
29
|
+
["pt", "gf", "role", "target", "frame", "stuff"]
|
30
|
+
end
|
31
|
+
|
32
|
+
##########
|
33
|
+
# given a hash mapping features to values,
|
34
|
+
# format according to fntab_format
|
35
|
+
def self.format_str(hash)
|
36
|
+
TabFormatNamedArgs.format_str(hash, FNTabFormatFile.fntab_format)
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require_relative 'fn_tab_sentence'
|
2
|
+
require "ruby_class_extensions"
|
3
|
+
|
4
|
+
class FNTabFrame < FNTabSentence
|
5
|
+
############
|
6
|
+
# initialize:
|
7
|
+
# as parent, except that we also get a frame number
|
8
|
+
# such that we can access the features of ``our'' frame
|
9
|
+
def initialize(pattern, frameno)
|
10
|
+
# by setting @group_no to frameno,
|
11
|
+
# we are initializing each TabFormatNamedArgs object
|
12
|
+
# in each_line_parsed() or read_one_line_parsed()
|
13
|
+
# with the right group number,
|
14
|
+
# such that all calls to TabFormatNamedArgs.get()
|
15
|
+
# will access the right group.
|
16
|
+
super(pattern)
|
17
|
+
@group_no = frameno
|
18
|
+
end
|
19
|
+
|
20
|
+
# returns the frame introduced by the target word(s)
|
21
|
+
# of this frame group, a string
|
22
|
+
def get_frame
|
23
|
+
sanity_check
|
24
|
+
each_line_parsed { |l|
|
25
|
+
return l.get("frame")
|
26
|
+
}
|
27
|
+
end
|
28
|
+
|
29
|
+
####
|
30
|
+
# returns an array of integers: the indices of the target of
|
31
|
+
# the frame
|
32
|
+
# These are the line numbers, which start counting at 0
|
33
|
+
#
|
34
|
+
# a target may span more than one word
|
35
|
+
def get_target_indices
|
36
|
+
sanity_check
|
37
|
+
idx = []
|
38
|
+
each_line_parsed {|l|
|
39
|
+
unless l.get("target") == "-"
|
40
|
+
idx << l.get("lineno")
|
41
|
+
end
|
42
|
+
}
|
43
|
+
|
44
|
+
return idx
|
45
|
+
end
|
46
|
+
|
47
|
+
####
|
48
|
+
# returns a string: the target
|
49
|
+
# in the case of multiword targets,
|
50
|
+
# we find the complete target at all
|
51
|
+
# indices, i.e. we can just take the first one we find
|
52
|
+
def get_target
|
53
|
+
each_line_parsed { |l|
|
54
|
+
t = l.get("target")
|
55
|
+
unless t == "-"
|
56
|
+
return t
|
57
|
+
end
|
58
|
+
}
|
59
|
+
end
|
60
|
+
|
61
|
+
####
|
62
|
+
# get the target POS, according to FrameNet
|
63
|
+
def get_target_fn_pos
|
64
|
+
get_target =~ /^[^\.]+\.(\w+)$/
|
65
|
+
return $1
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,169 @@
|
|
1
|
+
require_relative 'tab_format_sentence'
|
2
|
+
# require_relative 'fn_tab_frame'
|
3
|
+
require "ruby_class_extensions"
|
4
|
+
############################################
|
5
|
+
class FNTabSentence < TabFormatSentence
|
6
|
+
####
|
7
|
+
# overwrite this to get a feature from
|
8
|
+
# a group rather than from the main feature list
|
9
|
+
def get_this(l, feature_name)
|
10
|
+
l.get(feature_name)
|
11
|
+
end
|
12
|
+
|
13
|
+
####
|
14
|
+
def sanity_check
|
15
|
+
each_line_parsed { |l|
|
16
|
+
if l.get("sent_id").nil?
|
17
|
+
raise "Error: corpus file does not conform to FN format."
|
18
|
+
else
|
19
|
+
return
|
20
|
+
end
|
21
|
+
}
|
22
|
+
end
|
23
|
+
|
24
|
+
####
|
25
|
+
# returns the sentence ID, a string, as set by FrameNet
|
26
|
+
def get_sent_id
|
27
|
+
sanity_check
|
28
|
+
each_line_parsed { |l|
|
29
|
+
return l.get("sent_id")
|
30
|
+
}
|
31
|
+
end
|
32
|
+
|
33
|
+
####
|
34
|
+
# iterator, yields each frame of the sentence as a FNTabFrame
|
35
|
+
# object. They contain the complete sentence, but provide
|
36
|
+
# access to exactly one frame of that sentence.
|
37
|
+
def each_frame
|
38
|
+
# how many frames? assume that each line has the same
|
39
|
+
# number of frames
|
40
|
+
num_frames = read_one_line_parsed(0).num_groups
|
41
|
+
|
42
|
+
0.upto(num_frames - 1) { |frame_no|
|
43
|
+
frame_obj = FNTabFrame.new(@pattern, frame_no)
|
44
|
+
each_line { |l| frame_obj.add_line(l) }
|
45
|
+
yield frame_obj
|
46
|
+
}
|
47
|
+
end
|
48
|
+
|
49
|
+
####
|
50
|
+
# computes a mapping from word indices to labels on these words
|
51
|
+
#
|
52
|
+
# returns a hash: index_list(array:integer) -> label(string)
|
53
|
+
# An entry il->label means that all the lines whose line
|
54
|
+
# numbers are listed in il are labeled with label.
|
55
|
+
#
|
56
|
+
# Line numbers correspond to words of the sentence. Counting starts at 0.
|
57
|
+
#
|
58
|
+
# By default, "markables" looks for role labels, i.e. labels in the
|
59
|
+
# column "role", but it can also look in another column.
|
60
|
+
# To change the default, give the column name as a parameter.
|
61
|
+
def markables(use_this_column = "role")
|
62
|
+
# returns hash of {index list} -> {markup label}
|
63
|
+
|
64
|
+
sanity_check
|
65
|
+
|
66
|
+
idlist_to_annotation_list = {}
|
67
|
+
|
68
|
+
# add entry for the target word
|
69
|
+
# idlist_to_annotation_list[get_target_indices()] = "target"
|
70
|
+
|
71
|
+
# determine span of each frame element
|
72
|
+
# if we find overlapping FEs, we write a warning to STDERR
|
73
|
+
# ignore the 2nd label and attempt to "close" the 1st label
|
74
|
+
|
75
|
+
ids = []
|
76
|
+
label = nil
|
77
|
+
|
78
|
+
each_line_parsed { |l|
|
79
|
+
this_id = get_this(l, "lineno")
|
80
|
+
|
81
|
+
# start of FE?
|
82
|
+
this_col = get_this(l, use_this_column)
|
83
|
+
unless this_col
|
84
|
+
$stderr.puts "nil entry #{use_this_column} in line #{this_id} of sent #{get_sent_id}. Skipping."
|
85
|
+
next
|
86
|
+
end
|
87
|
+
this_fe_ann = this_col.split(":")
|
88
|
+
|
89
|
+
case this_fe_ann.length
|
90
|
+
when 1 # nothing at all, or a single begin or end
|
91
|
+
markup = this_fe_ann.first
|
92
|
+
if markup == "-" or markup == "--" # no change
|
93
|
+
if label
|
94
|
+
ids << this_id
|
95
|
+
end
|
96
|
+
elsif markup =~ /^B-(\S+)$/
|
97
|
+
if label # are we within a markable right now?
|
98
|
+
$stderr.puts "[TabFormat] Warning: Markable "+$1.to_s+" starts while within markable ", label.to_s
|
99
|
+
$stderr.puts "Debug data: Sentence id #{get_sent_id}, current ID list #{ids.join(" ")}"
|
100
|
+
else
|
101
|
+
label = $1
|
102
|
+
ids << this_id
|
103
|
+
end
|
104
|
+
elsif markup =~ /^E-(\S+)$/
|
105
|
+
if label == $1 # we close the markable we've opened before
|
106
|
+
ids << this_id
|
107
|
+
# store information
|
108
|
+
idlist_to_annotation_list[ids] = label
|
109
|
+
# reset memory
|
110
|
+
label = nil
|
111
|
+
ids = []
|
112
|
+
else
|
113
|
+
$stderr.puts "[TabFormat] Warning: Markable "+$1.to_s+" closes while within markable "+ label.to_s
|
114
|
+
$stderr.puts "Debug data: Sentence id #{get_sent_id}, current ID list #{ids.join(" ")}"
|
115
|
+
end
|
116
|
+
else
|
117
|
+
$stderr.puts "[TabFormat] Warning: cannot analyse markup "+markup
|
118
|
+
$stderr.puts "Debug data: Sentence id #{get_sent_id}"
|
119
|
+
end
|
120
|
+
when 2 # this should be a one-word markable
|
121
|
+
b_markup = this_fe_ann[0]
|
122
|
+
e_markup = this_fe_ann[1]
|
123
|
+
if label
|
124
|
+
$stderr.puts "[TabFormat] Warning: Finding new markable at word #{this_id} while within markable ", label
|
125
|
+
$stderr.puts "Debug data: Sentence id #{get_sent_id}, current ID list #{ids.join(" ")}"
|
126
|
+
else
|
127
|
+
if b_markup =~ /^B-(\S+)$/
|
128
|
+
b_label = $1
|
129
|
+
if e_markup =~ /^E-(\S+)$/
|
130
|
+
e_label = $1
|
131
|
+
if b_label == e_label
|
132
|
+
idlist_to_annotation_list[[this_id]] = b_label
|
133
|
+
else
|
134
|
+
$stderr.puts "[TabFormat] Warning: Starting markable "+b_label+", closing markable "+e_label
|
135
|
+
$stderr.puts "Debug data: Sentence id #{get_sent_id}, current ID list #{ids.join(" ")}"
|
136
|
+
end
|
137
|
+
else
|
138
|
+
$stderr.puts "[TabFormat] Warning: Unknown end markup "+e_markup
|
139
|
+
$stderr.puts "Debug data: Sentence id #{get_sent_id}, current ID list #{ids.join(" ")}"
|
140
|
+
end
|
141
|
+
else
|
142
|
+
$stderr.puts "[TabFormat] Warning: Unknown start markup "+b_markup
|
143
|
+
$stderr.puts "Debug data: Sentence id #{get_sent_id}, current ID list #{ids.join(" ")}"
|
144
|
+
end
|
145
|
+
end
|
146
|
+
else
|
147
|
+
$stderr.puts "Warning: cannot analyse markup with more than two colon-separated parts like "+this_fee_ann.join(":")
|
148
|
+
$stderr.puts "Debug data: Sentence id #{get_sent_id}"
|
149
|
+
end
|
150
|
+
}
|
151
|
+
|
152
|
+
unless label.nil?
|
153
|
+
$stderr.puts "[TabFormat] Warning: Markable ", label, " did not end in sentence."
|
154
|
+
$stderr.puts "Debug data: Sentence id #{get_sent_id}, current ID list #{ids.join(" ")}"
|
155
|
+
end
|
156
|
+
|
157
|
+
return idlist_to_annotation_list
|
158
|
+
end
|
159
|
+
|
160
|
+
#######
|
161
|
+
# @return [String] A tokenized sentence.
|
162
|
+
def to_s
|
163
|
+
sanity_check
|
164
|
+
array = []
|
165
|
+
each_line_parsed { |l| array << l.get("word") }
|
166
|
+
|
167
|
+
array.join(' ')
|
168
|
+
end
|
169
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
require_relative 'tab_format_sentence'
|
2
|
+
|
3
|
+
require "ruby_class_extensions"
|
4
|
+
|
5
|
+
#######################
|
6
|
+
class TabFormatFile
|
7
|
+
#######
|
8
|
+
# initialize:
|
9
|
+
# open files for reading.
|
10
|
+
#
|
11
|
+
# fp is a list of pairs [filename, format]
|
12
|
+
# where format is a list of strings that will be used
|
13
|
+
# to address columns of the file, the 1st string for the 1st column
|
14
|
+
#
|
15
|
+
# format may contain _one_ entry that is an array (or a call to repeat())
|
16
|
+
# e.g.:
|
17
|
+
# ["word", "pos", "lemma", repeat("frame", "target", "gf", "pt")]
|
18
|
+
def initialize(fp)
|
19
|
+
# open files
|
20
|
+
@files = []
|
21
|
+
@patterns = []
|
22
|
+
@no_of_read_lines = 0
|
23
|
+
fp.each_index { |ix|
|
24
|
+
if ix.modulo(2) == 0
|
25
|
+
# filename
|
26
|
+
begin
|
27
|
+
@files << File.new(fp[ix])
|
28
|
+
rescue
|
29
|
+
raise 'Sorry, could not read input file ' + fp[ix] + "\n"
|
30
|
+
end
|
31
|
+
else
|
32
|
+
# pattern
|
33
|
+
@patterns += fp[ix]
|
34
|
+
end
|
35
|
+
}
|
36
|
+
|
37
|
+
@my_sentence_class = TabFormatSentence
|
38
|
+
end
|
39
|
+
|
40
|
+
########
|
41
|
+
# each_sentence:
|
42
|
+
# yield each sentence of the files in turn.
|
43
|
+
# sentences are expected to be separated
|
44
|
+
# by a line containing nothing but whitespace.
|
45
|
+
# the last sentence may or may not be followed by
|
46
|
+
# an empty line.
|
47
|
+
# each_sentence ends when EOF is encountered on the first file.
|
48
|
+
# it expects all the other files to be the same length
|
49
|
+
# (in terms of number of lines) as the first file.
|
50
|
+
# each sentence is returned in the form of an
|
51
|
+
# array of TabFormatSentence sentences.
|
52
|
+
# AB: TODO Delete this nasty exception!!!
|
53
|
+
# @todo Change `#readline` to `#gets` to avoid Exceptions.
|
54
|
+
# @todo Change `#gets` to `#readlines` to read all lines at once.
|
55
|
+
def each_sentence
|
56
|
+
unless @read_completely
|
57
|
+
sentence = @my_sentence_class.new(@patterns)
|
58
|
+
begin
|
59
|
+
loop do
|
60
|
+
linearray = []
|
61
|
+
@files.each { |f| linearray << f.readline.chomp }
|
62
|
+
|
63
|
+
@no_of_read_lines += 1
|
64
|
+
if linearray.detect { |x| x.strip == '' }
|
65
|
+
if linearray.detect { |x| x.strip != '' }
|
66
|
+
STDERR.puts "Error: Mismatching empty lines! <from lib/common>"
|
67
|
+
exit(1)
|
68
|
+
else
|
69
|
+
# sentence finished. yield it and start a new one
|
70
|
+
unless sentence.empty?
|
71
|
+
yield sentence
|
72
|
+
end
|
73
|
+
sentence = @my_sentence_class.new(@patterns)
|
74
|
+
end
|
75
|
+
# read an empty line in each of the other files
|
76
|
+
|
77
|
+
else
|
78
|
+
# sentence not yet finished.
|
79
|
+
# add this line to it
|
80
|
+
sentence.add_line(linearray.join("\t"))
|
81
|
+
end
|
82
|
+
end
|
83
|
+
rescue EOFError
|
84
|
+
# maybe we haven't yielded the last sentence yet.
|
85
|
+
yield sentence unless sentence.empty?
|
86
|
+
|
87
|
+
@read_completely = true
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
@@ -0,0 +1,184 @@
|
|
1
|
+
require "ruby_class_extensions"
|
2
|
+
#################################################
|
3
|
+
# class for keeping one line,
|
4
|
+
# parsed.
|
5
|
+
# The line is kept as follows:
|
6
|
+
# - normal features: in a hash @f mapping feature names to values
|
7
|
+
# - features of the repeated group: in an array @r of
|
8
|
+
# TabFormatNamedArgs objects, one per group
|
9
|
+
#
|
10
|
+
# each feature of the line is available by name
|
11
|
+
# via the method "get".
|
12
|
+
# Additional features (from other input files) can be
|
13
|
+
# added to the TabFormatNamedArgs object via the method
|
14
|
+
# add_feature
|
15
|
+
#
|
16
|
+
# methods:
|
17
|
+
#
|
18
|
+
# new: initialize.
|
19
|
+
# values: array of strings
|
20
|
+
# features: how to access the strings by name
|
21
|
+
# 'features' is an array of strings
|
22
|
+
# later the i-th feature will be used to access
|
23
|
+
# the i-th value,
|
24
|
+
# except for repeated groups
|
25
|
+
#
|
26
|
+
# get: returns one feature by its name
|
27
|
+
# name: a string
|
28
|
+
#
|
29
|
+
# add_feature: add another feature to this object,
|
30
|
+
# which can be accessed via "get"
|
31
|
+
# name: name for the new feature, should be distinct
|
32
|
+
# from the ones already used in new()
|
33
|
+
# feature: a string, the value of the feature
|
34
|
+
##
|
35
|
+
|
36
|
+
class TabFormatNamedArgs
|
37
|
+
############
|
38
|
+
def initialize(values, features, group = nil)
|
39
|
+
@f = {}
|
40
|
+
@r = []
|
41
|
+
@group = group
|
42
|
+
|
43
|
+
# record the feature names, give special attention to a group
|
44
|
+
# if we have one
|
45
|
+
@group_feature_names = nil
|
46
|
+
@feature_names = features.map { |feature|
|
47
|
+
if feature.instance_of? Array
|
48
|
+
# found a group
|
49
|
+
@group_feature_names = feature
|
50
|
+
"GROUP"
|
51
|
+
else
|
52
|
+
feature
|
53
|
+
end
|
54
|
+
}
|
55
|
+
|
56
|
+
if @feature_names.count("GROUP") > 1
|
57
|
+
$stderr.puts "More than one group in feature set:" + features.join(" ")
|
58
|
+
raise "Cannot handle this."
|
59
|
+
end
|
60
|
+
|
61
|
+
# group_index: position of group in overall feature list
|
62
|
+
group_index = @feature_names.index("GROUP")
|
63
|
+
unless group_index
|
64
|
+
group_index = @feature_names.length
|
65
|
+
end
|
66
|
+
num_features_after_group = [0,
|
67
|
+
(@feature_names.length - 1) - group_index].max
|
68
|
+
index_after_groups = values.length - num_features_after_group
|
69
|
+
|
70
|
+
|
71
|
+
# features before group: put feature/value pairs in @f hash
|
72
|
+
0.upto(group_index - 1) { |i|
|
73
|
+
@f[features[i]] = values[i]
|
74
|
+
}
|
75
|
+
# group: store each group in @r hash
|
76
|
+
if @group_feature_names
|
77
|
+
# for (group_start = group_index; group_start < index_after_groups;
|
78
|
+
# group_start += @group_feature_names.length())
|
79
|
+
group_no = 0
|
80
|
+
group_index.step(index_after_groups - 1,
|
81
|
+
@group_feature_names.length) { |group_start|
|
82
|
+
@r << TabFormatNamedArgs.new(values.slice(group_start,
|
83
|
+
@group_feature_names.length),
|
84
|
+
@group_feature_names,
|
85
|
+
group_no)
|
86
|
+
group_no += 1
|
87
|
+
}
|
88
|
+
end
|
89
|
+
|
90
|
+
# features after group: put feature/value pairs in @f hash
|
91
|
+
feature_index = group_index + 1
|
92
|
+
index_after_groups.upto(values.length - 1) { |i|
|
93
|
+
@f[features[feature_index]] = values[i]
|
94
|
+
feature_index += 1
|
95
|
+
}
|
96
|
+
end
|
97
|
+
|
98
|
+
############
|
99
|
+
# return feature/value pairs as a tab format line,
|
100
|
+
# order of features as given in the 'features' list
|
101
|
+
# Features not set in the hash: their entry will be "-"
|
102
|
+
#
|
103
|
+
# If the feature list includes a group,
|
104
|
+
# assume zero entries for that group
|
105
|
+
def self.format_str(hash, # hash: feature -> value
|
106
|
+
features) # feature list, as for new()
|
107
|
+
if features.nil?
|
108
|
+
return ""
|
109
|
+
end
|
110
|
+
|
111
|
+
# sanity check: does the hash contain keys that are not in the feature list?
|
112
|
+
hash.keys.reject { |f| features.include? f }.each { |bad_feature|
|
113
|
+
$stderr.puts "Error: unknown feature #{bad_feature} in format_str: ignoring."
|
114
|
+
}
|
115
|
+
|
116
|
+
return features.select { |f|
|
117
|
+
# remove the group feature, if it's there
|
118
|
+
not(f.instance_of? Array)
|
119
|
+
}.map { |feature| hash.fetch(feature, '-') }.join("\t")
|
120
|
+
end
|
121
|
+
|
122
|
+
#############
|
123
|
+
def add_feature(name, feature)
|
124
|
+
if @f.key? name
|
125
|
+
raise "Trying to add a feature twice: #{name}."
|
126
|
+
end
|
127
|
+
|
128
|
+
@f[name] = feature
|
129
|
+
end
|
130
|
+
|
131
|
+
#############
|
132
|
+
# get feature value, identified by feature name
|
133
|
+
# return: feature value as string
|
134
|
+
def get(name)
|
135
|
+
if (retv = get_nongroup(name))
|
136
|
+
return retv
|
137
|
+
else
|
138
|
+
return get_from_group(name, @group)
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
#############
|
143
|
+
def set(name, feature)
|
144
|
+
@f[name] = feature
|
145
|
+
end
|
146
|
+
|
147
|
+
#############
|
148
|
+
def num_groups
|
149
|
+
return @r.length
|
150
|
+
end
|
151
|
+
|
152
|
+
#############
|
153
|
+
# return line as string, entries connected by tab,
|
154
|
+
# in the order that the entries were in originally
|
155
|
+
def to_s
|
156
|
+
return @feature_names.map { |feature|
|
157
|
+
case feature
|
158
|
+
when "GROUP"
|
159
|
+
@r.map { |group_obj| group_obj.to_s }.join("\t")
|
160
|
+
else
|
161
|
+
@f[feature]
|
162
|
+
end
|
163
|
+
}.join("\t")
|
164
|
+
end
|
165
|
+
|
166
|
+
protected
|
167
|
+
|
168
|
+
# get feature, non-group
|
169
|
+
# return: feature value (string)
|
170
|
+
def get_nongroup(feature)
|
171
|
+
return @f[feature]
|
172
|
+
end
|
173
|
+
|
174
|
+
# get feature from one of the groups
|
175
|
+
# return: feature value (string)
|
176
|
+
def get_from_group(name, group_no)
|
177
|
+
if not(group_no) or group_no >= @r.length
|
178
|
+
# no group with that number
|
179
|
+
return nil
|
180
|
+
else
|
181
|
+
return @r[group_no].get_nongroup(name)
|
182
|
+
end
|
183
|
+
end
|
184
|
+
end
|