frprep 0.0.1.prealpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
@@ -0,0 +1,94 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
require 'test/unit'
|
4
|
+
require 'stringio' # for helper methods
|
5
|
+
require 'frprep/opt_parser'
|
6
|
+
|
7
|
+
include FrPrep
|
8
|
+
|
9
|
+
class TestOptParser < Test::Unit::TestCase
|
10
|
+
|
11
|
+
def setup
|
12
|
+
@exp_file = 'test/frprep/data/prp_test.salsa'
|
13
|
+
@valid_opts = ['--expfile', @exp_file,
|
14
|
+
'--help'
|
15
|
+
]
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_public_methods
|
19
|
+
assert_respond_to(OptParser, :parse)
|
20
|
+
end
|
21
|
+
|
22
|
+
# It should return a FrPrepConfigData object.
|
23
|
+
def test_parse_method
|
24
|
+
input = ['-e', @exp_file]
|
25
|
+
return_value = OptParser.parse(input)
|
26
|
+
assert(return_value.instance_of?(FrPrepConfigData))
|
27
|
+
end
|
28
|
+
|
29
|
+
# It should reject the empty input and exit.
|
30
|
+
def test_empty_input
|
31
|
+
out, err = intercept_output do
|
32
|
+
assert_raises(SystemExit) { OptParser.parse([]) }
|
33
|
+
end
|
34
|
+
assert_match(/You have to provide some options./, err)
|
35
|
+
end
|
36
|
+
|
37
|
+
# It should accept correct options.
|
38
|
+
# Invalid options is the matter of OptionParser itself,
|
39
|
+
# do not test it here.
|
40
|
+
# We test only, that OP exits and does not raise an exception.
|
41
|
+
def test_accept_correct_options
|
42
|
+
# this options we should treat separately
|
43
|
+
@valid_opts.delete('--help')
|
44
|
+
assert_nothing_raised { OptParser.parse(@valid_opts) }
|
45
|
+
|
46
|
+
stdout, stderr = intercept_output do
|
47
|
+
assert_raises(SystemExit) { OptParser.parse(['--invalid-option']) }
|
48
|
+
end
|
49
|
+
|
50
|
+
assert_match(/You have provided an invalid option:/, stderr)
|
51
|
+
end
|
52
|
+
|
53
|
+
# It should successfully exit with some options.
|
54
|
+
def test_successful_exit
|
55
|
+
quietly do
|
56
|
+
success_args = ['-h', '--help']
|
57
|
+
success_args.each do |arg|
|
58
|
+
assert_raises(SystemExit) { OptParser.parse(arg.split) }
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
################################################################################
|
65
|
+
# It is a helper method, many testable units provide some verbose output
|
66
|
+
# to stderr and/or stdout. It is usefull to suppress any kind of verbosity.
|
67
|
+
def quietly(&b)
|
68
|
+
begin
|
69
|
+
orig_stderr = $stderr.clone
|
70
|
+
orig_stdout = $stdout.clone
|
71
|
+
$stderr.reopen(File.new('/dev/null', 'w'))
|
72
|
+
$stdout.reopen(File.new('/dev/null', 'w'))
|
73
|
+
b.call
|
74
|
+
ensure
|
75
|
+
$stderr.reopen(orig_stderr)
|
76
|
+
$stdout.reopen(orig_stdout)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# It is a helper method for handling stdout and stderr as strings.
|
81
|
+
def intercept_output
|
82
|
+
orig_stdout = $stdout
|
83
|
+
orig_stderr = $stderr
|
84
|
+
$stdout = StringIO.new
|
85
|
+
$stderr = StringIO.new
|
86
|
+
|
87
|
+
yield
|
88
|
+
|
89
|
+
return $stdout.string, $stderr.string
|
90
|
+
ensure
|
91
|
+
$stdout = orig_stdout
|
92
|
+
$stderr = orig_stderr
|
93
|
+
end
|
94
|
+
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'erb'
|
2
|
+
|
3
|
+
module FunctionalTestHelper
|
4
|
+
PREF = 'test/functional/sample_experiment_files'
|
5
|
+
|
6
|
+
PRP_TEST_FILE = 'test/functional/sample_experiment_files/prp_test.salsa'
|
7
|
+
PRP_TEST_FILE_FRED_STD = "#{PREF}/prp_test.salsa.fred.standalone"
|
8
|
+
PRP_TEST_FILE_ROSY_STD = "#{PREF}/prp_test.salsa.rosy.standalone"
|
9
|
+
PRP_TRAIN_FILE = 'test/functional/sample_experiment_files/prp_train.salsa'
|
10
|
+
PRP_TRAIN_FILE_FRED_STD = "#{PREF}/prp_train.salsa.fred.standalone"
|
11
|
+
PRP_TRAIN_FILE_ROSY_STD = "#{PREF}/prp_train.salsa.rosy.standalone"
|
12
|
+
|
13
|
+
FRED_TEST_FILE = 'test/functional/sample_experiment_files/fred_test.salsa'
|
14
|
+
FRED_TRAIN_FILE = 'test/functional/sample_experiment_files/fred_train.salsa'
|
15
|
+
ROSY_TEST_FILE = 'test/functional/sample_experiment_files/rosy_test.salsa'
|
16
|
+
ROSY_TRAIN_FILE = 'test/functional/sample_experiment_files/rosy_train.salsa'
|
17
|
+
|
18
|
+
# Run an external process for functional testing and check the return code.
|
19
|
+
# <system> returns <true> if the external code exposes no errors.
|
20
|
+
# <@msg> is defined for every test object.
|
21
|
+
def execute(cmd)
|
22
|
+
status = system(cmd)
|
23
|
+
assert(status, @msg)
|
24
|
+
end
|
25
|
+
|
26
|
+
# Create a temporary exp file only for this test.
|
27
|
+
# Shalmaneser needs absolute paths, we provide them in exp files
|
28
|
+
# using templating.
|
29
|
+
def create_exp_file(file)
|
30
|
+
template = File.read("#{file}.erb")
|
31
|
+
text = ERB.new(template).result
|
32
|
+
File.open(file, 'w') do |f|
|
33
|
+
f.write(text)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def remove_exp_file(file)
|
38
|
+
File.delete(file)
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,122 @@
|
|
1
|
+
experiment_ID = fred_test
|
2
|
+
|
3
|
+
apply_to_all_known_targets = true
|
4
|
+
|
5
|
+
enduser_mode = false
|
6
|
+
|
7
|
+
verbose = true
|
8
|
+
|
9
|
+
|
10
|
+
############################
|
11
|
+
# Paths
|
12
|
+
# - fred_directory: directory where Fred puts its internal data
|
13
|
+
# - directory_output:
|
14
|
+
# redirect system output of disambiguated text (in SalsaTigerXML)
|
15
|
+
# to another directory.
|
16
|
+
# If you do not set anything here, output is to
|
17
|
+
# <fred_directory>/<experiment_ID>/output/stxml
|
18
|
+
# - classifier_dir:
|
19
|
+
# Write trained classifiers to this directory.
|
20
|
+
# If you do not set this parameter, classifiers are written to
|
21
|
+
# <fred_directory>/<experiment_ID>/classifiers
|
22
|
+
|
23
|
+
fred_directory = <%= File.expand_path('test/functional/output') %>
|
24
|
+
classifier_dir = <%= File.expand_path('test/functional/input/fred/cls') %>
|
25
|
+
# - preproc_descr_file_train / ...test
|
26
|
+
# where the experiment file for frprep is located
|
27
|
+
# (preprocessing for Fred and Rosy)
|
28
|
+
# for the preprocessing of the data used in this experiment
|
29
|
+
#
|
30
|
+
# give one preprocessing file name for the training data
|
31
|
+
# and one for the test data
|
32
|
+
# (If you only ever use test data in this experiment, you only
|
33
|
+
# need to give preproc_descr_file_test, and vice versa for training data.)
|
34
|
+
|
35
|
+
preproc_descr_file_test = <%= File.expand_path('test/functional/sample_experiment_files/prp_test.salsa.fred.standalone') %>
|
36
|
+
|
37
|
+
#####################
|
38
|
+
# noncontiguous input?
|
39
|
+
# if so, set 'noncontiguous_input' to 'true' (default is 'false')
|
40
|
+
# Also give the larger corpus from which the input sentences are:
|
41
|
+
# - directory
|
42
|
+
# - format: same possibilities as for frprep format
|
43
|
+
# - encoding: same possibilities as for frprep encoding
|
44
|
+
|
45
|
+
noncontiguous_input = false
|
46
|
+
#larger_corpus_dir =
|
47
|
+
larger_corpus_format = SalsaTigerXML
|
48
|
+
#larger_corpus_encoding = iso
|
49
|
+
|
50
|
+
|
51
|
+
#################
|
52
|
+
# Features
|
53
|
+
|
54
|
+
# bag-of-words context, with given context size,
|
55
|
+
# for example:
|
56
|
+
feature = context 50
|
57
|
+
feature = context 2
|
58
|
+
#
|
59
|
+
# (you can give more than one context feature line!)
|
60
|
+
#
|
61
|
+
# other possible features:
|
62
|
+
# feature = syntax
|
63
|
+
# feature = synsem
|
64
|
+
#
|
65
|
+
# syntax: grammatical functions
|
66
|
+
# synsem: grammatical functions plus headwords
|
67
|
+
|
68
|
+
#feature = context % %contextsize%
|
69
|
+
feature = syntax
|
70
|
+
|
71
|
+
# How to handle training data that is labeled
|
72
|
+
# with multiple sense labels?
|
73
|
+
# - binarize (default): This works only with binary classifiers.
|
74
|
+
# When featurizing for the binary classifiers, consider an item
|
75
|
+
# positive if its set of assigned labels includes the
|
76
|
+
# label for this binary classifier.
|
77
|
+
# - repeat: Repeat the instance, once for each
|
78
|
+
# sense label that has been assigned. (Basically, treat it
|
79
|
+
# as N instances with equal features but different labels.)
|
80
|
+
# - join: join all the assigned senses into one combined sense
|
81
|
+
# and treat that as a separate sense to train on.
|
82
|
+
# - keep: keep as multiple sense labels. (Note that this
|
83
|
+
# makes sense only for classifiers that can deal with
|
84
|
+
# multiple labels.)
|
85
|
+
|
86
|
+
#handle_multilabel = binarize
|
87
|
+
handle_multilabel = repeat
|
88
|
+
|
89
|
+
# What to do with numerical features?
|
90
|
+
# - keep: just leave as is
|
91
|
+
# - repeat: for a feature with max. numerical value N,
|
92
|
+
# use N binary features
|
93
|
+
# - bin: use a fixed number of bins, e.g. 5, then
|
94
|
+
# if feature value > 20: set all bins to 1,
|
95
|
+
# if feature value > 10: set the first four bins to 1,
|
96
|
+
# etc.
|
97
|
+
# default: bin.
|
98
|
+
#numerical_features = bin
|
99
|
+
numerical_features = keep
|
100
|
+
|
101
|
+
# Binary classifiers, or n-ary classifiers?
|
102
|
+
# if binary classifiers, set 'binary_classifiers = true'
|
103
|
+
# default is 'false'.
|
104
|
+
binary_classifiers = false
|
105
|
+
|
106
|
+
#################
|
107
|
+
# Fred internal settings
|
108
|
+
|
109
|
+
# what kind of classifier to use?
|
110
|
+
#
|
111
|
+
# format:
|
112
|
+
# <classifier type> <path> <optionally another path>
|
113
|
+
#
|
114
|
+
# for maxent, give first the path where maxent resides,
|
115
|
+
# then <where_shalmaneser_resides>/program/tools/maxent
|
116
|
+
classifier = maxent /opt/OpenNLP-maxent/2.4.0 <%= File.expand_path('tools/maxent/') %>
|
117
|
+
|
118
|
+
|
119
|
+
# for binary classifiers, you can set the pseudolabel
|
120
|
+
# on the 'negative' sense.
|
121
|
+
# Default is 'NONE'
|
122
|
+
negsense = NONE
|
@@ -0,0 +1,135 @@
|
|
1
|
+
# ID identifying this experiment and all its data
|
2
|
+
# please do not use spaces inside the experiment ID
|
3
|
+
experiment_ID = fred_train
|
4
|
+
|
5
|
+
# targets:
|
6
|
+
# if apply_to_all_known_targets is set to true,
|
7
|
+
# disambiguate all words for which we have training data
|
8
|
+
# when performing task "test" (i.e. applying trained classifiers)
|
9
|
+
apply_to_all_known_targets = true
|
10
|
+
|
11
|
+
# Enduser mode?
|
12
|
+
# The idea is that the enduser will only _apply_
|
13
|
+
# pre-trained classifiers. So in enduser mode many
|
14
|
+
# options are disallowed.
|
15
|
+
enduser_mode = false
|
16
|
+
|
17
|
+
|
18
|
+
# print warnings and
|
19
|
+
# give detailed progress reports
|
20
|
+
verbose = true
|
21
|
+
|
22
|
+
|
23
|
+
############################
|
24
|
+
# Paths
|
25
|
+
# - fred_directory: directory where Fred puts its internal data
|
26
|
+
# - directory_output:
|
27
|
+
# redirect system output of disambiguated text (in SalsaTigerXML)
|
28
|
+
# to another directory.
|
29
|
+
# If you do not set anything here, output is to
|
30
|
+
# <fred_directory>/<experiment_ID>/output/stxml
|
31
|
+
# - classifier_dir:
|
32
|
+
# Write trained classifiers to this directory.
|
33
|
+
# If you do not set this parameter, classifiers are written to
|
34
|
+
# <fred_directory>/<experiment_ID>/classifiers
|
35
|
+
|
36
|
+
fred_directory = <%= File.expand_path('test/functional/output') %>
|
37
|
+
|
38
|
+
# - preproc_descr_file_train / ...test
|
39
|
+
# where the experiment file for frprep is located
|
40
|
+
# (preprocessing for Fred and Rosy)
|
41
|
+
# for the preprocessing of the data used in this experiment
|
42
|
+
#
|
43
|
+
# give one preprocessing file name for the training data
|
44
|
+
# and one for the test data
|
45
|
+
# (If you only ever use test data in this experiment, you only
|
46
|
+
# need to give preproc_descr_file_test, and vice versa for training data.)
|
47
|
+
preproc_descr_file_train = <%= File.expand_path('test/functional/sample_experiment_files/prp_train.salsa.fred.standalone') %>
|
48
|
+
|
49
|
+
|
50
|
+
|
51
|
+
#####################
|
52
|
+
# noncontiguous input?
|
53
|
+
# if so, set 'noncontiguous_input' to 'true' (default is 'false')
|
54
|
+
# Also give the larger corpus from which the input sentences are:
|
55
|
+
# - directory
|
56
|
+
# - format: same possibilities as for frprep format
|
57
|
+
# - encoding: same possibilities as for frprep encoding
|
58
|
+
|
59
|
+
noncontiguous_input = false
|
60
|
+
#larger_corpus_dir =
|
61
|
+
larger_corpus_format = SalsaTigerXML
|
62
|
+
#larger_corpus_encoding = iso
|
63
|
+
|
64
|
+
|
65
|
+
#################
|
66
|
+
# Features
|
67
|
+
|
68
|
+
# bag-of-words context, with given context size,
|
69
|
+
# for example:
|
70
|
+
feature = context 50
|
71
|
+
feature = context 2
|
72
|
+
#
|
73
|
+
# (you can give more than one context feature line!)
|
74
|
+
#
|
75
|
+
# other possible features:
|
76
|
+
# feature = syntax
|
77
|
+
# feature = synsem
|
78
|
+
#
|
79
|
+
# syntax: grammatical functions
|
80
|
+
# synsem: grammatical functions plus headwords
|
81
|
+
|
82
|
+
#feature = context % %contextsize%
|
83
|
+
feature = syntax
|
84
|
+
|
85
|
+
# How to handle training data that is labeled
|
86
|
+
# with multiple sense labels?
|
87
|
+
# - binarize (default): This works only with binary classifiers.
|
88
|
+
# When featurizing for the binary classifiers, consider an item
|
89
|
+
# positive if its set of assigned labels includes the
|
90
|
+
# label for this binary classifier.
|
91
|
+
# - repeat: Repeat the instance, once for each
|
92
|
+
# sense label that has been assigned. (Basically, treat it
|
93
|
+
# as N instances with equal features but different labels.)
|
94
|
+
# - join: join all the assigned senses into one combined sense
|
95
|
+
# and treat that as a separate sense to train on.
|
96
|
+
# - keep: keep as multiple sense labels. (Note that this
|
97
|
+
# makes sense only for classifiers that can deal with
|
98
|
+
# multiple labels.)
|
99
|
+
|
100
|
+
#handle_multilabel = binarize
|
101
|
+
handle_multilabel = repeat
|
102
|
+
|
103
|
+
# What to do with numerical features?
|
104
|
+
# - keep: just leave as is
|
105
|
+
# - repeat: for a feature with max. numerical value N,
|
106
|
+
# use N binary features
|
107
|
+
# - bin: use a fixed number of bins, e.g. 5, then
|
108
|
+
# if feature value > 20: set all bins to 1,
|
109
|
+
# if feature value > 10: set the first four bins to 1,
|
110
|
+
# etc.
|
111
|
+
# default: bin.
|
112
|
+
#numerical_features = bin
|
113
|
+
numerical_features = keep
|
114
|
+
# Binary classifiers, or n-ary classifiers?
|
115
|
+
# if binary classifiers, set 'binary_classifiers = true'
|
116
|
+
# default is 'false'.
|
117
|
+
binary_classifiers = false
|
118
|
+
|
119
|
+
#################
|
120
|
+
# Fred internal settings
|
121
|
+
|
122
|
+
# what kind of classifier to use?
|
123
|
+
#
|
124
|
+
# format:
|
125
|
+
# <classifier type> <path> <optionally another path>
|
126
|
+
#
|
127
|
+
# for maxent, give first the path where maxent resides,
|
128
|
+
# then <where_shalmaneser_resides>/program/tools/maxent
|
129
|
+
classifier = maxent /opt/OpenNLP-maxent/2.4.0 <%= File.expand_path('tools/maxent/') %>
|
130
|
+
|
131
|
+
|
132
|
+
# for binary classifiers, you can set the pseudolabel
|
133
|
+
# on the 'negative' sense.
|
134
|
+
# Default is 'NONE'
|
135
|
+
negsense = NONE
|
@@ -0,0 +1,138 @@
|
|
1
|
+
#################################################
|
2
|
+
# This is a sample experiment file
|
3
|
+
# with explanations of all features
|
4
|
+
# that can be set for the frprep preprocessing system for Fred and Rosy.
|
5
|
+
#
|
6
|
+
# To start your own experiment,
|
7
|
+
# replace all occurrences of
|
8
|
+
# %...% by values of your choice.
|
9
|
+
#
|
10
|
+
# Boolean features may be omitted and are false by default.
|
11
|
+
#
|
12
|
+
# Experiment file lines that start with '#'
|
13
|
+
# are comments and are ignored. Empty lines are ignored as well.
|
14
|
+
|
15
|
+
########################
|
16
|
+
# Experiment description
|
17
|
+
#
|
18
|
+
|
19
|
+
# ID identifying this experiment and all its data
|
20
|
+
# please do not use spaces inside the experiment ID
|
21
|
+
prep_experiment_ID = prp_test
|
22
|
+
|
23
|
+
# YOUR INPUT DATA:
|
24
|
+
# frprep accepts an input directory rather than an input file.
|
25
|
+
# It will process all files in the directory directory_input
|
26
|
+
# and write the results to directory_preprocessed.
|
27
|
+
#
|
28
|
+
# For input formats see the discussion of "format" below.
|
29
|
+
directory_input = <%= File.expand_path('test/functional/input/frprep/test.salsa') %>
|
30
|
+
directory_preprocessed = <%= File.expand_path('test/functional/output/frprep/test.salsa') %>
|
31
|
+
|
32
|
+
##
|
33
|
+
# Experimental data is described by the following parameters:
|
34
|
+
#
|
35
|
+
# - language: en / de
|
36
|
+
# en for English or de for German
|
37
|
+
#
|
38
|
+
# - format: SalsaTigerXML / FNXml / SalsaTab / BNC / Plain
|
39
|
+
#
|
40
|
+
# Format of the input data, training/test set
|
41
|
+
# SalsaTigerXML: Parsed data, English or German
|
42
|
+
# FNXml: FrameNet Lexical Unit files in FrameNet XML format
|
43
|
+
# FNCorpusXML: FrameNet files in the FrameNet corpus XML format
|
44
|
+
# SalsaTab: tabular format (internal)
|
45
|
+
# BNC BNC XML format, alternating words and POS tags
|
46
|
+
# Plain Plain text, ONE SENTENCE PER LINE.
|
47
|
+
#
|
48
|
+
# Preprocessing transforms all data to SalsaTigerXML.
|
49
|
+
#
|
50
|
+
# - origin: SalsaTiger / FrameNet / <not specified>
|
51
|
+
# This is the origin of the training/test data.
|
52
|
+
# SalsaTiger: data from the Tiger corpus, possibly semantically
|
53
|
+
# annotated by Salsa
|
54
|
+
# FrameNet: data from the FrameNet project
|
55
|
+
#
|
56
|
+
# Don't set 'origin' if none of these origins apply
|
57
|
+
#
|
58
|
+
# - encoding: utf8 / iso / hex / <not specified>
|
59
|
+
# Default: iso
|
60
|
+
|
61
|
+
language = de
|
62
|
+
#origin =
|
63
|
+
format = Plain
|
64
|
+
encoding = iso
|
65
|
+
|
66
|
+
#############################
|
67
|
+
# Which preprocessing steps to take?
|
68
|
+
#
|
69
|
+
# Data can be parsed, lemmatized and POS-tagged,
|
70
|
+
# but this happens only if it is specified in the
|
71
|
+
# experiment file.
|
72
|
+
#
|
73
|
+
# Set these booleans to true to trigger the respective
|
74
|
+
# type of preprocessing. The default value is false.
|
75
|
+
|
76
|
+
do_lemmatize = true
|
77
|
+
do_postag = false
|
78
|
+
do_parse = true
|
79
|
+
|
80
|
+
#############################
|
81
|
+
# directory where frprep puts its internal data
|
82
|
+
#
|
83
|
+
|
84
|
+
frprep_directory = <%= File.expand_path('test/functional/output/') %>
|
85
|
+
|
86
|
+
#############################
|
87
|
+
# Syntax/semantics interface repair:
|
88
|
+
# FrameNet annotated data has some annotation choices
|
89
|
+
# that may make it harder to learn the mapping from
|
90
|
+
# syntactic structure to semantic roles.
|
91
|
+
#
|
92
|
+
# If you are using FrameNet data for training a
|
93
|
+
# semantic role labeler, set the following two settings
|
94
|
+
# to true (default is false) to 'repair' semantic role labels
|
95
|
+
# to closer match the syntactic structure
|
96
|
+
|
97
|
+
fe_syn_repair = true
|
98
|
+
fe_rel_repair = false
|
99
|
+
|
100
|
+
|
101
|
+
#################
|
102
|
+
# Location of tools and resources used by Fred
|
103
|
+
|
104
|
+
# currently known to the system:
|
105
|
+
# (Saarbruecken paths given)
|
106
|
+
#
|
107
|
+
# - POS tagging:
|
108
|
+
# - pos_tagger = treetagger
|
109
|
+
# pos_tagger_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
|
110
|
+
#
|
111
|
+
# - Lemmatization:
|
112
|
+
# - lemmatizer = treetagger
|
113
|
+
# lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
|
114
|
+
# lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-german-notokenisation
|
115
|
+
#
|
116
|
+
# - Parser:
|
117
|
+
# - parser = collins (English)
|
118
|
+
# parser_path = /proj/llx/Software/Parsers/COLLINS-PARSER/
|
119
|
+
# - parser = sleepy (German)
|
120
|
+
# parser_path = /proj/corpora/sleepy3/
|
121
|
+
# - parser = minipar (English)
|
122
|
+
# parser_path = /proj/llx/Software/Parsers/minipar-linux/
|
123
|
+
#
|
124
|
+
pos_tagger = treetagger
|
125
|
+
pos_tagger_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
|
126
|
+
|
127
|
+
lemmatizer = treetagger
|
128
|
+
lemmatizer_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
|
129
|
+
|
130
|
+
parser = berkeley
|
131
|
+
parser_path = <%= File.expand_path('tools/berkeleyParser') %>
|
132
|
+
|
133
|
+
# parser:
|
134
|
+
# maximum no. of sentences in a parse file,
|
135
|
+
# maximum sentence length to be parsed
|
136
|
+
|
137
|
+
parser_max_sent_num = 2000
|
138
|
+
parser_max_sent_len = 80
|