frprep 0.0.1.prealpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,3 @@
1
+ module Shalmaneser
2
+ VERSION = '0.0.1.prealpha'
3
+ end
@@ -0,0 +1,94 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ require 'test/unit'
4
+ require 'stringio' # for helper methods
5
+ require 'frprep/opt_parser'
6
+
7
+ include FrPrep
8
+
9
+ class TestOptParser < Test::Unit::TestCase
10
+
11
+ def setup
12
+ @exp_file = 'test/frprep/data/prp_test.salsa'
13
+ @valid_opts = ['--expfile', @exp_file,
14
+ '--help'
15
+ ]
16
+ end
17
+
18
+ def test_public_methods
19
+ assert_respond_to(OptParser, :parse)
20
+ end
21
+
22
+ # It should return a FrPrepConfigData object.
23
+ def test_parse_method
24
+ input = ['-e', @exp_file]
25
+ return_value = OptParser.parse(input)
26
+ assert(return_value.instance_of?(FrPrepConfigData))
27
+ end
28
+
29
+ # It should reject the empty input and exit.
30
+ def test_empty_input
31
+ out, err = intercept_output do
32
+ assert_raises(SystemExit) { OptParser.parse([]) }
33
+ end
34
+ assert_match(/You have to provide some options./, err)
35
+ end
36
+
37
+ # It should accept correct options.
38
+ # Invalid options is the matter of OptionParser itself,
39
+ # do not test it here.
40
+ # We test only, that OP exits and does not raise an exception.
41
+ def test_accept_correct_options
42
+ # this options we should treat separately
43
+ @valid_opts.delete('--help')
44
+ assert_nothing_raised { OptParser.parse(@valid_opts) }
45
+
46
+ stdout, stderr = intercept_output do
47
+ assert_raises(SystemExit) { OptParser.parse(['--invalid-option']) }
48
+ end
49
+
50
+ assert_match(/You have provided an invalid option:/, stderr)
51
+ end
52
+
53
+ # It should successfully exit with some options.
54
+ def test_successful_exit
55
+ quietly do
56
+ success_args = ['-h', '--help']
57
+ success_args.each do |arg|
58
+ assert_raises(SystemExit) { OptParser.parse(arg.split) }
59
+ end
60
+ end
61
+ end
62
+
63
+ end
64
+ ################################################################################
65
+ # It is a helper method, many testable units provide some verbose output
66
+ # to stderr and/or stdout. It is usefull to suppress any kind of verbosity.
67
+ def quietly(&b)
68
+ begin
69
+ orig_stderr = $stderr.clone
70
+ orig_stdout = $stdout.clone
71
+ $stderr.reopen(File.new('/dev/null', 'w'))
72
+ $stdout.reopen(File.new('/dev/null', 'w'))
73
+ b.call
74
+ ensure
75
+ $stderr.reopen(orig_stderr)
76
+ $stdout.reopen(orig_stdout)
77
+ end
78
+ end
79
+
80
+ # It is a helper method for handling stdout and stderr as strings.
81
+ def intercept_output
82
+ orig_stdout = $stdout
83
+ orig_stderr = $stderr
84
+ $stdout = StringIO.new
85
+ $stderr = StringIO.new
86
+
87
+ yield
88
+
89
+ return $stdout.string, $stderr.string
90
+ ensure
91
+ $stdout = orig_stdout
92
+ $stderr = orig_stderr
93
+ end
94
+
@@ -0,0 +1,40 @@
1
+ require 'erb'
2
+
3
+ module FunctionalTestHelper
4
+ PREF = 'test/functional/sample_experiment_files'
5
+
6
+ PRP_TEST_FILE = 'test/functional/sample_experiment_files/prp_test.salsa'
7
+ PRP_TEST_FILE_FRED_STD = "#{PREF}/prp_test.salsa.fred.standalone"
8
+ PRP_TEST_FILE_ROSY_STD = "#{PREF}/prp_test.salsa.rosy.standalone"
9
+ PRP_TRAIN_FILE = 'test/functional/sample_experiment_files/prp_train.salsa'
10
+ PRP_TRAIN_FILE_FRED_STD = "#{PREF}/prp_train.salsa.fred.standalone"
11
+ PRP_TRAIN_FILE_ROSY_STD = "#{PREF}/prp_train.salsa.rosy.standalone"
12
+
13
+ FRED_TEST_FILE = 'test/functional/sample_experiment_files/fred_test.salsa'
14
+ FRED_TRAIN_FILE = 'test/functional/sample_experiment_files/fred_train.salsa'
15
+ ROSY_TEST_FILE = 'test/functional/sample_experiment_files/rosy_test.salsa'
16
+ ROSY_TRAIN_FILE = 'test/functional/sample_experiment_files/rosy_train.salsa'
17
+
18
+ # Run an external process for functional testing and check the return code.
19
+ # <system> returns <true> if the external code exposes no errors.
20
+ # <@msg> is defined for every test object.
21
+ def execute(cmd)
22
+ status = system(cmd)
23
+ assert(status, @msg)
24
+ end
25
+
26
+ # Create a temporary exp file only for this test.
27
+ # Shalmaneser needs absolute paths, we provide them in exp files
28
+ # using templating.
29
+ def create_exp_file(file)
30
+ template = File.read("#{file}.erb")
31
+ text = ERB.new(template).result
32
+ File.open(file, 'w') do |f|
33
+ f.write(text)
34
+ end
35
+ end
36
+
37
+ def remove_exp_file(file)
38
+ File.delete(file)
39
+ end
40
+ end
@@ -0,0 +1,122 @@
1
+ experiment_ID = fred_test
2
+
3
+ apply_to_all_known_targets = true
4
+
5
+ enduser_mode = false
6
+
7
+ verbose = true
8
+
9
+
10
+ ############################
11
+ # Paths
12
+ # - fred_directory: directory where Fred puts its internal data
13
+ # - directory_output:
14
+ # redirect system output of disambiguated text (in SalsaTigerXML)
15
+ # to another directory.
16
+ # If you do not set anything here, output is to
17
+ # <fred_directory>/<experiment_ID>/output/stxml
18
+ # - classifier_dir:
19
+ # Write trained classifiers to this directory.
20
+ # If you do not set this parameter, classifiers are written to
21
+ # <fred_directory>/<experiment_ID>/classifiers
22
+
23
+ fred_directory = <%= File.expand_path('test/functional/output') %>
24
+ classifier_dir = <%= File.expand_path('test/functional/input/fred/cls') %>
25
+ # - preproc_descr_file_train / ...test
26
+ # where the experiment file for frprep is located
27
+ # (preprocessing for Fred and Rosy)
28
+ # for the preprocessing of the data used in this experiment
29
+ #
30
+ # give one preprocessing file name for the training data
31
+ # and one for the test data
32
+ # (If you only ever use test data in this experiment, you only
33
+ # need to give preproc_descr_file_test, and vice versa for training data.)
34
+
35
+ preproc_descr_file_test = <%= File.expand_path('test/functional/sample_experiment_files/prp_test.salsa.fred.standalone') %>
36
+
37
+ #####################
38
+ # noncontiguous input?
39
+ # if so, set 'noncontiguous_input' to 'true' (default is 'false')
40
+ # Also give the larger corpus from which the input sentences are:
41
+ # - directory
42
+ # - format: same possibilities as for frprep format
43
+ # - encoding: same possibilities as for frprep encoding
44
+
45
+ noncontiguous_input = false
46
+ #larger_corpus_dir =
47
+ larger_corpus_format = SalsaTigerXML
48
+ #larger_corpus_encoding = iso
49
+
50
+
51
+ #################
52
+ # Features
53
+
54
+ # bag-of-words context, with given context size,
55
+ # for example:
56
+ feature = context 50
57
+ feature = context 2
58
+ #
59
+ # (you can give more than one context feature line!)
60
+ #
61
+ # other possible features:
62
+ # feature = syntax
63
+ # feature = synsem
64
+ #
65
+ # syntax: grammatical functions
66
+ # synsem: grammatical functions plus headwords
67
+
68
+ #feature = context % %contextsize%
69
+ feature = syntax
70
+
71
+ # How to handle training data that is labeled
72
+ # with multiple sense labels?
73
+ # - binarize (default): This works only with binary classifiers.
74
+ # When featurizing for the binary classifiers, consider an item
75
+ # positive if its set of assigned labels includes the
76
+ # label for this binary classifier.
77
+ # - repeat: Repeat the instance, once for each
78
+ # sense label that has been assigned. (Basically, treat it
79
+ # as N instances with equal features but different labels.)
80
+ # - join: join all the assigned senses into one combined sense
81
+ # and treat that as a separate sense to train on.
82
+ # - keep: keep as multiple sense labels. (Note that this
83
+ # makes sense only for classifiers that can deal with
84
+ # multiple labels.)
85
+
86
+ #handle_multilabel = binarize
87
+ handle_multilabel = repeat
88
+
89
+ # What to do with numerical features?
90
+ # - keep: just leave as is
91
+ # - repeat: for a feature with max. numerical value N,
92
+ # use N binary features
93
+ # - bin: use a fixed number of bins, e.g. 5, then
94
+ # if feature value > 20: set all bins to 1,
95
+ # if feature value > 10: set the first four bins to 1,
96
+ # etc.
97
+ # default: bin.
98
+ #numerical_features = bin
99
+ numerical_features = keep
100
+
101
+ # Binary classifiers, or n-ary classifiers?
102
+ # if binary classifiers, set 'binary_classifiers = true'
103
+ # default is 'false'.
104
+ binary_classifiers = false
105
+
106
+ #################
107
+ # Fred internal settings
108
+
109
+ # what kind of classifier to use?
110
+ #
111
+ # format:
112
+ # <classifier type> <path> <optionally another path>
113
+ #
114
+ # for maxent, give first the path where maxent resides,
115
+ # then <where_shalmaneser_resides>/program/tools/maxent
116
+ classifier = maxent /opt/OpenNLP-maxent/2.4.0 <%= File.expand_path('tools/maxent/') %>
117
+
118
+
119
+ # for binary classifiers, you can set the pseudolabel
120
+ # on the 'negative' sense.
121
+ # Default is 'NONE'
122
+ negsense = NONE
@@ -0,0 +1,135 @@
1
+ # ID identifying this experiment and all its data
2
+ # please do not use spaces inside the experiment ID
3
+ experiment_ID = fred_train
4
+
5
+ # targets:
6
+ # if apply_to_all_known_targets is set to true,
7
+ # disambiguate all words for which we have training data
8
+ # when performing task "test" (i.e. applying trained classifiers)
9
+ apply_to_all_known_targets = true
10
+
11
+ # Enduser mode?
12
+ # The idea is that the enduser will only _apply_
13
+ # pre-trained classifiers. So in enduser mode many
14
+ # options are disallowed.
15
+ enduser_mode = false
16
+
17
+
18
+ # print warnings and
19
+ # give detailed progress reports
20
+ verbose = true
21
+
22
+
23
+ ############################
24
+ # Paths
25
+ # - fred_directory: directory where Fred puts its internal data
26
+ # - directory_output:
27
+ # redirect system output of disambiguated text (in SalsaTigerXML)
28
+ # to another directory.
29
+ # If you do not set anything here, output is to
30
+ # <fred_directory>/<experiment_ID>/output/stxml
31
+ # - classifier_dir:
32
+ # Write trained classifiers to this directory.
33
+ # If you do not set this parameter, classifiers are written to
34
+ # <fred_directory>/<experiment_ID>/classifiers
35
+
36
+ fred_directory = <%= File.expand_path('test/functional/output') %>
37
+
38
+ # - preproc_descr_file_train / ...test
39
+ # where the experiment file for frprep is located
40
+ # (preprocessing for Fred and Rosy)
41
+ # for the preprocessing of the data used in this experiment
42
+ #
43
+ # give one preprocessing file name for the training data
44
+ # and one for the test data
45
+ # (If you only ever use test data in this experiment, you only
46
+ # need to give preproc_descr_file_test, and vice versa for training data.)
47
+ preproc_descr_file_train = <%= File.expand_path('test/functional/sample_experiment_files/prp_train.salsa.fred.standalone') %>
48
+
49
+
50
+
51
+ #####################
52
+ # noncontiguous input?
53
+ # if so, set 'noncontiguous_input' to 'true' (default is 'false')
54
+ # Also give the larger corpus from which the input sentences are:
55
+ # - directory
56
+ # - format: same possibilities as for frprep format
57
+ # - encoding: same possibilities as for frprep encoding
58
+
59
+ noncontiguous_input = false
60
+ #larger_corpus_dir =
61
+ larger_corpus_format = SalsaTigerXML
62
+ #larger_corpus_encoding = iso
63
+
64
+
65
+ #################
66
+ # Features
67
+
68
+ # bag-of-words context, with given context size,
69
+ # for example:
70
+ feature = context 50
71
+ feature = context 2
72
+ #
73
+ # (you can give more than one context feature line!)
74
+ #
75
+ # other possible features:
76
+ # feature = syntax
77
+ # feature = synsem
78
+ #
79
+ # syntax: grammatical functions
80
+ # synsem: grammatical functions plus headwords
81
+
82
+ #feature = context % %contextsize%
83
+ feature = syntax
84
+
85
+ # How to handle training data that is labeled
86
+ # with multiple sense labels?
87
+ # - binarize (default): This works only with binary classifiers.
88
+ # When featurizing for the binary classifiers, consider an item
89
+ # positive if its set of assigned labels includes the
90
+ # label for this binary classifier.
91
+ # - repeat: Repeat the instance, once for each
92
+ # sense label that has been assigned. (Basically, treat it
93
+ # as N instances with equal features but different labels.)
94
+ # - join: join all the assigned senses into one combined sense
95
+ # and treat that as a separate sense to train on.
96
+ # - keep: keep as multiple sense labels. (Note that this
97
+ # makes sense only for classifiers that can deal with
98
+ # multiple labels.)
99
+
100
+ #handle_multilabel = binarize
101
+ handle_multilabel = repeat
102
+
103
+ # What to do with numerical features?
104
+ # - keep: just leave as is
105
+ # - repeat: for a feature with max. numerical value N,
106
+ # use N binary features
107
+ # - bin: use a fixed number of bins, e.g. 5, then
108
+ # if feature value > 20: set all bins to 1,
109
+ # if feature value > 10: set the first four bins to 1,
110
+ # etc.
111
+ # default: bin.
112
+ #numerical_features = bin
113
+ numerical_features = keep
114
+ # Binary classifiers, or n-ary classifiers?
115
+ # if binary classifiers, set 'binary_classifiers = true'
116
+ # default is 'false'.
117
+ binary_classifiers = false
118
+
119
+ #################
120
+ # Fred internal settings
121
+
122
+ # what kind of classifier to use?
123
+ #
124
+ # format:
125
+ # <classifier type> <path> <optionally another path>
126
+ #
127
+ # for maxent, give first the path where maxent resides,
128
+ # then <where_shalmaneser_resides>/program/tools/maxent
129
+ classifier = maxent /opt/OpenNLP-maxent/2.4.0 <%= File.expand_path('tools/maxent/') %>
130
+
131
+
132
+ # for binary classifiers, you can set the pseudolabel
133
+ # on the 'negative' sense.
134
+ # Default is 'NONE'
135
+ negsense = NONE
@@ -0,0 +1,138 @@
1
+ #################################################
2
+ # This is a sample experiment file
3
+ # with explanations of all features
4
+ # that can be set for the frprep preprocessing system for Fred and Rosy.
5
+ #
6
+ # To start your own experiment,
7
+ # replace all occurrences of
8
+ # %...% by values of your choice.
9
+ #
10
+ # Boolean features may be omitted and are false by default.
11
+ #
12
+ # Experiment file lines that start with '#'
13
+ # are comments and are ignored. Empty lines are ignored as well.
14
+
15
+ ########################
16
+ # Experiment description
17
+ #
18
+
19
+ # ID identifying this experiment and all its data
20
+ # please do not use spaces inside the experiment ID
21
+ prep_experiment_ID = prp_test
22
+
23
+ # YOUR INPUT DATA:
24
+ # frprep accepts an input directory rather than an input file.
25
+ # It will process all files in the directory directory_input
26
+ # and write the results to directory_preprocessed.
27
+ #
28
+ # For input formats see the discussion of "format" below.
29
+ directory_input = <%= File.expand_path('test/functional/input/frprep/test.salsa') %>
30
+ directory_preprocessed = <%= File.expand_path('test/functional/output/frprep/test.salsa') %>
31
+
32
+ ##
33
+ # Experimental data is described by the following parameters:
34
+ #
35
+ # - language: en / de
36
+ # en for English or de for German
37
+ #
38
+ # - format: SalsaTigerXML / FNXml / SalsaTab / BNC / Plain
39
+ #
40
+ # Format of the input data, training/test set
41
+ # SalsaTigerXML: Parsed data, English or German
42
+ # FNXml: FrameNet Lexical Unit files in FrameNet XML format
43
+ # FNCorpusXML: FrameNet files in the FrameNet corpus XML format
44
+ # SalsaTab: tabular format (internal)
45
+ # BNC BNC XML format, alternating words and POS tags
46
+ # Plain Plain text, ONE SENTENCE PER LINE.
47
+ #
48
+ # Preprocessing transforms all data to SalsaTigerXML.
49
+ #
50
+ # - origin: SalsaTiger / FrameNet / <not specified>
51
+ # This is the origin of the training/test data.
52
+ # SalsaTiger: data from the Tiger corpus, possibly semantically
53
+ # annotated by Salsa
54
+ # FrameNet: data from the FrameNet project
55
+ #
56
+ # Don't set 'origin' if none of these origins apply
57
+ #
58
+ # - encoding: utf8 / iso / hex / <not specified>
59
+ # Default: iso
60
+
61
+ language = de
62
+ #origin =
63
+ format = Plain
64
+ encoding = iso
65
+
66
+ #############################
67
+ # Which preprocessing steps to take?
68
+ #
69
+ # Data can be parsed, lemmatized and POS-tagged,
70
+ # but this happens only if it is specified in the
71
+ # experiment file.
72
+ #
73
+ # Set these booleans to true to trigger the respective
74
+ # type of preprocessing. The default value is false.
75
+
76
+ do_lemmatize = true
77
+ do_postag = false
78
+ do_parse = true
79
+
80
+ #############################
81
+ # directory where frprep puts its internal data
82
+ #
83
+
84
+ frprep_directory = <%= File.expand_path('test/functional/output/') %>
85
+
86
+ #############################
87
+ # Syntax/semantics interface repair:
88
+ # FrameNet annotated data has some annotation choices
89
+ # that may make it harder to learn the mapping from
90
+ # syntactic structure to semantic roles.
91
+ #
92
+ # If you are using FrameNet data for training a
93
+ # semantic role labeler, set the following two settings
94
+ # to true (default is false) to 'repair' semantic role labels
95
+ # to closer match the syntactic structure
96
+
97
+ fe_syn_repair = true
98
+ fe_rel_repair = false
99
+
100
+
101
+ #################
102
+ # Location of tools and resources used by Fred
103
+
104
+ # currently known to the system:
105
+ # (Saarbruecken paths given)
106
+ #
107
+ # - POS tagging:
108
+ # - pos_tagger = treetagger
109
+ # pos_tagger_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
110
+ #
111
+ # - Lemmatization:
112
+ # - lemmatizer = treetagger
113
+ # lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
114
+ # lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-german-notokenisation
115
+ #
116
+ # - Parser:
117
+ # - parser = collins (English)
118
+ # parser_path = /proj/llx/Software/Parsers/COLLINS-PARSER/
119
+ # - parser = sleepy (German)
120
+ # parser_path = /proj/corpora/sleepy3/
121
+ # - parser = minipar (English)
122
+ # parser_path = /proj/llx/Software/Parsers/minipar-linux/
123
+ #
124
+ pos_tagger = treetagger
125
+ pos_tagger_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
126
+
127
+ lemmatizer = treetagger
128
+ lemmatizer_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
129
+
130
+ parser = berkeley
131
+ parser_path = <%= File.expand_path('tools/berkeleyParser') %>
132
+
133
+ # parser:
134
+ # maximum no. of sentences in a parse file,
135
+ # maximum sentence length to be parsed
136
+
137
+ parser_max_sent_num = 2000
138
+ parser_max_sent_len = 80