frprep 0.0.1.prealpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,3 @@
1
+ module Shalmaneser
2
+ VERSION = '0.0.1.prealpha'
3
+ end
@@ -0,0 +1,94 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ require 'test/unit'
4
+ require 'stringio' # for helper methods
5
+ require 'frprep/opt_parser'
6
+
7
+ include FrPrep
8
+
9
+ class TestOptParser < Test::Unit::TestCase
10
+
11
+ def setup
12
+ @exp_file = 'test/frprep/data/prp_test.salsa'
13
+ @valid_opts = ['--expfile', @exp_file,
14
+ '--help'
15
+ ]
16
+ end
17
+
18
+ def test_public_methods
19
+ assert_respond_to(OptParser, :parse)
20
+ end
21
+
22
+ # It should return a FrPrepConfigData object.
23
+ def test_parse_method
24
+ input = ['-e', @exp_file]
25
+ return_value = OptParser.parse(input)
26
+ assert(return_value.instance_of?(FrPrepConfigData))
27
+ end
28
+
29
+ # It should reject the empty input and exit.
30
+ def test_empty_input
31
+ out, err = intercept_output do
32
+ assert_raises(SystemExit) { OptParser.parse([]) }
33
+ end
34
+ assert_match(/You have to provide some options./, err)
35
+ end
36
+
37
+ # It should accept correct options.
38
+ # Invalid options is the matter of OptionParser itself,
39
+ # do not test it here.
40
+ # We test only, that OP exits and does not raise an exception.
41
+ def test_accept_correct_options
42
+ # this options we should treat separately
43
+ @valid_opts.delete('--help')
44
+ assert_nothing_raised { OptParser.parse(@valid_opts) }
45
+
46
+ stdout, stderr = intercept_output do
47
+ assert_raises(SystemExit) { OptParser.parse(['--invalid-option']) }
48
+ end
49
+
50
+ assert_match(/You have provided an invalid option:/, stderr)
51
+ end
52
+
53
+ # It should successfully exit with some options.
54
+ def test_successful_exit
55
+ quietly do
56
+ success_args = ['-h', '--help']
57
+ success_args.each do |arg|
58
+ assert_raises(SystemExit) { OptParser.parse(arg.split) }
59
+ end
60
+ end
61
+ end
62
+
63
+ end
64
+ ################################################################################
65
+ # It is a helper method, many testable units provide some verbose output
66
+ # to stderr and/or stdout. It is usefull to suppress any kind of verbosity.
67
+ def quietly(&b)
68
+ begin
69
+ orig_stderr = $stderr.clone
70
+ orig_stdout = $stdout.clone
71
+ $stderr.reopen(File.new('/dev/null', 'w'))
72
+ $stdout.reopen(File.new('/dev/null', 'w'))
73
+ b.call
74
+ ensure
75
+ $stderr.reopen(orig_stderr)
76
+ $stdout.reopen(orig_stdout)
77
+ end
78
+ end
79
+
80
+ # It is a helper method for handling stdout and stderr as strings.
81
+ def intercept_output
82
+ orig_stdout = $stdout
83
+ orig_stderr = $stderr
84
+ $stdout = StringIO.new
85
+ $stderr = StringIO.new
86
+
87
+ yield
88
+
89
+ return $stdout.string, $stderr.string
90
+ ensure
91
+ $stdout = orig_stdout
92
+ $stderr = orig_stderr
93
+ end
94
+
@@ -0,0 +1,40 @@
1
+ require 'erb'
2
+
3
+ module FunctionalTestHelper
4
+ PREF = 'test/functional/sample_experiment_files'
5
+
6
+ PRP_TEST_FILE = 'test/functional/sample_experiment_files/prp_test.salsa'
7
+ PRP_TEST_FILE_FRED_STD = "#{PREF}/prp_test.salsa.fred.standalone"
8
+ PRP_TEST_FILE_ROSY_STD = "#{PREF}/prp_test.salsa.rosy.standalone"
9
+ PRP_TRAIN_FILE = 'test/functional/sample_experiment_files/prp_train.salsa'
10
+ PRP_TRAIN_FILE_FRED_STD = "#{PREF}/prp_train.salsa.fred.standalone"
11
+ PRP_TRAIN_FILE_ROSY_STD = "#{PREF}/prp_train.salsa.rosy.standalone"
12
+
13
+ FRED_TEST_FILE = 'test/functional/sample_experiment_files/fred_test.salsa'
14
+ FRED_TRAIN_FILE = 'test/functional/sample_experiment_files/fred_train.salsa'
15
+ ROSY_TEST_FILE = 'test/functional/sample_experiment_files/rosy_test.salsa'
16
+ ROSY_TRAIN_FILE = 'test/functional/sample_experiment_files/rosy_train.salsa'
17
+
18
+ # Run an external process for functional testing and check the return code.
19
+ # <system> returns <true> if the external code exposes no errors.
20
+ # <@msg> is defined for every test object.
21
+ def execute(cmd)
22
+ status = system(cmd)
23
+ assert(status, @msg)
24
+ end
25
+
26
+ # Create a temporary exp file only for this test.
27
+ # Shalmaneser needs absolute paths, we provide them in exp files
28
+ # using templating.
29
+ def create_exp_file(file)
30
+ template = File.read("#{file}.erb")
31
+ text = ERB.new(template).result
32
+ File.open(file, 'w') do |f|
33
+ f.write(text)
34
+ end
35
+ end
36
+
37
+ def remove_exp_file(file)
38
+ File.delete(file)
39
+ end
40
+ end
@@ -0,0 +1,122 @@
1
+ experiment_ID = fred_test
2
+
3
+ apply_to_all_known_targets = true
4
+
5
+ enduser_mode = false
6
+
7
+ verbose = true
8
+
9
+
10
+ ############################
11
+ # Paths
12
+ # - fred_directory: directory where Fred puts its internal data
13
+ # - directory_output:
14
+ # redirect system output of disambiguated text (in SalsaTigerXML)
15
+ # to another directory.
16
+ # If you do not set anything here, output is to
17
+ # <fred_directory>/<experiment_ID>/output/stxml
18
+ # - classifier_dir:
19
+ # Write trained classifiers to this directory.
20
+ # If you do not set this parameter, classifiers are written to
21
+ # <fred_directory>/<experiment_ID>/classifiers
22
+
23
+ fred_directory = <%= File.expand_path('test/functional/output') %>
24
+ classifier_dir = <%= File.expand_path('test/functional/input/fred/cls') %>
25
+ # - preproc_descr_file_train / ...test
26
+ # where the experiment file for frprep is located
27
+ # (preprocessing for Fred and Rosy)
28
+ # for the preprocessing of the data used in this experiment
29
+ #
30
+ # give one preprocessing file name for the training data
31
+ # and one for the test data
32
+ # (If you only ever use test data in this experiment, you only
33
+ # need to give preproc_descr_file_test, and vice versa for training data.)
34
+
35
+ preproc_descr_file_test = <%= File.expand_path('test/functional/sample_experiment_files/prp_test.salsa.fred.standalone') %>
36
+
37
+ #####################
38
+ # noncontiguous input?
39
+ # if so, set 'noncontiguous_input' to 'true' (default is 'false')
40
+ # Also give the larger corpus from which the input sentences are:
41
+ # - directory
42
+ # - format: same possibilities as for frprep format
43
+ # - encoding: same possibilities as for frprep encoding
44
+
45
+ noncontiguous_input = false
46
+ #larger_corpus_dir =
47
+ larger_corpus_format = SalsaTigerXML
48
+ #larger_corpus_encoding = iso
49
+
50
+
51
+ #################
52
+ # Features
53
+
54
+ # bag-of-words context, with given context size,
55
+ # for example:
56
+ feature = context 50
57
+ feature = context 2
58
+ #
59
+ # (you can give more than one context feature line!)
60
+ #
61
+ # other possible features:
62
+ # feature = syntax
63
+ # feature = synsem
64
+ #
65
+ # syntax: grammatical functions
66
+ # synsem: grammatical functions plus headwords
67
+
68
+ #feature = context % %contextsize%
69
+ feature = syntax
70
+
71
+ # How to handle training data that is labeled
72
+ # with multiple sense labels?
73
+ # - binarize (default): This works only with binary classifiers.
74
+ # When featurizing for the binary classifiers, consider an item
75
+ # positive if its set of assigned labels includes the
76
+ # label for this binary classifier.
77
+ # - repeat: Repeat the instance, once for each
78
+ # sense label that has been assigned. (Basically, treat it
79
+ # as N instances with equal features but different labels.)
80
+ # - join: join all the assigned senses into one combined sense
81
+ # and treat that as a separate sense to train on.
82
+ # - keep: keep as multiple sense labels. (Note that this
83
+ # makes sense only for classifiers that can deal with
84
+ # multiple labels.)
85
+
86
+ #handle_multilabel = binarize
87
+ handle_multilabel = repeat
88
+
89
+ # What to do with numerical features?
90
+ # - keep: just leave as is
91
+ # - repeat: for a feature with max. numerical value N,
92
+ # use N binary features
93
+ # - bin: use a fixed number of bins, e.g. 5, then
94
+ # if feature value > 20: set all bins to 1,
95
+ # if feature value > 10: set the first four bins to 1,
96
+ # etc.
97
+ # default: bin.
98
+ #numerical_features = bin
99
+ numerical_features = keep
100
+
101
+ # Binary classifiers, or n-ary classifiers?
102
+ # if binary classifiers, set 'binary_classifiers = true'
103
+ # default is 'false'.
104
+ binary_classifiers = false
105
+
106
+ #################
107
+ # Fred internal settings
108
+
109
+ # what kind of classifier to use?
110
+ #
111
+ # format:
112
+ # <classifier type> <path> <optionally another path>
113
+ #
114
+ # for maxent, give first the path where maxent resides,
115
+ # then <where_shalmaneser_resides>/program/tools/maxent
116
+ classifier = maxent /opt/OpenNLP-maxent/2.4.0 <%= File.expand_path('tools/maxent/') %>
117
+
118
+
119
+ # for binary classifiers, you can set the pseudolabel
120
+ # on the 'negative' sense.
121
+ # Default is 'NONE'
122
+ negsense = NONE
@@ -0,0 +1,135 @@
1
+ # ID identifying this experiment and all its data
2
+ # please do not use spaces inside the experiment ID
3
+ experiment_ID = fred_train
4
+
5
+ # targets:
6
+ # if apply_to_all_known_targets is set to true,
7
+ # disambiguate all words for which we have training data
8
+ # when performing task "test" (i.e. applying trained classifiers)
9
+ apply_to_all_known_targets = true
10
+
11
+ # Enduser mode?
12
+ # The idea is that the enduser will only _apply_
13
+ # pre-trained classifiers. So in enduser mode many
14
+ # options are disallowed.
15
+ enduser_mode = false
16
+
17
+
18
+ # print warnings and
19
+ # give detailed progress reports
20
+ verbose = true
21
+
22
+
23
+ ############################
24
+ # Paths
25
+ # - fred_directory: directory where Fred puts its internal data
26
+ # - directory_output:
27
+ # redirect system output of disambiguated text (in SalsaTigerXML)
28
+ # to another directory.
29
+ # If you do not set anything here, output is to
30
+ # <fred_directory>/<experiment_ID>/output/stxml
31
+ # - classifier_dir:
32
+ # Write trained classifiers to this directory.
33
+ # If you do not set this parameter, classifiers are written to
34
+ # <fred_directory>/<experiment_ID>/classifiers
35
+
36
+ fred_directory = <%= File.expand_path('test/functional/output') %>
37
+
38
+ # - preproc_descr_file_train / ...test
39
+ # where the experiment file for frprep is located
40
+ # (preprocessing for Fred and Rosy)
41
+ # for the preprocessing of the data used in this experiment
42
+ #
43
+ # give one preprocessing file name for the training data
44
+ # and one for the test data
45
+ # (If you only ever use test data in this experiment, you only
46
+ # need to give preproc_descr_file_test, and vice versa for training data.)
47
+ preproc_descr_file_train = <%= File.expand_path('test/functional/sample_experiment_files/prp_train.salsa.fred.standalone') %>
48
+
49
+
50
+
51
+ #####################
52
+ # noncontiguous input?
53
+ # if so, set 'noncontiguous_input' to 'true' (default is 'false')
54
+ # Also give the larger corpus from which the input sentences are:
55
+ # - directory
56
+ # - format: same possibilities as for frprep format
57
+ # - encoding: same possibilities as for frprep encoding
58
+
59
+ noncontiguous_input = false
60
+ #larger_corpus_dir =
61
+ larger_corpus_format = SalsaTigerXML
62
+ #larger_corpus_encoding = iso
63
+
64
+
65
+ #################
66
+ # Features
67
+
68
+ # bag-of-words context, with given context size,
69
+ # for example:
70
+ feature = context 50
71
+ feature = context 2
72
+ #
73
+ # (you can give more than one context feature line!)
74
+ #
75
+ # other possible features:
76
+ # feature = syntax
77
+ # feature = synsem
78
+ #
79
+ # syntax: grammatical functions
80
+ # synsem: grammatical functions plus headwords
81
+
82
+ #feature = context % %contextsize%
83
+ feature = syntax
84
+
85
+ # How to handle training data that is labeled
86
+ # with multiple sense labels?
87
+ # - binarize (default): This works only with binary classifiers.
88
+ # When featurizing for the binary classifiers, consider an item
89
+ # positive if its set of assigned labels includes the
90
+ # label for this binary classifier.
91
+ # - repeat: Repeat the instance, once for each
92
+ # sense label that has been assigned. (Basically, treat it
93
+ # as N instances with equal features but different labels.)
94
+ # - join: join all the assigned senses into one combined sense
95
+ # and treat that as a separate sense to train on.
96
+ # - keep: keep as multiple sense labels. (Note that this
97
+ # makes sense only for classifiers that can deal with
98
+ # multiple labels.)
99
+
100
+ #handle_multilabel = binarize
101
+ handle_multilabel = repeat
102
+
103
+ # What to do with numerical features?
104
+ # - keep: just leave as is
105
+ # - repeat: for a feature with max. numerical value N,
106
+ # use N binary features
107
+ # - bin: use a fixed number of bins, e.g. 5, then
108
+ # if feature value > 20: set all bins to 1,
109
+ # if feature value > 10: set the first four bins to 1,
110
+ # etc.
111
+ # default: bin.
112
+ #numerical_features = bin
113
+ numerical_features = keep
114
+ # Binary classifiers, or n-ary classifiers?
115
+ # if binary classifiers, set 'binary_classifiers = true'
116
+ # default is 'false'.
117
+ binary_classifiers = false
118
+
119
+ #################
120
+ # Fred internal settings
121
+
122
+ # what kind of classifier to use?
123
+ #
124
+ # format:
125
+ # <classifier type> <path> <optionally another path>
126
+ #
127
+ # for maxent, give first the path where maxent resides,
128
+ # then <where_shalmaneser_resides>/program/tools/maxent
129
+ classifier = maxent /opt/OpenNLP-maxent/2.4.0 <%= File.expand_path('tools/maxent/') %>
130
+
131
+
132
+ # for binary classifiers, you can set the pseudolabel
133
+ # on the 'negative' sense.
134
+ # Default is 'NONE'
135
+ negsense = NONE
@@ -0,0 +1,138 @@
1
+ #################################################
2
+ # This is a sample experiment file
3
+ # with explanations of all features
4
+ # that can be set for the frprep preprocessing system for Fred and Rosy.
5
+ #
6
+ # To start your own experiment,
7
+ # replace all occurrences of
8
+ # %...% by values of your choice.
9
+ #
10
+ # Boolean features may be omitted and are false by default.
11
+ #
12
+ # Experiment file lines that start with '#'
13
+ # are comments and are ignored. Empty lines are ignored as well.
14
+
15
+ ########################
16
+ # Experiment description
17
+ #
18
+
19
+ # ID identifying this experiment and all its data
20
+ # please do not use spaces inside the experiment ID
21
+ prep_experiment_ID = prp_test
22
+
23
+ # YOUR INPUT DATA:
24
+ # frprep accepts an input directory rather than an input file.
25
+ # It will process all files in the directory directory_input
26
+ # and write the results to directory_preprocessed.
27
+ #
28
+ # For input formats see the discussion of "format" below.
29
+ directory_input = <%= File.expand_path('test/functional/input/frprep/test.salsa') %>
30
+ directory_preprocessed = <%= File.expand_path('test/functional/output/frprep/test.salsa') %>
31
+
32
+ ##
33
+ # Experimental data is described by the following parameters:
34
+ #
35
+ # - language: en / de
36
+ # en for English or de for German
37
+ #
38
+ # - format: SalsaTigerXML / FNXml / SalsaTab / BNC / Plain
39
+ #
40
+ # Format of the input data, training/test set
41
+ # SalsaTigerXML: Parsed data, English or German
42
+ # FNXml: FrameNet Lexical Unit files in FrameNet XML format
43
+ # FNCorpusXML: FrameNet files in the FrameNet corpus XML format
44
+ # SalsaTab: tabular format (internal)
45
+ # BNC BNC XML format, alternating words and POS tags
46
+ # Plain Plain text, ONE SENTENCE PER LINE.
47
+ #
48
+ # Preprocessing transforms all data to SalsaTigerXML.
49
+ #
50
+ # - origin: SalsaTiger / FrameNet / <not specified>
51
+ # This is the origin of the training/test data.
52
+ # SalsaTiger: data from the Tiger corpus, possibly semantically
53
+ # annotated by Salsa
54
+ # FrameNet: data from the FrameNet project
55
+ #
56
+ # Don't set 'origin' if none of these origins apply
57
+ #
58
+ # - encoding: utf8 / iso / hex / <not specified>
59
+ # Default: iso
60
+
61
+ language = de
62
+ #origin =
63
+ format = Plain
64
+ encoding = iso
65
+
66
+ #############################
67
+ # Which preprocessing steps to take?
68
+ #
69
+ # Data can be parsed, lemmatized and POS-tagged,
70
+ # but this happens only if it is specified in the
71
+ # experiment file.
72
+ #
73
+ # Set these booleans to true to trigger the respective
74
+ # type of preprocessing. The default value is false.
75
+
76
+ do_lemmatize = true
77
+ do_postag = false
78
+ do_parse = true
79
+
80
+ #############################
81
+ # directory where frprep puts its internal data
82
+ #
83
+
84
+ frprep_directory = <%= File.expand_path('test/functional/output/') %>
85
+
86
+ #############################
87
+ # Syntax/semantics interface repair:
88
+ # FrameNet annotated data has some annotation choices
89
+ # that may make it harder to learn the mapping from
90
+ # syntactic structure to semantic roles.
91
+ #
92
+ # If you are using FrameNet data for training a
93
+ # semantic role labeler, set the following two settings
94
+ # to true (default is false) to 'repair' semantic role labels
95
+ # to closer match the syntactic structure
96
+
97
+ fe_syn_repair = true
98
+ fe_rel_repair = false
99
+
100
+
101
+ #################
102
+ # Location of tools and resources used by Fred
103
+
104
+ # currently known to the system:
105
+ # (Saarbruecken paths given)
106
+ #
107
+ # - POS tagging:
108
+ # - pos_tagger = treetagger
109
+ # pos_tagger_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
110
+ #
111
+ # - Lemmatization:
112
+ # - lemmatizer = treetagger
113
+ # lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
114
+ # lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-german-notokenisation
115
+ #
116
+ # - Parser:
117
+ # - parser = collins (English)
118
+ # parser_path = /proj/llx/Software/Parsers/COLLINS-PARSER/
119
+ # - parser = sleepy (German)
120
+ # parser_path = /proj/corpora/sleepy3/
121
+ # - parser = minipar (English)
122
+ # parser_path = /proj/llx/Software/Parsers/minipar-linux/
123
+ #
124
+ pos_tagger = treetagger
125
+ pos_tagger_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
126
+
127
+ lemmatizer = treetagger
128
+ lemmatizer_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
129
+
130
+ parser = berkeley
131
+ parser_path = <%= File.expand_path('tools/berkeleyParser') %>
132
+
133
+ # parser:
134
+ # maximum no. of sentences in a parse file,
135
+ # maximum sentence length to be parsed
136
+
137
+ parser_max_sent_num = 2000
138
+ parser_max_sent_len = 80