frprep 0.0.1.prealpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,120 @@
1
+ # ID identifying this experiment and all its data
2
+ # please do not use spaces inside the experiment ID
3
+ prep_experiment_ID = prp_test
4
+
5
+ # YOUR INPUT DATA:
6
+ # frprep accepts an input directory rather than an input file.
7
+ # It will process all files in the directory directory_input
8
+ # and write the results to directory_preprocessed.
9
+ #
10
+ # For input formats see the discussion of "format" below.
11
+ #directory_input = <%= File.expand_path('test/functional/input/frprep/test.salsa') %>
12
+ directory_preprocessed = <%= File.expand_path('test/functional/input/fred/frprep/test.salsa') %>
13
+
14
+ ##
15
+ # Experimental data is described by the following parameters:
16
+ #
17
+ # - language: en / de
18
+ # en for English or de for German
19
+ #
20
+ # - format: SalsaTigerXML / FNXml / SalsaTab / BNC / Plain
21
+ #
22
+ # Format of the input data, training/test set
23
+ # SalsaTigerXML: Parsed data, English or German
24
+ # FNXml: FrameNet Lexical Unit files in FrameNet XML format
25
+ # FNCorpusXML: FrameNet files in the FrameNet corpus XML format
26
+ # SalsaTab: tabular format (internal)
27
+ # BNC BNC XML format, alternating words and POS tags
28
+ # Plain Plain text, ONE SENTENCE PER LINE.
29
+ #
30
+ # Preprocessing transforms all data to SalsaTigerXML.
31
+ #
32
+ # - origin: SalsaTiger / FrameNet / <not specified>
33
+ # This is the origin of the training/test data.
34
+ # SalsaTiger: data from the Tiger corpus, possibly semantically
35
+ # annotated by Salsa
36
+ # FrameNet: data from the FrameNet project
37
+ #
38
+ # Don't set 'origin' if none of these origins apply
39
+ #
40
+ # - encoding: utf8 / iso / hex / <not specified>
41
+ # Default: iso
42
+
43
+ language = de
44
+ #origin =
45
+ format = Plain
46
+ encoding = iso
47
+
48
+ #############################
49
+ # Which preprocessing steps to take?
50
+ #
51
+ # Data can be parsed, lemmatized and POS-tagged,
52
+ # but this happens only if it is specified in the
53
+ # experiment file.
54
+ #
55
+ # Set these booleans to true to trigger the respective
56
+ # type of preprocessing. The default value is false.
57
+
58
+ do_lemmatize = true
59
+ do_postag = false
60
+ do_parse = true
61
+
62
+ #############################
63
+ # directory where frprep puts its internal data
64
+ #
65
+
66
+ #frprep_directory = <%= File.expand_path('test/functional/input/fred/frprep') %>
67
+
68
+ #############################
69
+ # Syntax/semantics interface repair:
70
+ # FrameNet annotated data has some annotation choices
71
+ # that may make it harder to learn the mapping from
72
+ # syntactic structure to semantic roles.
73
+ #
74
+ # If you are using FrameNet data for training a
75
+ # semantic role labeler, set the following two settings
76
+ # to true (default is false) to 'repair' semantic role labels
77
+ # to closer match the syntactic structure
78
+
79
+ fe_syn_repair = true
80
+ fe_rel_repair = false
81
+
82
+
83
+ #################
84
+ # Location of tools and resources used by Fred
85
+
86
+ # currently known to the system:
87
+ # (Saarbruecken paths given)
88
+ #
89
+ # - POS tagging:
90
+ # - pos_tagger = treetagger
91
+ # pos_tagger_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
92
+ #
93
+ # - Lemmatization:
94
+ # - lemmatizer = treetagger
95
+ # lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
96
+ # lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-german-notokenisation
97
+ #
98
+ # - Parser:
99
+ # - parser = collins (English)
100
+ # parser_path = /proj/llx/Software/Parsers/COLLINS-PARSER/
101
+ # - parser = sleepy (German)
102
+ # parser_path = /proj/corpora/sleepy3/
103
+ # - parser = minipar (English)
104
+ # parser_path = /proj/llx/Software/Parsers/minipar-linux/
105
+ #
106
+ pos_tagger = treetagger
107
+ pos_tagger_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
108
+
109
+ lemmatizer = treetagger
110
+ lemmatizer_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
111
+
112
+ parser = berkeley
113
+ parser_path = <%= File.expand_path('tools/berkeleyParser') %>
114
+
115
+ # parser:
116
+ # maximum no. of sentences in a parse file,
117
+ # maximum sentence length to be parsed
118
+
119
+ parser_max_sent_num = 2000
120
+ parser_max_sent_len = 80
@@ -0,0 +1,120 @@
1
+ # ID identifying this experiment and all its data
2
+ # please do not use spaces inside the experiment ID
3
+ prep_experiment_ID = prp_test
4
+
5
+ # YOUR INPUT DATA:
6
+ # frprep accepts an input directory rather than an input file.
7
+ # It will process all files in the directory directory_input
8
+ # and write the results to directory_preprocessed.
9
+ #
10
+ # For input formats see the discussion of "format" below.
11
+ #directory_input = <%= File.expand_path('test/functional/input/frprep/test.salsa') %>
12
+ directory_preprocessed = <%= File.expand_path('test/functional/input/rosy/frprep/test.salsa') %>
13
+
14
+ ##
15
+ # Experimental data is described by the following parameters:
16
+ #
17
+ # - language: en / de
18
+ # en for English or de for German
19
+ #
20
+ # - format: SalsaTigerXML / FNXml / SalsaTab / BNC / Plain
21
+ #
22
+ # Format of the input data, training/test set
23
+ # SalsaTigerXML: Parsed data, English or German
24
+ # FNXml: FrameNet Lexical Unit files in FrameNet XML format
25
+ # FNCorpusXML: FrameNet files in the FrameNet corpus XML format
26
+ # SalsaTab: tabular format (internal)
27
+ # BNC BNC XML format, alternating words and POS tags
28
+ # Plain Plain text, ONE SENTENCE PER LINE.
29
+ #
30
+ # Preprocessing transforms all data to SalsaTigerXML.
31
+ #
32
+ # - origin: SalsaTiger / FrameNet / <not specified>
33
+ # This is the origin of the training/test data.
34
+ # SalsaTiger: data from the Tiger corpus, possibly semantically
35
+ # annotated by Salsa
36
+ # FrameNet: data from the FrameNet project
37
+ #
38
+ # Don't set 'origin' if none of these origins apply
39
+ #
40
+ # - encoding: utf8 / iso / hex / <not specified>
41
+ # Default: iso
42
+
43
+ language = de
44
+ #origin =
45
+ format = Plain
46
+ encoding = iso
47
+
48
+ #############################
49
+ # Which preprocessing steps to take?
50
+ #
51
+ # Data can be parsed, lemmatized and POS-tagged,
52
+ # but this happens only if it is specified in the
53
+ # experiment file.
54
+ #
55
+ # Set these booleans to true to trigger the respective
56
+ # type of preprocessing. The default value is false.
57
+
58
+ do_lemmatize = true
59
+ do_postag = false
60
+ do_parse = true
61
+
62
+ #############################
63
+ # directory where frprep puts its internal data
64
+ #
65
+
66
+ #frprep_directory = <%= File.expand_path('test/functional/input/rosy/frprep') %>
67
+
68
+ #############################
69
+ # Syntax/semantics interface repair:
70
+ # FrameNet annotated data has some annotation choices
71
+ # that may make it harder to learn the mapping from
72
+ # syntactic structure to semantic roles.
73
+ #
74
+ # If you are using FrameNet data for training a
75
+ # semantic role labeler, set the following two settings
76
+ # to true (default is false) to 'repair' semantic role labels
77
+ # to closer match the syntactic structure
78
+
79
+ fe_syn_repair = true
80
+ fe_rel_repair = false
81
+
82
+
83
+ #################
84
+ # Location of tools and resources used by Fred
85
+
86
+ # currently known to the system:
87
+ # (Saarbruecken paths given)
88
+ #
89
+ # - POS tagging:
90
+ # - pos_tagger = treetagger
91
+ # pos_tagger_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
92
+ #
93
+ # - Lemmatization:
94
+ # - lemmatizer = treetagger
95
+ # lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
96
+ # lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-german-notokenisation
97
+ #
98
+ # - Parser:
99
+ # - parser = collins (English)
100
+ # parser_path = /proj/llx/Software/Parsers/COLLINS-PARSER/
101
+ # - parser = sleepy (German)
102
+ # parser_path = /proj/corpora/sleepy3/
103
+ # - parser = minipar (English)
104
+ # parser_path = /proj/llx/Software/Parsers/minipar-linux/
105
+ #
106
+ pos_tagger = treetagger
107
+ pos_tagger_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
108
+
109
+ lemmatizer = treetagger
110
+ lemmatizer_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
111
+
112
+ parser = berkeley
113
+ parser_path = <%= File.expand_path('tools/berkeleyParser') %>
114
+
115
+ # parser:
116
+ # maximum no. of sentences in a parse file,
117
+ # maximum sentence length to be parsed
118
+
119
+ parser_max_sent_num = 2000
120
+ parser_max_sent_len = 80
@@ -0,0 +1,138 @@
1
+ #################################################
2
+ # This is a sample experiment file
3
+ # with explanations of all features
4
+ # that can be set for the frprep preprocessing system for Fred and Rosy.
5
+ #
6
+ # To start your own experiment,
7
+ # replace all occurrences of
8
+ # %...% by values of your choice.
9
+ #
10
+ # Boolean features may be omitted and are false by default.
11
+ #
12
+ # Experiment file lines that start with '#'
13
+ # are comments and are ignored. Empty lines are ignored as well.
14
+
15
+ ########################
16
+ # Experiment description
17
+ #
18
+
19
+ # ID identifying this experiment and all its data
20
+ # please do not use spaces inside the experiment ID
21
+ prep_experiment_ID = prp_train
22
+
23
+ # YOUR INPUT DATA:
24
+ # frprep accepts an input directory rather than an input file.
25
+ # It will process all files in the directory directory_input
26
+ # and write the results to directory_preprocessed.
27
+ #
28
+ # For input formats see the discussion of "format" below.
29
+ directory_input = <%= File.expand_path('test/functional/input/frprep/train.salsa') %>
30
+ directory_preprocessed = <%= File.expand_path('test/functional/output/frprep/train.salsa') %>
31
+
32
+ ##
33
+ # Experimental data is described by the following parameters:
34
+ #
35
+ # - language: en / de
36
+ # en for English or de for German
37
+ #
38
+ # - format: SalsaTigerXML / FNXml / SalsaTab / BNC / Plain
39
+ #
40
+ # Format of the input data, training/test set
41
+ # SalsaTigerXML: Parsed data, English or German
42
+ # FNXml: FrameNet Lexical Unit files in FrameNet XML format
43
+ # FNCorpusXML: FrameNet files in the FrameNet corpus XML format
44
+ # SalsaTab: tabular format (internal)
45
+ # BNC BNC XML format, alternating words and POS tags
46
+ # Plain Plain text, ONE SENTENCE PER LINE.
47
+ #
48
+ # Preprocessing transforms all data to SalsaTigerXML.
49
+ #
50
+ # - origin: SalsaTiger / FrameNet / <not specified>
51
+ # This is the origin of the training/test data.
52
+ # SalsaTiger: data from the Tiger corpus, possibly semantically
53
+ # annotated by Salsa
54
+ # FrameNet: data from the FrameNet project
55
+ #
56
+ # Don't set 'origin' if none of these origins apply
57
+ #
58
+ # - encoding: utf8 / iso / hex / <not specified>
59
+ # Default: iso
60
+
61
+ language = de
62
+ #origin =
63
+ format = SalsaTigerXML
64
+ encoding = utf8
65
+
66
+ #############################
67
+ # Which preprocessing steps to take?
68
+ #
69
+ # Data can be parsed, lemmatized and POS-tagged,
70
+ # but this happens only if it is specified in the
71
+ # experiment file.
72
+ #
73
+ # Set these booleans to true to trigger the respective
74
+ # type of preprocessing. The default value is false.
75
+
76
+ do_lemmatize = true
77
+ do_postag = false
78
+ do_parse = true
79
+
80
+ #############################
81
+ # directory where frprep puts its internal data
82
+ #
83
+
84
+ frprep_directory = <%= File.expand_path('test/functional/output/') %>
85
+
86
+ #############################
87
+ # Syntax/semantics interface repair:
88
+ # FrameNet annotated data has some annotation choices
89
+ # that may make it harder to learn the mapping from
90
+ # syntactic structure to semantic roles.
91
+ #
92
+ # If you are using FrameNet data for training a
93
+ # semantic role labeler, set the following two settings
94
+ # to true (default is false) to 'repair' semantic role labels
95
+ # to closer match the syntactic structure
96
+
97
+ fe_syn_repair = true
98
+ fe_rel_repair = false
99
+
100
+
101
+ #################
102
+ # Location of tools and resources used by Fred
103
+
104
+ # currently known to the system:
105
+ # (Saarbruecken paths given)
106
+ #
107
+ # - POS tagging:
108
+ # - pos_tagger = treetagger
109
+ # pos_tagger_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
110
+ #
111
+ # - Lemmatization:
112
+ # - lemmatizer = treetagger
113
+ # lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
114
+ # lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-german-notokenisation
115
+ #
116
+ # - Parser:
117
+ # - parser = collins (English)
118
+ # parser_path = /proj/llx/Software/Parsers/COLLINS-PARSER/
119
+ # - parser = sleepy (German)
120
+ # parser_path = /proj/corpora/sleepy3/
121
+ # - parser = minipar (English)
122
+ # parser_path = /proj/llx/Software/Parsers/minipar-linux/
123
+ #
124
+ pos_tagger = treetagger
125
+ pos_tagger_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
126
+
127
+ lemmatizer = treetagger
128
+ lemmatizer_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
129
+
130
+ parser = berkeley
131
+ parser_path = <%= File.expand_path('tools/berkeleyParser') %>
132
+
133
+ # parser:
134
+ # maximum no. of sentences in a parse file,
135
+ # maximum sentence length to be parsed
136
+
137
+ parser_max_sent_num = 2000
138
+ parser_max_sent_len = 80
@@ -0,0 +1,138 @@
1
+ #################################################
2
+ # This is a sample experiment file
3
+ # with explanations of all features
4
+ # that can be set for the frprep preprocessing system for Fred and Rosy.
5
+ #
6
+ # To start your own experiment,
7
+ # replace all occurrences of
8
+ # %...% by values of your choice.
9
+ #
10
+ # Boolean features may be omitted and are false by default.
11
+ #
12
+ # Experiment file lines that start with '#'
13
+ # are comments and are ignored. Empty lines are ignored as well.
14
+
15
+ ########################
16
+ # Experiment description
17
+ #
18
+
19
+ # ID identifying this experiment and all its data
20
+ # please do not use spaces inside the experiment ID
21
+ prep_experiment_ID = prp_train
22
+
23
+ # YOUR INPUT DATA:
24
+ # frprep accepts an input directory rather than an input file.
25
+ # It will process all files in the directory directory_input
26
+ # and write the results to directory_preprocessed.
27
+ #
28
+ # For input formats see the discussion of "format" below.
29
+ #directory_input = <%= File.expand_path('test/functional/input/frprep/train.salsa') %>
30
+ directory_preprocessed = <%= File.expand_path('test/functional/input/fred/frprep/train.salsa') %>
31
+
32
+ ##
33
+ # Experimental data is described by the following parameters:
34
+ #
35
+ # - language: en / de
36
+ # en for English or de for German
37
+ #
38
+ # - format: SalsaTigerXML / FNXml / SalsaTab / BNC / Plain
39
+ #
40
+ # Format of the input data, training/test set
41
+ # SalsaTigerXML: Parsed data, English or German
42
+ # FNXml: FrameNet Lexical Unit files in FrameNet XML format
43
+ # FNCorpusXML: FrameNet files in the FrameNet corpus XML format
44
+ # SalsaTab: tabular format (internal)
45
+ # BNC BNC XML format, alternating words and POS tags
46
+ # Plain Plain text, ONE SENTENCE PER LINE.
47
+ #
48
+ # Preprocessing transforms all data to SalsaTigerXML.
49
+ #
50
+ # - origin: SalsaTiger / FrameNet / <not specified>
51
+ # This is the origin of the training/test data.
52
+ # SalsaTiger: data from the Tiger corpus, possibly semantically
53
+ # annotated by Salsa
54
+ # FrameNet: data from the FrameNet project
55
+ #
56
+ # Don't set 'origin' if none of these origins apply
57
+ #
58
+ # - encoding: utf8 / iso / hex / <not specified>
59
+ # Default: iso
60
+
61
+ language = de
62
+ #origin =
63
+ format = SalsaTigerXML
64
+ encoding = utf8
65
+
66
+ #############################
67
+ # Which preprocessing steps to take?
68
+ #
69
+ # Data can be parsed, lemmatized and POS-tagged,
70
+ # but this happens only if it is specified in the
71
+ # experiment file.
72
+ #
73
+ # Set these booleans to true to trigger the respective
74
+ # type of preprocessing. The default value is false.
75
+
76
+ do_lemmatize = true
77
+ do_postag = false
78
+ do_parse = true
79
+
80
+ #############################
81
+ # directory where frprep puts its internal data
82
+ #
83
+
84
+ #frprep_directory = <%= File.expand_path('test/functional/input/fred/') %>
85
+
86
+ #############################
87
+ # Syntax/semantics interface repair:
88
+ # FrameNet annotated data has some annotation choices
89
+ # that may make it harder to learn the mapping from
90
+ # syntactic structure to semantic roles.
91
+ #
92
+ # If you are using FrameNet data for training a
93
+ # semantic role labeler, set the following two settings
94
+ # to true (default is false) to 'repair' semantic role labels
95
+ # to closer match the syntactic structure
96
+
97
+ fe_syn_repair = true
98
+ fe_rel_repair = false
99
+
100
+
101
+ #################
102
+ # Location of tools and resources used by Fred
103
+
104
+ # currently known to the system:
105
+ # (Saarbruecken paths given)
106
+ #
107
+ # - POS tagging:
108
+ # - pos_tagger = treetagger
109
+ # pos_tagger_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
110
+ #
111
+ # - Lemmatization:
112
+ # - lemmatizer = treetagger
113
+ # lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
114
+ # lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-german-notokenisation
115
+ #
116
+ # - Parser:
117
+ # - parser = collins (English)
118
+ # parser_path = /proj/llx/Software/Parsers/COLLINS-PARSER/
119
+ # - parser = sleepy (German)
120
+ # parser_path = /proj/corpora/sleepy3/
121
+ # - parser = minipar (English)
122
+ # parser_path = /proj/llx/Software/Parsers/minipar-linux/
123
+ #
124
+ pos_tagger = treetagger
125
+ pos_tagger_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
126
+
127
+ lemmatizer = treetagger
128
+ lemmatizer_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
129
+
130
+ parser = berkeley
131
+ parser_path = <%= File.expand_path('tools/berkeleyParser') %>
132
+
133
+ # parser:
134
+ # maximum no. of sentences in a parse file,
135
+ # maximum sentence length to be parsed
136
+
137
+ parser_max_sent_num = 2000
138
+ parser_max_sent_len = 80