frprep 0.0.1.prealpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (138) hide show
  1. data/.yardopts +8 -0
  2. data/CHANGELOG.rdoc +0 -0
  3. data/LICENSE.rdoc +0 -0
  4. data/README.rdoc +0 -0
  5. data/lib/common/AbstractSynInterface.rb +1227 -0
  6. data/lib/common/BerkeleyInterface.rb +375 -0
  7. data/lib/common/CollinsInterface.rb +1165 -0
  8. data/lib/common/ConfigData.rb +694 -0
  9. data/lib/common/Counter.rb +18 -0
  10. data/lib/common/DBInterface.rb +48 -0
  11. data/lib/common/EnduserMode.rb +27 -0
  12. data/lib/common/Eval.rb +480 -0
  13. data/lib/common/FixSynSemMapping.rb +196 -0
  14. data/lib/common/FrPrepConfigData.rb +66 -0
  15. data/lib/common/FrprepHelper.rb +1324 -0
  16. data/lib/common/Graph.rb +345 -0
  17. data/lib/common/ISO-8859-1.rb +24 -0
  18. data/lib/common/ML.rb +186 -0
  19. data/lib/common/Maxent.rb +215 -0
  20. data/lib/common/MiniparInterface.rb +1388 -0
  21. data/lib/common/Optimise.rb +195 -0
  22. data/lib/common/Parser.rb +213 -0
  23. data/lib/common/RegXML.rb +269 -0
  24. data/lib/common/RosyConventions.rb +171 -0
  25. data/lib/common/SQLQuery.rb +243 -0
  26. data/lib/common/STXmlTerminalOrder.rb +194 -0
  27. data/lib/common/SalsaTigerRegXML.rb +2347 -0
  28. data/lib/common/SalsaTigerXMLHelper.rb +99 -0
  29. data/lib/common/SleepyInterface.rb +384 -0
  30. data/lib/common/SynInterfaces.rb +275 -0
  31. data/lib/common/TabFormat.rb +720 -0
  32. data/lib/common/Tiger.rb +1448 -0
  33. data/lib/common/TntInterface.rb +44 -0
  34. data/lib/common/Tree.rb +61 -0
  35. data/lib/common/TreetaggerInterface.rb +303 -0
  36. data/lib/common/headz.rb +338 -0
  37. data/lib/common/option_parser.rb +13 -0
  38. data/lib/common/ruby_class_extensions.rb +310 -0
  39. data/lib/fred/Baseline.rb +150 -0
  40. data/lib/fred/FileZipped.rb +31 -0
  41. data/lib/fred/FredBOWContext.rb +863 -0
  42. data/lib/fred/FredConfigData.rb +182 -0
  43. data/lib/fred/FredConventions.rb +232 -0
  44. data/lib/fred/FredDetermineTargets.rb +324 -0
  45. data/lib/fred/FredEval.rb +312 -0
  46. data/lib/fred/FredFeatureExtractors.rb +321 -0
  47. data/lib/fred/FredFeatures.rb +1061 -0
  48. data/lib/fred/FredFeaturize.rb +596 -0
  49. data/lib/fred/FredNumTrainingSenses.rb +27 -0
  50. data/lib/fred/FredParameters.rb +402 -0
  51. data/lib/fred/FredSplit.rb +84 -0
  52. data/lib/fred/FredSplitPkg.rb +180 -0
  53. data/lib/fred/FredTest.rb +607 -0
  54. data/lib/fred/FredTrain.rb +144 -0
  55. data/lib/fred/PlotAndREval.rb +480 -0
  56. data/lib/fred/fred.rb +45 -0
  57. data/lib/fred/md5.rb +23 -0
  58. data/lib/fred/opt_parser.rb +250 -0
  59. data/lib/frprep/AbstractSynInterface.rb +1227 -0
  60. data/lib/frprep/Ampersand.rb +37 -0
  61. data/lib/frprep/BerkeleyInterface.rb +375 -0
  62. data/lib/frprep/CollinsInterface.rb +1165 -0
  63. data/lib/frprep/ConfigData.rb +694 -0
  64. data/lib/frprep/Counter.rb +18 -0
  65. data/lib/frprep/FNCorpusXML.rb +643 -0
  66. data/lib/frprep/FNDatabase.rb +144 -0
  67. data/lib/frprep/FixSynSemMapping.rb +196 -0
  68. data/lib/frprep/FrPrepConfigData.rb +66 -0
  69. data/lib/frprep/FrameXML.rb +513 -0
  70. data/lib/frprep/FrprepHelper.rb +1324 -0
  71. data/lib/frprep/Graph.rb +345 -0
  72. data/lib/frprep/ISO-8859-1.rb +24 -0
  73. data/lib/frprep/MiniparInterface.rb +1388 -0
  74. data/lib/frprep/Parser.rb +213 -0
  75. data/lib/frprep/RegXML.rb +269 -0
  76. data/lib/frprep/STXmlTerminalOrder.rb +194 -0
  77. data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
  78. data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
  79. data/lib/frprep/SleepyInterface.rb +384 -0
  80. data/lib/frprep/SynInterfaces.rb +275 -0
  81. data/lib/frprep/TabFormat.rb +720 -0
  82. data/lib/frprep/Tiger.rb +1448 -0
  83. data/lib/frprep/TntInterface.rb +44 -0
  84. data/lib/frprep/Tree.rb +61 -0
  85. data/lib/frprep/TreetaggerInterface.rb +303 -0
  86. data/lib/frprep/do_parses.rb +142 -0
  87. data/lib/frprep/frprep.rb +686 -0
  88. data/lib/frprep/headz.rb +338 -0
  89. data/lib/frprep/one_parsed_file.rb +28 -0
  90. data/lib/frprep/opt_parser.rb +94 -0
  91. data/lib/frprep/ruby_class_extensions.rb +310 -0
  92. data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
  93. data/lib/rosy/DBMySQL.rb +146 -0
  94. data/lib/rosy/DBSQLite.rb +280 -0
  95. data/lib/rosy/DBTable.rb +239 -0
  96. data/lib/rosy/DBWrapper.rb +176 -0
  97. data/lib/rosy/ExternalConfigData.rb +58 -0
  98. data/lib/rosy/FailedParses.rb +130 -0
  99. data/lib/rosy/FeatureInfo.rb +242 -0
  100. data/lib/rosy/GfInduce.rb +1115 -0
  101. data/lib/rosy/GfInduceFeature.rb +148 -0
  102. data/lib/rosy/InputData.rb +294 -0
  103. data/lib/rosy/RosyConfigData.rb +115 -0
  104. data/lib/rosy/RosyConfusability.rb +338 -0
  105. data/lib/rosy/RosyEval.rb +465 -0
  106. data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
  107. data/lib/rosy/RosyFeaturize.rb +280 -0
  108. data/lib/rosy/RosyInspect.rb +336 -0
  109. data/lib/rosy/RosyIterator.rb +477 -0
  110. data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
  111. data/lib/rosy/RosyPruning.rb +165 -0
  112. data/lib/rosy/RosyServices.rb +744 -0
  113. data/lib/rosy/RosySplit.rb +232 -0
  114. data/lib/rosy/RosyTask.rb +19 -0
  115. data/lib/rosy/RosyTest.rb +826 -0
  116. data/lib/rosy/RosyTrain.rb +232 -0
  117. data/lib/rosy/RosyTrainingTestTable.rb +786 -0
  118. data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
  119. data/lib/rosy/View.rb +418 -0
  120. data/lib/rosy/opt_parser.rb +379 -0
  121. data/lib/rosy/rosy.rb +77 -0
  122. data/lib/shalmaneser/version.rb +3 -0
  123. data/test/frprep/test_opt_parser.rb +94 -0
  124. data/test/functional/functional_test_helper.rb +40 -0
  125. data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
  126. data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
  127. data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
  128. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
  129. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
  130. data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
  131. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
  132. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
  133. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
  134. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
  135. data/test/functional/test_fred.rb +47 -0
  136. data/test/functional/test_frprep.rb +52 -0
  137. data/test/functional/test_rosy.rb +20 -0
  138. metadata +270 -0
@@ -0,0 +1,120 @@
1
+ # ID identifying this experiment and all its data
2
+ # please do not use spaces inside the experiment ID
3
+ prep_experiment_ID = prp_test
4
+
5
+ # YOUR INPUT DATA:
6
+ # frprep accepts an input directory rather than an input file.
7
+ # It will process all files in the directory directory_input
8
+ # and write the results to directory_preprocessed.
9
+ #
10
+ # For input formats see the discussion of "format" below.
11
+ #directory_input = <%= File.expand_path('test/functional/input/frprep/test.salsa') %>
12
+ directory_preprocessed = <%= File.expand_path('test/functional/input/fred/frprep/test.salsa') %>
13
+
14
+ ##
15
+ # Experimental data is described by the following parameters:
16
+ #
17
+ # - language: en / de
18
+ # en for English or de for German
19
+ #
20
+ # - format: SalsaTigerXML / FNXml / SalsaTab / BNC / Plain
21
+ #
22
+ # Format of the input data, training/test set
23
+ # SalsaTigerXML: Parsed data, English or German
24
+ # FNXml: FrameNet Lexical Unit files in FrameNet XML format
25
+ # FNCorpusXML: FrameNet files in the FrameNet corpus XML format
26
+ # SalsaTab: tabular format (internal)
27
+ # BNC BNC XML format, alternating words and POS tags
28
+ # Plain Plain text, ONE SENTENCE PER LINE.
29
+ #
30
+ # Preprocessing transforms all data to SalsaTigerXML.
31
+ #
32
+ # - origin: SalsaTiger / FrameNet / <not specified>
33
+ # This is the origin of the training/test data.
34
+ # SalsaTiger: data from the Tiger corpus, possibly semantically
35
+ # annotated by Salsa
36
+ # FrameNet: data from the FrameNet project
37
+ #
38
+ # Don't set 'origin' if none of these origins apply
39
+ #
40
+ # - encoding: utf8 / iso / hex / <not specified>
41
+ # Default: iso
42
+
43
+ language = de
44
+ #origin =
45
+ format = Plain
46
+ encoding = iso
47
+
48
+ #############################
49
+ # Which preprocessing steps to take?
50
+ #
51
+ # Data can be parsed, lemmatized and POS-tagged,
52
+ # but this happens only if it is specified in the
53
+ # experiment file.
54
+ #
55
+ # Set these booleans to true to trigger the respective
56
+ # type of preprocessing. The default value is false.
57
+
58
+ do_lemmatize = true
59
+ do_postag = false
60
+ do_parse = true
61
+
62
+ #############################
63
+ # directory where frprep puts its internal data
64
+ #
65
+
66
+ #frprep_directory = <%= File.expand_path('test/functional/input/fred/frprep') %>
67
+
68
+ #############################
69
+ # Syntax/semantics interface repair:
70
+ # FrameNet annotated data has some annotation choices
71
+ # that may make it harder to learn the mapping from
72
+ # syntactic structure to semantic roles.
73
+ #
74
+ # If you are using FrameNet data for training a
75
+ # semantic role labeler, set the following two settings
76
+ # to true (default is false) to 'repair' semantic role labels
77
+ # to closer match the syntactic structure
78
+
79
+ fe_syn_repair = true
80
+ fe_rel_repair = false
81
+
82
+
83
+ #################
84
+ # Location of tools and resources used by Fred
85
+
86
+ # currently known to the system:
87
+ # (Saarbruecken paths given)
88
+ #
89
+ # - POS tagging:
90
+ # - pos_tagger = treetagger
91
+ # pos_tagger_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
92
+ #
93
+ # - Lemmatization:
94
+ # - lemmatizer = treetagger
95
+ # lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
96
+ # lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-german-notokenisation
97
+ #
98
+ # - Parser:
99
+ # - parser = collins (English)
100
+ # parser_path = /proj/llx/Software/Parsers/COLLINS-PARSER/
101
+ # - parser = sleepy (German)
102
+ # parser_path = /proj/corpora/sleepy3/
103
+ # - parser = minipar (English)
104
+ # parser_path = /proj/llx/Software/Parsers/minipar-linux/
105
+ #
106
+ pos_tagger = treetagger
107
+ pos_tagger_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
108
+
109
+ lemmatizer = treetagger
110
+ lemmatizer_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
111
+
112
+ parser = berkeley
113
+ parser_path = <%= File.expand_path('tools/berkeleyParser') %>
114
+
115
+ # parser:
116
+ # maximum no. of sentences in a parse file,
117
+ # maximum sentence length to be parsed
118
+
119
+ parser_max_sent_num = 2000
120
+ parser_max_sent_len = 80
@@ -0,0 +1,120 @@
1
+ # ID identifying this experiment and all its data
2
+ # please do not use spaces inside the experiment ID
3
+ prep_experiment_ID = prp_test
4
+
5
+ # YOUR INPUT DATA:
6
+ # frprep accepts an input directory rather than an input file.
7
+ # It will process all files in the directory directory_input
8
+ # and write the results to directory_preprocessed.
9
+ #
10
+ # For input formats see the discussion of "format" below.
11
+ #directory_input = <%= File.expand_path('test/functional/input/frprep/test.salsa') %>
12
+ directory_preprocessed = <%= File.expand_path('test/functional/input/rosy/frprep/test.salsa') %>
13
+
14
+ ##
15
+ # Experimental data is described by the following parameters:
16
+ #
17
+ # - language: en / de
18
+ # en for English or de for German
19
+ #
20
+ # - format: SalsaTigerXML / FNXml / SalsaTab / BNC / Plain
21
+ #
22
+ # Format of the input data, training/test set
23
+ # SalsaTigerXML: Parsed data, English or German
24
+ # FNXml: FrameNet Lexical Unit files in FrameNet XML format
25
+ # FNCorpusXML: FrameNet files in the FrameNet corpus XML format
26
+ # SalsaTab: tabular format (internal)
27
+ # BNC BNC XML format, alternating words and POS tags
28
+ # Plain Plain text, ONE SENTENCE PER LINE.
29
+ #
30
+ # Preprocessing transforms all data to SalsaTigerXML.
31
+ #
32
+ # - origin: SalsaTiger / FrameNet / <not specified>
33
+ # This is the origin of the training/test data.
34
+ # SalsaTiger: data from the Tiger corpus, possibly semantically
35
+ # annotated by Salsa
36
+ # FrameNet: data from the FrameNet project
37
+ #
38
+ # Don't set 'origin' if none of these origins apply
39
+ #
40
+ # - encoding: utf8 / iso / hex / <not specified>
41
+ # Default: iso
42
+
43
+ language = de
44
+ #origin =
45
+ format = Plain
46
+ encoding = iso
47
+
48
+ #############################
49
+ # Which preprocessing steps to take?
50
+ #
51
+ # Data can be parsed, lemmatized and POS-tagged,
52
+ # but this happens only if it is specified in the
53
+ # experiment file.
54
+ #
55
+ # Set these booleans to true to trigger the respective
56
+ # type of preprocessing. The default value is false.
57
+
58
+ do_lemmatize = true
59
+ do_postag = false
60
+ do_parse = true
61
+
62
+ #############################
63
+ # directory where frprep puts its internal data
64
+ #
65
+
66
+ #frprep_directory = <%= File.expand_path('test/functional/input/rosy/frprep') %>
67
+
68
+ #############################
69
+ # Syntax/semantics interface repair:
70
+ # FrameNet annotated data has some annotation choices
71
+ # that may make it harder to learn the mapping from
72
+ # syntactic structure to semantic roles.
73
+ #
74
+ # If you are using FrameNet data for training a
75
+ # semantic role labeler, set the following two settings
76
+ # to true (default is false) to 'repair' semantic role labels
77
+ # to closer match the syntactic structure
78
+
79
+ fe_syn_repair = true
80
+ fe_rel_repair = false
81
+
82
+
83
+ #################
84
+ # Location of tools and resources used by Fred
85
+
86
+ # currently known to the system:
87
+ # (Saarbruecken paths given)
88
+ #
89
+ # - POS tagging:
90
+ # - pos_tagger = treetagger
91
+ # pos_tagger_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
92
+ #
93
+ # - Lemmatization:
94
+ # - lemmatizer = treetagger
95
+ # lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
96
+ # lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-german-notokenisation
97
+ #
98
+ # - Parser:
99
+ # - parser = collins (English)
100
+ # parser_path = /proj/llx/Software/Parsers/COLLINS-PARSER/
101
+ # - parser = sleepy (German)
102
+ # parser_path = /proj/corpora/sleepy3/
103
+ # - parser = minipar (English)
104
+ # parser_path = /proj/llx/Software/Parsers/minipar-linux/
105
+ #
106
+ pos_tagger = treetagger
107
+ pos_tagger_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
108
+
109
+ lemmatizer = treetagger
110
+ lemmatizer_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
111
+
112
+ parser = berkeley
113
+ parser_path = <%= File.expand_path('tools/berkeleyParser') %>
114
+
115
+ # parser:
116
+ # maximum no. of sentences in a parse file,
117
+ # maximum sentence length to be parsed
118
+
119
+ parser_max_sent_num = 2000
120
+ parser_max_sent_len = 80
@@ -0,0 +1,138 @@
1
+ #################################################
2
+ # This is a sample experiment file
3
+ # with explanations of all features
4
+ # that can be set for the frprep preprocessing system for Fred and Rosy.
5
+ #
6
+ # To start your own experiment,
7
+ # replace all occurrences of
8
+ # %...% by values of your choice.
9
+ #
10
+ # Boolean features may be omitted and are false by default.
11
+ #
12
+ # Experiment file lines that start with '#'
13
+ # are comments and are ignored. Empty lines are ignored as well.
14
+
15
+ ########################
16
+ # Experiment description
17
+ #
18
+
19
+ # ID identifying this experiment and all its data
20
+ # please do not use spaces inside the experiment ID
21
+ prep_experiment_ID = prp_train
22
+
23
+ # YOUR INPUT DATA:
24
+ # frprep accepts an input directory rather than an input file.
25
+ # It will process all files in the directory directory_input
26
+ # and write the results to directory_preprocessed.
27
+ #
28
+ # For input formats see the discussion of "format" below.
29
+ directory_input = <%= File.expand_path('test/functional/input/frprep/train.salsa') %>
30
+ directory_preprocessed = <%= File.expand_path('test/functional/output/frprep/train.salsa') %>
31
+
32
+ ##
33
+ # Experimental data is described by the following parameters:
34
+ #
35
+ # - language: en / de
36
+ # en for English or de for German
37
+ #
38
+ # - format: SalsaTigerXML / FNXml / SalsaTab / BNC / Plain
39
+ #
40
+ # Format of the input data, training/test set
41
+ # SalsaTigerXML: Parsed data, English or German
42
+ # FNXml: FrameNet Lexical Unit files in FrameNet XML format
43
+ # FNCorpusXML: FrameNet files in the FrameNet corpus XML format
44
+ # SalsaTab: tabular format (internal)
45
+ # BNC BNC XML format, alternating words and POS tags
46
+ # Plain Plain text, ONE SENTENCE PER LINE.
47
+ #
48
+ # Preprocessing transforms all data to SalsaTigerXML.
49
+ #
50
+ # - origin: SalsaTiger / FrameNet / <not specified>
51
+ # This is the origin of the training/test data.
52
+ # SalsaTiger: data from the Tiger corpus, possibly semantically
53
+ # annotated by Salsa
54
+ # FrameNet: data from the FrameNet project
55
+ #
56
+ # Don't set 'origin' if none of these origins apply
57
+ #
58
+ # - encoding: utf8 / iso / hex / <not specified>
59
+ # Default: iso
60
+
61
+ language = de
62
+ #origin =
63
+ format = SalsaTigerXML
64
+ encoding = utf8
65
+
66
+ #############################
67
+ # Which preprocessing steps to take?
68
+ #
69
+ # Data can be parsed, lemmatized and POS-tagged,
70
+ # but this happens only if it is specified in the
71
+ # experiment file.
72
+ #
73
+ # Set these booleans to true to trigger the respective
74
+ # type of preprocessing. The default value is false.
75
+
76
+ do_lemmatize = true
77
+ do_postag = false
78
+ do_parse = true
79
+
80
+ #############################
81
+ # directory where frprep puts its internal data
82
+ #
83
+
84
+ frprep_directory = <%= File.expand_path('test/functional/output/') %>
85
+
86
+ #############################
87
+ # Syntax/semantics interface repair:
88
+ # FrameNet annotated data has some annotation choices
89
+ # that may make it harder to learn the mapping from
90
+ # syntactic structure to semantic roles.
91
+ #
92
+ # If you are using FrameNet data for training a
93
+ # semantic role labeler, set the following two settings
94
+ # to true (default is false) to 'repair' semantic role labels
95
+ # to closer match the syntactic structure
96
+
97
+ fe_syn_repair = true
98
+ fe_rel_repair = false
99
+
100
+
101
+ #################
102
+ # Location of tools and resources used by Fred
103
+
104
+ # currently known to the system:
105
+ # (Saarbruecken paths given)
106
+ #
107
+ # - POS tagging:
108
+ # - pos_tagger = treetagger
109
+ # pos_tagger_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
110
+ #
111
+ # - Lemmatization:
112
+ # - lemmatizer = treetagger
113
+ # lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
114
+ # lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-german-notokenisation
115
+ #
116
+ # - Parser:
117
+ # - parser = collins (English)
118
+ # parser_path = /proj/llx/Software/Parsers/COLLINS-PARSER/
119
+ # - parser = sleepy (German)
120
+ # parser_path = /proj/corpora/sleepy3/
121
+ # - parser = minipar (English)
122
+ # parser_path = /proj/llx/Software/Parsers/minipar-linux/
123
+ #
124
+ pos_tagger = treetagger
125
+ pos_tagger_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
126
+
127
+ lemmatizer = treetagger
128
+ lemmatizer_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
129
+
130
+ parser = berkeley
131
+ parser_path = <%= File.expand_path('tools/berkeleyParser') %>
132
+
133
+ # parser:
134
+ # maximum no. of sentences in a parse file,
135
+ # maximum sentence length to be parsed
136
+
137
+ parser_max_sent_num = 2000
138
+ parser_max_sent_len = 80
@@ -0,0 +1,138 @@
1
+ #################################################
2
+ # This is a sample experiment file
3
+ # with explanations of all features
4
+ # that can be set for the frprep preprocessing system for Fred and Rosy.
5
+ #
6
+ # To start your own experiment,
7
+ # replace all occurrences of
8
+ # %...% by values of your choice.
9
+ #
10
+ # Boolean features may be omitted and are false by default.
11
+ #
12
+ # Experiment file lines that start with '#'
13
+ # are comments and are ignored. Empty lines are ignored as well.
14
+
15
+ ########################
16
+ # Experiment description
17
+ #
18
+
19
+ # ID identifying this experiment and all its data
20
+ # please do not use spaces inside the experiment ID
21
+ prep_experiment_ID = prp_train
22
+
23
+ # YOUR INPUT DATA:
24
+ # frprep accepts an input directory rather than an input file.
25
+ # It will process all files in the directory directory_input
26
+ # and write the results to directory_preprocessed.
27
+ #
28
+ # For input formats see the discussion of "format" below.
29
+ #directory_input = <%= File.expand_path('test/functional/input/frprep/train.salsa') %>
30
+ directory_preprocessed = <%= File.expand_path('test/functional/input/fred/frprep/train.salsa') %>
31
+
32
+ ##
33
+ # Experimental data is described by the following parameters:
34
+ #
35
+ # - language: en / de
36
+ # en for English or de for German
37
+ #
38
+ # - format: SalsaTigerXML / FNXml / SalsaTab / BNC / Plain
39
+ #
40
+ # Format of the input data, training/test set
41
+ # SalsaTigerXML: Parsed data, English or German
42
+ # FNXml: FrameNet Lexical Unit files in FrameNet XML format
43
+ # FNCorpusXML: FrameNet files in the FrameNet corpus XML format
44
+ # SalsaTab: tabular format (internal)
45
+ # BNC BNC XML format, alternating words and POS tags
46
+ # Plain Plain text, ONE SENTENCE PER LINE.
47
+ #
48
+ # Preprocessing transforms all data to SalsaTigerXML.
49
+ #
50
+ # - origin: SalsaTiger / FrameNet / <not specified>
51
+ # This is the origin of the training/test data.
52
+ # SalsaTiger: data from the Tiger corpus, possibly semantically
53
+ # annotated by Salsa
54
+ # FrameNet: data from the FrameNet project
55
+ #
56
+ # Don't set 'origin' if none of these origins apply
57
+ #
58
+ # - encoding: utf8 / iso / hex / <not specified>
59
+ # Default: iso
60
+
61
+ language = de
62
+ #origin =
63
+ format = SalsaTigerXML
64
+ encoding = utf8
65
+
66
+ #############################
67
+ # Which preprocessing steps to take?
68
+ #
69
+ # Data can be parsed, lemmatized and POS-tagged,
70
+ # but this happens only if it is specified in the
71
+ # experiment file.
72
+ #
73
+ # Set these booleans to true to trigger the respective
74
+ # type of preprocessing. The default value is false.
75
+
76
+ do_lemmatize = true
77
+ do_postag = false
78
+ do_parse = true
79
+
80
+ #############################
81
+ # directory where frprep puts its internal data
82
+ #
83
+
84
+ #frprep_directory = <%= File.expand_path('test/functional/input/fred/') %>
85
+
86
+ #############################
87
+ # Syntax/semantics interface repair:
88
+ # FrameNet annotated data has some annotation choices
89
+ # that may make it harder to learn the mapping from
90
+ # syntactic structure to semantic roles.
91
+ #
92
+ # If you are using FrameNet data for training a
93
+ # semantic role labeler, set the following two settings
94
+ # to true (default is false) to 'repair' semantic role labels
95
+ # to closer match the syntactic structure
96
+
97
+ fe_syn_repair = true
98
+ fe_rel_repair = false
99
+
100
+
101
+ #################
102
+ # Location of tools and resources used by Fred
103
+
104
+ # currently known to the system:
105
+ # (Saarbruecken paths given)
106
+ #
107
+ # - POS tagging:
108
+ # - pos_tagger = treetagger
109
+ # pos_tagger_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
110
+ #
111
+ # - Lemmatization:
112
+ # - lemmatizer = treetagger
113
+ # lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
114
+ # lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-german-notokenisation
115
+ #
116
+ # - Parser:
117
+ # - parser = collins (English)
118
+ # parser_path = /proj/llx/Software/Parsers/COLLINS-PARSER/
119
+ # - parser = sleepy (German)
120
+ # parser_path = /proj/corpora/sleepy3/
121
+ # - parser = minipar (English)
122
+ # parser_path = /proj/llx/Software/Parsers/minipar-linux/
123
+ #
124
+ pos_tagger = treetagger
125
+ pos_tagger_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
126
+
127
+ lemmatizer = treetagger
128
+ lemmatizer_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
129
+
130
+ parser = berkeley
131
+ parser_path = <%= File.expand_path('tools/berkeleyParser') %>
132
+
133
+ # parser:
134
+ # maximum no. of sentences in a parse file,
135
+ # maximum sentence length to be parsed
136
+
137
+ parser_max_sent_num = 2000
138
+ parser_max_sent_len = 80