frprep 0.0.1.prealpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +8 -0
- data/CHANGELOG.rdoc +0 -0
- data/LICENSE.rdoc +0 -0
- data/README.rdoc +0 -0
- data/lib/common/AbstractSynInterface.rb +1227 -0
- data/lib/common/BerkeleyInterface.rb +375 -0
- data/lib/common/CollinsInterface.rb +1165 -0
- data/lib/common/ConfigData.rb +694 -0
- data/lib/common/Counter.rb +18 -0
- data/lib/common/DBInterface.rb +48 -0
- data/lib/common/EnduserMode.rb +27 -0
- data/lib/common/Eval.rb +480 -0
- data/lib/common/FixSynSemMapping.rb +196 -0
- data/lib/common/FrPrepConfigData.rb +66 -0
- data/lib/common/FrprepHelper.rb +1324 -0
- data/lib/common/Graph.rb +345 -0
- data/lib/common/ISO-8859-1.rb +24 -0
- data/lib/common/ML.rb +186 -0
- data/lib/common/Maxent.rb +215 -0
- data/lib/common/MiniparInterface.rb +1388 -0
- data/lib/common/Optimise.rb +195 -0
- data/lib/common/Parser.rb +213 -0
- data/lib/common/RegXML.rb +269 -0
- data/lib/common/RosyConventions.rb +171 -0
- data/lib/common/SQLQuery.rb +243 -0
- data/lib/common/STXmlTerminalOrder.rb +194 -0
- data/lib/common/SalsaTigerRegXML.rb +2347 -0
- data/lib/common/SalsaTigerXMLHelper.rb +99 -0
- data/lib/common/SleepyInterface.rb +384 -0
- data/lib/common/SynInterfaces.rb +275 -0
- data/lib/common/TabFormat.rb +720 -0
- data/lib/common/Tiger.rb +1448 -0
- data/lib/common/TntInterface.rb +44 -0
- data/lib/common/Tree.rb +61 -0
- data/lib/common/TreetaggerInterface.rb +303 -0
- data/lib/common/headz.rb +338 -0
- data/lib/common/option_parser.rb +13 -0
- data/lib/common/ruby_class_extensions.rb +310 -0
- data/lib/fred/Baseline.rb +150 -0
- data/lib/fred/FileZipped.rb +31 -0
- data/lib/fred/FredBOWContext.rb +863 -0
- data/lib/fred/FredConfigData.rb +182 -0
- data/lib/fred/FredConventions.rb +232 -0
- data/lib/fred/FredDetermineTargets.rb +324 -0
- data/lib/fred/FredEval.rb +312 -0
- data/lib/fred/FredFeatureExtractors.rb +321 -0
- data/lib/fred/FredFeatures.rb +1061 -0
- data/lib/fred/FredFeaturize.rb +596 -0
- data/lib/fred/FredNumTrainingSenses.rb +27 -0
- data/lib/fred/FredParameters.rb +402 -0
- data/lib/fred/FredSplit.rb +84 -0
- data/lib/fred/FredSplitPkg.rb +180 -0
- data/lib/fred/FredTest.rb +607 -0
- data/lib/fred/FredTrain.rb +144 -0
- data/lib/fred/PlotAndREval.rb +480 -0
- data/lib/fred/fred.rb +45 -0
- data/lib/fred/md5.rb +23 -0
- data/lib/fred/opt_parser.rb +250 -0
- data/lib/frprep/AbstractSynInterface.rb +1227 -0
- data/lib/frprep/Ampersand.rb +37 -0
- data/lib/frprep/BerkeleyInterface.rb +375 -0
- data/lib/frprep/CollinsInterface.rb +1165 -0
- data/lib/frprep/ConfigData.rb +694 -0
- data/lib/frprep/Counter.rb +18 -0
- data/lib/frprep/FNCorpusXML.rb +643 -0
- data/lib/frprep/FNDatabase.rb +144 -0
- data/lib/frprep/FixSynSemMapping.rb +196 -0
- data/lib/frprep/FrPrepConfigData.rb +66 -0
- data/lib/frprep/FrameXML.rb +513 -0
- data/lib/frprep/FrprepHelper.rb +1324 -0
- data/lib/frprep/Graph.rb +345 -0
- data/lib/frprep/ISO-8859-1.rb +24 -0
- data/lib/frprep/MiniparInterface.rb +1388 -0
- data/lib/frprep/Parser.rb +213 -0
- data/lib/frprep/RegXML.rb +269 -0
- data/lib/frprep/STXmlTerminalOrder.rb +194 -0
- data/lib/frprep/SalsaTigerRegXML.rb +2347 -0
- data/lib/frprep/SalsaTigerXMLHelper.rb +99 -0
- data/lib/frprep/SleepyInterface.rb +384 -0
- data/lib/frprep/SynInterfaces.rb +275 -0
- data/lib/frprep/TabFormat.rb +720 -0
- data/lib/frprep/Tiger.rb +1448 -0
- data/lib/frprep/TntInterface.rb +44 -0
- data/lib/frprep/Tree.rb +61 -0
- data/lib/frprep/TreetaggerInterface.rb +303 -0
- data/lib/frprep/do_parses.rb +142 -0
- data/lib/frprep/frprep.rb +686 -0
- data/lib/frprep/headz.rb +338 -0
- data/lib/frprep/one_parsed_file.rb +28 -0
- data/lib/frprep/opt_parser.rb +94 -0
- data/lib/frprep/ruby_class_extensions.rb +310 -0
- data/lib/rosy/AbstractFeatureAndExternal.rb +240 -0
- data/lib/rosy/DBMySQL.rb +146 -0
- data/lib/rosy/DBSQLite.rb +280 -0
- data/lib/rosy/DBTable.rb +239 -0
- data/lib/rosy/DBWrapper.rb +176 -0
- data/lib/rosy/ExternalConfigData.rb +58 -0
- data/lib/rosy/FailedParses.rb +130 -0
- data/lib/rosy/FeatureInfo.rb +242 -0
- data/lib/rosy/GfInduce.rb +1115 -0
- data/lib/rosy/GfInduceFeature.rb +148 -0
- data/lib/rosy/InputData.rb +294 -0
- data/lib/rosy/RosyConfigData.rb +115 -0
- data/lib/rosy/RosyConfusability.rb +338 -0
- data/lib/rosy/RosyEval.rb +465 -0
- data/lib/rosy/RosyFeatureExtractors.rb +1609 -0
- data/lib/rosy/RosyFeaturize.rb +280 -0
- data/lib/rosy/RosyInspect.rb +336 -0
- data/lib/rosy/RosyIterator.rb +477 -0
- data/lib/rosy/RosyPhase2FeatureExtractors.rb +230 -0
- data/lib/rosy/RosyPruning.rb +165 -0
- data/lib/rosy/RosyServices.rb +744 -0
- data/lib/rosy/RosySplit.rb +232 -0
- data/lib/rosy/RosyTask.rb +19 -0
- data/lib/rosy/RosyTest.rb +826 -0
- data/lib/rosy/RosyTrain.rb +232 -0
- data/lib/rosy/RosyTrainingTestTable.rb +786 -0
- data/lib/rosy/TargetsMostFrequentFrame.rb +60 -0
- data/lib/rosy/View.rb +418 -0
- data/lib/rosy/opt_parser.rb +379 -0
- data/lib/rosy/rosy.rb +77 -0
- data/lib/shalmaneser/version.rb +3 -0
- data/test/frprep/test_opt_parser.rb +94 -0
- data/test/functional/functional_test_helper.rb +40 -0
- data/test/functional/sample_experiment_files/fred_test.salsa.erb +122 -0
- data/test/functional/sample_experiment_files/fred_train.salsa.erb +135 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +120 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +138 -0
- data/test/functional/sample_experiment_files/rosy_test.salsa.erb +257 -0
- data/test/functional/sample_experiment_files/rosy_train.salsa.erb +259 -0
- data/test/functional/test_fred.rb +47 -0
- data/test/functional/test_frprep.rb +52 -0
- data/test/functional/test_rosy.rb +20 -0
- metadata +270 -0
@@ -0,0 +1,120 @@
|
|
1
|
+
# ID identifying this experiment and all its data
|
2
|
+
# please do not use spaces inside the experiment ID
|
3
|
+
prep_experiment_ID = prp_test
|
4
|
+
|
5
|
+
# YOUR INPUT DATA:
|
6
|
+
# frprep accepts an input directory rather than an input file.
|
7
|
+
# It will process all files in the directory directory_input
|
8
|
+
# and write the results to directory_preprocessed.
|
9
|
+
#
|
10
|
+
# For input formats see the discussion of "format" below.
|
11
|
+
#directory_input = <%= File.expand_path('test/functional/input/frprep/test.salsa') %>
|
12
|
+
directory_preprocessed = <%= File.expand_path('test/functional/input/fred/frprep/test.salsa') %>
|
13
|
+
|
14
|
+
##
|
15
|
+
# Experimental data is described by the following parameters:
|
16
|
+
#
|
17
|
+
# - language: en / de
|
18
|
+
# en for English or de for German
|
19
|
+
#
|
20
|
+
# - format: SalsaTigerXML / FNXml / SalsaTab / BNC / Plain
|
21
|
+
#
|
22
|
+
# Format of the input data, training/test set
|
23
|
+
# SalsaTigerXML: Parsed data, English or German
|
24
|
+
# FNXml: FrameNet Lexical Unit files in FrameNet XML format
|
25
|
+
# FNCorpusXML: FrameNet files in the FrameNet corpus XML format
|
26
|
+
# SalsaTab: tabular format (internal)
|
27
|
+
# BNC BNC XML format, alternating words and POS tags
|
28
|
+
# Plain Plain text, ONE SENTENCE PER LINE.
|
29
|
+
#
|
30
|
+
# Preprocessing transforms all data to SalsaTigerXML.
|
31
|
+
#
|
32
|
+
# - origin: SalsaTiger / FrameNet / <not specified>
|
33
|
+
# This is the origin of the training/test data.
|
34
|
+
# SalsaTiger: data from the Tiger corpus, possibly semantically
|
35
|
+
# annotated by Salsa
|
36
|
+
# FrameNet: data from the FrameNet project
|
37
|
+
#
|
38
|
+
# Don't set 'origin' if none of these origins apply
|
39
|
+
#
|
40
|
+
# - encoding: utf8 / iso / hex / <not specified>
|
41
|
+
# Default: iso
|
42
|
+
|
43
|
+
language = de
|
44
|
+
#origin =
|
45
|
+
format = Plain
|
46
|
+
encoding = iso
|
47
|
+
|
48
|
+
#############################
|
49
|
+
# Which preprocessing steps to take?
|
50
|
+
#
|
51
|
+
# Data can be parsed, lemmatized and POS-tagged,
|
52
|
+
# but this happens only if it is specified in the
|
53
|
+
# experiment file.
|
54
|
+
#
|
55
|
+
# Set these booleans to true to trigger the respective
|
56
|
+
# type of preprocessing. The default value is false.
|
57
|
+
|
58
|
+
do_lemmatize = true
|
59
|
+
do_postag = false
|
60
|
+
do_parse = true
|
61
|
+
|
62
|
+
#############################
|
63
|
+
# directory where frprep puts its internal data
|
64
|
+
#
|
65
|
+
|
66
|
+
#frprep_directory = <%= File.expand_path('test/functional/input/fred/frprep') %>
|
67
|
+
|
68
|
+
#############################
|
69
|
+
# Syntax/semantics interface repair:
|
70
|
+
# FrameNet annotated data has some annotation choices
|
71
|
+
# that may make it harder to learn the mapping from
|
72
|
+
# syntactic structure to semantic roles.
|
73
|
+
#
|
74
|
+
# If you are using FrameNet data for training a
|
75
|
+
# semantic role labeler, set the following two settings
|
76
|
+
# to true (default is false) to 'repair' semantic role labels
|
77
|
+
# to closer match the syntactic structure
|
78
|
+
|
79
|
+
fe_syn_repair = true
|
80
|
+
fe_rel_repair = false
|
81
|
+
|
82
|
+
|
83
|
+
#################
|
84
|
+
# Location of tools and resources used by Fred
|
85
|
+
|
86
|
+
# currently known to the system:
|
87
|
+
# (Saarbruecken paths given)
|
88
|
+
#
|
89
|
+
# - POS tagging:
|
90
|
+
# - pos_tagger = treetagger
|
91
|
+
# pos_tagger_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
|
92
|
+
#
|
93
|
+
# - Lemmatization:
|
94
|
+
# - lemmatizer = treetagger
|
95
|
+
# lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
|
96
|
+
# lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-german-notokenisation
|
97
|
+
#
|
98
|
+
# - Parser:
|
99
|
+
# - parser = collins (English)
|
100
|
+
# parser_path = /proj/llx/Software/Parsers/COLLINS-PARSER/
|
101
|
+
# - parser = sleepy (German)
|
102
|
+
# parser_path = /proj/corpora/sleepy3/
|
103
|
+
# - parser = minipar (English)
|
104
|
+
# parser_path = /proj/llx/Software/Parsers/minipar-linux/
|
105
|
+
#
|
106
|
+
pos_tagger = treetagger
|
107
|
+
pos_tagger_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
|
108
|
+
|
109
|
+
lemmatizer = treetagger
|
110
|
+
lemmatizer_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
|
111
|
+
|
112
|
+
parser = berkeley
|
113
|
+
parser_path = <%= File.expand_path('tools/berkeleyParser') %>
|
114
|
+
|
115
|
+
# parser:
|
116
|
+
# maximum no. of sentences in a parse file,
|
117
|
+
# maximum sentence length to be parsed
|
118
|
+
|
119
|
+
parser_max_sent_num = 2000
|
120
|
+
parser_max_sent_len = 80
|
@@ -0,0 +1,120 @@
|
|
1
|
+
# ID identifying this experiment and all its data
|
2
|
+
# please do not use spaces inside the experiment ID
|
3
|
+
prep_experiment_ID = prp_test
|
4
|
+
|
5
|
+
# YOUR INPUT DATA:
|
6
|
+
# frprep accepts an input directory rather than an input file.
|
7
|
+
# It will process all files in the directory directory_input
|
8
|
+
# and write the results to directory_preprocessed.
|
9
|
+
#
|
10
|
+
# For input formats see the discussion of "format" below.
|
11
|
+
#directory_input = <%= File.expand_path('test/functional/input/frprep/test.salsa') %>
|
12
|
+
directory_preprocessed = <%= File.expand_path('test/functional/input/rosy/frprep/test.salsa') %>
|
13
|
+
|
14
|
+
##
|
15
|
+
# Experimental data is described by the following parameters:
|
16
|
+
#
|
17
|
+
# - language: en / de
|
18
|
+
# en for English or de for German
|
19
|
+
#
|
20
|
+
# - format: SalsaTigerXML / FNXml / SalsaTab / BNC / Plain
|
21
|
+
#
|
22
|
+
# Format of the input data, training/test set
|
23
|
+
# SalsaTigerXML: Parsed data, English or German
|
24
|
+
# FNXml: FrameNet Lexical Unit files in FrameNet XML format
|
25
|
+
# FNCorpusXML: FrameNet files in the FrameNet corpus XML format
|
26
|
+
# SalsaTab: tabular format (internal)
|
27
|
+
# BNC BNC XML format, alternating words and POS tags
|
28
|
+
# Plain Plain text, ONE SENTENCE PER LINE.
|
29
|
+
#
|
30
|
+
# Preprocessing transforms all data to SalsaTigerXML.
|
31
|
+
#
|
32
|
+
# - origin: SalsaTiger / FrameNet / <not specified>
|
33
|
+
# This is the origin of the training/test data.
|
34
|
+
# SalsaTiger: data from the Tiger corpus, possibly semantically
|
35
|
+
# annotated by Salsa
|
36
|
+
# FrameNet: data from the FrameNet project
|
37
|
+
#
|
38
|
+
# Don't set 'origin' if none of these origins apply
|
39
|
+
#
|
40
|
+
# - encoding: utf8 / iso / hex / <not specified>
|
41
|
+
# Default: iso
|
42
|
+
|
43
|
+
language = de
|
44
|
+
#origin =
|
45
|
+
format = Plain
|
46
|
+
encoding = iso
|
47
|
+
|
48
|
+
#############################
|
49
|
+
# Which preprocessing steps to take?
|
50
|
+
#
|
51
|
+
# Data can be parsed, lemmatized and POS-tagged,
|
52
|
+
# but this happens only if it is specified in the
|
53
|
+
# experiment file.
|
54
|
+
#
|
55
|
+
# Set these booleans to true to trigger the respective
|
56
|
+
# type of preprocessing. The default value is false.
|
57
|
+
|
58
|
+
do_lemmatize = true
|
59
|
+
do_postag = false
|
60
|
+
do_parse = true
|
61
|
+
|
62
|
+
#############################
|
63
|
+
# directory where frprep puts its internal data
|
64
|
+
#
|
65
|
+
|
66
|
+
#frprep_directory = <%= File.expand_path('test/functional/input/rosy/frprep') %>
|
67
|
+
|
68
|
+
#############################
|
69
|
+
# Syntax/semantics interface repair:
|
70
|
+
# FrameNet annotated data has some annotation choices
|
71
|
+
# that may make it harder to learn the mapping from
|
72
|
+
# syntactic structure to semantic roles.
|
73
|
+
#
|
74
|
+
# If you are using FrameNet data for training a
|
75
|
+
# semantic role labeler, set the following two settings
|
76
|
+
# to true (default is false) to 'repair' semantic role labels
|
77
|
+
# to closer match the syntactic structure
|
78
|
+
|
79
|
+
fe_syn_repair = true
|
80
|
+
fe_rel_repair = false
|
81
|
+
|
82
|
+
|
83
|
+
#################
|
84
|
+
# Location of tools and resources used by Fred
|
85
|
+
|
86
|
+
# currently known to the system:
|
87
|
+
# (Saarbruecken paths given)
|
88
|
+
#
|
89
|
+
# - POS tagging:
|
90
|
+
# - pos_tagger = treetagger
|
91
|
+
# pos_tagger_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
|
92
|
+
#
|
93
|
+
# - Lemmatization:
|
94
|
+
# - lemmatizer = treetagger
|
95
|
+
# lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
|
96
|
+
# lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-german-notokenisation
|
97
|
+
#
|
98
|
+
# - Parser:
|
99
|
+
# - parser = collins (English)
|
100
|
+
# parser_path = /proj/llx/Software/Parsers/COLLINS-PARSER/
|
101
|
+
# - parser = sleepy (German)
|
102
|
+
# parser_path = /proj/corpora/sleepy3/
|
103
|
+
# - parser = minipar (English)
|
104
|
+
# parser_path = /proj/llx/Software/Parsers/minipar-linux/
|
105
|
+
#
|
106
|
+
pos_tagger = treetagger
|
107
|
+
pos_tagger_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
|
108
|
+
|
109
|
+
lemmatizer = treetagger
|
110
|
+
lemmatizer_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
|
111
|
+
|
112
|
+
parser = berkeley
|
113
|
+
parser_path = <%= File.expand_path('tools/berkeleyParser') %>
|
114
|
+
|
115
|
+
# parser:
|
116
|
+
# maximum no. of sentences in a parse file,
|
117
|
+
# maximum sentence length to be parsed
|
118
|
+
|
119
|
+
parser_max_sent_num = 2000
|
120
|
+
parser_max_sent_len = 80
|
@@ -0,0 +1,138 @@
|
|
1
|
+
#################################################
|
2
|
+
# This is a sample experiment file
|
3
|
+
# with explanations of all features
|
4
|
+
# that can be set for the frprep preprocessing system for Fred and Rosy.
|
5
|
+
#
|
6
|
+
# To start your own experiment,
|
7
|
+
# replace all occurrences of
|
8
|
+
# %...% by values of your choice.
|
9
|
+
#
|
10
|
+
# Boolean features may be omitted and are false by default.
|
11
|
+
#
|
12
|
+
# Experiment file lines that start with '#'
|
13
|
+
# are comments and are ignored. Empty lines are ignored as well.
|
14
|
+
|
15
|
+
########################
|
16
|
+
# Experiment description
|
17
|
+
#
|
18
|
+
|
19
|
+
# ID identifying this experiment and all its data
|
20
|
+
# please do not use spaces inside the experiment ID
|
21
|
+
prep_experiment_ID = prp_train
|
22
|
+
|
23
|
+
# YOUR INPUT DATA:
|
24
|
+
# frprep accepts an input directory rather than an input file.
|
25
|
+
# It will process all files in the directory directory_input
|
26
|
+
# and write the results to directory_preprocessed.
|
27
|
+
#
|
28
|
+
# For input formats see the discussion of "format" below.
|
29
|
+
directory_input = <%= File.expand_path('test/functional/input/frprep/train.salsa') %>
|
30
|
+
directory_preprocessed = <%= File.expand_path('test/functional/output/frprep/train.salsa') %>
|
31
|
+
|
32
|
+
##
|
33
|
+
# Experimental data is described by the following parameters:
|
34
|
+
#
|
35
|
+
# - language: en / de
|
36
|
+
# en for English or de for German
|
37
|
+
#
|
38
|
+
# - format: SalsaTigerXML / FNXml / SalsaTab / BNC / Plain
|
39
|
+
#
|
40
|
+
# Format of the input data, training/test set
|
41
|
+
# SalsaTigerXML: Parsed data, English or German
|
42
|
+
# FNXml: FrameNet Lexical Unit files in FrameNet XML format
|
43
|
+
# FNCorpusXML: FrameNet files in the FrameNet corpus XML format
|
44
|
+
# SalsaTab: tabular format (internal)
|
45
|
+
# BNC BNC XML format, alternating words and POS tags
|
46
|
+
# Plain Plain text, ONE SENTENCE PER LINE.
|
47
|
+
#
|
48
|
+
# Preprocessing transforms all data to SalsaTigerXML.
|
49
|
+
#
|
50
|
+
# - origin: SalsaTiger / FrameNet / <not specified>
|
51
|
+
# This is the origin of the training/test data.
|
52
|
+
# SalsaTiger: data from the Tiger corpus, possibly semantically
|
53
|
+
# annotated by Salsa
|
54
|
+
# FrameNet: data from the FrameNet project
|
55
|
+
#
|
56
|
+
# Don't set 'origin' if none of these origins apply
|
57
|
+
#
|
58
|
+
# - encoding: utf8 / iso / hex / <not specified>
|
59
|
+
# Default: iso
|
60
|
+
|
61
|
+
language = de
|
62
|
+
#origin =
|
63
|
+
format = SalsaTigerXML
|
64
|
+
encoding = utf8
|
65
|
+
|
66
|
+
#############################
|
67
|
+
# Which preprocessing steps to take?
|
68
|
+
#
|
69
|
+
# Data can be parsed, lemmatized and POS-tagged,
|
70
|
+
# but this happens only if it is specified in the
|
71
|
+
# experiment file.
|
72
|
+
#
|
73
|
+
# Set these booleans to true to trigger the respective
|
74
|
+
# type of preprocessing. The default value is false.
|
75
|
+
|
76
|
+
do_lemmatize = true
|
77
|
+
do_postag = false
|
78
|
+
do_parse = true
|
79
|
+
|
80
|
+
#############################
|
81
|
+
# directory where frprep puts its internal data
|
82
|
+
#
|
83
|
+
|
84
|
+
frprep_directory = <%= File.expand_path('test/functional/output/') %>
|
85
|
+
|
86
|
+
#############################
|
87
|
+
# Syntax/semantics interface repair:
|
88
|
+
# FrameNet annotated data has some annotation choices
|
89
|
+
# that may make it harder to learn the mapping from
|
90
|
+
# syntactic structure to semantic roles.
|
91
|
+
#
|
92
|
+
# If you are using FrameNet data for training a
|
93
|
+
# semantic role labeler, set the following two settings
|
94
|
+
# to true (default is false) to 'repair' semantic role labels
|
95
|
+
# to closer match the syntactic structure
|
96
|
+
|
97
|
+
fe_syn_repair = true
|
98
|
+
fe_rel_repair = false
|
99
|
+
|
100
|
+
|
101
|
+
#################
|
102
|
+
# Location of tools and resources used by Fred
|
103
|
+
|
104
|
+
# currently known to the system:
|
105
|
+
# (Saarbruecken paths given)
|
106
|
+
#
|
107
|
+
# - POS tagging:
|
108
|
+
# - pos_tagger = treetagger
|
109
|
+
# pos_tagger_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
|
110
|
+
#
|
111
|
+
# - Lemmatization:
|
112
|
+
# - lemmatizer = treetagger
|
113
|
+
# lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
|
114
|
+
# lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-german-notokenisation
|
115
|
+
#
|
116
|
+
# - Parser:
|
117
|
+
# - parser = collins (English)
|
118
|
+
# parser_path = /proj/llx/Software/Parsers/COLLINS-PARSER/
|
119
|
+
# - parser = sleepy (German)
|
120
|
+
# parser_path = /proj/corpora/sleepy3/
|
121
|
+
# - parser = minipar (English)
|
122
|
+
# parser_path = /proj/llx/Software/Parsers/minipar-linux/
|
123
|
+
#
|
124
|
+
pos_tagger = treetagger
|
125
|
+
pos_tagger_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
|
126
|
+
|
127
|
+
lemmatizer = treetagger
|
128
|
+
lemmatizer_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
|
129
|
+
|
130
|
+
parser = berkeley
|
131
|
+
parser_path = <%= File.expand_path('tools/berkeleyParser') %>
|
132
|
+
|
133
|
+
# parser:
|
134
|
+
# maximum no. of sentences in a parse file,
|
135
|
+
# maximum sentence length to be parsed
|
136
|
+
|
137
|
+
parser_max_sent_num = 2000
|
138
|
+
parser_max_sent_len = 80
|
@@ -0,0 +1,138 @@
|
|
1
|
+
#################################################
|
2
|
+
# This is a sample experiment file
|
3
|
+
# with explanations of all features
|
4
|
+
# that can be set for the frprep preprocessing system for Fred and Rosy.
|
5
|
+
#
|
6
|
+
# To start your own experiment,
|
7
|
+
# replace all occurrences of
|
8
|
+
# %...% by values of your choice.
|
9
|
+
#
|
10
|
+
# Boolean features may be omitted and are false by default.
|
11
|
+
#
|
12
|
+
# Experiment file lines that start with '#'
|
13
|
+
# are comments and are ignored. Empty lines are ignored as well.
|
14
|
+
|
15
|
+
########################
|
16
|
+
# Experiment description
|
17
|
+
#
|
18
|
+
|
19
|
+
# ID identifying this experiment and all its data
|
20
|
+
# please do not use spaces inside the experiment ID
|
21
|
+
prep_experiment_ID = prp_train
|
22
|
+
|
23
|
+
# YOUR INPUT DATA:
|
24
|
+
# frprep accepts an input directory rather than an input file.
|
25
|
+
# It will process all files in the directory directory_input
|
26
|
+
# and write the results to directory_preprocessed.
|
27
|
+
#
|
28
|
+
# For input formats see the discussion of "format" below.
|
29
|
+
#directory_input = <%= File.expand_path('test/functional/input/frprep/train.salsa') %>
|
30
|
+
directory_preprocessed = <%= File.expand_path('test/functional/input/fred/frprep/train.salsa') %>
|
31
|
+
|
32
|
+
##
|
33
|
+
# Experimental data is described by the following parameters:
|
34
|
+
#
|
35
|
+
# - language: en / de
|
36
|
+
# en for English or de for German
|
37
|
+
#
|
38
|
+
# - format: SalsaTigerXML / FNXml / SalsaTab / BNC / Plain
|
39
|
+
#
|
40
|
+
# Format of the input data, training/test set
|
41
|
+
# SalsaTigerXML: Parsed data, English or German
|
42
|
+
# FNXml: FrameNet Lexical Unit files in FrameNet XML format
|
43
|
+
# FNCorpusXML: FrameNet files in the FrameNet corpus XML format
|
44
|
+
# SalsaTab: tabular format (internal)
|
45
|
+
# BNC BNC XML format, alternating words and POS tags
|
46
|
+
# Plain Plain text, ONE SENTENCE PER LINE.
|
47
|
+
#
|
48
|
+
# Preprocessing transforms all data to SalsaTigerXML.
|
49
|
+
#
|
50
|
+
# - origin: SalsaTiger / FrameNet / <not specified>
|
51
|
+
# This is the origin of the training/test data.
|
52
|
+
# SalsaTiger: data from the Tiger corpus, possibly semantically
|
53
|
+
# annotated by Salsa
|
54
|
+
# FrameNet: data from the FrameNet project
|
55
|
+
#
|
56
|
+
# Don't set 'origin' if none of these origins apply
|
57
|
+
#
|
58
|
+
# - encoding: utf8 / iso / hex / <not specified>
|
59
|
+
# Default: iso
|
60
|
+
|
61
|
+
language = de
|
62
|
+
#origin =
|
63
|
+
format = SalsaTigerXML
|
64
|
+
encoding = utf8
|
65
|
+
|
66
|
+
#############################
|
67
|
+
# Which preprocessing steps to take?
|
68
|
+
#
|
69
|
+
# Data can be parsed, lemmatized and POS-tagged,
|
70
|
+
# but this happens only if it is specified in the
|
71
|
+
# experiment file.
|
72
|
+
#
|
73
|
+
# Set these booleans to true to trigger the respective
|
74
|
+
# type of preprocessing. The default value is false.
|
75
|
+
|
76
|
+
do_lemmatize = true
|
77
|
+
do_postag = false
|
78
|
+
do_parse = true
|
79
|
+
|
80
|
+
#############################
|
81
|
+
# directory where frprep puts its internal data
|
82
|
+
#
|
83
|
+
|
84
|
+
#frprep_directory = <%= File.expand_path('test/functional/input/fred/') %>
|
85
|
+
|
86
|
+
#############################
|
87
|
+
# Syntax/semantics interface repair:
|
88
|
+
# FrameNet annotated data has some annotation choices
|
89
|
+
# that may make it harder to learn the mapping from
|
90
|
+
# syntactic structure to semantic roles.
|
91
|
+
#
|
92
|
+
# If you are using FrameNet data for training a
|
93
|
+
# semantic role labeler, set the following two settings
|
94
|
+
# to true (default is false) to 'repair' semantic role labels
|
95
|
+
# to closer match the syntactic structure
|
96
|
+
|
97
|
+
fe_syn_repair = true
|
98
|
+
fe_rel_repair = false
|
99
|
+
|
100
|
+
|
101
|
+
#################
|
102
|
+
# Location of tools and resources used by Fred
|
103
|
+
|
104
|
+
# currently known to the system:
|
105
|
+
# (Saarbruecken paths given)
|
106
|
+
#
|
107
|
+
# - POS tagging:
|
108
|
+
# - pos_tagger = treetagger
|
109
|
+
# pos_tagger_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
|
110
|
+
#
|
111
|
+
# - Lemmatization:
|
112
|
+
# - lemmatizer = treetagger
|
113
|
+
# lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
|
114
|
+
# lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-german-notokenisation
|
115
|
+
#
|
116
|
+
# - Parser:
|
117
|
+
# - parser = collins (English)
|
118
|
+
# parser_path = /proj/llx/Software/Parsers/COLLINS-PARSER/
|
119
|
+
# - parser = sleepy (German)
|
120
|
+
# parser_path = /proj/corpora/sleepy3/
|
121
|
+
# - parser = minipar (English)
|
122
|
+
# parser_path = /proj/llx/Software/Parsers/minipar-linux/
|
123
|
+
#
|
124
|
+
pos_tagger = treetagger
|
125
|
+
pos_tagger_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
|
126
|
+
|
127
|
+
lemmatizer = treetagger
|
128
|
+
lemmatizer_path = <%= File.expand_path('tools/treetagger/shal-ger') %>
|
129
|
+
|
130
|
+
parser = berkeley
|
131
|
+
parser_path = <%= File.expand_path('tools/berkeleyParser') %>
|
132
|
+
|
133
|
+
# parser:
|
134
|
+
# maximum no. of sentences in a parse file,
|
135
|
+
# maximum sentence length to be parsed
|
136
|
+
|
137
|
+
parser_max_sent_num = 2000
|
138
|
+
parser_max_sent_len = 80
|