shalmaneser 1.2.0.rc1 → 1.2.0.rc2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (30) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +26 -8
  3. data/doc/SB_README +57 -0
  4. data/doc/exp_files_description.txt +160 -0
  5. data/doc/fred.pdf +0 -0
  6. data/doc/index.md +120 -0
  7. data/doc/salsa_tool.pdf +0 -0
  8. data/doc/salsatigerxml.pdf +0 -0
  9. data/doc/shal_doc.pdf +0 -0
  10. data/doc/shal_lrec.pdf +0 -0
  11. data/lib/ext/maxent/Classify.class +0 -0
  12. data/lib/ext/maxent/Train.class +0 -0
  13. data/lib/frprep/TreetaggerInterface.rb +4 -4
  14. data/lib/shalmaneser/version.rb +1 -1
  15. metadata +41 -48
  16. data/test/frprep/test_opt_parser.rb +0 -94
  17. data/test/functional/functional_test_helper.rb +0 -40
  18. data/test/functional/sample_experiment_files/fred_test.salsa.erb +0 -122
  19. data/test/functional/sample_experiment_files/fred_train.salsa.erb +0 -135
  20. data/test/functional/sample_experiment_files/prp_test.salsa.erb +0 -138
  21. data/test/functional/sample_experiment_files/prp_test.salsa.fred.standalone.erb +0 -120
  22. data/test/functional/sample_experiment_files/prp_test.salsa.rosy.standalone.erb +0 -120
  23. data/test/functional/sample_experiment_files/prp_train.salsa.erb +0 -138
  24. data/test/functional/sample_experiment_files/prp_train.salsa.fred.standalone.erb +0 -138
  25. data/test/functional/sample_experiment_files/prp_train.salsa.rosy.standalone.erb +0 -138
  26. data/test/functional/sample_experiment_files/rosy_test.salsa.erb +0 -257
  27. data/test/functional/sample_experiment_files/rosy_train.salsa.erb +0 -259
  28. data/test/functional/test_fred.rb +0 -47
  29. data/test/functional/test_frprep.rb +0 -52
  30. data/test/functional/test_rosy.rb +0 -40
@@ -1,135 +0,0 @@
1
- # ID identifying this experiment and all its data
2
- # please do not use spaces inside the experiment ID
3
- experiment_ID = fred_train
4
-
5
- # targets:
6
- # if apply_to_all_known_targets is set to true,
7
- # disambiguate all words for which we have training data
8
- # when performing task "test" (i.e. applying trained classifiers)
9
- apply_to_all_known_targets = true
10
-
11
- # Enduser mode?
12
- # The idea is that the enduser will only _apply_
13
- # pre-trained classifiers. So in enduser mode many
14
- # options are disallowed.
15
- enduser_mode = false
16
-
17
-
18
- # print warnings and
19
- # give detailed progress reports
20
- verbose = true
21
-
22
-
23
- ############################
24
- # Paths
25
- # - fred_directory: directory where Fred puts its internal data
26
- # - directory_output:
27
- # redirect system output of disambiguated text (in SalsaTigerXML)
28
- # to another directory.
29
- # If you do not set anything here, output is to
30
- # <fred_directory>/<experiment_ID>/output/stxml
31
- # - classifier_dir:
32
- # Write trained classifiers to this directory.
33
- # If you do not set this parameter, classifiers are written to
34
- # <fred_directory>/<experiment_ID>/classifiers
35
-
36
- fred_directory = <%= File.expand_path('test/functional/output') %>
37
-
38
- # - preproc_descr_file_train / ...test
39
- # where the experiment file for frprep is located
40
- # (preprocessing for Fred and Rosy)
41
- # for the preprocessing of the data used in this experiment
42
- #
43
- # give one preprocessing file name for the training data
44
- # and one for the test data
45
- # (If you only ever use test data in this experiment, you only
46
- # need to give preproc_descr_file_test, and vice versa for training data.)
47
- preproc_descr_file_train = <%= File.expand_path('test/functional/sample_experiment_files/prp_train.salsa.fred.standalone') %>
48
-
49
-
50
-
51
- #####################
52
- # noncontiguous input?
53
- # if so, set 'noncontiguous_input' to 'true' (default is 'false')
54
- # Also give the larger corpus from which the input sentences are:
55
- # - directory
56
- # - format: same possibilities as for frprep format
57
- # - encoding: same possibilities as for frprep encoding
58
-
59
- noncontiguous_input = false
60
- #larger_corpus_dir =
61
- larger_corpus_format = SalsaTigerXML
62
- #larger_corpus_encoding = iso
63
-
64
-
65
- #################
66
- # Features
67
-
68
- # bag-of-words context, with given context size,
69
- # for example:
70
- feature = context 50
71
- feature = context 2
72
- #
73
- # (you can give more than one context feature line!)
74
- #
75
- # other possible features:
76
- # feature = syntax
77
- # feature = synsem
78
- #
79
- # syntax: grammatical functions
80
- # synsem: grammatical functions plus headwords
81
-
82
- #feature = context % %contextsize%
83
- feature = syntax
84
-
85
- # How to handle training data that is labeled
86
- # with multiple sense labels?
87
- # - binarize (default): This works only with binary classifiers.
88
- # When featurizing for the binary classifiers, consider an item
89
- # positive if its set of assigned labels includes the
90
- # label for this binary classifier.
91
- # - repeat: Repeat the instance, once for each
92
- # sense label that has been assigned. (Basically, treat it
93
- # as N instances with equal features but different labels.)
94
- # - join: join all the assigned senses into one combined sense
95
- # and treat that as a separate sense to train on.
96
- # - keep: keep as multiple sense labels. (Note that this
97
- # makes sense only for classifiers that can deal with
98
- # multiple labels.)
99
-
100
- #handle_multilabel = binarize
101
- handle_multilabel = repeat
102
-
103
- # What to do with numerical features?
104
- # - keep: just leave as is
105
- # - repeat: for a feature with max. numerical value N,
106
- # use N binary features
107
- # - bin: use a fixed number of bins, e.g. 5, then
108
- # if feature value > 20: set all bins to 1,
109
- # if feature value > 10: set the first four bins to 1,
110
- # etc.
111
- # default: bin.
112
- #numerical_features = bin
113
- numerical_features = keep
114
- # Binary classifiers, or n-ary classifiers?
115
- # if binary classifiers, set 'binary_classifiers = true'
116
- # default is 'false'.
117
- binary_classifiers = false
118
-
119
- #################
120
- # Fred internal settings
121
-
122
- # what kind of classifier to use?
123
- #
124
- # format:
125
- # <classifier type> <path> <optionally another path>
126
- #
127
- # for maxent, give first the path where maxent resides,
128
- # then <where_shalmaneser_resides>/program/tools/maxent
129
- classifier = maxent <%= File.expand_path('tools/maxent/maxent-2.4.0') %>
130
-
131
-
132
- # for binary classifiers, you can set the pseudolabel
133
- # on the 'negative' sense.
134
- # Default is 'NONE'
135
- negsense = NONE
@@ -1,138 +0,0 @@
1
- #################################################
2
- # This is a sample experiment file
3
- # with explanations of all features
4
- # that can be set for the frprep preprocessing system for Fred and Rosy.
5
- #
6
- # To start your own experiment,
7
- # replace all occurrences of
8
- # %...% by values of your choice.
9
- #
10
- # Boolean features may be omitted and are false by default.
11
- #
12
- # Experiment file lines that start with '#'
13
- # are comments and are ignored. Empty lines are ignored as well.
14
-
15
- ########################
16
- # Experiment description
17
- #
18
-
19
- # ID identifying this experiment and all its data
20
- # please do not use spaces inside the experiment ID
21
- prep_experiment_ID = prp_test
22
-
23
- # YOUR INPUT DATA:
24
- # frprep accepts an input directory rather than an input file.
25
- # It will process all files in the directory directory_input
26
- # and write the results to directory_preprocessed.
27
- #
28
- # For input formats see the discussion of "format" below.
29
- directory_input = <%= File.expand_path('test/functional/input/frprep/test.salsa') %>
30
- directory_preprocessed = <%= File.expand_path('test/functional/output/frprep/test.salsa') %>
31
-
32
- ##
33
- # Experimental data is described by the following parameters:
34
- #
35
- # - language: en / de
36
- # en for English or de for German
37
- #
38
- # - format: SalsaTigerXML / FNXml / SalsaTab / BNC / Plain
39
- #
40
- # Format of the input data, training/test set
41
- # SalsaTigerXML: Parsed data, English or German
42
- # FNXml: FrameNet Lexical Unit files in FrameNet XML format
43
- # FNCorpusXML: FrameNet files in the FrameNet corpus XML format
44
- # SalsaTab: tabular format (internal)
45
- # BNC BNC XML format, alternating words and POS tags
46
- # Plain Plain text, ONE SENTENCE PER LINE.
47
- #
48
- # Preprocessing transforms all data to SalsaTigerXML.
49
- #
50
- # - origin: SalsaTiger / FrameNet / <not specified>
51
- # This is the origin of the training/test data.
52
- # SalsaTiger: data from the Tiger corpus, possibly semantically
53
- # annotated by Salsa
54
- # FrameNet: data from the FrameNet project
55
- #
56
- # Don't set 'origin' if none of these origins apply
57
- #
58
- # - encoding: utf8 / iso / hex / <not specified>
59
- # Default: iso
60
-
61
- language = de
62
- #origin =
63
- format = Plain
64
- encoding = iso
65
-
66
- #############################
67
- # Which preprocessing steps to take?
68
- #
69
- # Data can be parsed, lemmatized and POS-tagged,
70
- # but this happens only if it is specified in the
71
- # experiment file.
72
- #
73
- # Set these booleans to true to trigger the respective
74
- # type of preprocessing. The default value is false.
75
-
76
- do_lemmatize = true
77
- do_postag = false
78
- do_parse = true
79
-
80
- #############################
81
- # directory where frprep puts its internal data
82
- #
83
-
84
- frprep_directory = <%= File.expand_path('test/functional/output/') %>
85
-
86
- #############################
87
- # Syntax/semantics interface repair:
88
- # FrameNet annotated data has some annotation choices
89
- # that may make it harder to learn the mapping from
90
- # syntactic structure to semantic roles.
91
- #
92
- # If you are using FrameNet data for training a
93
- # semantic role labeler, set the following two settings
94
- # to true (default is false) to 'repair' semantic role labels
95
- # to closer match the syntactic structure
96
-
97
- fe_syn_repair = true
98
- fe_rel_repair = false
99
-
100
-
101
- #################
102
- # Location of tools and resources used by Fred
103
-
104
- # currently known to the system:
105
- # (Saarbruecken paths given)
106
- #
107
- # - POS tagging:
108
- # - pos_tagger = treetagger
109
- # pos_tagger_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
110
- #
111
- # - Lemmatization:
112
- # - lemmatizer = treetagger
113
- # lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
114
- # lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-german-notokenisation
115
- #
116
- # - Parser:
117
- # - parser = collins (English)
118
- # parser_path = /proj/llx/Software/Parsers/COLLINS-PARSER/
119
- # - parser = sleepy (German)
120
- # parser_path = /proj/corpora/sleepy3/
121
- # - parser = minipar (English)
122
- # parser_path = /proj/llx/Software/Parsers/minipar-linux/
123
- #
124
- pos_tagger = treetagger
125
- pos_tagger_path = <%= File.expand_path('tools/treetagger') %>
126
-
127
- lemmatizer = treetagger
128
- lemmatizer_path = <%= File.expand_path('tools/treetagger') %>
129
-
130
- parser = berkeley
131
- parser_path = <%= File.expand_path('tools/berkeleyParser') %>
132
-
133
- # parser:
134
- # maximum no. of sentences in a parse file,
135
- # maximum sentence length to be parsed
136
-
137
- parser_max_sent_num = 2000
138
- parser_max_sent_len = 80
@@ -1,120 +0,0 @@
1
- # ID identifying this experiment and all its data
2
- # please do not use spaces inside the experiment ID
3
- prep_experiment_ID = prp_test
4
-
5
- # YOUR INPUT DATA:
6
- # frprep accepts an input directory rather than an input file.
7
- # It will process all files in the directory directory_input
8
- # and write the results to directory_preprocessed.
9
- #
10
- # For input formats see the discussion of "format" below.
11
- #directory_input = <%= File.expand_path('test/functional/input/frprep/test.salsa') %>
12
- directory_preprocessed = <%= File.expand_path('test/functional/input/fred/frprep/test.salsa') %>
13
-
14
- ##
15
- # Experimental data is described by the following parameters:
16
- #
17
- # - language: en / de
18
- # en for English or de for German
19
- #
20
- # - format: SalsaTigerXML / FNXml / SalsaTab / BNC / Plain
21
- #
22
- # Format of the input data, training/test set
23
- # SalsaTigerXML: Parsed data, English or German
24
- # FNXml: FrameNet Lexical Unit files in FrameNet XML format
25
- # FNCorpusXML: FrameNet files in the FrameNet corpus XML format
26
- # SalsaTab: tabular format (internal)
27
- # BNC BNC XML format, alternating words and POS tags
28
- # Plain Plain text, ONE SENTENCE PER LINE.
29
- #
30
- # Preprocessing transforms all data to SalsaTigerXML.
31
- #
32
- # - origin: SalsaTiger / FrameNet / <not specified>
33
- # This is the origin of the training/test data.
34
- # SalsaTiger: data from the Tiger corpus, possibly semantically
35
- # annotated by Salsa
36
- # FrameNet: data from the FrameNet project
37
- #
38
- # Don't set 'origin' if none of these origins apply
39
- #
40
- # - encoding: utf8 / iso / hex / <not specified>
41
- # Default: iso
42
-
43
- language = de
44
- #origin =
45
- format = Plain
46
- encoding = iso
47
-
48
- #############################
49
- # Which preprocessing steps to take?
50
- #
51
- # Data can be parsed, lemmatized and POS-tagged,
52
- # but this happens only if it is specified in the
53
- # experiment file.
54
- #
55
- # Set these booleans to true to trigger the respective
56
- # type of preprocessing. The default value is false.
57
-
58
- do_lemmatize = true
59
- do_postag = false
60
- do_parse = true
61
-
62
- #############################
63
- # directory where frprep puts its internal data
64
- #
65
-
66
- #frprep_directory = <%= File.expand_path('test/functional/input/fred/frprep') %>
67
-
68
- #############################
69
- # Syntax/semantics interface repair:
70
- # FrameNet annotated data has some annotation choices
71
- # that may make it harder to learn the mapping from
72
- # syntactic structure to semantic roles.
73
- #
74
- # If you are using FrameNet data for training a
75
- # semantic role labeler, set the following two settings
76
- # to true (default is false) to 'repair' semantic role labels
77
- # to closer match the syntactic structure
78
-
79
- fe_syn_repair = true
80
- fe_rel_repair = false
81
-
82
-
83
- #################
84
- # Location of tools and resources used by Fred
85
-
86
- # currently known to the system:
87
- # (Saarbruecken paths given)
88
- #
89
- # - POS tagging:
90
- # - pos_tagger = treetagger
91
- # pos_tagger_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
92
- #
93
- # - Lemmatization:
94
- # - lemmatizer = treetagger
95
- # lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
96
- # lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-german-notokenisation
97
- #
98
- # - Parser:
99
- # - parser = collins (English)
100
- # parser_path = /proj/llx/Software/Parsers/COLLINS-PARSER/
101
- # - parser = sleepy (German)
102
- # parser_path = /proj/corpora/sleepy3/
103
- # - parser = minipar (English)
104
- # parser_path = /proj/llx/Software/Parsers/minipar-linux/
105
- #
106
- pos_tagger = treetagger
107
- pos_tagger_path = <%= File.expand_path('tools/treetagger') %>
108
-
109
- lemmatizer = treetagger
110
- lemmatizer_path = <%= File.expand_path('tools/treetagger') %>
111
-
112
- parser = berkeley
113
- parser_path = <%= File.expand_path('tools/berkeleyParser') %>
114
-
115
- # parser:
116
- # maximum no. of sentences in a parse file,
117
- # maximum sentence length to be parsed
118
-
119
- parser_max_sent_num = 2000
120
- parser_max_sent_len = 80
@@ -1,120 +0,0 @@
1
- # ID identifying this experiment and all its data
2
- # please do not use spaces inside the experiment ID
3
- prep_experiment_ID = prp_test
4
-
5
- # YOUR INPUT DATA:
6
- # frprep accepts an input directory rather than an input file.
7
- # It will process all files in the directory directory_input
8
- # and write the results to directory_preprocessed.
9
- #
10
- # For input formats see the discussion of "format" below.
11
- #directory_input = <%= File.expand_path('test/functional/input/frprep/test.salsa') %>
12
- directory_preprocessed = <%= File.expand_path('test/functional/input/rosy/frprep/test.salsa') %>
13
-
14
- ##
15
- # Experimental data is described by the following parameters:
16
- #
17
- # - language: en / de
18
- # en for English or de for German
19
- #
20
- # - format: SalsaTigerXML / FNXml / SalsaTab / BNC / Plain
21
- #
22
- # Format of the input data, training/test set
23
- # SalsaTigerXML: Parsed data, English or German
24
- # FNXml: FrameNet Lexical Unit files in FrameNet XML format
25
- # FNCorpusXML: FrameNet files in the FrameNet corpus XML format
26
- # SalsaTab: tabular format (internal)
27
- # BNC BNC XML format, alternating words and POS tags
28
- # Plain Plain text, ONE SENTENCE PER LINE.
29
- #
30
- # Preprocessing transforms all data to SalsaTigerXML.
31
- #
32
- # - origin: SalsaTiger / FrameNet / <not specified>
33
- # This is the origin of the training/test data.
34
- # SalsaTiger: data from the Tiger corpus, possibly semantically
35
- # annotated by Salsa
36
- # FrameNet: data from the FrameNet project
37
- #
38
- # Don't set 'origin' if none of these origins apply
39
- #
40
- # - encoding: utf8 / iso / hex / <not specified>
41
- # Default: iso
42
-
43
- language = de
44
- #origin =
45
- format = Plain
46
- encoding = iso
47
-
48
- #############################
49
- # Which preprocessing steps to take?
50
- #
51
- # Data can be parsed, lemmatized and POS-tagged,
52
- # but this happens only if it is specified in the
53
- # experiment file.
54
- #
55
- # Set these booleans to true to trigger the respective
56
- # type of preprocessing. The default value is false.
57
-
58
- do_lemmatize = true
59
- do_postag = false
60
- do_parse = true
61
-
62
- #############################
63
- # directory where frprep puts its internal data
64
- #
65
-
66
- #frprep_directory = <%= File.expand_path('test/functional/input/rosy/frprep') %>
67
-
68
- #############################
69
- # Syntax/semantics interface repair:
70
- # FrameNet annotated data has some annotation choices
71
- # that may make it harder to learn the mapping from
72
- # syntactic structure to semantic roles.
73
- #
74
- # If you are using FrameNet data for training a
75
- # semantic role labeler, set the following two settings
76
- # to true (default is false) to 'repair' semantic role labels
77
- # to closer match the syntactic structure
78
-
79
- fe_syn_repair = true
80
- fe_rel_repair = false
81
-
82
-
83
- #################
84
- # Location of tools and resources used by Fred
85
-
86
- # currently known to the system:
87
- # (Saarbruecken paths given)
88
- #
89
- # - POS tagging:
90
- # - pos_tagger = treetagger
91
- # pos_tagger_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
92
- #
93
- # - Lemmatization:
94
- # - lemmatizer = treetagger
95
- # lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-english-notokenisation
96
- # lemmatizer_path = /proj/llx/Software/treetagger/cmd/tree-tagger-german-notokenisation
97
- #
98
- # - Parser:
99
- # - parser = collins (English)
100
- # parser_path = /proj/llx/Software/Parsers/COLLINS-PARSER/
101
- # - parser = sleepy (German)
102
- # parser_path = /proj/corpora/sleepy3/
103
- # - parser = minipar (English)
104
- # parser_path = /proj/llx/Software/Parsers/minipar-linux/
105
- #
106
- pos_tagger = treetagger
107
- pos_tagger_path = <%= File.expand_path('tools/treetagger') %>
108
-
109
- lemmatizer = treetagger
110
- lemmatizer_path = <%= File.expand_path('tools/treetagger') %>
111
-
112
- parser = berkeley
113
- parser_path = <%= File.expand_path('tools/berkeleyParser') %>
114
-
115
- # parser:
116
- # maximum no. of sentences in a parse file,
117
- # maximum sentence length to be parsed
118
-
119
- parser_max_sent_num = 2000
120
- parser_max_sent_len = 80