shalmaneser-frappe 1.2.rc5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +10 -0
  3. data/CHANGELOG.md +4 -0
  4. data/LICENSE.md +4 -0
  5. data/README.md +122 -0
  6. data/lib/frappe/Ampersand.rb +41 -0
  7. data/lib/frappe/file_parser.rb +126 -0
  8. data/lib/frappe/fix_syn_sem_mapping.rb +196 -0
  9. data/lib/frappe/frappe.rb +217 -0
  10. data/lib/frappe/frappe_flat_syntax.rb +89 -0
  11. data/lib/frappe/frappe_read_stxml.rb +48 -0
  12. data/lib/frappe/interfaces/berkeley_interface.rb +380 -0
  13. data/lib/frappe/interfaces/collins_interface.rb +340 -0
  14. data/lib/frappe/interfaces/counter.rb +19 -0
  15. data/lib/frappe/interfaces/stanford_interface.rb +353 -0
  16. data/lib/frappe/interfaces/treetagger_interface.rb +74 -0
  17. data/lib/frappe/interfaces/treetagger_module.rb +111 -0
  18. data/lib/frappe/interfaces/treetagger_pos_interface.rb +80 -0
  19. data/lib/frappe/interpreters/berkeley_interpreter.rb +27 -0
  20. data/lib/frappe/interpreters/collins_tnt_interpreter.rb +807 -0
  21. data/lib/frappe/interpreters/collins_treetagger_interpreter.rb +16 -0
  22. data/lib/frappe/interpreters/empty_interpreter.rb +26 -0
  23. data/lib/frappe/interpreters/headz.rb +265 -0
  24. data/lib/frappe/interpreters/headz_helpers.rb +54 -0
  25. data/lib/frappe/interpreters/stanford_interpreter.rb +28 -0
  26. data/lib/frappe/interpreters/syn_interpreter.rb +727 -0
  27. data/lib/frappe/interpreters/tiger_interpreter.rb +1846 -0
  28. data/lib/frappe/interpreters/treetagger_interpreter.rb +89 -0
  29. data/lib/frappe/one_parsed_file.rb +31 -0
  30. data/lib/frappe/opt_parser.rb +92 -0
  31. data/lib/frappe/path.rb +199 -0
  32. data/lib/frappe/plain_converter.rb +59 -0
  33. data/lib/frappe/salsa_tab_converter.rb +154 -0
  34. data/lib/frappe/salsa_tab_with_pos_converter.rb +531 -0
  35. data/lib/frappe/stxml_converter.rb +666 -0
  36. data/lib/frappe/syn_interface.rb +76 -0
  37. data/lib/frappe/syn_interface_stxml.rb +173 -0
  38. data/lib/frappe/syn_interface_tab.rb +39 -0
  39. data/lib/frappe/utf_iso.rb +27 -0
  40. data/lib/shalmaneser/frappe.rb +1 -0
  41. metadata +130 -0
@@ -0,0 +1,76 @@
1
+ #############################
2
+ # abstract class, to be inherited:
3
+ #
4
+ # tabular format or SalsaTigerXML interface for modules
5
+ # offering POS tagging, lemmatization, parsing etc.
6
+
7
+ # Leave this commented until we've reworked ExternalSystems
8
+ # since in causes circular requirements.
9
+ # require 'external_systems'
10
+
11
+ module Shalmaneser
12
+ module Frappe
13
+ class SynInterface
14
+ ###
15
+ # returns a string: the name of the system
16
+ # e.g. "Collins" or "TNT"
17
+ def self.system
18
+ raise NotImplementedError, "Overwrite me"
19
+ end
20
+
21
+ ###
22
+ # returns a string: the service offered
23
+ # one of "lemmatizer", "parser", "pos tagger"
24
+ def self.service
25
+ raise NotImplementedError, "Overwrite me"
26
+ end
27
+
28
+ ###
29
+ # initialize to set values for all subsequent processing
30
+ def initialize(program_path, # string: path to system
31
+ insuffix, # string: suffix of input files
32
+ outsuffix, # string: suffix for processed files
33
+ var_hash = {}) # optional arguments in a hash
34
+
35
+ @program_path = program_path
36
+ @insuffix = insuffix
37
+ @outsuffix = outsuffix
38
+ end
39
+
40
+ ###
41
+ # process each file in in_dir with matching suffix,
42
+ # producing a file in out_dir with same name but the suffix replaced
43
+ #
44
+ # returns: nothing
45
+ def process_dir(in_dir, # string: name of input directory
46
+ out_dir) # string: name of output directory
47
+
48
+ Dir["#{in_dir}*#{@insuffix}"].each do |infilename|
49
+ outfilename = "#{out_dir}#{File.basename(infilename, @insuffix)}#{@outsuffix}"
50
+ process_file(infilename, outfilename)
51
+ end
52
+ end
53
+
54
+ ###
55
+ # process one file, writing the result to outfilename
56
+ #
57
+ # returns: nothing
58
+ def process_file(infilename, # string: name of input file
59
+ outfilename)
60
+ raise NotImplementedError, "Overwrite me"
61
+ end
62
+
63
+ protected
64
+
65
+ def self.announce_me
66
+ if defined?(ExternalSystems)
67
+ # Yup, we have a class to which we can announce ourselves.
68
+ ExternalSystems.add_interface(self)
69
+ else
70
+ # no interface collector class
71
+ LOGGER.warn "Interface #{self} not announced: no ExternalSystems."
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,173 @@
1
+ require_relative 'syn_interface'
2
+
3
+ require 'salsa_tiger_xml/file_parts_parser'
4
+ require 'salsa_tiger_xml/salsa_tiger_xml_helper'
5
+ require 'salsa_tiger_xml/salsa_tiger_sentence'
6
+
7
+ #############################
8
+ # abstract class, to be inherited:
9
+ #
10
+ # SalsaTigerXML interface for modules
11
+ # offering parsing etc.
12
+ #
13
+ # The input format for these classes is TabFormat or FNTabFormat
14
+ module Shalmaneser
15
+ module Frappe
16
+ class SynInterfaceSTXML < SynInterface
17
+ ###
18
+ # initialize to set values for all subsequent processing
19
+ def initialize(program_path, # string: path to system
20
+ insuffix, # string: suffix of input files
21
+ outsuffix, # string: suffix for processed files
22
+ stsuffix, # string: suffix for Salsa/Tiger XML files
23
+ var_hash = {}) # optional arguments in a hash
24
+ super(program_path, insuffix, outsuffix, var_hash)
25
+ @stsuffix = stsuffix
26
+ end
27
+
28
+ def to_stxml_dir(in_dir, # string: name of dir with parse files
29
+ out_dir) # string: name of output dir
30
+
31
+ Dir["#{in_dir}*#{@outsuffix}"].each do |parsefilename|
32
+ stxmlfilename = "#{out_dir}#{File.basename(parsefilename, @outsuffix)}#{@stsuffix}"
33
+ to_stxml_file(parsefilename, stxmlfilename)
34
+ end
35
+ end
36
+
37
+ def to_stxml_file(infilename, outfilename)
38
+ raise NotImplementedError, "Overwrite me"
39
+ end
40
+
41
+ ###
42
+ # standard mapping:
43
+ #
44
+ # to be used as the mapping from tab sentence words to
45
+ # SalsaTigerSentence nodes returned by each_sentence:
46
+ # map the n-th word of the tab sentence to the n-th terminal of
47
+ # the SalsaTigerSentence
48
+ def self.standard_mapping(sent, tabsent)
49
+ retv = {}
50
+
51
+ if sent.nil?
52
+ retv = nil
53
+ else
54
+ terminals = sent.terminals_sorted
55
+ if tabsent
56
+ tabsent.each_line_parsed do |l|
57
+ if (t = terminals[l.get("lineno")])
58
+ retv[l.get("lineno")] = [t]
59
+ else
60
+ retv[l.get("lineno")] = []
61
+ end
62
+ end
63
+ end
64
+ end
65
+
66
+ retv
67
+ end
68
+
69
+
70
+ ###
71
+ # for a given processed file:
72
+ # yield each sentence as a tuple
73
+ # [SalsaTigerSentence object, FNTabFormatSentence object, mapping]
74
+ # of
75
+ # - the sentence in SalsaTigerXML,
76
+ # - the matching tab format sentence
77
+ # - a mapping of terminals:
78
+ # hash: line in tab sentence(integer) -> array:SynNode
79
+ # mapping tab sentence nodes to matching nodes in the SalsaTigerSentence data structure
80
+ #
81
+ # default version: write Salsa/Tiger XML to tempfile, read back in
82
+ # and assume that each sentence in the tab file has a correspondent
83
+ # in the processed file (may not hold e.g. if the parser leaves out
84
+ # sentences it cannot process)
85
+ def each_sentence(infilename, # string: name of processed file
86
+ tab_dir = nil) # string: name of dir with input files
87
+ # (set either here or on initialization)
88
+ if tab_dir
89
+ @tab_dir = tab_dir
90
+ end
91
+
92
+ # write Salsa/Tiger XML to tempfile
93
+ tf = Tempfile.new("SynInterface")
94
+ tf.close
95
+ to_stxml_file(infilename, tf.path)
96
+ tf.flush
97
+
98
+ # get matching tab file, read
99
+ tab_reader = get_tab_reader(infilename)
100
+ tab_sentences = []
101
+ tab_reader.each_sentence { |s| tab_sentences << s }
102
+
103
+ # read Salsa/Tiger sentences and yield them
104
+ reader = STXML::FilePartsParser.new(tf.path)
105
+ sent_index = 0
106
+ reader.scan_s { |sent_string|
107
+ yield [
108
+ STXML::SalsaTigerSentence.new(sent_string, tab_sentences[sent_index]),
109
+ tab_sentences[sent_index],
110
+ SynInterfaceSTXML.standard_mapping(sent, tab_sentences[sent_index])
111
+ ]
112
+ sent_index += 1
113
+ }
114
+
115
+ # remove tempfile
116
+ tf.close(true)
117
+ end
118
+
119
+ #####################
120
+ protected
121
+
122
+
123
+ ###
124
+ # get tab format file for a given processed file
125
+ def get_tab_reader(infilename) # string: name of processed file
126
+ # find matching non-processed file for processed file
127
+ # assumption: directory with non-processed files
128
+ # has been set as @tab_dir
129
+
130
+ # sanity checks
131
+ unless @tab_dir
132
+ raise "Need to set tab directory"
133
+ end
134
+
135
+ # get matching tab file for this parser output file
136
+ tabfilename = @tab_dir+File.basename(infilename, @outsuffix)+ @insuffix
137
+ return FNTabFormatFile.new(tabfilename)
138
+ end
139
+
140
+
141
+ ###
142
+ # provide a XML representation for a sentence that couldn't be analyzed
143
+ # assuming a flat structure of all terminals, adding a virtual top node
144
+ def self.failed_sentence(tab_sent, sentid)
145
+ sent_obj = STXML::SalsaTigerSentence.empty_sentence(sentid.to_s)
146
+
147
+ sent_obj.set_attribute("failed","true")
148
+
149
+ topnode = sent_obj.add_syn("nt",
150
+ "NONE", # cat
151
+ nil, # word (doesn't matter)
152
+ nil, # pos (doesn't matter)
153
+ "500") # nonterminal counter
154
+
155
+ t_counter = 0
156
+
157
+ tab_sent.each_line_parsed {|line|
158
+ t_counter += 1
159
+ word = line.get("word")
160
+ pos = line.get("pos")
161
+ node = sent_obj.add_syn("t",
162
+ nil, # cat (doesn't matter here)
163
+ STXML::SalsaTigerXMLHelper.escape(word), # word
164
+ pos, # pos
165
+ t_counter.to_s)
166
+ topnode.add_child(node,nil)
167
+ node.add_parent(topnode, nil)
168
+ }
169
+ return sent_obj
170
+ end
171
+ end
172
+ end
173
+ end
@@ -0,0 +1,39 @@
1
+ require_relative 'syn_interface'
2
+
3
+ require 'tabular_format/fn_tab_format_file'
4
+
5
+ #############################
6
+ # abstract class, to be inherited:
7
+ #
8
+ # tabular format interface for modules
9
+ # offering POS tagging, lemmatization etc.
10
+ module Shalmaneser
11
+ module Frappe
12
+ class SynInterfaceTab < SynInterface
13
+
14
+ ##########
15
+ protected
16
+
17
+ # fntab_words_for_file:
18
+ # given a file in tab format, columns as in FNTabFormat,
19
+ # get the "word" entries and write them to a given file,
20
+ # one word per line, as input for processing
21
+ def SynInterfaceTab.fntab_words_to_file(infilename, # string: name of input file
22
+ outfile, # stream: output file
23
+ sent_marker = "", # string: mark end of sentence how?
24
+ iso = nil) # non-nil: assume utf-8, transform to iso-8859-1
25
+ corpusfile = FNTabFormatFile.new(infilename)
26
+ corpusfile.each_sentence {|s|
27
+ s.each_line_parsed {|line_obj|
28
+ if iso
29
+ outfile.puts UtfIso.to_iso_8859_1(line_obj.get("word"))
30
+ else
31
+ outfile.puts line_obj.get("word")
32
+ end
33
+ }
34
+ outfile.puts sent_marker
35
+ }
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,27 @@
1
+ # KE changed July 05: now no inclusion of modules required,
2
+ # and names changed from REXML.Encodign to UtfIso
3
+ module Shalmaneser
4
+ module Frappe
5
+ module UtfIso
6
+ # Convert from UTF-8
7
+ def UtfIso.to_iso_8859_1(content)
8
+ array_utf8 = content.unpack('U*')
9
+ array_enc = []
10
+ array_utf8.each do |num|
11
+ if num <= 0xFF
12
+ array_enc << num
13
+ else
14
+ # Numeric entity (&#nnnn;); shard by Stefan Scholl
15
+ # array_enc += to_iso_8859("&\##{num};").unpack('C*')
16
+ end
17
+ end
18
+ array_enc.pack('C*')
19
+ end
20
+
21
+ # Convert to UTF-8
22
+ def UtfIso.from_iso_8859_1(str)
23
+ str.unpack('C*').pack('U*')
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1 @@
1
+ # A dummy file to require for now.
metadata ADDED
@@ -0,0 +1,130 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: shalmaneser-frappe
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.2.rc5
5
+ platform: ruby
6
+ authors:
7
+ - Andrei Beliankou
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-01-13 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ - !ruby/object:Gem::Dependency
28
+ name: tokenizer
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '0.2'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '0.2'
41
+ - !ruby/object:Gem::Dependency
42
+ name: shalmaneser-lib
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '='
46
+ - !ruby/object:Gem::Version
47
+ version: 1.2.rc5
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '='
53
+ - !ruby/object:Gem::Version
54
+ version: 1.2.rc5
55
+ description: Frappe - Fred and Rosy PreProcEssor.
56
+ email: arbox@yandex.ru
57
+ executables: []
58
+ extensions: []
59
+ extra_rdoc_files:
60
+ - README.md
61
+ - LICENSE.md
62
+ - CHANGELOG.md
63
+ files:
64
+ - ".yardopts"
65
+ - CHANGELOG.md
66
+ - LICENSE.md
67
+ - README.md
68
+ - lib/frappe/Ampersand.rb
69
+ - lib/frappe/file_parser.rb
70
+ - lib/frappe/fix_syn_sem_mapping.rb
71
+ - lib/frappe/frappe.rb
72
+ - lib/frappe/frappe_flat_syntax.rb
73
+ - lib/frappe/frappe_read_stxml.rb
74
+ - lib/frappe/interfaces/berkeley_interface.rb
75
+ - lib/frappe/interfaces/collins_interface.rb
76
+ - lib/frappe/interfaces/counter.rb
77
+ - lib/frappe/interfaces/stanford_interface.rb
78
+ - lib/frappe/interfaces/treetagger_interface.rb
79
+ - lib/frappe/interfaces/treetagger_module.rb
80
+ - lib/frappe/interfaces/treetagger_pos_interface.rb
81
+ - lib/frappe/interpreters/berkeley_interpreter.rb
82
+ - lib/frappe/interpreters/collins_tnt_interpreter.rb
83
+ - lib/frappe/interpreters/collins_treetagger_interpreter.rb
84
+ - lib/frappe/interpreters/empty_interpreter.rb
85
+ - lib/frappe/interpreters/headz.rb
86
+ - lib/frappe/interpreters/headz_helpers.rb
87
+ - lib/frappe/interpreters/stanford_interpreter.rb
88
+ - lib/frappe/interpreters/syn_interpreter.rb
89
+ - lib/frappe/interpreters/tiger_interpreter.rb
90
+ - lib/frappe/interpreters/treetagger_interpreter.rb
91
+ - lib/frappe/one_parsed_file.rb
92
+ - lib/frappe/opt_parser.rb
93
+ - lib/frappe/path.rb
94
+ - lib/frappe/plain_converter.rb
95
+ - lib/frappe/salsa_tab_converter.rb
96
+ - lib/frappe/salsa_tab_with_pos_converter.rb
97
+ - lib/frappe/stxml_converter.rb
98
+ - lib/frappe/syn_interface.rb
99
+ - lib/frappe/syn_interface_stxml.rb
100
+ - lib/frappe/syn_interface_tab.rb
101
+ - lib/frappe/utf_iso.rb
102
+ - lib/shalmaneser/frappe.rb
103
+ homepage: https://github.com/arbox/shalmaneser
104
+ licenses:
105
+ - GPL-2.0
106
+ metadata: {}
107
+ post_install_message:
108
+ rdoc_options:
109
+ - "-m"
110
+ - README.md
111
+ require_paths:
112
+ - lib
113
+ required_ruby_version: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - '='
116
+ - !ruby/object:Gem::Version
117
+ version: '2.0'
118
+ required_rubygems_version: !ruby/object:Gem::Requirement
119
+ requirements:
120
+ - - ">"
121
+ - !ruby/object:Gem::Version
122
+ version: 1.3.1
123
+ requirements: []
124
+ rubyforge_project:
125
+ rubygems_version: 2.5.1
126
+ signing_key:
127
+ specification_version: 4
128
+ summary: FRAPPE
129
+ test_files: []
130
+ has_rdoc: