shalmaneser-frappe 1.2.rc5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +10 -0
- data/CHANGELOG.md +4 -0
- data/LICENSE.md +4 -0
- data/README.md +122 -0
- data/lib/frappe/Ampersand.rb +41 -0
- data/lib/frappe/file_parser.rb +126 -0
- data/lib/frappe/fix_syn_sem_mapping.rb +196 -0
- data/lib/frappe/frappe.rb +217 -0
- data/lib/frappe/frappe_flat_syntax.rb +89 -0
- data/lib/frappe/frappe_read_stxml.rb +48 -0
- data/lib/frappe/interfaces/berkeley_interface.rb +380 -0
- data/lib/frappe/interfaces/collins_interface.rb +340 -0
- data/lib/frappe/interfaces/counter.rb +19 -0
- data/lib/frappe/interfaces/stanford_interface.rb +353 -0
- data/lib/frappe/interfaces/treetagger_interface.rb +74 -0
- data/lib/frappe/interfaces/treetagger_module.rb +111 -0
- data/lib/frappe/interfaces/treetagger_pos_interface.rb +80 -0
- data/lib/frappe/interpreters/berkeley_interpreter.rb +27 -0
- data/lib/frappe/interpreters/collins_tnt_interpreter.rb +807 -0
- data/lib/frappe/interpreters/collins_treetagger_interpreter.rb +16 -0
- data/lib/frappe/interpreters/empty_interpreter.rb +26 -0
- data/lib/frappe/interpreters/headz.rb +265 -0
- data/lib/frappe/interpreters/headz_helpers.rb +54 -0
- data/lib/frappe/interpreters/stanford_interpreter.rb +28 -0
- data/lib/frappe/interpreters/syn_interpreter.rb +727 -0
- data/lib/frappe/interpreters/tiger_interpreter.rb +1846 -0
- data/lib/frappe/interpreters/treetagger_interpreter.rb +89 -0
- data/lib/frappe/one_parsed_file.rb +31 -0
- data/lib/frappe/opt_parser.rb +92 -0
- data/lib/frappe/path.rb +199 -0
- data/lib/frappe/plain_converter.rb +59 -0
- data/lib/frappe/salsa_tab_converter.rb +154 -0
- data/lib/frappe/salsa_tab_with_pos_converter.rb +531 -0
- data/lib/frappe/stxml_converter.rb +666 -0
- data/lib/frappe/syn_interface.rb +76 -0
- data/lib/frappe/syn_interface_stxml.rb +173 -0
- data/lib/frappe/syn_interface_tab.rb +39 -0
- data/lib/frappe/utf_iso.rb +27 -0
- data/lib/shalmaneser/frappe.rb +1 -0
- metadata +130 -0
@@ -0,0 +1,76 @@
|
|
1
|
+
#############################
|
2
|
+
# abstract class, to be inherited:
|
3
|
+
#
|
4
|
+
# tabular format or SalsaTigerXML interface for modules
|
5
|
+
# offering POS tagging, lemmatization, parsing etc.
|
6
|
+
|
7
|
+
# Leave this commented until we've reworked ExternalSystems
|
8
|
+
# since in causes circular requirements.
|
9
|
+
# require 'external_systems'
|
10
|
+
|
11
|
+
module Shalmaneser
|
12
|
+
module Frappe
|
13
|
+
class SynInterface
|
14
|
+
###
|
15
|
+
# returns a string: the name of the system
|
16
|
+
# e.g. "Collins" or "TNT"
|
17
|
+
def self.system
|
18
|
+
raise NotImplementedError, "Overwrite me"
|
19
|
+
end
|
20
|
+
|
21
|
+
###
|
22
|
+
# returns a string: the service offered
|
23
|
+
# one of "lemmatizer", "parser", "pos tagger"
|
24
|
+
def self.service
|
25
|
+
raise NotImplementedError, "Overwrite me"
|
26
|
+
end
|
27
|
+
|
28
|
+
###
|
29
|
+
# initialize to set values for all subsequent processing
|
30
|
+
def initialize(program_path, # string: path to system
|
31
|
+
insuffix, # string: suffix of input files
|
32
|
+
outsuffix, # string: suffix for processed files
|
33
|
+
var_hash = {}) # optional arguments in a hash
|
34
|
+
|
35
|
+
@program_path = program_path
|
36
|
+
@insuffix = insuffix
|
37
|
+
@outsuffix = outsuffix
|
38
|
+
end
|
39
|
+
|
40
|
+
###
|
41
|
+
# process each file in in_dir with matching suffix,
|
42
|
+
# producing a file in out_dir with same name but the suffix replaced
|
43
|
+
#
|
44
|
+
# returns: nothing
|
45
|
+
def process_dir(in_dir, # string: name of input directory
|
46
|
+
out_dir) # string: name of output directory
|
47
|
+
|
48
|
+
Dir["#{in_dir}*#{@insuffix}"].each do |infilename|
|
49
|
+
outfilename = "#{out_dir}#{File.basename(infilename, @insuffix)}#{@outsuffix}"
|
50
|
+
process_file(infilename, outfilename)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
###
|
55
|
+
# process one file, writing the result to outfilename
|
56
|
+
#
|
57
|
+
# returns: nothing
|
58
|
+
def process_file(infilename, # string: name of input file
|
59
|
+
outfilename)
|
60
|
+
raise NotImplementedError, "Overwrite me"
|
61
|
+
end
|
62
|
+
|
63
|
+
protected
|
64
|
+
|
65
|
+
def self.announce_me
|
66
|
+
if defined?(ExternalSystems)
|
67
|
+
# Yup, we have a class to which we can announce ourselves.
|
68
|
+
ExternalSystems.add_interface(self)
|
69
|
+
else
|
70
|
+
# no interface collector class
|
71
|
+
LOGGER.warn "Interface #{self} not announced: no ExternalSystems."
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,173 @@
|
|
1
|
+
require_relative 'syn_interface'
|
2
|
+
|
3
|
+
require 'salsa_tiger_xml/file_parts_parser'
|
4
|
+
require 'salsa_tiger_xml/salsa_tiger_xml_helper'
|
5
|
+
require 'salsa_tiger_xml/salsa_tiger_sentence'
|
6
|
+
|
7
|
+
#############################
|
8
|
+
# abstract class, to be inherited:
|
9
|
+
#
|
10
|
+
# SalsaTigerXML interface for modules
|
11
|
+
# offering parsing etc.
|
12
|
+
#
|
13
|
+
# The input format for these classes is TabFormat or FNTabFormat
|
14
|
+
module Shalmaneser
|
15
|
+
module Frappe
|
16
|
+
class SynInterfaceSTXML < SynInterface
|
17
|
+
###
|
18
|
+
# initialize to set values for all subsequent processing
|
19
|
+
def initialize(program_path, # string: path to system
|
20
|
+
insuffix, # string: suffix of input files
|
21
|
+
outsuffix, # string: suffix for processed files
|
22
|
+
stsuffix, # string: suffix for Salsa/Tiger XML files
|
23
|
+
var_hash = {}) # optional arguments in a hash
|
24
|
+
super(program_path, insuffix, outsuffix, var_hash)
|
25
|
+
@stsuffix = stsuffix
|
26
|
+
end
|
27
|
+
|
28
|
+
def to_stxml_dir(in_dir, # string: name of dir with parse files
|
29
|
+
out_dir) # string: name of output dir
|
30
|
+
|
31
|
+
Dir["#{in_dir}*#{@outsuffix}"].each do |parsefilename|
|
32
|
+
stxmlfilename = "#{out_dir}#{File.basename(parsefilename, @outsuffix)}#{@stsuffix}"
|
33
|
+
to_stxml_file(parsefilename, stxmlfilename)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def to_stxml_file(infilename, outfilename)
|
38
|
+
raise NotImplementedError, "Overwrite me"
|
39
|
+
end
|
40
|
+
|
41
|
+
###
|
42
|
+
# standard mapping:
|
43
|
+
#
|
44
|
+
# to be used as the mapping from tab sentence words to
|
45
|
+
# SalsaTigerSentence nodes returned by each_sentence:
|
46
|
+
# map the n-th word of the tab sentence to the n-th terminal of
|
47
|
+
# the SalsaTigerSentence
|
48
|
+
def self.standard_mapping(sent, tabsent)
|
49
|
+
retv = {}
|
50
|
+
|
51
|
+
if sent.nil?
|
52
|
+
retv = nil
|
53
|
+
else
|
54
|
+
terminals = sent.terminals_sorted
|
55
|
+
if tabsent
|
56
|
+
tabsent.each_line_parsed do |l|
|
57
|
+
if (t = terminals[l.get("lineno")])
|
58
|
+
retv[l.get("lineno")] = [t]
|
59
|
+
else
|
60
|
+
retv[l.get("lineno")] = []
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
retv
|
67
|
+
end
|
68
|
+
|
69
|
+
|
70
|
+
###
|
71
|
+
# for a given processed file:
|
72
|
+
# yield each sentence as a tuple
|
73
|
+
# [SalsaTigerSentence object, FNTabFormatSentence object, mapping]
|
74
|
+
# of
|
75
|
+
# - the sentence in SalsaTigerXML,
|
76
|
+
# - the matching tab format sentence
|
77
|
+
# - a mapping of terminals:
|
78
|
+
# hash: line in tab sentence(integer) -> array:SynNode
|
79
|
+
# mapping tab sentence nodes to matching nodes in the SalsaTigerSentence data structure
|
80
|
+
#
|
81
|
+
# default version: write Salsa/Tiger XML to tempfile, read back in
|
82
|
+
# and assume that each sentence in the tab file has a correspondent
|
83
|
+
# in the processed file (may not hold e.g. if the parser leaves out
|
84
|
+
# sentences it cannot process)
|
85
|
+
def each_sentence(infilename, # string: name of processed file
|
86
|
+
tab_dir = nil) # string: name of dir with input files
|
87
|
+
# (set either here or on initialization)
|
88
|
+
if tab_dir
|
89
|
+
@tab_dir = tab_dir
|
90
|
+
end
|
91
|
+
|
92
|
+
# write Salsa/Tiger XML to tempfile
|
93
|
+
tf = Tempfile.new("SynInterface")
|
94
|
+
tf.close
|
95
|
+
to_stxml_file(infilename, tf.path)
|
96
|
+
tf.flush
|
97
|
+
|
98
|
+
# get matching tab file, read
|
99
|
+
tab_reader = get_tab_reader(infilename)
|
100
|
+
tab_sentences = []
|
101
|
+
tab_reader.each_sentence { |s| tab_sentences << s }
|
102
|
+
|
103
|
+
# read Salsa/Tiger sentences and yield them
|
104
|
+
reader = STXML::FilePartsParser.new(tf.path)
|
105
|
+
sent_index = 0
|
106
|
+
reader.scan_s { |sent_string|
|
107
|
+
yield [
|
108
|
+
STXML::SalsaTigerSentence.new(sent_string, tab_sentences[sent_index]),
|
109
|
+
tab_sentences[sent_index],
|
110
|
+
SynInterfaceSTXML.standard_mapping(sent, tab_sentences[sent_index])
|
111
|
+
]
|
112
|
+
sent_index += 1
|
113
|
+
}
|
114
|
+
|
115
|
+
# remove tempfile
|
116
|
+
tf.close(true)
|
117
|
+
end
|
118
|
+
|
119
|
+
#####################
|
120
|
+
protected
|
121
|
+
|
122
|
+
|
123
|
+
###
|
124
|
+
# get tab format file for a given processed file
|
125
|
+
def get_tab_reader(infilename) # string: name of processed file
|
126
|
+
# find matching non-processed file for processed file
|
127
|
+
# assumption: directory with non-processed files
|
128
|
+
# has been set as @tab_dir
|
129
|
+
|
130
|
+
# sanity checks
|
131
|
+
unless @tab_dir
|
132
|
+
raise "Need to set tab directory"
|
133
|
+
end
|
134
|
+
|
135
|
+
# get matching tab file for this parser output file
|
136
|
+
tabfilename = @tab_dir+File.basename(infilename, @outsuffix)+ @insuffix
|
137
|
+
return FNTabFormatFile.new(tabfilename)
|
138
|
+
end
|
139
|
+
|
140
|
+
|
141
|
+
###
|
142
|
+
# provide a XML representation for a sentence that couldn't be analyzed
|
143
|
+
# assuming a flat structure of all terminals, adding a virtual top node
|
144
|
+
def self.failed_sentence(tab_sent, sentid)
|
145
|
+
sent_obj = STXML::SalsaTigerSentence.empty_sentence(sentid.to_s)
|
146
|
+
|
147
|
+
sent_obj.set_attribute("failed","true")
|
148
|
+
|
149
|
+
topnode = sent_obj.add_syn("nt",
|
150
|
+
"NONE", # cat
|
151
|
+
nil, # word (doesn't matter)
|
152
|
+
nil, # pos (doesn't matter)
|
153
|
+
"500") # nonterminal counter
|
154
|
+
|
155
|
+
t_counter = 0
|
156
|
+
|
157
|
+
tab_sent.each_line_parsed {|line|
|
158
|
+
t_counter += 1
|
159
|
+
word = line.get("word")
|
160
|
+
pos = line.get("pos")
|
161
|
+
node = sent_obj.add_syn("t",
|
162
|
+
nil, # cat (doesn't matter here)
|
163
|
+
STXML::SalsaTigerXMLHelper.escape(word), # word
|
164
|
+
pos, # pos
|
165
|
+
t_counter.to_s)
|
166
|
+
topnode.add_child(node,nil)
|
167
|
+
node.add_parent(topnode, nil)
|
168
|
+
}
|
169
|
+
return sent_obj
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require_relative 'syn_interface'
|
2
|
+
|
3
|
+
require 'tabular_format/fn_tab_format_file'
|
4
|
+
|
5
|
+
#############################
|
6
|
+
# abstract class, to be inherited:
|
7
|
+
#
|
8
|
+
# tabular format interface for modules
|
9
|
+
# offering POS tagging, lemmatization etc.
|
10
|
+
module Shalmaneser
|
11
|
+
module Frappe
|
12
|
+
class SynInterfaceTab < SynInterface
|
13
|
+
|
14
|
+
##########
|
15
|
+
protected
|
16
|
+
|
17
|
+
# fntab_words_for_file:
|
18
|
+
# given a file in tab format, columns as in FNTabFormat,
|
19
|
+
# get the "word" entries and write them to a given file,
|
20
|
+
# one word per line, as input for processing
|
21
|
+
def SynInterfaceTab.fntab_words_to_file(infilename, # string: name of input file
|
22
|
+
outfile, # stream: output file
|
23
|
+
sent_marker = "", # string: mark end of sentence how?
|
24
|
+
iso = nil) # non-nil: assume utf-8, transform to iso-8859-1
|
25
|
+
corpusfile = FNTabFormatFile.new(infilename)
|
26
|
+
corpusfile.each_sentence {|s|
|
27
|
+
s.each_line_parsed {|line_obj|
|
28
|
+
if iso
|
29
|
+
outfile.puts UtfIso.to_iso_8859_1(line_obj.get("word"))
|
30
|
+
else
|
31
|
+
outfile.puts line_obj.get("word")
|
32
|
+
end
|
33
|
+
}
|
34
|
+
outfile.puts sent_marker
|
35
|
+
}
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# KE changed July 05: now no inclusion of modules required,
|
2
|
+
# and names changed from REXML.Encodign to UtfIso
|
3
|
+
module Shalmaneser
|
4
|
+
module Frappe
|
5
|
+
module UtfIso
|
6
|
+
# Convert from UTF-8
|
7
|
+
def UtfIso.to_iso_8859_1(content)
|
8
|
+
array_utf8 = content.unpack('U*')
|
9
|
+
array_enc = []
|
10
|
+
array_utf8.each do |num|
|
11
|
+
if num <= 0xFF
|
12
|
+
array_enc << num
|
13
|
+
else
|
14
|
+
# Numeric entity (&#nnnn;); shard by Stefan Scholl
|
15
|
+
# array_enc += to_iso_8859("&\##{num};").unpack('C*')
|
16
|
+
end
|
17
|
+
end
|
18
|
+
array_enc.pack('C*')
|
19
|
+
end
|
20
|
+
|
21
|
+
# Convert to UTF-8
|
22
|
+
def UtfIso.from_iso_8859_1(str)
|
23
|
+
str.unpack('C*').pack('U*')
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
# A dummy file to require for now.
|
metadata
ADDED
@@ -0,0 +1,130 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: shalmaneser-frappe
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.2.rc5
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Andrei Beliankou
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-01-13 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.6'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.6'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: tokenizer
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0.2'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0.2'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: shalmaneser-lib
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 1.2.rc5
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 1.2.rc5
|
55
|
+
description: Frappe - Fred and Rosy PreProcEssor.
|
56
|
+
email: arbox@yandex.ru
|
57
|
+
executables: []
|
58
|
+
extensions: []
|
59
|
+
extra_rdoc_files:
|
60
|
+
- README.md
|
61
|
+
- LICENSE.md
|
62
|
+
- CHANGELOG.md
|
63
|
+
files:
|
64
|
+
- ".yardopts"
|
65
|
+
- CHANGELOG.md
|
66
|
+
- LICENSE.md
|
67
|
+
- README.md
|
68
|
+
- lib/frappe/Ampersand.rb
|
69
|
+
- lib/frappe/file_parser.rb
|
70
|
+
- lib/frappe/fix_syn_sem_mapping.rb
|
71
|
+
- lib/frappe/frappe.rb
|
72
|
+
- lib/frappe/frappe_flat_syntax.rb
|
73
|
+
- lib/frappe/frappe_read_stxml.rb
|
74
|
+
- lib/frappe/interfaces/berkeley_interface.rb
|
75
|
+
- lib/frappe/interfaces/collins_interface.rb
|
76
|
+
- lib/frappe/interfaces/counter.rb
|
77
|
+
- lib/frappe/interfaces/stanford_interface.rb
|
78
|
+
- lib/frappe/interfaces/treetagger_interface.rb
|
79
|
+
- lib/frappe/interfaces/treetagger_module.rb
|
80
|
+
- lib/frappe/interfaces/treetagger_pos_interface.rb
|
81
|
+
- lib/frappe/interpreters/berkeley_interpreter.rb
|
82
|
+
- lib/frappe/interpreters/collins_tnt_interpreter.rb
|
83
|
+
- lib/frappe/interpreters/collins_treetagger_interpreter.rb
|
84
|
+
- lib/frappe/interpreters/empty_interpreter.rb
|
85
|
+
- lib/frappe/interpreters/headz.rb
|
86
|
+
- lib/frappe/interpreters/headz_helpers.rb
|
87
|
+
- lib/frappe/interpreters/stanford_interpreter.rb
|
88
|
+
- lib/frappe/interpreters/syn_interpreter.rb
|
89
|
+
- lib/frappe/interpreters/tiger_interpreter.rb
|
90
|
+
- lib/frappe/interpreters/treetagger_interpreter.rb
|
91
|
+
- lib/frappe/one_parsed_file.rb
|
92
|
+
- lib/frappe/opt_parser.rb
|
93
|
+
- lib/frappe/path.rb
|
94
|
+
- lib/frappe/plain_converter.rb
|
95
|
+
- lib/frappe/salsa_tab_converter.rb
|
96
|
+
- lib/frappe/salsa_tab_with_pos_converter.rb
|
97
|
+
- lib/frappe/stxml_converter.rb
|
98
|
+
- lib/frappe/syn_interface.rb
|
99
|
+
- lib/frappe/syn_interface_stxml.rb
|
100
|
+
- lib/frappe/syn_interface_tab.rb
|
101
|
+
- lib/frappe/utf_iso.rb
|
102
|
+
- lib/shalmaneser/frappe.rb
|
103
|
+
homepage: https://github.com/arbox/shalmaneser
|
104
|
+
licenses:
|
105
|
+
- GPL-2.0
|
106
|
+
metadata: {}
|
107
|
+
post_install_message:
|
108
|
+
rdoc_options:
|
109
|
+
- "-m"
|
110
|
+
- README.md
|
111
|
+
require_paths:
|
112
|
+
- lib
|
113
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - '='
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '2.0'
|
118
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
119
|
+
requirements:
|
120
|
+
- - ">"
|
121
|
+
- !ruby/object:Gem::Version
|
122
|
+
version: 1.3.1
|
123
|
+
requirements: []
|
124
|
+
rubyforge_project:
|
125
|
+
rubygems_version: 2.5.1
|
126
|
+
signing_key:
|
127
|
+
specification_version: 4
|
128
|
+
summary: FRAPPE
|
129
|
+
test_files: []
|
130
|
+
has_rdoc:
|