rbbt-text 0.2.1 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/get_ppis.rb +52 -0
- data/lib/rbbt/bow/dictionary.rb +9 -9
- data/lib/rbbt/bow/misc.rb +86 -2
- data/lib/rbbt/corpus/corpus.rb +55 -0
- data/lib/rbbt/corpus/document.rb +289 -0
- data/lib/rbbt/corpus/document_repo.rb +115 -0
- data/lib/rbbt/corpus/sources/pubmed.rb +26 -0
- data/lib/rbbt/ner/NER.rb +7 -5
- data/lib/rbbt/ner/abner.rb +13 -2
- data/lib/rbbt/ner/annotations.rb +182 -51
- data/lib/rbbt/ner/annotations/annotated.rb +15 -0
- data/lib/rbbt/ner/annotations/named_entity.rb +37 -0
- data/lib/rbbt/ner/annotations/relations.rb +25 -0
- data/lib/rbbt/ner/annotations/token.rb +28 -0
- data/lib/rbbt/ner/annotations/transformed.rb +170 -0
- data/lib/rbbt/ner/banner.rb +8 -5
- data/lib/rbbt/ner/chemical_tagger.rb +34 -0
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +136 -0
- data/lib/rbbt/ner/oscar3.rb +1 -1
- data/lib/rbbt/ner/oscar4.rb +41 -0
- data/lib/rbbt/ner/patterns.rb +132 -0
- data/lib/rbbt/ner/rnorm.rb +141 -0
- data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
- data/lib/rbbt/ner/rnorm/tokens.rb +218 -0
- data/lib/rbbt/ner/token_trieNER.rb +185 -51
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +214 -0
- data/lib/rbbt/nlp/nlp.rb +235 -0
- data/share/install/software/ABNER +0 -4
- data/share/install/software/ChemicalTagger +81 -0
- data/share/install/software/Gdep +115 -0
- data/share/install/software/Geniass +118 -0
- data/share/install/software/OSCAR4 +16 -0
- data/share/install/software/StanfordParser +15 -0
- data/share/patterns/drug_induce_disease +22 -0
- data/share/rnorm/cue_default +10 -0
- data/share/rnorm/tokens_default +86 -0
- data/share/{stopwords → wordlists/stopwords} +0 -0
- data/test/rbbt/bow/test_bow.rb +1 -1
- data/test/rbbt/bow/test_dictionary.rb +1 -1
- data/test/rbbt/bow/test_misc.rb +1 -1
- data/test/rbbt/corpus/test_corpus.rb +99 -0
- data/test/rbbt/corpus/test_document.rb +222 -0
- data/test/rbbt/ner/annotations/test_named_entity.rb +14 -0
- data/test/rbbt/ner/annotations/test_transformed.rb +175 -0
- data/test/rbbt/ner/test_abner.rb +1 -1
- data/test/rbbt/ner/test_annotations.rb +64 -2
- data/test/rbbt/ner/test_banner.rb +1 -1
- data/test/rbbt/ner/test_chemical_tagger.rb +56 -0
- data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +20 -0
- data/test/rbbt/ner/{test_oscar3.rb → test_oscar4.rb} +12 -13
- data/test/rbbt/ner/test_patterns.rb +66 -0
- data/test/rbbt/ner/test_regexpNER.rb +1 -1
- data/test/rbbt/ner/test_rnorm.rb +47 -0
- data/test/rbbt/ner/test_token_trieNER.rb +60 -35
- data/test/rbbt/nlp/test_nlp.rb +88 -0
- data/test/test_helper.rb +20 -0
- metadata +93 -20
data/lib/rbbt/nlp/nlp.rb
ADDED
@@ -0,0 +1,235 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/util/tmpfile'
|
3
|
+
require 'rbbt/util/persistence'
|
4
|
+
require 'rbbt/util/resource'
|
5
|
+
require 'rbbt/ner/annotations'
|
6
|
+
require 'rbbt/ner/annotations/annotated'
|
7
|
+
require 'rbbt/nlp/genia/sentence_splitter'
|
8
|
+
require 'digest/md5'
|
9
|
+
|
10
|
+
|
11
|
+
module NLP
|
12
|
+
|
13
|
+
extend LocalPersist
|
14
|
+
self.local_persistence_dir = '/tmp/crap'
|
15
|
+
|
16
|
+
#Rbbt.software.opt.StanfordParser.define_as_install Rbbt.share.install.software.StanfordParser.find
|
17
|
+
#Rbbt.software.opt.StanfordParser.produce
|
18
|
+
|
19
|
+
Rbbt.software.opt.Geniass.define_as_install Rbbt.share.install.software.Geniass.find
|
20
|
+
Rbbt.software.opt.Geniass.produce
|
21
|
+
|
22
|
+
Rbbt.software.opt.Gdep.define_as_install Rbbt.share.install.software.Gdep.find
|
23
|
+
Rbbt.software.opt.Gdep.produce
|
24
|
+
|
25
|
+
NEW_LINE_MASK = "\t\t \t \t"
|
26
|
+
|
27
|
+
def self.geniass_sentence_splitter(text)
|
28
|
+
offsets = []
|
29
|
+
|
30
|
+
cleaned = text.gsub("\n",NEW_LINE_MASK)
|
31
|
+
TmpFile.with_file(cleaned) do |fin|
|
32
|
+
TmpFile.with_file do |fout|
|
33
|
+
CMD.cmd("cd #{Rbbt.software.opt.Geniass.find}; ./geniass #{ fin } #{ fout }")
|
34
|
+
|
35
|
+
|
36
|
+
Open.write(fin, Open.read(fin).gsub(NEW_LINE_MASK, "\n"))
|
37
|
+
Open.write(fout, Open.read(fout).gsub("\n", '|').gsub(NEW_LINE_MASK, "\n"))
|
38
|
+
# Addapted from sentence2standOff.rb in Geniass package
|
39
|
+
|
40
|
+
inTxtStrict = Open.open(fin)
|
41
|
+
inTxtNew = Open.open(fout)
|
42
|
+
|
43
|
+
marker = "|"[0]
|
44
|
+
position = 0
|
45
|
+
sentenceCount = 1
|
46
|
+
target = ''
|
47
|
+
targetNew = ''
|
48
|
+
start = 0
|
49
|
+
finish = 0
|
50
|
+
|
51
|
+
while(!inTxtNew.eof?) do
|
52
|
+
targetNew = inTxtNew.getc
|
53
|
+
target = inTxtStrict.getc
|
54
|
+
position += 1
|
55
|
+
if targetNew == marker
|
56
|
+
sentenceCount += 1
|
57
|
+
finish = position - 1
|
58
|
+
offsets << [start, finish] if finish - start > 10
|
59
|
+
if targetNew == target
|
60
|
+
start = position
|
61
|
+
else
|
62
|
+
targetNew = inTxtNew.getc
|
63
|
+
while targetNew != target do
|
64
|
+
target = inTxtStrict.getc
|
65
|
+
position += 1
|
66
|
+
end
|
67
|
+
start = position - 1
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
finish = position - 1
|
73
|
+
offsets << [start, finish] if finish > start
|
74
|
+
|
75
|
+
inTxtStrict.close
|
76
|
+
inTxtNew.close
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
offsets.collect do |s,e|
|
81
|
+
sentence = text[s..e]
|
82
|
+
next if sentence.nil?
|
83
|
+
#sentence.gsub!(NEW_LINE_MASK, "\n")
|
84
|
+
Segment.annotate sentence, s
|
85
|
+
sentence
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
module GdepToken
|
90
|
+
attr_accessor :num, :token, :lemma, :chunk, :pos, :bio, :link, :dep
|
91
|
+
include Segment
|
92
|
+
|
93
|
+
def self.annotate(token, offset = nil, num = nil, lemma = nil, chunk = nil, pos = nil, bio = nil, link = nil, dep = nil)
|
94
|
+
token.extend GdepToken
|
95
|
+
|
96
|
+
token.offset = offset
|
97
|
+
token.num = num
|
98
|
+
token.lemma = lemma
|
99
|
+
token.chunk = chunk
|
100
|
+
token.pos = pos
|
101
|
+
token.bio = bio
|
102
|
+
token.link = link
|
103
|
+
token.dep = dep
|
104
|
+
|
105
|
+
token
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
module GdepChunk
|
110
|
+
attr_accessor :type, :parts, :segment_types
|
111
|
+
include Segment
|
112
|
+
|
113
|
+
def self.annotate(string, offset = nil, type = nil, parts = nil)
|
114
|
+
string.extend GdepChunk
|
115
|
+
|
116
|
+
string.offset = offset
|
117
|
+
string.type = type
|
118
|
+
string.parts = parts
|
119
|
+
|
120
|
+
string
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
def self.merge_vp_chunks(chunk_list)
|
125
|
+
vp = nil
|
126
|
+
new_chunks = []
|
127
|
+
chunk_list.each do |chunk|
|
128
|
+
if chunk.type =~ /^VP/
|
129
|
+
if vp.nil?
|
130
|
+
vp = chunk
|
131
|
+
else
|
132
|
+
vp << chunk
|
133
|
+
vp.parts.concat chunk.parts
|
134
|
+
end
|
135
|
+
else
|
136
|
+
new_chunks << vp if not vp.nil?
|
137
|
+
new_chunks << chunk
|
138
|
+
vp = nil
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
new_chunks
|
143
|
+
end
|
144
|
+
|
145
|
+
def self.gdep_chunks(sentence, segment_list)
|
146
|
+
chunks = []
|
147
|
+
|
148
|
+
chunk_start = "B"[0]
|
149
|
+
chunk_inside = "I"[0]
|
150
|
+
|
151
|
+
last = GdepToken.annotate("LW")
|
152
|
+
|
153
|
+
chunk_segments = []
|
154
|
+
segment_list.each do |segment|
|
155
|
+
if segment.chunk[0] == chunk_inside and not segment.offset.nil?
|
156
|
+
chunk_segments << segment
|
157
|
+
else
|
158
|
+
if chunk_segments.any?
|
159
|
+
cstart = chunk_segments.first.offset
|
160
|
+
cend = chunk_segments.last.end
|
161
|
+
chunk = sentence[cstart..cend]
|
162
|
+
GdepChunk.annotate(chunk, cstart, last.chunk.sub(/^.-/,''), chunk_segments)
|
163
|
+
chunks << chunk
|
164
|
+
end
|
165
|
+
|
166
|
+
if segment.offset.nil?
|
167
|
+
chunk_segments = []
|
168
|
+
else
|
169
|
+
chunk_segments = [segment]
|
170
|
+
end
|
171
|
+
end
|
172
|
+
last = segment
|
173
|
+
end
|
174
|
+
|
175
|
+
chunks
|
176
|
+
end
|
177
|
+
|
178
|
+
def self.gdep_parse_sentences(sentences)
|
179
|
+
sentences = Array === sentences ? sentences : [sentences]
|
180
|
+
|
181
|
+
input = sentences.collect{|sentence| sentence.gsub(/\n/, NEW_LINE_MASK)} * "\n"
|
182
|
+
sentence_tokens = TmpFile.with_file(input) do |fin|
|
183
|
+
out = local_persist(Digest::MD5.hexdigest(input), :Chunks, :string) do
|
184
|
+
CMD.cmd("cd #{Rbbt.software.opt.Gdep.find}; ./gdep #{ fin }").read
|
185
|
+
end
|
186
|
+
|
187
|
+
out.split(/^$/).collect do |sentence|
|
188
|
+
tokens = sentence.split(/\n/).collect do |line|
|
189
|
+
next if line.empty?
|
190
|
+
num, token, lemma, chunk, pos, bio, link, dep = line.split(/\t/)
|
191
|
+
GdepToken.annotate(token, nil, num, lemma, chunk, pos, bio, link, dep)
|
192
|
+
end.compact
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
sentences.zip(sentence_tokens).collect do |sentence, tokens|
|
197
|
+
Segment.align(sentence, tokens)
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
|
202
|
+
def self.gdep_parse_sentences_extension(sentences)
|
203
|
+
require Rbbt.software.opt.Gdep.ruby["Gdep.so"].find
|
204
|
+
gdep = Gdep.new
|
205
|
+
if not gdep.gdep_is_loaded
|
206
|
+
Misc.in_dir Rbbt.software.opt.Gdep.find do
|
207
|
+
gdep.load_gdep
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
sentences = Array === sentences ? sentences : [sentences]
|
212
|
+
|
213
|
+
sentence_tokens = sentences.collect{|sentence|
|
214
|
+
Gdep.new.tag(sentence).split(/\n/).collect do |line|
|
215
|
+
next if line.empty?
|
216
|
+
token, lemma, pos, chunk = line.split(/\t/)
|
217
|
+
GdepToken.annotate(token, nil, nil, lemma, chunk, pos)
|
218
|
+
token
|
219
|
+
end.compact
|
220
|
+
}
|
221
|
+
|
222
|
+
sentences.zip(sentence_tokens).collect do |sentence, tokens|
|
223
|
+
Segment.align(sentence, tokens)
|
224
|
+
tokens
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
def self.gdep_chunk_sentences(sentences)
|
229
|
+
sentences = Array === sentences ? sentences : [sentences]
|
230
|
+
NLP.gdep_parse_sentences_extension(sentences).zip(sentences).collect do |segment_list, sentence|
|
231
|
+
chunk_list = NLP.gdep_chunks(sentence, segment_list)
|
232
|
+
NLP.merge_vp_chunks(chunk_list)
|
233
|
+
end
|
234
|
+
end
|
235
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
INSTALL_HELPER_FILE="$1"
|
4
|
+
RBBT_SOFTWARE_DIR="$2"
|
5
|
+
source "$INSTALL_HELPER_FILE"
|
6
|
+
|
7
|
+
name='ChemicalTagger'
|
8
|
+
url="https://bitbucket.org/lh359/chemicaltagger/downloads/chemicalTagger-1.0-jar-with-dependencies.jar"
|
9
|
+
|
10
|
+
PKG_DIR=`opt_dir $name`
|
11
|
+
[ -d $PKG_DIR ] || mkdir -p $PKG_DIR
|
12
|
+
[ -t "$OPT_SRC_DIR/ChemicalTagger.jar" ] || wget "$url" -O "$OPT_SRC_DIR/ChemicalTagger.jar"
|
13
|
+
cp "$OPT_SRC_DIR/ChemicalTagger.jar" "$PKG_DIR/ChemicalTagger.jar"
|
14
|
+
[ -t "$OPT_JAR_DIR/ChemicalTagger.jar" ] || ln -sf "$PKG_DIR/ChemicalTagger.jar" "$OPT_JAR_DIR/ChemicalTagger.jar"
|
15
|
+
|
16
|
+
cat > /tmp/RbbtChemicalTagger.java <<EOF
|
17
|
+
import uk.ac.cam.ch.wwmm.chemicaltagger.Utils;
|
18
|
+
import uk.ac.cam.ch.wwmm.chemicaltagger.ChemistryPOSTagger ;
|
19
|
+
import uk.ac.cam.ch.wwmm.chemicaltagger.ChemistrySentenceParser;
|
20
|
+
import uk.ac.cam.ch.wwmm.chemicaltagger.POSContainer;
|
21
|
+
import uk.ac.cam.ch.wwmm.chemicaltagger.Utils;
|
22
|
+
import org.antlr.runtime.Token;
|
23
|
+
import org.antlr.runtime.tree.Tree;
|
24
|
+
import org.apache.commons.lang.StringUtils;
|
25
|
+
import nu.xom.Document;
|
26
|
+
import java.util.ArrayList;
|
27
|
+
|
28
|
+
class RbbtChemicalTagger{
|
29
|
+
public static String[] match(String text){
|
30
|
+
|
31
|
+
ChemistryPOSTagger chemPos = ChemistryPOSTagger.getDefaultInstance();
|
32
|
+
|
33
|
+
POSContainer posContainer = chemPos.runTaggers(text);
|
34
|
+
|
35
|
+
ChemistrySentenceParser chemistrySentenceParser = new ChemistrySentenceParser(posContainer);
|
36
|
+
|
37
|
+
chemistrySentenceParser.parseTags();
|
38
|
+
|
39
|
+
Tree tree = chemistrySentenceParser.getParseTree();
|
40
|
+
|
41
|
+
ArrayList tokens = new ArrayList();
|
42
|
+
ArrayList molecules = new ArrayList();
|
43
|
+
ArrayList new_tokens = new ArrayList();
|
44
|
+
tokens.add(tree);
|
45
|
+
while (tokens.size() > 0){
|
46
|
+
for(int tree_i = 0; tree_i < tokens.size(); tree_i++){
|
47
|
+
Tree subtree = (Tree) tokens.get(tree_i);
|
48
|
+
int type = subtree.getType();
|
49
|
+
if (type == 78){
|
50
|
+
molecules.add(subtree.getChild(0));
|
51
|
+
}else{
|
52
|
+
for(int tree_j = 0; tree_j < subtree.getChildCount(); tree_j++){
|
53
|
+
Tree child = subtree.getChild(tree_j);
|
54
|
+
new_tokens.add(child);
|
55
|
+
}
|
56
|
+
}
|
57
|
+
}
|
58
|
+
tokens = new_tokens;
|
59
|
+
new_tokens = new ArrayList();
|
60
|
+
}
|
61
|
+
|
62
|
+
String[] matches = new String[molecules.size()];
|
63
|
+
for(int molecule_i = 0; molecule_i < molecules.size(); molecule_i++){
|
64
|
+
Tree molecule = (Tree) molecules.get(molecule_i);
|
65
|
+
matches[molecule_i] = molecule.getText();
|
66
|
+
}
|
67
|
+
return matches;
|
68
|
+
}
|
69
|
+
|
70
|
+
public static void main(String[] args){
|
71
|
+
String text = "Alternatively, rearrangement of O-(w-haloalkyl)esters 34 of 2-carboethoxy-N-hydroxypyridine-2-selone affords azonianaphthalenium halides 37 in 79% yield";
|
72
|
+
match(text);
|
73
|
+
}
|
74
|
+
}
|
75
|
+
|
76
|
+
EOF
|
77
|
+
|
78
|
+
env |grep JAVA
|
79
|
+
|
80
|
+
|
81
|
+
(env CLASSPATH="$OPT_JAR_DIR/ChemicalTagger.jar:$CLASSPATH" /home/mvazquezg/software/opt/java/jdk/bin/javac /tmp/RbbtChemicalTagger.java && jar uf "$PKG_DIR/ChemicalTagger.jar" -C /tmp RbbtChemicalTagger.class) || (rm "$PKG_DIR/ChemicalTagger.jar" && rmdir $PKG_DIR)
|
@@ -0,0 +1,115 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
INSTALL_HELPER_FILE="$1"
|
4
|
+
RBBT_SOFTWARE_DIR="$2"
|
5
|
+
source "$INSTALL_HELPER_FILE"
|
6
|
+
|
7
|
+
name='Gdep'
|
8
|
+
url="http://people.ict.usc.edu/~sagae/parser/gdep/gdep-beta2.tgz"
|
9
|
+
|
10
|
+
|
11
|
+
get_pkg "$name" "$url"
|
12
|
+
uncompress_pkg "$name"
|
13
|
+
|
14
|
+
cd "$(echo $OPT_BUILD_DIR/`ls $OPT_BUILD_DIR |head -n 1`)"
|
15
|
+
|
16
|
+
pwd > /tmp/pwd
|
17
|
+
ls > /tmp/ls
|
18
|
+
|
19
|
+
cat >> Makefile <<'EOF'
|
20
|
+
libgdep.so: $(OBJS) ksdep.h
|
21
|
+
$(CPP) -o libgdep.so $(CFLAGS) $(OBJS) -shared
|
22
|
+
EOF
|
23
|
+
|
24
|
+
cat Makefile| sed 's/\(CFLAGS\s*=\)/\1 -fPIC/' > /tmp/clean.Makefile
|
25
|
+
mv /tmp/clean.Makefile Makefile
|
26
|
+
|
27
|
+
make gdep
|
28
|
+
make libgdep.so
|
29
|
+
|
30
|
+
mkdir lib
|
31
|
+
mv libgdep.so lib
|
32
|
+
|
33
|
+
build "$name" "$extra"
|
34
|
+
|
35
|
+
cd "$OPT_BUILD_DIR/$name"
|
36
|
+
|
37
|
+
mkdir ruby
|
38
|
+
cat > ruby/extconf.rb <<'EOF'
|
39
|
+
require 'mkmf-rice'
|
40
|
+
dir_config('gdep')
|
41
|
+
have_library('gdep')
|
42
|
+
create_makefile('Gdep')
|
43
|
+
EOF
|
44
|
+
|
45
|
+
cat > ruby/Gdep.cpp <<'EOF'
|
46
|
+
#include "rice/Class.hpp"
|
47
|
+
#include "rice/String.hpp"
|
48
|
+
|
49
|
+
#include <stdio.h>
|
50
|
+
#include <fstream>
|
51
|
+
#include <map>
|
52
|
+
#include <list>
|
53
|
+
#include <iostream>
|
54
|
+
#include <sstream>
|
55
|
+
#include "../maxent.h"
|
56
|
+
#include "../common.h"
|
57
|
+
|
58
|
+
using namespace Rice;
|
59
|
+
using namespace std;
|
60
|
+
void init_morphdic();
|
61
|
+
extern void load_ne_models();
|
62
|
+
string bidir_postag(const string & s, const vector<ME_Model> & vme, const vector<ME_Model> & cvme, bool dont_tokenize);
|
63
|
+
|
64
|
+
|
65
|
+
vector<ME_Model> vme(16);
|
66
|
+
vector<ME_Model> vme_chunking(16);
|
67
|
+
bool gdep_loaded = false;
|
68
|
+
|
69
|
+
void load_gdep()
|
70
|
+
{
|
71
|
+
init_morphdic();
|
72
|
+
|
73
|
+
for (int i = 0; i < 16; i++) {
|
74
|
+
char buf[1000];
|
75
|
+
sprintf(buf, "./models_medline/model.bidir.%d", i);
|
76
|
+
vme[i].load_from_file(buf);
|
77
|
+
}
|
78
|
+
|
79
|
+
|
80
|
+
for (int i = 0; i < 8; i +=2 ) {
|
81
|
+
char buf[1000];
|
82
|
+
sprintf(buf, "./models_chunking/model.bidir.%d", i);
|
83
|
+
vme_chunking[i].load_from_file(buf);
|
84
|
+
}
|
85
|
+
|
86
|
+
load_ne_models();
|
87
|
+
|
88
|
+
gdep_loaded = true;
|
89
|
+
}
|
90
|
+
|
91
|
+
bool gdep_is_loaded(){
|
92
|
+
return gdep_loaded;
|
93
|
+
}
|
94
|
+
|
95
|
+
string tag(string line){
|
96
|
+
return(bidir_postag(line, vme, vme_chunking, false));
|
97
|
+
}
|
98
|
+
|
99
|
+
extern "C"
|
100
|
+
void Init_Gdep()
|
101
|
+
{
|
102
|
+
Class rb_cGdep =
|
103
|
+
define_class("Gdep")
|
104
|
+
.define_method("load_gdep", &load_gdep)
|
105
|
+
.define_method("tag", &tag)
|
106
|
+
.define_method("gdep_is_loaded", &gdep_is_loaded);
|
107
|
+
}
|
108
|
+
EOF
|
109
|
+
|
110
|
+
cd ruby
|
111
|
+
ruby extconf.rb --with-gdep-dir="$OPT_DIR/$name"
|
112
|
+
make
|
113
|
+
|
114
|
+
|
115
|
+
|
@@ -0,0 +1,118 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
INSTALL_HELPER_FILE="$1"
|
4
|
+
RBBT_SOFTWARE_DIR="$2"
|
5
|
+
source "$INSTALL_HELPER_FILE"
|
6
|
+
|
7
|
+
name='Geniass'
|
8
|
+
url="http://www-tsujii.is.s.u-tokyo.ac.jp/~y-matsu/geniass/geniass-1.00.tar.gz"
|
9
|
+
|
10
|
+
|
11
|
+
get_pkg "$name" "$url"
|
12
|
+
uncompress_pkg "$name"
|
13
|
+
|
14
|
+
cd "$(echo $OPT_BUILD_DIR/`ls $OPT_BUILD_DIR |head -n 1`)"
|
15
|
+
|
16
|
+
pwd > /tmp/pwd
|
17
|
+
ls > /tmp/ls
|
18
|
+
|
19
|
+
cat >> Makefile <<'EOF'
|
20
|
+
libgeniass.so: maxent.o blmvm.o
|
21
|
+
$(CXX) $(CFLAGS) -o $@ -shared $^
|
22
|
+
EOF
|
23
|
+
|
24
|
+
cat Makefile| sed 's/\(CFLAGS\s*=\)/\1 -fPIC/' > /tmp/clean.Makefile
|
25
|
+
mv /tmp/clean.Makefile Makefile
|
26
|
+
|
27
|
+
make geniass
|
28
|
+
make libgeniass.so
|
29
|
+
|
30
|
+
mkdir lib
|
31
|
+
mv libgeniass.so lib
|
32
|
+
|
33
|
+
build "$name" "$extra"
|
34
|
+
|
35
|
+
cd "$OPT_BUILD_DIR/$name"
|
36
|
+
|
37
|
+
mkdir ruby
|
38
|
+
cat > ruby/extconf.rb <<'EOF'
|
39
|
+
require 'mkmf-rice'
|
40
|
+
dir_config('geniass')
|
41
|
+
have_library('geniass')
|
42
|
+
create_makefile('Geniass')
|
43
|
+
EOF
|
44
|
+
|
45
|
+
cat > ruby/Geniass.cpp <<'EOF'
|
46
|
+
#include "rice/Class.hpp"
|
47
|
+
#include "rice/String.hpp"
|
48
|
+
|
49
|
+
#include <iostream>
|
50
|
+
#include <iomanip>
|
51
|
+
#include <string>
|
52
|
+
#include <list>
|
53
|
+
#include <cstdio>
|
54
|
+
#include <cstdlib>
|
55
|
+
#include <fstream>
|
56
|
+
#include <sstream>
|
57
|
+
#include "../maxent.h"
|
58
|
+
|
59
|
+
using namespace Rice;
|
60
|
+
using namespace std;
|
61
|
+
|
62
|
+
ME_Model model;
|
63
|
+
bool geniass_loaded = false;
|
64
|
+
|
65
|
+
void load_geniass(){
|
66
|
+
printf("loading model");
|
67
|
+
string modelFile = "model1-1.0";
|
68
|
+
model.load_from_file(modelFile.c_str());
|
69
|
+
geniass_loaded = true;
|
70
|
+
printf("..done\n");
|
71
|
+
}
|
72
|
+
|
73
|
+
bool geniass_is_loaded(){ return(geniass_loaded); };
|
74
|
+
|
75
|
+
void split(string& str, vector<string>& tokens)
|
76
|
+
{
|
77
|
+
istringstream in(str);
|
78
|
+
char c;
|
79
|
+
|
80
|
+
while (in){
|
81
|
+
string token;
|
82
|
+
token = "";
|
83
|
+
while (in.get(c) && (c != '\t')) token.push_back(c);
|
84
|
+
tokens.push_back(token);
|
85
|
+
}
|
86
|
+
}
|
87
|
+
|
88
|
+
string label(string line){
|
89
|
+
vector<string> tokens;
|
90
|
+
split(line, tokens);
|
91
|
+
ME_Sample s;
|
92
|
+
|
93
|
+
for(vector<string>::const_iterator token = tokens.begin() + 1;
|
94
|
+
token != tokens.end(); ++token){
|
95
|
+
s.add_feature(*token);
|
96
|
+
}
|
97
|
+
(void) model.classify(s);
|
98
|
+
return(s.label);
|
99
|
+
}
|
100
|
+
|
101
|
+
extern "C"
|
102
|
+
void Init_Geniass()
|
103
|
+
{
|
104
|
+
Class rb_cGeniass =
|
105
|
+
define_class("Geniass")
|
106
|
+
.define_method("load_geniass", &load_geniass)
|
107
|
+
.define_method("label", &label)
|
108
|
+
.define_method("geniass_is_loaded", &geniass_is_loaded);
|
109
|
+
}
|
110
|
+
|
111
|
+
EOF
|
112
|
+
|
113
|
+
cd ruby
|
114
|
+
ruby extconf.rb --with-geniass-dir="$OPT_DIR/$name"
|
115
|
+
make
|
116
|
+
|
117
|
+
|
118
|
+
|