rbbt-text 0.2.1 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/get_ppis.rb +52 -0
- data/lib/rbbt/bow/dictionary.rb +9 -9
- data/lib/rbbt/bow/misc.rb +86 -2
- data/lib/rbbt/corpus/corpus.rb +55 -0
- data/lib/rbbt/corpus/document.rb +289 -0
- data/lib/rbbt/corpus/document_repo.rb +115 -0
- data/lib/rbbt/corpus/sources/pubmed.rb +26 -0
- data/lib/rbbt/ner/NER.rb +7 -5
- data/lib/rbbt/ner/abner.rb +13 -2
- data/lib/rbbt/ner/annotations.rb +182 -51
- data/lib/rbbt/ner/annotations/annotated.rb +15 -0
- data/lib/rbbt/ner/annotations/named_entity.rb +37 -0
- data/lib/rbbt/ner/annotations/relations.rb +25 -0
- data/lib/rbbt/ner/annotations/token.rb +28 -0
- data/lib/rbbt/ner/annotations/transformed.rb +170 -0
- data/lib/rbbt/ner/banner.rb +8 -5
- data/lib/rbbt/ner/chemical_tagger.rb +34 -0
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +136 -0
- data/lib/rbbt/ner/oscar3.rb +1 -1
- data/lib/rbbt/ner/oscar4.rb +41 -0
- data/lib/rbbt/ner/patterns.rb +132 -0
- data/lib/rbbt/ner/rnorm.rb +141 -0
- data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
- data/lib/rbbt/ner/rnorm/tokens.rb +218 -0
- data/lib/rbbt/ner/token_trieNER.rb +185 -51
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +214 -0
- data/lib/rbbt/nlp/nlp.rb +235 -0
- data/share/install/software/ABNER +0 -4
- data/share/install/software/ChemicalTagger +81 -0
- data/share/install/software/Gdep +115 -0
- data/share/install/software/Geniass +118 -0
- data/share/install/software/OSCAR4 +16 -0
- data/share/install/software/StanfordParser +15 -0
- data/share/patterns/drug_induce_disease +22 -0
- data/share/rnorm/cue_default +10 -0
- data/share/rnorm/tokens_default +86 -0
- data/share/{stopwords → wordlists/stopwords} +0 -0
- data/test/rbbt/bow/test_bow.rb +1 -1
- data/test/rbbt/bow/test_dictionary.rb +1 -1
- data/test/rbbt/bow/test_misc.rb +1 -1
- data/test/rbbt/corpus/test_corpus.rb +99 -0
- data/test/rbbt/corpus/test_document.rb +222 -0
- data/test/rbbt/ner/annotations/test_named_entity.rb +14 -0
- data/test/rbbt/ner/annotations/test_transformed.rb +175 -0
- data/test/rbbt/ner/test_abner.rb +1 -1
- data/test/rbbt/ner/test_annotations.rb +64 -2
- data/test/rbbt/ner/test_banner.rb +1 -1
- data/test/rbbt/ner/test_chemical_tagger.rb +56 -0
- data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +20 -0
- data/test/rbbt/ner/{test_oscar3.rb → test_oscar4.rb} +12 -13
- data/test/rbbt/ner/test_patterns.rb +66 -0
- data/test/rbbt/ner/test_regexpNER.rb +1 -1
- data/test/rbbt/ner/test_rnorm.rb +47 -0
- data/test/rbbt/ner/test_token_trieNER.rb +60 -35
- data/test/rbbt/nlp/test_nlp.rb +88 -0
- data/test/test_helper.rb +20 -0
- metadata +93 -20
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
INSTALL_HELPER_FILE="$1"
|
4
|
+
RBBT_SOFTWARE_DIR="$2"
|
5
|
+
source "$INSTALL_HELPER_FILE"
|
6
|
+
|
7
|
+
name='OSCAR4'
|
8
|
+
url="http://maven.ch.cam.ac.uk/m2repo/uk/ac/cam/ch/wwmm/oscar/oscar4-all/4.0.1/oscar4-all-4.0.1-with-dependencies.jar"
|
9
|
+
|
10
|
+
|
11
|
+
PKG_DIR=`opt_dir $name`
|
12
|
+
[ -d $PKG_DIR ] || mkdir -p $PKG_DIR
|
13
|
+
wget "$url" -O "$PKG_DIR/OSCAR4.jar"
|
14
|
+
ln -sf "$PKG_DIR/OSCAR4.jar" "$OPT_JAR_DIR/OSCAR4.jar"
|
15
|
+
|
16
|
+
|
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
INSTALL_HELPER_FILE="$1"
|
4
|
+
RBBT_SOFTWARE_DIR="$2"
|
5
|
+
source "$INSTALL_HELPER_FILE"
|
6
|
+
|
7
|
+
name='StanfordParser'
|
8
|
+
url="http://nlp.stanford.edu/downloads/stanford-parser-2011-04-20.tgz"
|
9
|
+
|
10
|
+
|
11
|
+
get_src "$name" "$url"
|
12
|
+
mkdir "$OPT_DIR/$name"
|
13
|
+
cp "$OPT_BUILD_DIR/stanford-parser.jar" "$OPT_DIR/$name"
|
14
|
+
ln -s "$OPT_DIR/$name/stanford-parser.jar" "$OPT_JAR_DIR/stanford-parser.jar"
|
15
|
+
|
@@ -0,0 +1,22 @@
|
|
1
|
+
NP[disease_NN] VP[induce_HVB by_IN] NP[drug_NN]
|
2
|
+
NP[drug_NN] VP[induce_HVB] NP[disease_NN]
|
3
|
+
NP[drug_NN] VP[cause_HVB] NP[disease_NN]
|
4
|
+
NP[disease_NN] VP[cause_HVB by_IN] NP[drug_NN]
|
5
|
+
NP[disease_NN] VP[produce_HVB by_IN] NP[drug_NN]
|
6
|
+
NP[disease_NN] VP[induce_HVB by_IN] NP[injection_HNN of_IN] NP[drug_NN]
|
7
|
+
NP[drug_NN] VP[associate_HVB with_IN] NP[risk_HNN of_IN] NP[disease_NN]
|
8
|
+
NP[disease_NN] VP[induce_HVB by_IN] NP[administration_HNN of_IN] NP[drug_NN]
|
9
|
+
NP[disease_NN] VP[be_HVB] NP[effect_HNN of_IN] NP[drug_NN]
|
10
|
+
NP[drug_NN] VP[increase_HVB] NP[risk_HNN of_IN] NP[disease_NN]
|
11
|
+
NP[disease_NN] NTG[follow_VBG] NP[treatment_HNN with_IN] NP[drug_NN]
|
12
|
+
NP[disease_NN] VP[associate_HVB with_IN] NP[drug_NN therapy_HNN]
|
13
|
+
NP[disease_NN] VP[associate_HVB with_IN] NP[use_HNN of_IN] NP[drug_NN]
|
14
|
+
NP[disease_NN] VP[associate_HVB with_IN] NP[drug_NN use_HNN]
|
15
|
+
NP[disease_NN] VP[associate_HVB with_IN] NP[drug_NN treatment_HNN]
|
16
|
+
NP[disease_NN while_IN] VP[receive_HVB] NP[drug_NN]
|
17
|
+
NP[disease_NN] NTG[follow_VBG] NP[drug_NN therapy_HNN]
|
18
|
+
NP[disease_NN after_IN] VP[receive_HVB] NP[drug_NN]
|
19
|
+
NP[disease_NN] NTG[follow_VBG] NP[drug_NN administration_HNN]
|
20
|
+
NP[disease_NN due_(?:IN|JJ) to_TO] NP[drug_NN therapy_HNN]
|
21
|
+
NP[disease_NN] VP[follow_HVB] NP[treatment_HNN with_IN] NP[drug_NN]
|
22
|
+
NP[disease_NN] VP[follow_HVB] NP[drug_NN administration_HNN]
|
@@ -0,0 +1,10 @@
|
|
1
|
+
equal do |w| [w] end
|
2
|
+
standard do |w| [w.downcase.split(/\s+/).sort.join("")] end
|
3
|
+
cleaned do |w| [w.downcase.sub(/,.*/,'').sub(/\(.*\)/,'').gsub(/s(?:=\W)/,'')] end
|
4
|
+
special do |w| s = w.split.select{|w| w.is_special?}.collect{|w| w.downcase.sub(/p$/,'')} end
|
5
|
+
words do |w|
|
6
|
+
w.sub(/(.*)I$/,'\1I \1').
|
7
|
+
scan(/[a-z][a-z]+/i).
|
8
|
+
sort{|a,b| b.length <=> a.length}.
|
9
|
+
collect{|n| n.downcase}
|
10
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
require 'rbbt/util/misc'
|
2
|
+
|
3
|
+
|
4
|
+
plural = Proc.new do |t| t.sub(/s$/,'') end
|
5
|
+
|
6
|
+
tokens do
|
7
|
+
|
8
|
+
# Some (possible) single letters first
|
9
|
+
receptor /^(?:receptor|r)s?$/i
|
10
|
+
protein /^(?:protein|p)s?$/i
|
11
|
+
roman /^[IV]+$/
|
12
|
+
greek_letter do |w| $inverse_greek[w.downcase] != nil end
|
13
|
+
|
14
|
+
|
15
|
+
# Some words for removal
|
16
|
+
stopword do |w| $stopwords.include?( w.downcase_first) end
|
17
|
+
gene /genes?/i
|
18
|
+
dna
|
19
|
+
cdna
|
20
|
+
rna
|
21
|
+
mrna
|
22
|
+
trna
|
23
|
+
cdna
|
24
|
+
component
|
25
|
+
exon
|
26
|
+
intron
|
27
|
+
domain
|
28
|
+
family
|
29
|
+
|
30
|
+
|
31
|
+
# Important words
|
32
|
+
number /^(?:\d+[.,]?\d+|\d)$/
|
33
|
+
greek do |w| $greek[w.downcase] != nil end
|
34
|
+
special do |w| w.is_special? end
|
35
|
+
promoter
|
36
|
+
similar /^(homolog.*|like|related|associated)$/
|
37
|
+
ase /ase$/
|
38
|
+
in_end /in$/
|
39
|
+
end
|
40
|
+
|
41
|
+
comparisons do
|
42
|
+
|
43
|
+
compare.number do |l1,l2|
|
44
|
+
v = 0
|
45
|
+
case
|
46
|
+
when l1.empty? && l2.empty?
|
47
|
+
v = 0
|
48
|
+
when l1.sort.uniq == l2.sort.uniq
|
49
|
+
v = 3
|
50
|
+
when l1.any? && l1[0] == l2[0]
|
51
|
+
v = -3
|
52
|
+
when l1.empty? && l2 == ['1']
|
53
|
+
v = -5
|
54
|
+
else
|
55
|
+
v = -10
|
56
|
+
end
|
57
|
+
v
|
58
|
+
end
|
59
|
+
|
60
|
+
diff.promoter -10
|
61
|
+
diff.receptor -10
|
62
|
+
diff.similar -10
|
63
|
+
diff.capital -10
|
64
|
+
|
65
|
+
same.unknown 1
|
66
|
+
miss.unknown -2
|
67
|
+
extr.unknown -2
|
68
|
+
|
69
|
+
same.greek 1
|
70
|
+
miss.greek -2
|
71
|
+
extr.greek -2
|
72
|
+
|
73
|
+
same.special 4
|
74
|
+
miss.special -3
|
75
|
+
extr.special -3
|
76
|
+
|
77
|
+
transform.receptor plural
|
78
|
+
transform.protein plural
|
79
|
+
|
80
|
+
transform.roman do |t| [t.arabic, :number] end
|
81
|
+
transform.greek_letter do |t| [$inverse_greek[t.downcase], :greek] end
|
82
|
+
transform.ase do |t| [t, :special] end
|
83
|
+
transform.in_end do |t| [t, :special] end
|
84
|
+
transform.unknown do |t| [t, (t.length < 4 ? :special : :unknown)] end
|
85
|
+
end
|
86
|
+
|
File without changes
|
data/test/rbbt/bow/test_bow.rb
CHANGED
data/test/rbbt/bow/test_misc.rb
CHANGED
@@ -0,0 +1,99 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
|
2
|
+
require 'rbbt/corpus/corpus'
|
3
|
+
require 'rbbt/corpus/sources/pubmed'
|
4
|
+
|
5
|
+
class Document
|
6
|
+
define :sentences do
|
7
|
+
require 'rbbt/nlp/nlp'
|
8
|
+
NLP.geniass_sentence_splitter(text)
|
9
|
+
end
|
10
|
+
|
11
|
+
define :genes do
|
12
|
+
require 'rbbt/ner/abner'
|
13
|
+
Abner.new.entities(text)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
class TestCorpus < Test::Unit::TestCase
|
19
|
+
|
20
|
+
def test_add_document
|
21
|
+
pmid = "19458159"
|
22
|
+
|
23
|
+
text = PubMed.get_article(pmid).text
|
24
|
+
|
25
|
+
corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
|
26
|
+
|
27
|
+
assert corpus.find(:pubmed, pmid).empty?
|
28
|
+
|
29
|
+
corpus.add_document(text, :pubmed, pmid, :abstract)
|
30
|
+
|
31
|
+
assert corpus.find(:pubmed, pmid).any?
|
32
|
+
assert corpus.find(:pubmed, pmid, :fulltext).empty?
|
33
|
+
assert corpus.find(:pubmed, pmid, :abstract).any?
|
34
|
+
|
35
|
+
assert corpus.find(:pubmed, pmid).first.text =~ /SENT/
|
36
|
+
end
|
37
|
+
|
38
|
+
def test_add_pmid
|
39
|
+
pmid = "19465387"
|
40
|
+
corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
|
41
|
+
corpus.add_pmid(pmid, :abstract)
|
42
|
+
|
43
|
+
assert corpus.exists? :pubmed, pmid
|
44
|
+
assert corpus.exists? :pubmed, pmid, :abstract
|
45
|
+
assert_equal false, corpus.exists?(:pubmed, pmid, :fulltext)
|
46
|
+
end
|
47
|
+
|
48
|
+
def test_find_all
|
49
|
+
corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
|
50
|
+
corpus.add_pmid("19458159", :abstract)
|
51
|
+
corpus.add_pmid("19465387", :abstract)
|
52
|
+
|
53
|
+
all = corpus.find
|
54
|
+
|
55
|
+
assert_equal 2, all.length
|
56
|
+
assert all.select{|document| document.id == "19458159"}.any?
|
57
|
+
assert all.select{|document| document.id == "19465387"}.any?
|
58
|
+
end
|
59
|
+
|
60
|
+
def test_doc_sentences
|
61
|
+
corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
|
62
|
+
corpus.add_pmid("19458159", :abstract)
|
63
|
+
|
64
|
+
sentences = corpus.find.collect{|doc| doc.sentences}.flatten
|
65
|
+
|
66
|
+
assert corpus.find.first.sentences.length > 0
|
67
|
+
assert corpus.find.first.sentences.sort_by{|s| s.offset}.first =~ /Semantic features in Text/i
|
68
|
+
|
69
|
+
corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
|
70
|
+
assert corpus.find.first.sentences.sort_by{|s| s.offset}.first =~ /Semantic features in Text/i
|
71
|
+
end
|
72
|
+
|
73
|
+
def test_doc_genes
|
74
|
+
corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
|
75
|
+
corpus.add_pmid("21611789", :abstract)
|
76
|
+
|
77
|
+
assert corpus.find(:pubmed, "21611789").first.genes.include? "CDKAL1"
|
78
|
+
end
|
79
|
+
|
80
|
+
def test_genes
|
81
|
+
corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
|
82
|
+
corpus.add_pmid("21611789", :abstract)
|
83
|
+
|
84
|
+
assert corpus.find.collect{|d| d.genes}.flatten.include? "CDKAL1"
|
85
|
+
end
|
86
|
+
|
87
|
+
def test_index
|
88
|
+
corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
|
89
|
+
corpus.add_pmid("21611789", :abstract)
|
90
|
+
|
91
|
+
document = corpus.find(:pubmed, "21611789").first
|
92
|
+
|
93
|
+
genes = corpus.find.collect{|d| d.genes}.flatten.select{|gene| gene == "CDKAL1"}
|
94
|
+
|
95
|
+
assert genes.collect{|gene|
|
96
|
+
document.sentences_at(gene.offset)
|
97
|
+
}.flatten.length > 1
|
98
|
+
end
|
99
|
+
end
|
@@ -0,0 +1,222 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
|
2
|
+
require 'rbbt/corpus/document'
|
3
|
+
require 'test/unit'
|
4
|
+
|
5
|
+
$persistence = TSV.new({})
|
6
|
+
$tchash_persistence = TCHash.get(Rbbt.tmp.test.document.persistence.find(:user), true, Persistence::TSV::TSVSerializer)
|
7
|
+
$global_persistence = TSV.new({}, :key => "ID", :fields => [ "Start", "End", "Info","Document ID", "Entity Type"])
|
8
|
+
$tchash_global_persistence = TSV.new(TCHash.get(Rbbt.tmp.test.global.persistence.find(:user), true, Persistence::TSV::StringArraySerializer), :key => "ID", :fields => [ "Start", "End", "Info","Document ID", "Entity Type"])
|
9
|
+
|
10
|
+
class Document
|
11
|
+
define :sentences do
|
12
|
+
require 'rbbt/nlp/nlp'
|
13
|
+
NLP.geniass_sentence_splitter(text)
|
14
|
+
end
|
15
|
+
|
16
|
+
define :tokens do
|
17
|
+
require 'rbbt/ner/annotations/token'
|
18
|
+
Token.tokenize(text)
|
19
|
+
end
|
20
|
+
|
21
|
+
define :long_words do
|
22
|
+
require 'rbbt/ner/annotations/token'
|
23
|
+
Token.tokenize(text).select{|tok| tok.length > 5}
|
24
|
+
end
|
25
|
+
|
26
|
+
define :short_words do
|
27
|
+
require 'rbbt/ner/annotations/token'
|
28
|
+
Token.tokenize(text).select{|tok| tok.length < 5}
|
29
|
+
end
|
30
|
+
|
31
|
+
define :even_words do
|
32
|
+
require 'rbbt/ner/annotations/token'
|
33
|
+
Token.tokenize(text).select{|tok| tok.length % 2 == 0}
|
34
|
+
end
|
35
|
+
|
36
|
+
define :missing do
|
37
|
+
[]
|
38
|
+
end
|
39
|
+
|
40
|
+
define :tokens_again do
|
41
|
+
raise "This should be here already"
|
42
|
+
end
|
43
|
+
|
44
|
+
persist :sentences
|
45
|
+
persist_in_tsv :tokens
|
46
|
+
persist_in_tsv :long_words, $tchash_persistence, :Literal
|
47
|
+
persist_in_global_tsv :short_words, $global_persistence
|
48
|
+
persist_in_global_tsv :even_words, $tchash_global_persistence
|
49
|
+
persist_in_global_tsv :missing, $tchash_global_persistence
|
50
|
+
end
|
51
|
+
|
52
|
+
class TestDocument < Test::Unit::TestCase
|
53
|
+
|
54
|
+
def test_annotations
|
55
|
+
|
56
|
+
text =<<-EOF
|
57
|
+
This is a
|
58
|
+
sentence. This is
|
59
|
+
another sentence.
|
60
|
+
EOF
|
61
|
+
|
62
|
+
doc = Document.new
|
63
|
+
doc.text = text
|
64
|
+
|
65
|
+
assert_equal 2, doc.sentences.length
|
66
|
+
assert_equal 10, doc.tokens.length
|
67
|
+
end
|
68
|
+
|
69
|
+
def test_annotation_load
|
70
|
+
text =<<-EOF
|
71
|
+
This is a
|
72
|
+
sentence. This is
|
73
|
+
another sentence.
|
74
|
+
EOF
|
75
|
+
|
76
|
+
doc = Document.new
|
77
|
+
doc.text = text * 10
|
78
|
+
|
79
|
+
sentence = doc.sentences.last
|
80
|
+
doc.load_into sentence, :tokens
|
81
|
+
assert_equal 5, sentence.tokens.length
|
82
|
+
assert_equal "another", sentence.tokens[2]
|
83
|
+
assert_equal sentence.offset + 0, sentence.tokens[0].offset
|
84
|
+
end
|
85
|
+
|
86
|
+
def test_annotation_persistence
|
87
|
+
text =<<-EOF
|
88
|
+
This is a
|
89
|
+
sentence. This is
|
90
|
+
another sentence.
|
91
|
+
EOF
|
92
|
+
|
93
|
+
text *= 10
|
94
|
+
|
95
|
+
TmpFile.with_file do |dir|
|
96
|
+
FileUtils.mkdir_p dir
|
97
|
+
|
98
|
+
doc = Document.new(dir)
|
99
|
+
doc.text = text
|
100
|
+
doc.sentences
|
101
|
+
|
102
|
+
doc = Document.new(dir)
|
103
|
+
doc.text = text
|
104
|
+
|
105
|
+
sentence = doc.sentences.last
|
106
|
+
doc.load_into sentence, :tokens
|
107
|
+
|
108
|
+
assert_equal 5, sentence.tokens.length
|
109
|
+
assert_equal "another", sentence.tokens[2]
|
110
|
+
assert_equal sentence.offset + 0, sentence.tokens[0].offset
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
def test_range_persistence
|
115
|
+
text =<<-EOF
|
116
|
+
This is a
|
117
|
+
sentence. This is
|
118
|
+
another sentence.
|
119
|
+
EOF
|
120
|
+
|
121
|
+
text *= 10
|
122
|
+
|
123
|
+
TmpFile.with_file do |dir|
|
124
|
+
FileUtils.mkdir_p dir
|
125
|
+
|
126
|
+
doc = Document.new(dir)
|
127
|
+
doc.text = text
|
128
|
+
|
129
|
+
sentence = doc.sentences.last
|
130
|
+
Misc.benchmark(10) do
|
131
|
+
doc = Document.new(dir)
|
132
|
+
doc.text = text
|
133
|
+
|
134
|
+
doc.load_into sentence, :tokens, :persist => true
|
135
|
+
assert_equal 5, sentence.tokens.length
|
136
|
+
assert_equal "another", sentence.tokens[2]
|
137
|
+
assert_equal sentence.offset + 0, sentence.tokens[0].offset
|
138
|
+
assert_equal sentence.offset + 5, sentence.tokens[1].offset
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
def test_annotation_persistence_in_tsv
|
144
|
+
text =<<-EOF
|
145
|
+
This is a
|
146
|
+
sentence. This is
|
147
|
+
another sentence.
|
148
|
+
EOF
|
149
|
+
|
150
|
+
TmpFile.with_file do |dir|
|
151
|
+
FileUtils.mkdir_p dir
|
152
|
+
|
153
|
+
|
154
|
+
doc = Document.new(dir)
|
155
|
+
doc.text = text * 10
|
156
|
+
doc.sentences
|
157
|
+
|
158
|
+
doc = Document.new(dir)
|
159
|
+
doc.text = text * 10
|
160
|
+
|
161
|
+
sentence = doc.sentences.last
|
162
|
+
|
163
|
+
doc.load_into sentence, :tokens, :long_words
|
164
|
+
|
165
|
+
assert_equal 5, sentence.tokens.length
|
166
|
+
assert_equal "another", sentence.tokens[2]
|
167
|
+
assert_equal sentence.offset + 0, sentence.tokens[0].offset
|
168
|
+
|
169
|
+
assert_equal 2, sentence.long_words.length
|
170
|
+
assert_equal %w(another sentence), sentence.long_words
|
171
|
+
assert_equal sentence.offset + "This is ".length, sentence.long_words[0].offset
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
def test_annotation_persistence_in_global
|
176
|
+
text =<<-EOF
|
177
|
+
This is a
|
178
|
+
sentence. This is
|
179
|
+
another sentence.
|
180
|
+
EOF
|
181
|
+
|
182
|
+
TmpFile.with_file do |dir|
|
183
|
+
FileUtils.mkdir_p dir
|
184
|
+
|
185
|
+
|
186
|
+
doc = Document.new(dir)
|
187
|
+
doc.text = text * 10
|
188
|
+
doc.docid = "FOOF"
|
189
|
+
doc.short_words
|
190
|
+
doc.sentences
|
191
|
+
|
192
|
+
doc = Document.new(dir)
|
193
|
+
doc.text = text * 10
|
194
|
+
doc.docid = "FOOF"
|
195
|
+
|
196
|
+
sentence = doc.sentences.last
|
197
|
+
|
198
|
+
doc.load_into sentence, :tokens, :long_words, :short_words, :even_words
|
199
|
+
|
200
|
+
assert_equal 3, sentence.short_words.length
|
201
|
+
assert_equal 3, sentence.even_words.length
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
def test_dump
|
206
|
+
text =<<-EOF
|
207
|
+
This is a
|
208
|
+
sentence. This is
|
209
|
+
another sentence.
|
210
|
+
EOF
|
211
|
+
|
212
|
+
TmpFile.with_file do |dir|
|
213
|
+
FileUtils.mkdir_p dir
|
214
|
+
|
215
|
+
doc = Document.new(dir)
|
216
|
+
doc.text = text * 10
|
217
|
+
tsv = Document.tsv(doc.sentences, ["Literal"])
|
218
|
+
end
|
219
|
+
end
|
220
|
+
end
|
221
|
+
|
222
|
+
|