rbbt-text 0.2.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/get_ppis.rb +52 -0
- data/lib/rbbt/bow/dictionary.rb +9 -9
- data/lib/rbbt/bow/misc.rb +86 -2
- data/lib/rbbt/corpus/corpus.rb +55 -0
- data/lib/rbbt/corpus/document.rb +289 -0
- data/lib/rbbt/corpus/document_repo.rb +115 -0
- data/lib/rbbt/corpus/sources/pubmed.rb +26 -0
- data/lib/rbbt/ner/NER.rb +7 -5
- data/lib/rbbt/ner/abner.rb +13 -2
- data/lib/rbbt/ner/annotations.rb +182 -51
- data/lib/rbbt/ner/annotations/annotated.rb +15 -0
- data/lib/rbbt/ner/annotations/named_entity.rb +37 -0
- data/lib/rbbt/ner/annotations/relations.rb +25 -0
- data/lib/rbbt/ner/annotations/token.rb +28 -0
- data/lib/rbbt/ner/annotations/transformed.rb +170 -0
- data/lib/rbbt/ner/banner.rb +8 -5
- data/lib/rbbt/ner/chemical_tagger.rb +34 -0
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +136 -0
- data/lib/rbbt/ner/oscar3.rb +1 -1
- data/lib/rbbt/ner/oscar4.rb +41 -0
- data/lib/rbbt/ner/patterns.rb +132 -0
- data/lib/rbbt/ner/rnorm.rb +141 -0
- data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
- data/lib/rbbt/ner/rnorm/tokens.rb +218 -0
- data/lib/rbbt/ner/token_trieNER.rb +185 -51
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +214 -0
- data/lib/rbbt/nlp/nlp.rb +235 -0
- data/share/install/software/ABNER +0 -4
- data/share/install/software/ChemicalTagger +81 -0
- data/share/install/software/Gdep +115 -0
- data/share/install/software/Geniass +118 -0
- data/share/install/software/OSCAR4 +16 -0
- data/share/install/software/StanfordParser +15 -0
- data/share/patterns/drug_induce_disease +22 -0
- data/share/rnorm/cue_default +10 -0
- data/share/rnorm/tokens_default +86 -0
- data/share/{stopwords → wordlists/stopwords} +0 -0
- data/test/rbbt/bow/test_bow.rb +1 -1
- data/test/rbbt/bow/test_dictionary.rb +1 -1
- data/test/rbbt/bow/test_misc.rb +1 -1
- data/test/rbbt/corpus/test_corpus.rb +99 -0
- data/test/rbbt/corpus/test_document.rb +222 -0
- data/test/rbbt/ner/annotations/test_named_entity.rb +14 -0
- data/test/rbbt/ner/annotations/test_transformed.rb +175 -0
- data/test/rbbt/ner/test_abner.rb +1 -1
- data/test/rbbt/ner/test_annotations.rb +64 -2
- data/test/rbbt/ner/test_banner.rb +1 -1
- data/test/rbbt/ner/test_chemical_tagger.rb +56 -0
- data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +20 -0
- data/test/rbbt/ner/{test_oscar3.rb → test_oscar4.rb} +12 -13
- data/test/rbbt/ner/test_patterns.rb +66 -0
- data/test/rbbt/ner/test_regexpNER.rb +1 -1
- data/test/rbbt/ner/test_rnorm.rb +47 -0
- data/test/rbbt/ner/test_token_trieNER.rb +60 -35
- data/test/rbbt/nlp/test_nlp.rb +88 -0
- data/test/test_helper.rb +20 -0
- metadata +93 -20
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
INSTALL_HELPER_FILE="$1"
|
4
|
+
RBBT_SOFTWARE_DIR="$2"
|
5
|
+
source "$INSTALL_HELPER_FILE"
|
6
|
+
|
7
|
+
name='OSCAR4'
|
8
|
+
url="http://maven.ch.cam.ac.uk/m2repo/uk/ac/cam/ch/wwmm/oscar/oscar4-all/4.0.1/oscar4-all-4.0.1-with-dependencies.jar"
|
9
|
+
|
10
|
+
|
11
|
+
PKG_DIR=`opt_dir $name`
|
12
|
+
[ -d $PKG_DIR ] || mkdir -p $PKG_DIR
|
13
|
+
wget "$url" -O "$PKG_DIR/OSCAR4.jar"
|
14
|
+
ln -sf "$PKG_DIR/OSCAR4.jar" "$OPT_JAR_DIR/OSCAR4.jar"
|
15
|
+
|
16
|
+
|
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
INSTALL_HELPER_FILE="$1"
|
4
|
+
RBBT_SOFTWARE_DIR="$2"
|
5
|
+
source "$INSTALL_HELPER_FILE"
|
6
|
+
|
7
|
+
name='StanfordParser'
|
8
|
+
url="http://nlp.stanford.edu/downloads/stanford-parser-2011-04-20.tgz"
|
9
|
+
|
10
|
+
|
11
|
+
get_src "$name" "$url"
|
12
|
+
mkdir "$OPT_DIR/$name"
|
13
|
+
cp "$OPT_BUILD_DIR/stanford-parser.jar" "$OPT_DIR/$name"
|
14
|
+
ln -s "$OPT_DIR/$name/stanford-parser.jar" "$OPT_JAR_DIR/stanford-parser.jar"
|
15
|
+
|
@@ -0,0 +1,22 @@
|
|
1
|
+
NP[disease_NN] VP[induce_HVB by_IN] NP[drug_NN]
|
2
|
+
NP[drug_NN] VP[induce_HVB] NP[disease_NN]
|
3
|
+
NP[drug_NN] VP[cause_HVB] NP[disease_NN]
|
4
|
+
NP[disease_NN] VP[cause_HVB by_IN] NP[drug_NN]
|
5
|
+
NP[disease_NN] VP[produce_HVB by_IN] NP[drug_NN]
|
6
|
+
NP[disease_NN] VP[induce_HVB by_IN] NP[injection_HNN of_IN] NP[drug_NN]
|
7
|
+
NP[drug_NN] VP[associate_HVB with_IN] NP[risk_HNN of_IN] NP[disease_NN]
|
8
|
+
NP[disease_NN] VP[induce_HVB by_IN] NP[administration_HNN of_IN] NP[drug_NN]
|
9
|
+
NP[disease_NN] VP[be_HVB] NP[effect_HNN of_IN] NP[drug_NN]
|
10
|
+
NP[drug_NN] VP[increase_HVB] NP[risk_HNN of_IN] NP[disease_NN]
|
11
|
+
NP[disease_NN] NTG[follow_VBG] NP[treatment_HNN with_IN] NP[drug_NN]
|
12
|
+
NP[disease_NN] VP[associate_HVB with_IN] NP[drug_NN therapy_HNN]
|
13
|
+
NP[disease_NN] VP[associate_HVB with_IN] NP[use_HNN of_IN] NP[drug_NN]
|
14
|
+
NP[disease_NN] VP[associate_HVB with_IN] NP[drug_NN use_HNN]
|
15
|
+
NP[disease_NN] VP[associate_HVB with_IN] NP[drug_NN treatment_HNN]
|
16
|
+
NP[disease_NN while_IN] VP[receive_HVB] NP[drug_NN]
|
17
|
+
NP[disease_NN] NTG[follow_VBG] NP[drug_NN therapy_HNN]
|
18
|
+
NP[disease_NN after_IN] VP[receive_HVB] NP[drug_NN]
|
19
|
+
NP[disease_NN] NTG[follow_VBG] NP[drug_NN administration_HNN]
|
20
|
+
NP[disease_NN due_(?:IN|JJ) to_TO] NP[drug_NN therapy_HNN]
|
21
|
+
NP[disease_NN] VP[follow_HVB] NP[treatment_HNN with_IN] NP[drug_NN]
|
22
|
+
NP[disease_NN] VP[follow_HVB] NP[drug_NN administration_HNN]
|
@@ -0,0 +1,10 @@
|
|
1
|
+
equal do |w| [w] end
|
2
|
+
standard do |w| [w.downcase.split(/\s+/).sort.join("")] end
|
3
|
+
cleaned do |w| [w.downcase.sub(/,.*/,'').sub(/\(.*\)/,'').gsub(/s(?:=\W)/,'')] end
|
4
|
+
special do |w| s = w.split.select{|w| w.is_special?}.collect{|w| w.downcase.sub(/p$/,'')} end
|
5
|
+
words do |w|
|
6
|
+
w.sub(/(.*)I$/,'\1I \1').
|
7
|
+
scan(/[a-z][a-z]+/i).
|
8
|
+
sort{|a,b| b.length <=> a.length}.
|
9
|
+
collect{|n| n.downcase}
|
10
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
require 'rbbt/util/misc'
|
2
|
+
|
3
|
+
|
4
|
+
plural = Proc.new do |t| t.sub(/s$/,'') end
|
5
|
+
|
6
|
+
tokens do
|
7
|
+
|
8
|
+
# Some (possible) single letters first
|
9
|
+
receptor /^(?:receptor|r)s?$/i
|
10
|
+
protein /^(?:protein|p)s?$/i
|
11
|
+
roman /^[IV]+$/
|
12
|
+
greek_letter do |w| $inverse_greek[w.downcase] != nil end
|
13
|
+
|
14
|
+
|
15
|
+
# Some words for removal
|
16
|
+
stopword do |w| $stopwords.include?( w.downcase_first) end
|
17
|
+
gene /genes?/i
|
18
|
+
dna
|
19
|
+
cdna
|
20
|
+
rna
|
21
|
+
mrna
|
22
|
+
trna
|
23
|
+
cdna
|
24
|
+
component
|
25
|
+
exon
|
26
|
+
intron
|
27
|
+
domain
|
28
|
+
family
|
29
|
+
|
30
|
+
|
31
|
+
# Important words
|
32
|
+
number /^(?:\d+[.,]?\d+|\d)$/
|
33
|
+
greek do |w| $greek[w.downcase] != nil end
|
34
|
+
special do |w| w.is_special? end
|
35
|
+
promoter
|
36
|
+
similar /^(homolog.*|like|related|associated)$/
|
37
|
+
ase /ase$/
|
38
|
+
in_end /in$/
|
39
|
+
end
|
40
|
+
|
41
|
+
comparisons do
|
42
|
+
|
43
|
+
compare.number do |l1,l2|
|
44
|
+
v = 0
|
45
|
+
case
|
46
|
+
when l1.empty? && l2.empty?
|
47
|
+
v = 0
|
48
|
+
when l1.sort.uniq == l2.sort.uniq
|
49
|
+
v = 3
|
50
|
+
when l1.any? && l1[0] == l2[0]
|
51
|
+
v = -3
|
52
|
+
when l1.empty? && l2 == ['1']
|
53
|
+
v = -5
|
54
|
+
else
|
55
|
+
v = -10
|
56
|
+
end
|
57
|
+
v
|
58
|
+
end
|
59
|
+
|
60
|
+
diff.promoter -10
|
61
|
+
diff.receptor -10
|
62
|
+
diff.similar -10
|
63
|
+
diff.capital -10
|
64
|
+
|
65
|
+
same.unknown 1
|
66
|
+
miss.unknown -2
|
67
|
+
extr.unknown -2
|
68
|
+
|
69
|
+
same.greek 1
|
70
|
+
miss.greek -2
|
71
|
+
extr.greek -2
|
72
|
+
|
73
|
+
same.special 4
|
74
|
+
miss.special -3
|
75
|
+
extr.special -3
|
76
|
+
|
77
|
+
transform.receptor plural
|
78
|
+
transform.protein plural
|
79
|
+
|
80
|
+
transform.roman do |t| [t.arabic, :number] end
|
81
|
+
transform.greek_letter do |t| [$inverse_greek[t.downcase], :greek] end
|
82
|
+
transform.ase do |t| [t, :special] end
|
83
|
+
transform.in_end do |t| [t, :special] end
|
84
|
+
transform.unknown do |t| [t, (t.length < 4 ? :special : :unknown)] end
|
85
|
+
end
|
86
|
+
|
File without changes
|
data/test/rbbt/bow/test_bow.rb
CHANGED
data/test/rbbt/bow/test_misc.rb
CHANGED
@@ -0,0 +1,99 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
|
2
|
+
require 'rbbt/corpus/corpus'
|
3
|
+
require 'rbbt/corpus/sources/pubmed'
|
4
|
+
|
5
|
+
class Document
|
6
|
+
define :sentences do
|
7
|
+
require 'rbbt/nlp/nlp'
|
8
|
+
NLP.geniass_sentence_splitter(text)
|
9
|
+
end
|
10
|
+
|
11
|
+
define :genes do
|
12
|
+
require 'rbbt/ner/abner'
|
13
|
+
Abner.new.entities(text)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
class TestCorpus < Test::Unit::TestCase
|
19
|
+
|
20
|
+
def test_add_document
|
21
|
+
pmid = "19458159"
|
22
|
+
|
23
|
+
text = PubMed.get_article(pmid).text
|
24
|
+
|
25
|
+
corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
|
26
|
+
|
27
|
+
assert corpus.find(:pubmed, pmid).empty?
|
28
|
+
|
29
|
+
corpus.add_document(text, :pubmed, pmid, :abstract)
|
30
|
+
|
31
|
+
assert corpus.find(:pubmed, pmid).any?
|
32
|
+
assert corpus.find(:pubmed, pmid, :fulltext).empty?
|
33
|
+
assert corpus.find(:pubmed, pmid, :abstract).any?
|
34
|
+
|
35
|
+
assert corpus.find(:pubmed, pmid).first.text =~ /SENT/
|
36
|
+
end
|
37
|
+
|
38
|
+
def test_add_pmid
|
39
|
+
pmid = "19465387"
|
40
|
+
corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
|
41
|
+
corpus.add_pmid(pmid, :abstract)
|
42
|
+
|
43
|
+
assert corpus.exists? :pubmed, pmid
|
44
|
+
assert corpus.exists? :pubmed, pmid, :abstract
|
45
|
+
assert_equal false, corpus.exists?(:pubmed, pmid, :fulltext)
|
46
|
+
end
|
47
|
+
|
48
|
+
def test_find_all
|
49
|
+
corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
|
50
|
+
corpus.add_pmid("19458159", :abstract)
|
51
|
+
corpus.add_pmid("19465387", :abstract)
|
52
|
+
|
53
|
+
all = corpus.find
|
54
|
+
|
55
|
+
assert_equal 2, all.length
|
56
|
+
assert all.select{|document| document.id == "19458159"}.any?
|
57
|
+
assert all.select{|document| document.id == "19465387"}.any?
|
58
|
+
end
|
59
|
+
|
60
|
+
def test_doc_sentences
|
61
|
+
corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
|
62
|
+
corpus.add_pmid("19458159", :abstract)
|
63
|
+
|
64
|
+
sentences = corpus.find.collect{|doc| doc.sentences}.flatten
|
65
|
+
|
66
|
+
assert corpus.find.first.sentences.length > 0
|
67
|
+
assert corpus.find.first.sentences.sort_by{|s| s.offset}.first =~ /Semantic features in Text/i
|
68
|
+
|
69
|
+
corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
|
70
|
+
assert corpus.find.first.sentences.sort_by{|s| s.offset}.first =~ /Semantic features in Text/i
|
71
|
+
end
|
72
|
+
|
73
|
+
def test_doc_genes
|
74
|
+
corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
|
75
|
+
corpus.add_pmid("21611789", :abstract)
|
76
|
+
|
77
|
+
assert corpus.find(:pubmed, "21611789").first.genes.include? "CDKAL1"
|
78
|
+
end
|
79
|
+
|
80
|
+
def test_genes
|
81
|
+
corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
|
82
|
+
corpus.add_pmid("21611789", :abstract)
|
83
|
+
|
84
|
+
assert corpus.find.collect{|d| d.genes}.flatten.include? "CDKAL1"
|
85
|
+
end
|
86
|
+
|
87
|
+
def test_index
|
88
|
+
corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
|
89
|
+
corpus.add_pmid("21611789", :abstract)
|
90
|
+
|
91
|
+
document = corpus.find(:pubmed, "21611789").first
|
92
|
+
|
93
|
+
genes = corpus.find.collect{|d| d.genes}.flatten.select{|gene| gene == "CDKAL1"}
|
94
|
+
|
95
|
+
assert genes.collect{|gene|
|
96
|
+
document.sentences_at(gene.offset)
|
97
|
+
}.flatten.length > 1
|
98
|
+
end
|
99
|
+
end
|
@@ -0,0 +1,222 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
|
2
|
+
require 'rbbt/corpus/document'
|
3
|
+
require 'test/unit'
|
4
|
+
|
5
|
+
$persistence = TSV.new({})
|
6
|
+
$tchash_persistence = TCHash.get(Rbbt.tmp.test.document.persistence.find(:user), true, Persistence::TSV::TSVSerializer)
|
7
|
+
$global_persistence = TSV.new({}, :key => "ID", :fields => [ "Start", "End", "Info","Document ID", "Entity Type"])
|
8
|
+
$tchash_global_persistence = TSV.new(TCHash.get(Rbbt.tmp.test.global.persistence.find(:user), true, Persistence::TSV::StringArraySerializer), :key => "ID", :fields => [ "Start", "End", "Info","Document ID", "Entity Type"])
|
9
|
+
|
10
|
+
class Document
|
11
|
+
define :sentences do
|
12
|
+
require 'rbbt/nlp/nlp'
|
13
|
+
NLP.geniass_sentence_splitter(text)
|
14
|
+
end
|
15
|
+
|
16
|
+
define :tokens do
|
17
|
+
require 'rbbt/ner/annotations/token'
|
18
|
+
Token.tokenize(text)
|
19
|
+
end
|
20
|
+
|
21
|
+
define :long_words do
|
22
|
+
require 'rbbt/ner/annotations/token'
|
23
|
+
Token.tokenize(text).select{|tok| tok.length > 5}
|
24
|
+
end
|
25
|
+
|
26
|
+
define :short_words do
|
27
|
+
require 'rbbt/ner/annotations/token'
|
28
|
+
Token.tokenize(text).select{|tok| tok.length < 5}
|
29
|
+
end
|
30
|
+
|
31
|
+
define :even_words do
|
32
|
+
require 'rbbt/ner/annotations/token'
|
33
|
+
Token.tokenize(text).select{|tok| tok.length % 2 == 0}
|
34
|
+
end
|
35
|
+
|
36
|
+
define :missing do
|
37
|
+
[]
|
38
|
+
end
|
39
|
+
|
40
|
+
define :tokens_again do
|
41
|
+
raise "This should be here already"
|
42
|
+
end
|
43
|
+
|
44
|
+
persist :sentences
|
45
|
+
persist_in_tsv :tokens
|
46
|
+
persist_in_tsv :long_words, $tchash_persistence, :Literal
|
47
|
+
persist_in_global_tsv :short_words, $global_persistence
|
48
|
+
persist_in_global_tsv :even_words, $tchash_global_persistence
|
49
|
+
persist_in_global_tsv :missing, $tchash_global_persistence
|
50
|
+
end
|
51
|
+
|
52
|
+
class TestDocument < Test::Unit::TestCase
|
53
|
+
|
54
|
+
def test_annotations
|
55
|
+
|
56
|
+
text =<<-EOF
|
57
|
+
This is a
|
58
|
+
sentence. This is
|
59
|
+
another sentence.
|
60
|
+
EOF
|
61
|
+
|
62
|
+
doc = Document.new
|
63
|
+
doc.text = text
|
64
|
+
|
65
|
+
assert_equal 2, doc.sentences.length
|
66
|
+
assert_equal 10, doc.tokens.length
|
67
|
+
end
|
68
|
+
|
69
|
+
def test_annotation_load
|
70
|
+
text =<<-EOF
|
71
|
+
This is a
|
72
|
+
sentence. This is
|
73
|
+
another sentence.
|
74
|
+
EOF
|
75
|
+
|
76
|
+
doc = Document.new
|
77
|
+
doc.text = text * 10
|
78
|
+
|
79
|
+
sentence = doc.sentences.last
|
80
|
+
doc.load_into sentence, :tokens
|
81
|
+
assert_equal 5, sentence.tokens.length
|
82
|
+
assert_equal "another", sentence.tokens[2]
|
83
|
+
assert_equal sentence.offset + 0, sentence.tokens[0].offset
|
84
|
+
end
|
85
|
+
|
86
|
+
def test_annotation_persistence
|
87
|
+
text =<<-EOF
|
88
|
+
This is a
|
89
|
+
sentence. This is
|
90
|
+
another sentence.
|
91
|
+
EOF
|
92
|
+
|
93
|
+
text *= 10
|
94
|
+
|
95
|
+
TmpFile.with_file do |dir|
|
96
|
+
FileUtils.mkdir_p dir
|
97
|
+
|
98
|
+
doc = Document.new(dir)
|
99
|
+
doc.text = text
|
100
|
+
doc.sentences
|
101
|
+
|
102
|
+
doc = Document.new(dir)
|
103
|
+
doc.text = text
|
104
|
+
|
105
|
+
sentence = doc.sentences.last
|
106
|
+
doc.load_into sentence, :tokens
|
107
|
+
|
108
|
+
assert_equal 5, sentence.tokens.length
|
109
|
+
assert_equal "another", sentence.tokens[2]
|
110
|
+
assert_equal sentence.offset + 0, sentence.tokens[0].offset
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
def test_range_persistence
|
115
|
+
text =<<-EOF
|
116
|
+
This is a
|
117
|
+
sentence. This is
|
118
|
+
another sentence.
|
119
|
+
EOF
|
120
|
+
|
121
|
+
text *= 10
|
122
|
+
|
123
|
+
TmpFile.with_file do |dir|
|
124
|
+
FileUtils.mkdir_p dir
|
125
|
+
|
126
|
+
doc = Document.new(dir)
|
127
|
+
doc.text = text
|
128
|
+
|
129
|
+
sentence = doc.sentences.last
|
130
|
+
Misc.benchmark(10) do
|
131
|
+
doc = Document.new(dir)
|
132
|
+
doc.text = text
|
133
|
+
|
134
|
+
doc.load_into sentence, :tokens, :persist => true
|
135
|
+
assert_equal 5, sentence.tokens.length
|
136
|
+
assert_equal "another", sentence.tokens[2]
|
137
|
+
assert_equal sentence.offset + 0, sentence.tokens[0].offset
|
138
|
+
assert_equal sentence.offset + 5, sentence.tokens[1].offset
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
def test_annotation_persistence_in_tsv
|
144
|
+
text =<<-EOF
|
145
|
+
This is a
|
146
|
+
sentence. This is
|
147
|
+
another sentence.
|
148
|
+
EOF
|
149
|
+
|
150
|
+
TmpFile.with_file do |dir|
|
151
|
+
FileUtils.mkdir_p dir
|
152
|
+
|
153
|
+
|
154
|
+
doc = Document.new(dir)
|
155
|
+
doc.text = text * 10
|
156
|
+
doc.sentences
|
157
|
+
|
158
|
+
doc = Document.new(dir)
|
159
|
+
doc.text = text * 10
|
160
|
+
|
161
|
+
sentence = doc.sentences.last
|
162
|
+
|
163
|
+
doc.load_into sentence, :tokens, :long_words
|
164
|
+
|
165
|
+
assert_equal 5, sentence.tokens.length
|
166
|
+
assert_equal "another", sentence.tokens[2]
|
167
|
+
assert_equal sentence.offset + 0, sentence.tokens[0].offset
|
168
|
+
|
169
|
+
assert_equal 2, sentence.long_words.length
|
170
|
+
assert_equal %w(another sentence), sentence.long_words
|
171
|
+
assert_equal sentence.offset + "This is ".length, sentence.long_words[0].offset
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
def test_annotation_persistence_in_global
|
176
|
+
text =<<-EOF
|
177
|
+
This is a
|
178
|
+
sentence. This is
|
179
|
+
another sentence.
|
180
|
+
EOF
|
181
|
+
|
182
|
+
TmpFile.with_file do |dir|
|
183
|
+
FileUtils.mkdir_p dir
|
184
|
+
|
185
|
+
|
186
|
+
doc = Document.new(dir)
|
187
|
+
doc.text = text * 10
|
188
|
+
doc.docid = "FOOF"
|
189
|
+
doc.short_words
|
190
|
+
doc.sentences
|
191
|
+
|
192
|
+
doc = Document.new(dir)
|
193
|
+
doc.text = text * 10
|
194
|
+
doc.docid = "FOOF"
|
195
|
+
|
196
|
+
sentence = doc.sentences.last
|
197
|
+
|
198
|
+
doc.load_into sentence, :tokens, :long_words, :short_words, :even_words
|
199
|
+
|
200
|
+
assert_equal 3, sentence.short_words.length
|
201
|
+
assert_equal 3, sentence.even_words.length
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
def test_dump
|
206
|
+
text =<<-EOF
|
207
|
+
This is a
|
208
|
+
sentence. This is
|
209
|
+
another sentence.
|
210
|
+
EOF
|
211
|
+
|
212
|
+
TmpFile.with_file do |dir|
|
213
|
+
FileUtils.mkdir_p dir
|
214
|
+
|
215
|
+
doc = Document.new(dir)
|
216
|
+
doc.text = text * 10
|
217
|
+
tsv = Document.tsv(doc.sentences, ["Literal"])
|
218
|
+
end
|
219
|
+
end
|
220
|
+
end
|
221
|
+
|
222
|
+
|