rbbt-text 1.2.0 → 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/document.rb +46 -0
- data/lib/rbbt/document/annotation.rb +42 -0
- data/lib/rbbt/document/corpus.rb +38 -0
- data/lib/rbbt/document/corpus/pubmed.rb +33 -0
- data/lib/rbbt/ner/NER.rb +3 -3
- data/lib/rbbt/ner/abner.rb +1 -1
- data/lib/rbbt/ner/banner.rb +1 -1
- data/lib/rbbt/ner/brat.rb +1 -1
- data/lib/rbbt/ner/chemical_tagger.rb +1 -2
- data/lib/rbbt/ner/g_norm_plus.rb +19 -2
- data/lib/rbbt/ner/linnaeus.rb +3 -3
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
- data/lib/rbbt/ner/oscar3.rb +1 -2
- data/lib/rbbt/ner/oscar4.rb +3 -3
- data/lib/rbbt/ner/patterns.rb +6 -5
- data/lib/rbbt/ner/regexpNER.rb +1 -2
- data/lib/rbbt/ner/token_trieNER.rb +6 -6
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +1 -1
- data/lib/rbbt/nlp/nlp.rb +5 -5
- data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
- data/lib/rbbt/segment.rb +177 -0
- data/lib/rbbt/segment/annotation.rb +58 -0
- data/lib/rbbt/segment/encoding.rb +18 -0
- data/lib/rbbt/{text/segment → segment}/named_entity.rb +11 -11
- data/lib/rbbt/segment/overlaps.rb +63 -0
- data/lib/rbbt/segment/range_index.rb +35 -0
- data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
- data/lib/rbbt/segment/token.rb +23 -0
- data/lib/rbbt/{text/segment → segment}/transformed.rb +7 -9
- data/lib/rbbt/segment/tsv.rb +41 -0
- data/share/install/software/Linnaeus +1 -1
- data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
- data/test/rbbt/document/test_annotation.rb +140 -0
- data/test/rbbt/document/test_corpus.rb +33 -0
- data/test/rbbt/ner/test_finder.rb +3 -3
- data/test/rbbt/ner/test_g_norm_plus.rb +11 -1
- data/test/rbbt/ner/test_patterns.rb +9 -9
- data/test/rbbt/ner/test_regexpNER.rb +14 -14
- data/test/rbbt/ner/test_rnorm.rb +3 -4
- data/test/rbbt/ner/test_token_trieNER.rb +1 -0
- data/test/rbbt/nlp/genia/test_sentence_splitter.rb +13 -3
- data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
- data/test/rbbt/segment/test_annotation.rb +40 -0
- data/test/rbbt/segment/test_corpus.rb +36 -0
- data/test/rbbt/segment/test_encoding.rb +24 -0
- data/test/rbbt/{text/segment → segment}/test_named_entity.rb +12 -9
- data/test/rbbt/segment/test_overlaps.rb +69 -0
- data/test/rbbt/segment/test_range_index.rb +43 -0
- data/test/rbbt/{text/segment → segment}/test_transformed.rb +76 -51
- data/test/rbbt/test_document.rb +14 -0
- data/test/rbbt/test_segment.rb +187 -0
- data/test/test_helper.rb +5 -3
- metadata +40 -32
- data/lib/rbbt/text/corpus.rb +0 -106
- data/lib/rbbt/text/corpus/document.rb +0 -383
- data/lib/rbbt/text/corpus/document_repo.rb +0 -68
- data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
- data/lib/rbbt/text/document.rb +0 -39
- data/lib/rbbt/text/segment.rb +0 -363
- data/lib/rbbt/text/segment/docid.rb +0 -46
- data/lib/rbbt/text/segment/relationship.rb +0 -24
- data/lib/rbbt/text/segment/token.rb +0 -49
- data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
- data/test/rbbt/text/corpus/test_document.rb +0 -82
- data/test/rbbt/text/segment/test_relationship.rb +0 -0
- data/test/rbbt/text/segment/test_segmented.rb +0 -23
- data/test/rbbt/text/test_corpus.rb +0 -34
- data/test/rbbt/text/test_document.rb +0 -58
- data/test/rbbt/text/test_segment.rb +0 -100
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c2a24d8e7faf30d53e41a00a27f6145e8e9f18f0c10af57cdddaea0ee18c35d6
|
4
|
+
data.tar.gz: 3475006965110391e35151cd1b5368028dacf467aa276f8eb68fce3320be1122
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: da40a039a4792eb5e7fa00270870279221c74dcbf51df1b5278b247496fefbfa888a87b7ab19f05676644c51a01177eb49e229cb0156fe7f0190dd4933d41e24
|
7
|
+
data.tar.gz: a32fca5f21a987dcbb6b5541015cc33879330e6f1ef7c4a28e75debe5bdd1dc8bf7b98bfc91d828e605f29868aa972b55cd59bb4f86e66d2fb0cfea31fac2ae0
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'rbbt-util'
|
2
|
+
require 'rbbt/entity'
|
3
|
+
require 'rbbt/document/annotation'
|
4
|
+
|
5
|
+
module DocID
|
6
|
+
extend Entity
|
7
|
+
self.annotation :corpus
|
8
|
+
|
9
|
+
class << self
|
10
|
+
attr_accessor :default_corpus
|
11
|
+
end
|
12
|
+
|
13
|
+
def corpus
|
14
|
+
annotation_values[:corpus] || DocID.default_corpus
|
15
|
+
end
|
16
|
+
|
17
|
+
property :to do |type|
|
18
|
+
namespace, code = self.split(":")
|
19
|
+
DocID.setup([namespace, code, "title"] * ":", :corpus => corpus)
|
20
|
+
end
|
21
|
+
|
22
|
+
def document
|
23
|
+
text = self.corpus[self]
|
24
|
+
namespace, id, type = self.split(":")
|
25
|
+
Document.setup(text, namespace, id, type, :corpus => corpus)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
module Document
|
30
|
+
extend Entity
|
31
|
+
self.annotation :namespace, :code, :type, :corpus
|
32
|
+
|
33
|
+
property :docid do |corpus=nil|
|
34
|
+
digest = Misc.digest(self)
|
35
|
+
corpus = self.corpus if corpus.nil?
|
36
|
+
|
37
|
+
DocID.setup([namespace, code, type, digest] * ":", :corpus => corpus)
|
38
|
+
end
|
39
|
+
|
40
|
+
property :to do |type|
|
41
|
+
docid.to(type).document
|
42
|
+
end
|
43
|
+
|
44
|
+
alias id docid
|
45
|
+
end
|
46
|
+
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'rbbt/segment/annotation'
|
2
|
+
|
3
|
+
module Document
|
4
|
+
def self.define(type, &block)
|
5
|
+
send :property, type do
|
6
|
+
segments = self.instance_exec &block
|
7
|
+
|
8
|
+
Segment.align(self, segments) unless segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
|
9
|
+
|
10
|
+
segments.each do |segment|
|
11
|
+
SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
|
12
|
+
end
|
13
|
+
|
14
|
+
docid = self.docid
|
15
|
+
segments.each{|s| s.docid = docid if s.docid.nil? }
|
16
|
+
|
17
|
+
segments
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.define_multiple(type, &block)
|
22
|
+
send :property, type => :multiple do |list|
|
23
|
+
doc_segments = self.instance_exec list, &block
|
24
|
+
|
25
|
+
doc_segments = doc_segments.chunked_values_at(self) if Hash === doc_segments
|
26
|
+
|
27
|
+
doc_segments.each_with_index do |segments,i|
|
28
|
+
document = list[i]
|
29
|
+
Segment.align(document, segments) unless segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
|
30
|
+
|
31
|
+
segments.each do |segment|
|
32
|
+
SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
|
33
|
+
end
|
34
|
+
|
35
|
+
docid = document.docid
|
36
|
+
segments.each{|s| s.docid = docid if s.docid.nil? }
|
37
|
+
|
38
|
+
segments
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'rbbt-util'
|
2
|
+
|
3
|
+
module Document::Corpus
|
4
|
+
|
5
|
+
def self.setup(corpus)
|
6
|
+
corpus.extend Document::Corpus
|
7
|
+
end
|
8
|
+
|
9
|
+
def add_document(document)
|
10
|
+
self[document.docid] = document
|
11
|
+
end
|
12
|
+
|
13
|
+
def [](*args)
|
14
|
+
docid, *rest = args
|
15
|
+
res = super(*args)
|
16
|
+
return res if args.length > 1
|
17
|
+
namespace, id, type = docid.split(":")
|
18
|
+
|
19
|
+
if res.nil?
|
20
|
+
if Document::Corpus.claims.include?(namespace.to_s)
|
21
|
+
res = self.instance_exec(id, type, &Document::Corpus.claims[namespace.to_s])
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
Document.setup(res, namespace, id, type, self) unless res.nil?
|
26
|
+
|
27
|
+
res
|
28
|
+
end
|
29
|
+
|
30
|
+
class << self
|
31
|
+
attr_accessor :claims
|
32
|
+
def claim(namespace, &block)
|
33
|
+
@claims = {}
|
34
|
+
@claims[namespace.to_s] = block
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'rbbt/sources/pubmed'
|
2
|
+
|
3
|
+
module Document::Corpus
|
4
|
+
def add_pmid(pmid, type = nil)
|
5
|
+
pmids = Array === pmid ? pmid : [pmid]
|
6
|
+
type = nil if String === type and type.empty?
|
7
|
+
|
8
|
+
res = PubMed.get_article(pmids).collect do |pmid, article|
|
9
|
+
Log.debug "Loading pmid #{pmid}"
|
10
|
+
document = if type.nil? || type.to_sym == :abstract
|
11
|
+
Document.setup(article.abstract || "", "PMID", pmid, :abstract, self, :corpus => self)
|
12
|
+
elsif type.to_sym == :title
|
13
|
+
Document.setup(article.title, :PMID, pmid, :title, self)
|
14
|
+
else
|
15
|
+
raise "No FullText available for #{ pmid }" if article.full_text.nil?
|
16
|
+
Document.setup(article.full_text, :PMID, pmid, :fulltext, self, :corpus => self)
|
17
|
+
end
|
18
|
+
add_document(document)
|
19
|
+
end
|
20
|
+
|
21
|
+
Document.setup(res)
|
22
|
+
end
|
23
|
+
|
24
|
+
def add_pubmed_query(query, max = 3000, type = nil)
|
25
|
+
pmids = PubMed.query(query, max)
|
26
|
+
add_pmid(pmids, type)
|
27
|
+
end
|
28
|
+
|
29
|
+
self.claim "PMID" do |id, type|
|
30
|
+
Log.debug "Claiming #{id}"
|
31
|
+
self.add_pmid(id, type).first
|
32
|
+
end
|
33
|
+
end
|
data/lib/rbbt/ner/NER.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
require 'rbbt/
|
2
|
-
require 'rbbt/
|
3
|
-
require 'rbbt/
|
1
|
+
require 'rbbt/segment'
|
2
|
+
require 'rbbt/segment/named_entity'
|
3
|
+
require 'rbbt/segment/segmented'
|
4
4
|
|
5
5
|
class NER
|
6
6
|
def entities(text, protect = false, *args)
|
data/lib/rbbt/ner/abner.rb
CHANGED
data/lib/rbbt/ner/banner.rb
CHANGED
data/lib/rbbt/ner/brat.rb
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rjb'
|
3
|
-
require 'rbbt/text/segment'
|
4
3
|
require 'rbbt/ner/NER'
|
5
4
|
require 'rbbt/util/log'
|
6
5
|
|
@@ -8,7 +7,7 @@ class ChemicalTagger < NER
|
|
8
7
|
Rbbt.claim Rbbt.software.opt.ChemicalTagger, :install, Rbbt.share.install.software.ChemicalTagger.find
|
9
8
|
|
10
9
|
def self.init
|
11
|
-
ENV["CLASSPATH"] = ENV["CLASSPATH"].split(":").reverse * ":"
|
10
|
+
ENV["CLASSPATH"] = [ENV["CLASSPATH"].split(":"), Rbbt.software.opt.ChemicalTagger.produce.glob("*.jar").first].reverse * ":"
|
12
11
|
Rjb::load(nil, jvmargs = ['-Xms1G','-Xmx2G']) unless Rjb.loaded?
|
13
12
|
@@RbbtChemicalTagger ||= Rjb::import('RbbtChemicalTagger')
|
14
13
|
end
|
data/lib/rbbt/ner/g_norm_plus.rb
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
require 'rbbt-util'
|
2
|
+
require 'rbbt/segment'
|
3
|
+
require 'rbbt/segment/named_entity'
|
2
4
|
module GNormPlus
|
3
5
|
|
4
6
|
Rbbt.claim Rbbt.software.opt.GNormPlus, :install do
|
@@ -35,8 +37,8 @@ module GNormPlus
|
|
35
37
|
HomologeneID = False
|
36
38
|
Normalization2Protein = False
|
37
39
|
ShowUnNormalizedMention = False
|
40
|
+
IgnoreNER = False
|
38
41
|
DeleteTmp = True
|
39
|
-
IgnoreNER = True
|
40
42
|
EOF
|
41
43
|
|
42
44
|
def self.process(texts)
|
@@ -69,7 +71,7 @@ EOF
|
|
69
71
|
tsv = TSV.setup({}, :key_field => key_field, :fields => ["Entities"], :type => :flat)
|
70
72
|
Dir.glob("output/*.txt").each do |file|
|
71
73
|
name = File.basename(file).sub(".txt",'')
|
72
|
-
entities = Open.read(file).split("\n")[1..-1].collect{|l| l.gsub(':', '
|
74
|
+
entities = Open.read(file).split("\n")[1..-1].collect{|l| l.gsub(':', '·').split("\t")[1..-1] * ":"}
|
73
75
|
tsv[name] = entities
|
74
76
|
end
|
75
77
|
|
@@ -79,6 +81,21 @@ EOF
|
|
79
81
|
end
|
80
82
|
end
|
81
83
|
end
|
84
|
+
|
85
|
+
def self.entities(texts)
|
86
|
+
res = {}
|
87
|
+
process(texts).each do |name, entities|
|
88
|
+
|
89
|
+
segments = entities.collect do |entity|
|
90
|
+
start, eend, literal, type, code = entity.split(":")
|
91
|
+
literal.gsub!('·',':')
|
92
|
+
|
93
|
+
NamedEntity.setup(literal, :offset => start.to_i, :entity_type => type, :code => code)
|
94
|
+
end
|
95
|
+
|
96
|
+
res[name] = segments
|
97
|
+
end
|
98
|
+
end
|
82
99
|
end
|
83
100
|
|
84
101
|
if __FILE__ == $0
|
data/lib/rbbt/ner/linnaeus.rb
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
require 'rjb'
|
2
2
|
require 'rbbt'
|
3
|
-
require 'rbbt/
|
3
|
+
require 'rbbt/segment/named_entity'
|
4
4
|
|
5
5
|
module Linnaeus
|
6
6
|
|
7
7
|
Rbbt.claim Rbbt.software.opt.Linnaeus, :install, Rbbt.share.install.software.Linnaeus.find
|
8
8
|
|
9
|
-
ARGS = ["--properties", Rbbt.software.opt.Linnaeus["species-proxy/properties.conf"].find]
|
9
|
+
ARGS = ["--properties", Rbbt.software.opt.Linnaeus.produce["species-proxy/properties.conf"].find]
|
10
10
|
|
11
11
|
|
12
12
|
Rjb::load(nil, jvmargs = ['-Xms2G','-Xmx2G']) unless Rjb.loaded?
|
@@ -31,7 +31,7 @@ module Linnaeus
|
|
31
31
|
init unless defined? @@Matcher
|
32
32
|
|
33
33
|
@@Matcher.match(text).toArray().collect do |mention|
|
34
|
-
NamedEntity.setup(mention.text(), mention.start(), "Organism", mention.ids(), mention.probabilities())
|
34
|
+
NamedEntity.setup(mention.text(), :offset => mention.start(), :entity_type => "Organism", :code => mention.ids(), :score => mention.probabilities())
|
35
35
|
end
|
36
36
|
end
|
37
37
|
end
|
@@ -1,8 +1,8 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rbbt/util/misc'
|
3
3
|
require 'rbbt/tsv'
|
4
|
-
require 'rbbt/
|
5
|
-
require 'rbbt/
|
4
|
+
require 'rbbt/segment'
|
5
|
+
require 'rbbt/segment/token'
|
6
6
|
require 'rbbt/ner/NER'
|
7
7
|
require 'inline'
|
8
8
|
|
@@ -150,7 +150,7 @@ VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
|
|
150
150
|
|
151
151
|
def match(text)
|
152
152
|
matches = NGramPrefixDictionary.match(index, (case_insensitive ? text.downcase : text)).collect{|name, code, offset|
|
153
|
-
NamedEntity.setup(name, offset, type, code)
|
153
|
+
NamedEntity.setup(name, :offset => offset, :entity_type => type, :code => code)
|
154
154
|
}
|
155
155
|
|
156
156
|
if case_insensitive
|
data/lib/rbbt/ner/oscar3.rb
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rjb'
|
3
3
|
require 'libxml'
|
4
|
-
require 'rbbt/text/segment'
|
5
4
|
require 'rbbt/ner/NER'
|
6
5
|
require 'rbbt/util/log'
|
7
6
|
|
@@ -53,7 +52,7 @@ class OSCAR3 < NER
|
|
53
52
|
next unless type.nil? or type.include? mention_type
|
54
53
|
score = memm ? entities.get(key).to_string.to_f : nil
|
55
54
|
|
56
|
-
NamedEntity.setup mention, rstart.to_i + offset, mention_type,
|
55
|
+
NamedEntity.setup mention, :offset => rstart.to_i + offset, :entity_type => mention_type, :score => score
|
57
56
|
|
58
57
|
mentions << mention unless mentions.collect{|m| m.to_s}.include? mention.to_s
|
59
58
|
end
|
data/lib/rbbt/ner/oscar4.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rjb'
|
3
3
|
require 'libxml'
|
4
|
-
require 'rbbt/
|
4
|
+
require 'rbbt/segment'
|
5
5
|
require 'rbbt/ner/NER'
|
6
6
|
require 'rbbt/util/log'
|
7
7
|
|
@@ -25,7 +25,7 @@ class OSCAR4 < NER
|
|
25
25
|
@@tagger ||= @@OSCAR.new()
|
26
26
|
end
|
27
27
|
|
28
|
-
def self.match(text, type = nil)
|
28
|
+
def self.match(text, protect = false, type = nil)
|
29
29
|
self.init
|
30
30
|
|
31
31
|
return [] if text.nil? or text.strip.empty?
|
@@ -46,7 +46,7 @@ class OSCAR4 < NER
|
|
46
46
|
|
47
47
|
next unless entity.getType.toString == type unless type.nil?
|
48
48
|
|
49
|
-
NamedEntity.setup mention, entity.getStart, entity.getType, inchi, entity.getConfidence
|
49
|
+
NamedEntity.setup mention, :offset => entity.getStart, :entity_type => entity.getType, :code => inchi, :score => entity.getConfidence
|
50
50
|
|
51
51
|
result << mention
|
52
52
|
end
|
data/lib/rbbt/ner/patterns.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
|
-
require 'rbbt/
|
2
|
-
require 'rbbt/
|
3
|
-
require 'rbbt/
|
4
|
-
require 'rbbt/
|
1
|
+
require 'rbbt/segment/named_entity'
|
2
|
+
require 'rbbt/segment/segmented'
|
3
|
+
require 'rbbt/segment/transformed'
|
4
|
+
#require 'rbbt/segment/relationship'
|
5
5
|
require 'rbbt/ner/regexpNER'
|
6
6
|
require 'rbbt/ner/token_trieNER'
|
7
7
|
require 'rbbt/nlp/nlp'
|
@@ -14,7 +14,8 @@ class PatternRelExt
|
|
14
14
|
regexpNER = RegExpNER.new type => patterns.collect{|p| /#{p}/}
|
15
15
|
segments = sentence.segments
|
16
16
|
segments = segments.values.flatten if Hash === segments
|
17
|
-
Transformed.with_transform(sentence, segments, Proc.new{|s| s.
|
17
|
+
Transformed.with_transform(sentence, segments, Proc.new{|s| s.entity_type.to_s.upcase}) do |sentence|
|
18
|
+
ppp sentence
|
18
19
|
regexpNER.entities(sentence)
|
19
20
|
end
|
20
21
|
end
|
data/lib/rbbt/ner/regexpNER.rb
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
require 'rbbt/text/segment'
|
2
1
|
require 'rbbt/ner/NER'
|
3
2
|
require 'rbbt/util/simpleDSL'
|
4
3
|
|
@@ -23,7 +22,7 @@ class RegExpNER < NER
|
|
23
22
|
end
|
24
23
|
|
25
24
|
if match and not match.empty?
|
26
|
-
NamedEntity.setup(match, start + pre.length, type)
|
25
|
+
NamedEntity.setup(match, :offset => start + pre.length, :entity_type => type)
|
27
26
|
matches << match
|
28
27
|
end
|
29
28
|
|
@@ -1,8 +1,8 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rbbt/tsv'
|
3
|
-
require 'rbbt/
|
4
|
-
require 'rbbt/text/segment/token'
|
3
|
+
require 'rbbt/segment'
|
5
4
|
require 'rbbt/ner/NER'
|
5
|
+
require 'rbbt/segment/token'
|
6
6
|
|
7
7
|
class TokenTrieNER < NER
|
8
8
|
def self.clean(token)
|
@@ -16,13 +16,13 @@ class TokenTrieNER < NER
|
|
16
16
|
def self.prepare_token(token, start, extend_to_token = true, no_clean = false)
|
17
17
|
if no_clean
|
18
18
|
if extend_to_token
|
19
|
-
Token.setup(
|
19
|
+
Token.setup(token, :offset => start, :original => token)
|
20
20
|
else
|
21
21
|
token
|
22
22
|
end
|
23
23
|
else
|
24
24
|
if extend_to_token
|
25
|
-
Token.setup(clean(token), start, token)
|
25
|
+
Token.setup(clean(token), :offset => start, :original => token)
|
26
26
|
else
|
27
27
|
clean(token)
|
28
28
|
end
|
@@ -137,7 +137,7 @@ class TokenTrieNER < NER
|
|
137
137
|
tmp_index = {}
|
138
138
|
hash.send(hash.respond_to?(:through)? :through : :each) do |code, names|
|
139
139
|
names = Array === names ? names : [names]
|
140
|
-
names.flatten! if Array === names.first and not
|
140
|
+
names.flatten! if Array === names.first and not Segment === names.first.first
|
141
141
|
|
142
142
|
if names.empty?
|
143
143
|
names.unshift code unless TSV === hash and not (hash.fields.nil? or hash.fields.empty?)
|
@@ -237,7 +237,7 @@ class TokenTrieNER < NER
|
|
237
237
|
match << ((t.respond_to?(:original) and not t.original.nil?) ? t.original : t)
|
238
238
|
}
|
239
239
|
|
240
|
-
NamedEntity.setup(match, match_tokens.first.offset, type, codes)
|
240
|
+
NamedEntity.setup(match, :offset => match_tokens.first.offset, :entity_type => type, :code => codes)
|
241
241
|
end
|
242
242
|
|
243
243
|
attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean
|