rbbt-text 1.1.8 → 1.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/ner/NER.rb +3 -3
- data/lib/rbbt/ner/abner.rb +3 -3
- data/lib/rbbt/ner/banner.rb +1 -1
- data/lib/rbbt/ner/brat.rb +2 -2
- data/lib/rbbt/ner/chemical_tagger.rb +1 -1
- data/lib/rbbt/ner/linnaeus.rb +1 -1
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +2 -2
- data/lib/rbbt/ner/oscar3.rb +1 -1
- data/lib/rbbt/ner/oscar4.rb +1 -1
- data/lib/rbbt/ner/patterns.rb +4 -4
- data/lib/rbbt/ner/regexpNER.rb +1 -1
- data/lib/rbbt/ner/token_trieNER.rb +2 -2
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +1 -1
- data/lib/rbbt/nlp/nlp.rb +2 -2
- data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +1 -1
- data/lib/rbbt/{corpus → text}/corpus.rb +51 -11
- data/lib/rbbt/text/corpus/document.rb +361 -0
- data/lib/rbbt/text/corpus/document_repo.rb +68 -0
- data/lib/rbbt/text/corpus/sources/pmid.rb +34 -0
- data/lib/rbbt/text/document.rb +39 -0
- data/lib/rbbt/{ner → text}/segment.rb +11 -6
- data/lib/rbbt/{ner → text}/segment/docid.rb +1 -1
- data/lib/rbbt/{ner → text}/segment/named_entity.rb +2 -2
- data/lib/rbbt/{ner → text}/segment/relationship.rb +1 -1
- data/lib/rbbt/{ner → text}/segment/segmented.rb +1 -1
- data/lib/rbbt/{ner → text}/segment/token.rb +1 -1
- data/lib/rbbt/{ner → text}/segment/transformed.rb +47 -42
- data/test/rbbt/entity/test_document.rb +1 -0
- data/test/rbbt/ner/test_abner.rb +1 -0
- data/test/rbbt/ner/test_linnaeus.rb +1 -0
- data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +0 -1
- data/test/rbbt/text/corpus/sources/test_pmid.rb +33 -0
- data/test/rbbt/text/corpus/test_document.rb +52 -0
- data/test/rbbt/{ner → text}/segment/test_named_entity.rb +2 -2
- data/test/rbbt/{ner → text}/segment/test_relationship.rb +0 -0
- data/test/rbbt/{ner → text}/segment/test_segmented.rb +1 -1
- data/test/rbbt/{ner → text}/segment/test_transformed.rb +96 -3
- data/test/rbbt/text/test_corpus.rb +34 -0
- data/test/rbbt/text/test_document.rb +58 -0
- data/test/rbbt/{ner → text}/test_segment.rb +2 -2
- data/test/test_helper.rb +3 -3
- metadata +32 -24
- data/lib/rbbt/corpus/document.rb +0 -266
- data/lib/rbbt/corpus/document_repo.rb +0 -137
- data/lib/rbbt/corpus/sources/pubmed.rb +0 -27
- data/lib/rbbt/entity/document.rb +0 -75
@@ -1,137 +0,0 @@
|
|
1
|
-
require 'rbbt/util/misc'
|
2
|
-
require 'tokyocabinet'
|
3
|
-
|
4
|
-
module DocumentRepo
|
5
|
-
class OpenError < StandardError;end
|
6
|
-
class KeyFormatError < StandardError;end
|
7
|
-
|
8
|
-
TC_CONNECTIONS = {}
|
9
|
-
def self.open_tokyocabinet(path, write)
|
10
|
-
write = true if not File.exists?(path)
|
11
|
-
flags = (write ? TokyoCabinet::BDB::OWRITER | TokyoCabinet::BDB::OCREAT : TokyoCabinet::BDB::OREADER)
|
12
|
-
|
13
|
-
FileUtils.mkdir_p File.dirname(path) unless File.exists?(File.dirname(path))
|
14
|
-
|
15
|
-
database = TC_CONNECTIONS[path] ||= TokyoCabinet::BDB.new
|
16
|
-
database.close
|
17
|
-
|
18
|
-
if !database.open(path, flags)
|
19
|
-
ecode = database.ecode
|
20
|
-
raise "Open error: #{database.errmsg(ecode)}. Trying to open file #{path}"
|
21
|
-
end
|
22
|
-
|
23
|
-
class << database
|
24
|
-
attr_accessor :writable, :persistence_path
|
25
|
-
|
26
|
-
def read
|
27
|
-
return if not @writable
|
28
|
-
self.close
|
29
|
-
if !self.open(@persistence_path, TokyoCabinet::BDB::OREADER)
|
30
|
-
ecode = self.ecode
|
31
|
-
raise "Open error: #{self.errmsg(ecode)}. Trying to open file #{@persistence_path}"
|
32
|
-
end
|
33
|
-
@writable = false
|
34
|
-
self
|
35
|
-
end
|
36
|
-
|
37
|
-
def write
|
38
|
-
return if @writable
|
39
|
-
self.close
|
40
|
-
if !self.open(@persistence_path, TokyoCabinet::BDB::OWRITER | TokyoCabinet::BDB::OCREAT)
|
41
|
-
ecode = self.ecode
|
42
|
-
raise "Open error: #{self.errmsg(ecode)}. Trying to open file #{@persistence_path}"
|
43
|
-
end
|
44
|
-
@writable = true
|
45
|
-
self
|
46
|
-
end
|
47
|
-
|
48
|
-
def write?
|
49
|
-
@writable
|
50
|
-
end
|
51
|
-
|
52
|
-
def collect
|
53
|
-
res = []
|
54
|
-
each do |key, value|
|
55
|
-
res << if block_given?
|
56
|
-
yield key, value
|
57
|
-
else
|
58
|
-
[key, value]
|
59
|
-
end
|
60
|
-
end
|
61
|
-
res
|
62
|
-
end
|
63
|
-
|
64
|
-
def delete(key)
|
65
|
-
out(key)
|
66
|
-
end
|
67
|
-
|
68
|
-
def values_at(*keys)
|
69
|
-
keys.collect do |key|
|
70
|
-
self[key]
|
71
|
-
end
|
72
|
-
end
|
73
|
-
|
74
|
-
def merge!(hash)
|
75
|
-
hash.each do |key,values|
|
76
|
-
self[key] = values
|
77
|
-
end
|
78
|
-
end
|
79
|
-
|
80
|
-
end
|
81
|
-
|
82
|
-
database.persistence_path ||= path
|
83
|
-
|
84
|
-
database.extend DocumentRepo
|
85
|
-
|
86
|
-
database
|
87
|
-
end
|
88
|
-
|
89
|
-
def docid2fields(docid)
|
90
|
-
docid.split(":", -1).values_at 0,1,2,3
|
91
|
-
end
|
92
|
-
|
93
|
-
def fields2docid(namespace = nil, id = nil, type = nil, hash = nil)
|
94
|
-
[namespace, id, type, hash] * ":"
|
95
|
-
end
|
96
|
-
|
97
|
-
def docid(docid)
|
98
|
-
get(docid)
|
99
|
-
end
|
100
|
-
|
101
|
-
def add(text, namespace, id, type, hash)
|
102
|
-
read
|
103
|
-
write unless write?
|
104
|
-
docid = fields2docid(namespace, id, type, hash)
|
105
|
-
self[docid] = text unless self.include? docid
|
106
|
-
read
|
107
|
-
docid
|
108
|
-
end
|
109
|
-
|
110
|
-
def find(namespace=nil, id = nil, type = nil, hash = nil)
|
111
|
-
case
|
112
|
-
when namespace.nil?
|
113
|
-
self.keys
|
114
|
-
when id.nil?
|
115
|
-
range_start = [namespace] * ":" + ':'
|
116
|
-
range_end = [namespace] * ":" + ';'
|
117
|
-
self.range(range_start, true, range_end, false)
|
118
|
-
when (type and hash)
|
119
|
-
[[namespace, id, type, hash] * ":"]
|
120
|
-
when hash
|
121
|
-
[[namespace, id, "", hash] * ":"]
|
122
|
-
when type
|
123
|
-
range_start = [namespace, id, type] * ":" + ':'
|
124
|
-
range_end = [namespace, id, type] * ":" + ';'
|
125
|
-
self.range(range_start, true, range_end, false)
|
126
|
-
else
|
127
|
-
range_start = [namespace, id] * ":" + ':'
|
128
|
-
range_end = [namespace, id] * ":" + ';'
|
129
|
-
self.range(range_start, true, range_end, false)
|
130
|
-
end
|
131
|
-
end
|
132
|
-
|
133
|
-
def find_docid(docid)
|
134
|
-
find(*docid2fields(docid))
|
135
|
-
end
|
136
|
-
|
137
|
-
end
|
@@ -1,27 +0,0 @@
|
|
1
|
-
require 'rbbt/sources/pubmed'
|
2
|
-
|
3
|
-
class Corpus
|
4
|
-
|
5
|
-
NAMESPACES = {} unless defined? NAMESPACES
|
6
|
-
NAMESPACES[:pubmed] = :add_pmid
|
7
|
-
|
8
|
-
def add_pmid(pmid, type = nil)
|
9
|
-
pmids = Array === pmid ? pmid : [pmid]
|
10
|
-
type = nil if String === type and type.empty?
|
11
|
-
|
12
|
-
PubMed.get_article(pmids).collect do |pmid, article|
|
13
|
-
add_document(article.title, :pubmed, pmid, :title)
|
14
|
-
if (type.nil? and article.pdf_url.nil?) or (not type.nil? and type.to_sym === :abstract)
|
15
|
-
add_document(article.abstract || "", :pubmed, pmid, :abstract)
|
16
|
-
else
|
17
|
-
raise "No FullText available for #{ pmid }" if article.pdf_url.nil?
|
18
|
-
add_document(article.full_text, :pubmed, pmid, :fulltext)
|
19
|
-
end
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
def add_pubmed_query(query, max, type = nil)
|
24
|
-
pmids = PubMed.query(query, max)
|
25
|
-
add_pmid(pmids, type)
|
26
|
-
end
|
27
|
-
end
|
data/lib/rbbt/entity/document.rb
DELETED
@@ -1,75 +0,0 @@
|
|
1
|
-
require 'rbbt/entity'
|
2
|
-
require 'rbbt/ner/segment/docid'
|
3
|
-
|
4
|
-
module Document
|
5
|
-
extend Entity
|
6
|
-
|
7
|
-
class << self
|
8
|
-
attr_accessor :corpus
|
9
|
-
end
|
10
|
-
|
11
|
-
property :docid => :single2array do |*args|
|
12
|
-
@docid ||= if self =~ /^text:/
|
13
|
-
self
|
14
|
-
else
|
15
|
-
["text", Misc.digest(self.inspect)] * ":"
|
16
|
-
end
|
17
|
-
@docid
|
18
|
-
end
|
19
|
-
|
20
|
-
property :annotation_id => :both do |*args|
|
21
|
-
if Array === self
|
22
|
-
Misc.hash2md5(info.merge(:self => self))
|
23
|
-
else
|
24
|
-
docid(*args)
|
25
|
-
end
|
26
|
-
end
|
27
|
-
|
28
|
-
property :_get_text => :single do
|
29
|
-
self
|
30
|
-
end
|
31
|
-
|
32
|
-
property :text => :array2single do |*args|
|
33
|
-
article_text = {}
|
34
|
-
missing = []
|
35
|
-
|
36
|
-
if Document.corpus.nil?
|
37
|
-
self._get_text(*args)
|
38
|
-
else
|
39
|
-
|
40
|
-
Document.corpus.read_and_close do
|
41
|
-
self.each do |doc|
|
42
|
-
id = doc.docid(*args)
|
43
|
-
case
|
44
|
-
when Document.corpus.include?(doc)
|
45
|
-
article_text[doc] = Document.corpus[doc]
|
46
|
-
when Document.corpus.include?(id)
|
47
|
-
article_text[doc] = Document.corpus[id]
|
48
|
-
else
|
49
|
-
missing << doc
|
50
|
-
end
|
51
|
-
|
52
|
-
end
|
53
|
-
end
|
54
|
-
|
55
|
-
if missing.any?
|
56
|
-
missing.first.annotate missing
|
57
|
-
missing_text = Misc.process_to_hash(missing){|list| list._get_text(*args)}
|
58
|
-
|
59
|
-
Misc.lock(Document.corpus.respond_to?(:persistence_path) ? Document.corpus.persistence_path : nil) do
|
60
|
-
Document.corpus.write_and_close do
|
61
|
-
|
62
|
-
missing_text.each do |doc, doc_text|
|
63
|
-
doc = self.annotate doc.dup
|
64
|
-
Document.corpus[doc.docid(*args)] = doc_text
|
65
|
-
article_text[doc] = doc_text
|
66
|
-
end
|
67
|
-
end
|
68
|
-
end
|
69
|
-
end
|
70
|
-
|
71
|
-
article_text.values_at *self
|
72
|
-
end
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|