rbbt-text 1.1.8 → 1.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/ner/NER.rb +3 -3
  3. data/lib/rbbt/ner/abner.rb +3 -3
  4. data/lib/rbbt/ner/banner.rb +1 -1
  5. data/lib/rbbt/ner/brat.rb +2 -2
  6. data/lib/rbbt/ner/chemical_tagger.rb +1 -1
  7. data/lib/rbbt/ner/linnaeus.rb +1 -1
  8. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +2 -2
  9. data/lib/rbbt/ner/oscar3.rb +1 -1
  10. data/lib/rbbt/ner/oscar4.rb +1 -1
  11. data/lib/rbbt/ner/patterns.rb +4 -4
  12. data/lib/rbbt/ner/regexpNER.rb +1 -1
  13. data/lib/rbbt/ner/token_trieNER.rb +2 -2
  14. data/lib/rbbt/nlp/genia/sentence_splitter.rb +1 -1
  15. data/lib/rbbt/nlp/nlp.rb +2 -2
  16. data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +1 -1
  17. data/lib/rbbt/{corpus → text}/corpus.rb +51 -11
  18. data/lib/rbbt/text/corpus/document.rb +361 -0
  19. data/lib/rbbt/text/corpus/document_repo.rb +68 -0
  20. data/lib/rbbt/text/corpus/sources/pmid.rb +34 -0
  21. data/lib/rbbt/text/document.rb +39 -0
  22. data/lib/rbbt/{ner → text}/segment.rb +11 -6
  23. data/lib/rbbt/{ner → text}/segment/docid.rb +1 -1
  24. data/lib/rbbt/{ner → text}/segment/named_entity.rb +2 -2
  25. data/lib/rbbt/{ner → text}/segment/relationship.rb +1 -1
  26. data/lib/rbbt/{ner → text}/segment/segmented.rb +1 -1
  27. data/lib/rbbt/{ner → text}/segment/token.rb +1 -1
  28. data/lib/rbbt/{ner → text}/segment/transformed.rb +47 -42
  29. data/test/rbbt/entity/test_document.rb +1 -0
  30. data/test/rbbt/ner/test_abner.rb +1 -0
  31. data/test/rbbt/ner/test_linnaeus.rb +1 -0
  32. data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +0 -1
  33. data/test/rbbt/text/corpus/sources/test_pmid.rb +33 -0
  34. data/test/rbbt/text/corpus/test_document.rb +52 -0
  35. data/test/rbbt/{ner → text}/segment/test_named_entity.rb +2 -2
  36. data/test/rbbt/{ner → text}/segment/test_relationship.rb +0 -0
  37. data/test/rbbt/{ner → text}/segment/test_segmented.rb +1 -1
  38. data/test/rbbt/{ner → text}/segment/test_transformed.rb +96 -3
  39. data/test/rbbt/text/test_corpus.rb +34 -0
  40. data/test/rbbt/text/test_document.rb +58 -0
  41. data/test/rbbt/{ner → text}/test_segment.rb +2 -2
  42. data/test/test_helper.rb +3 -3
  43. metadata +32 -24
  44. data/lib/rbbt/corpus/document.rb +0 -266
  45. data/lib/rbbt/corpus/document_repo.rb +0 -137
  46. data/lib/rbbt/corpus/sources/pubmed.rb +0 -27
  47. data/lib/rbbt/entity/document.rb +0 -75
@@ -1,137 +0,0 @@
1
- require 'rbbt/util/misc'
2
- require 'tokyocabinet'
3
-
4
- module DocumentRepo
5
- class OpenError < StandardError;end
6
- class KeyFormatError < StandardError;end
7
-
8
- TC_CONNECTIONS = {}
9
- def self.open_tokyocabinet(path, write)
10
- write = true if not File.exists?(path)
11
- flags = (write ? TokyoCabinet::BDB::OWRITER | TokyoCabinet::BDB::OCREAT : TokyoCabinet::BDB::OREADER)
12
-
13
- FileUtils.mkdir_p File.dirname(path) unless File.exists?(File.dirname(path))
14
-
15
- database = TC_CONNECTIONS[path] ||= TokyoCabinet::BDB.new
16
- database.close
17
-
18
- if !database.open(path, flags)
19
- ecode = database.ecode
20
- raise "Open error: #{database.errmsg(ecode)}. Trying to open file #{path}"
21
- end
22
-
23
- class << database
24
- attr_accessor :writable, :persistence_path
25
-
26
- def read
27
- return if not @writable
28
- self.close
29
- if !self.open(@persistence_path, TokyoCabinet::BDB::OREADER)
30
- ecode = self.ecode
31
- raise "Open error: #{self.errmsg(ecode)}. Trying to open file #{@persistence_path}"
32
- end
33
- @writable = false
34
- self
35
- end
36
-
37
- def write
38
- return if @writable
39
- self.close
40
- if !self.open(@persistence_path, TokyoCabinet::BDB::OWRITER | TokyoCabinet::BDB::OCREAT)
41
- ecode = self.ecode
42
- raise "Open error: #{self.errmsg(ecode)}. Trying to open file #{@persistence_path}"
43
- end
44
- @writable = true
45
- self
46
- end
47
-
48
- def write?
49
- @writable
50
- end
51
-
52
- def collect
53
- res = []
54
- each do |key, value|
55
- res << if block_given?
56
- yield key, value
57
- else
58
- [key, value]
59
- end
60
- end
61
- res
62
- end
63
-
64
- def delete(key)
65
- out(key)
66
- end
67
-
68
- def values_at(*keys)
69
- keys.collect do |key|
70
- self[key]
71
- end
72
- end
73
-
74
- def merge!(hash)
75
- hash.each do |key,values|
76
- self[key] = values
77
- end
78
- end
79
-
80
- end
81
-
82
- database.persistence_path ||= path
83
-
84
- database.extend DocumentRepo
85
-
86
- database
87
- end
88
-
89
- def docid2fields(docid)
90
- docid.split(":", -1).values_at 0,1,2,3
91
- end
92
-
93
- def fields2docid(namespace = nil, id = nil, type = nil, hash = nil)
94
- [namespace, id, type, hash] * ":"
95
- end
96
-
97
- def docid(docid)
98
- get(docid)
99
- end
100
-
101
- def add(text, namespace, id, type, hash)
102
- read
103
- write unless write?
104
- docid = fields2docid(namespace, id, type, hash)
105
- self[docid] = text unless self.include? docid
106
- read
107
- docid
108
- end
109
-
110
- def find(namespace=nil, id = nil, type = nil, hash = nil)
111
- case
112
- when namespace.nil?
113
- self.keys
114
- when id.nil?
115
- range_start = [namespace] * ":" + ':'
116
- range_end = [namespace] * ":" + ';'
117
- self.range(range_start, true, range_end, false)
118
- when (type and hash)
119
- [[namespace, id, type, hash] * ":"]
120
- when hash
121
- [[namespace, id, "", hash] * ":"]
122
- when type
123
- range_start = [namespace, id, type] * ":" + ':'
124
- range_end = [namespace, id, type] * ":" + ';'
125
- self.range(range_start, true, range_end, false)
126
- else
127
- range_start = [namespace, id] * ":" + ':'
128
- range_end = [namespace, id] * ":" + ';'
129
- self.range(range_start, true, range_end, false)
130
- end
131
- end
132
-
133
- def find_docid(docid)
134
- find(*docid2fields(docid))
135
- end
136
-
137
- end
@@ -1,27 +0,0 @@
1
- require 'rbbt/sources/pubmed'
2
-
3
- class Corpus
4
-
5
- NAMESPACES = {} unless defined? NAMESPACES
6
- NAMESPACES[:pubmed] = :add_pmid
7
-
8
- def add_pmid(pmid, type = nil)
9
- pmids = Array === pmid ? pmid : [pmid]
10
- type = nil if String === type and type.empty?
11
-
12
- PubMed.get_article(pmids).collect do |pmid, article|
13
- add_document(article.title, :pubmed, pmid, :title)
14
- if (type.nil? and article.pdf_url.nil?) or (not type.nil? and type.to_sym === :abstract)
15
- add_document(article.abstract || "", :pubmed, pmid, :abstract)
16
- else
17
- raise "No FullText available for #{ pmid }" if article.pdf_url.nil?
18
- add_document(article.full_text, :pubmed, pmid, :fulltext)
19
- end
20
- end
21
- end
22
-
23
- def add_pubmed_query(query, max, type = nil)
24
- pmids = PubMed.query(query, max)
25
- add_pmid(pmids, type)
26
- end
27
- end
@@ -1,75 +0,0 @@
1
- require 'rbbt/entity'
2
- require 'rbbt/ner/segment/docid'
3
-
4
- module Document
5
- extend Entity
6
-
7
- class << self
8
- attr_accessor :corpus
9
- end
10
-
11
- property :docid => :single2array do |*args|
12
- @docid ||= if self =~ /^text:/
13
- self
14
- else
15
- ["text", Misc.digest(self.inspect)] * ":"
16
- end
17
- @docid
18
- end
19
-
20
- property :annotation_id => :both do |*args|
21
- if Array === self
22
- Misc.hash2md5(info.merge(:self => self))
23
- else
24
- docid(*args)
25
- end
26
- end
27
-
28
- property :_get_text => :single do
29
- self
30
- end
31
-
32
- property :text => :array2single do |*args|
33
- article_text = {}
34
- missing = []
35
-
36
- if Document.corpus.nil?
37
- self._get_text(*args)
38
- else
39
-
40
- Document.corpus.read_and_close do
41
- self.each do |doc|
42
- id = doc.docid(*args)
43
- case
44
- when Document.corpus.include?(doc)
45
- article_text[doc] = Document.corpus[doc]
46
- when Document.corpus.include?(id)
47
- article_text[doc] = Document.corpus[id]
48
- else
49
- missing << doc
50
- end
51
-
52
- end
53
- end
54
-
55
- if missing.any?
56
- missing.first.annotate missing
57
- missing_text = Misc.process_to_hash(missing){|list| list._get_text(*args)}
58
-
59
- Misc.lock(Document.corpus.respond_to?(:persistence_path) ? Document.corpus.persistence_path : nil) do
60
- Document.corpus.write_and_close do
61
-
62
- missing_text.each do |doc, doc_text|
63
- doc = self.annotate doc.dup
64
- Document.corpus[doc.docid(*args)] = doc_text
65
- article_text[doc] = doc_text
66
- end
67
- end
68
- end
69
- end
70
-
71
- article_text.values_at *self
72
- end
73
- end
74
- end
75
-