biointerchange 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.rspec +1 -0
- data/.travis.yml +12 -0
- data/Gemfile +17 -0
- data/LICENSE.txt +8 -0
- data/README.md +166 -0
- data/Rakefile +50 -0
- data/VERSION +1 -0
- data/bin/biointerchange +6 -0
- data/docs/exceptions_readme.txt +13 -0
- data/examples/BovineGenomeChrX.gff3.gz +0 -0
- data/examples/gb-2007-8-3-R40.xml +243 -0
- data/examples/pubannotation.json +1 -0
- data/generators/rdfxml.rb +104 -0
- data/lib/biointerchange/core.rb +195 -0
- data/lib/biointerchange/exceptions.rb +38 -0
- data/lib/biointerchange/genomics/gff3_feature.rb +82 -0
- data/lib/biointerchange/genomics/gff3_feature_set.rb +37 -0
- data/lib/biointerchange/genomics/gff3_rdf_ntriples.rb +107 -0
- data/lib/biointerchange/genomics/gff3_reader.rb +86 -0
- data/lib/biointerchange/gff3.rb +135 -0
- data/lib/biointerchange/reader.rb +25 -0
- data/lib/biointerchange/registry.rb +29 -0
- data/lib/biointerchange/sio.rb +7124 -0
- data/lib/biointerchange/sofa.rb +1566 -0
- data/lib/biointerchange/textmining/content.rb +69 -0
- data/lib/biointerchange/textmining/document.rb +36 -0
- data/lib/biointerchange/textmining/pdfx_xml_reader.rb +161 -0
- data/lib/biointerchange/textmining/process.rb +57 -0
- data/lib/biointerchange/textmining/pubannos_json_reader.rb +72 -0
- data/lib/biointerchange/textmining/text_mining_rdf_ntriples.rb +197 -0
- data/lib/biointerchange/textmining/text_mining_reader.rb +41 -0
- data/lib/biointerchange/writer.rb +23 -0
- data/lib/biointerchange.rb +3 -0
- data/spec/exceptions_spec.rb +27 -0
- data/spec/gff3_rdfwriter_spec.rb +67 -0
- data/spec/text_mining_pdfx_xml_reader_spec.rb +89 -0
- data/spec/text_mining_pubannos_json_reader_spec.rb +71 -0
- data/spec/text_mining_rdfwriter_spec.rb +57 -0
- data/web/about.html +89 -0
- data/web/biointerchange.js +133 -0
- data/web/bootstrap/css/bootstrap-responsive.css +1040 -0
- data/web/bootstrap/css/bootstrap-responsive.min.css +9 -0
- data/web/bootstrap/css/bootstrap.css +5624 -0
- data/web/bootstrap/css/bootstrap.min.css +9 -0
- data/web/bootstrap/img/glyphicons-halflings-white.png +0 -0
- data/web/bootstrap/img/glyphicons-halflings.png +0 -0
- data/web/bootstrap/js/bootstrap.js +2027 -0
- data/web/bootstrap/js/bootstrap.min.js +6 -0
- data/web/bootstrap/js/jquery-1.8.1.min.js +2 -0
- data/web/css/rdoc-style.css +5786 -0
- data/web/css/rdoc.css +716 -0
- data/web/images/BioInterchange300.png +0 -0
- data/web/index.html +109 -0
- data/web/service/rdfizer.fcgi +68 -0
- data/web/webservices.html +123 -0
- metadata +240 -0
@@ -0,0 +1,69 @@
|
|
1
|
+
module BioInterchange::TextMining
|
2
|
+
|
3
|
+
class Content
|
4
|
+
|
5
|
+
# Constants that describe content types.
|
6
|
+
UNSPECIFIED = 0
|
7
|
+
DOCUMENT = 1
|
8
|
+
PAGE = 2
|
9
|
+
TITLE = 3
|
10
|
+
AUTHOR = 4
|
11
|
+
ABSTRACT = 5
|
12
|
+
SECTION = 6
|
13
|
+
PARAGRAPH = 7
|
14
|
+
SENTENCE = 8
|
15
|
+
PHRASE = 9
|
16
|
+
WORD = 10
|
17
|
+
CHARACTER = 11
|
18
|
+
|
19
|
+
# Creates a new document content representation at a specific location of the document.
|
20
|
+
#
|
21
|
+
# +offset+:: zero-based offset of the represented content within the document (absolute location within the document)
|
22
|
+
# +length+:: length of the represented content, where a length of zero denotes a boundary between two characters
|
23
|
+
# +type+:: classifaction of the content
|
24
|
+
def initialize(offset, length, type = UNSPECIFIED, process = nil)
|
25
|
+
@offset = offset
|
26
|
+
@length = length
|
27
|
+
@type = type
|
28
|
+
@process = process
|
29
|
+
end
|
30
|
+
|
31
|
+
# Sets the context of this content.
|
32
|
+
#
|
33
|
+
# +context+:: a +BioInterchange::TextMining::Document+ or +BioInterchange::TextMining::Content+ instance in which this content is enclosed in
|
34
|
+
def setContext(context)
|
35
|
+
@context = context
|
36
|
+
end
|
37
|
+
|
38
|
+
# Returns the offset of the content as absolute position within the document.
|
39
|
+
def offset
|
40
|
+
@offset
|
41
|
+
end
|
42
|
+
|
43
|
+
# Returns the length of the content, which is measured in characters.
|
44
|
+
def length
|
45
|
+
@length
|
46
|
+
end
|
47
|
+
|
48
|
+
# Returns the type of the content, if known, or +BioInterchange::TextMining::Content::UNSPECIFIED otherwise.
|
49
|
+
def type
|
50
|
+
@type
|
51
|
+
end
|
52
|
+
|
53
|
+
# Returns the process associated with this content, if previously provided, or nil otherwise.
|
54
|
+
def process
|
55
|
+
@process
|
56
|
+
end
|
57
|
+
|
58
|
+
# Returns a URI that identifies this content.
|
59
|
+
def uri
|
60
|
+
raise BioInterchange::Exceptions::ImplementationModelError, 'An URI can only be returned for content with a context (i.e., use setContext(context) first).' unless @context
|
61
|
+
process = '-'
|
62
|
+
process = "(#{@process.uri.sub(/^.*?:\/\//, '')})" if @process
|
63
|
+
"biointerchange://textmining/content/#{@context.uri.sub(/^.*?:\/\//, '')}/#{@offset},#{@length},#{@type},#{process}"
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module BioInterchange::TextMining
|
2
|
+
|
3
|
+
class Document
|
4
|
+
|
5
|
+
# Creates a blank document representation.
|
6
|
+
#
|
7
|
+
# +uri+:: source of the document
|
8
|
+
def initialize(uri)
|
9
|
+
@uri = uri
|
10
|
+
|
11
|
+
# Will hold content:
|
12
|
+
@content = []
|
13
|
+
end
|
14
|
+
|
15
|
+
# Returns the URI of the document.
|
16
|
+
def uri
|
17
|
+
@uri
|
18
|
+
end
|
19
|
+
|
20
|
+
# Adds content to the document.
|
21
|
+
#
|
22
|
+
# +content+:: content of type +BioInterchange::TextMining::Content+ that should be added to the document
|
23
|
+
def add(content)
|
24
|
+
raise BioInterchange::Exceptions::ImplementationModelError, 'Content has to be of kind BioInterchange::TextMining::Content' unless content.kind_of?(BioInterchange::TextMining::Content)
|
25
|
+
@content << content
|
26
|
+
end
|
27
|
+
|
28
|
+
# Returns the document contents as an array.
|
29
|
+
def contents
|
30
|
+
@content.clone.freeze
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
@@ -0,0 +1,161 @@
|
|
1
|
+
module BioInterchange::TextMining
|
2
|
+
|
3
|
+
require 'rexml/document'
|
4
|
+
require 'rexml/streamlistener'
|
5
|
+
|
6
|
+
class PdfxXmlReader < BioInterchange::TextMining::TMReader
|
7
|
+
|
8
|
+
# Reads input stream and returns associated +BioInterchange::TextMining::Document+ model
|
9
|
+
#
|
10
|
+
# Presently I assume a single document per xml file,
|
11
|
+
# and that <section> tags cannot nest. I also assume
|
12
|
+
# that a Content::DOCUMENT type is everything between
|
13
|
+
# the <article> tags.
|
14
|
+
#
|
15
|
+
# +inputstream+:: Input IO stream to deserialize
|
16
|
+
def deserialize(inputstream)
|
17
|
+
|
18
|
+
#super(inputstream)
|
19
|
+
|
20
|
+
raise BioInterchange::Exceptions::ImplementationReaderError, 'InputStream not of type IO, cannot read.' unless inputstream.kind_of?(IO) or inputstream.kind_of?(String)
|
21
|
+
|
22
|
+
@input = inputstream
|
23
|
+
|
24
|
+
pdfx
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def pdfx
|
33
|
+
list = MyListener.new
|
34
|
+
REXML::Document.parse_stream(@input, list)
|
35
|
+
return list.document
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
|
40
|
+
|
41
|
+
class MyListener
|
42
|
+
|
43
|
+
include REXML::StreamListener
|
44
|
+
|
45
|
+
def initialize
|
46
|
+
@map = {}
|
47
|
+
|
48
|
+
#sections can nest, so "stack" them
|
49
|
+
@map['sec_s'] = []
|
50
|
+
@map['sec_l'] = []
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
def tag_start(name, attr)
|
55
|
+
#puts "tag_start: #{name}"
|
56
|
+
if name =~ /^job$/
|
57
|
+
raise BioInterchange::Exceptions::InputFormatError, 'Input document XML has multiple <job> tags, cannot parse multiple documents within a single file.' if @map['id_done']
|
58
|
+
@map['id'] = true
|
59
|
+
elsif name =~ /^article-title$/
|
60
|
+
raise BioInterchange::Exceptions::InputFormatError, 'Input document XML has multiple <article-title> tags defined, cannot parse multiple documents within a single file.' if @map['title_done']
|
61
|
+
@map['title'] = true
|
62
|
+
@map['title_s'] = @map['art_l']
|
63
|
+
@map['title_l'] = 0
|
64
|
+
elsif name =~ /^abstract$/
|
65
|
+
raise BioInterchange::Exceptions::InputFormatError, 'Input document XML has multiple <abstract> tags defined, cannot parse multiple documents within a single file.' if @map['abs_done']
|
66
|
+
@map['abs'] = true
|
67
|
+
@map['abs_s'] = @map['art_l']
|
68
|
+
@map['abs_l'] = 0
|
69
|
+
elsif name =~ /^body$/
|
70
|
+
raise BioInterchange::Exceptions::InputFormatError, 'Input document XML has multiple <body> tags defined, cannot parse multiple documents within a single file.' if @map['body_done']
|
71
|
+
@map['body'] = true
|
72
|
+
@map['body_s'] = @map['art_l']
|
73
|
+
@map['body_l'] = 0
|
74
|
+
elsif name =~ /^article$/
|
75
|
+
raise BioInterchange::Exceptions::InputFormatError, 'Input document XML has multiple <article> tags defined, cannot parse multiple documents within a single file.' if @map['art_done']
|
76
|
+
@map['art'] = true
|
77
|
+
@map['art_s'] = 0
|
78
|
+
@map['art_l'] = 0
|
79
|
+
elsif name =~ /^section$/
|
80
|
+
raise BioInterchange::Exceptions::InputFormatError, 'Error with section stack, stacks not equal in size: Possibly not a well formed XML input file. Check <section> tags all match up and do not overlap (nesting is fine).' unless @map['sec_s'].size == @map['sec_l'].size
|
81
|
+
@map['sec_s'].push @map['art_l']
|
82
|
+
@map['sec_l'].push 0
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
|
87
|
+
def text(data)
|
88
|
+
|
89
|
+
if @map['art']
|
90
|
+
@map['art_l'] += data.length
|
91
|
+
end
|
92
|
+
|
93
|
+
if @map['id']
|
94
|
+
@doc = BioInterchange::TextMining::Document.new("http://pdfx.cs.man.ac.uk/" + data)
|
95
|
+
elsif @map['title']
|
96
|
+
@map['title_l'] += data.length
|
97
|
+
elsif @map['abs']
|
98
|
+
@map['abs_l'] += data.length
|
99
|
+
end
|
100
|
+
if @map['body']
|
101
|
+
@map['body_l'] += data.length
|
102
|
+
end
|
103
|
+
if @map['sec_l'].size != 0
|
104
|
+
#add length to *all* current sections
|
105
|
+
@map['sec_l'].size.times do |i|
|
106
|
+
@map['sec_l'][i] += data.length
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
end
|
111
|
+
|
112
|
+
#TODO add deal with <author> type tags
|
113
|
+
|
114
|
+
def tag_end(name)
|
115
|
+
#puts "tag_end: #{name}"
|
116
|
+
if name =~ /^job$/
|
117
|
+
@map['id'] = false
|
118
|
+
@map['id_done'] = true
|
119
|
+
elsif name =~ /^article-title$/
|
120
|
+
@map['title'] = false
|
121
|
+
dc = BioInterchange::TextMining::Content.new(@map['title_s'], @map['title_l'], BioInterchange::TextMining::Content::TITLE, @process)
|
122
|
+
dc.setContext(@doc)
|
123
|
+
@doc.add(dc)
|
124
|
+
@map['title_done'] = true
|
125
|
+
elsif name =~ /^abstract$/
|
126
|
+
@map['abs'] = false
|
127
|
+
dc = BioInterchange::TextMining::Content.new(@map['abs_s'], @map['abs_l'], BioInterchange::TextMining::Content::ABSTRACT, @process)
|
128
|
+
dc.setContext(@doc)
|
129
|
+
@doc.add(dc)
|
130
|
+
@map['abs_done'] = true
|
131
|
+
elsif name =~ /^body$/
|
132
|
+
@map['body'] = false
|
133
|
+
dc = BioInterchange::TextMining::Content.new(@map['body_s'], @map['body_l'], BioInterchange::TextMining::Content::SECTION, @process)
|
134
|
+
dc.setContext(@doc)
|
135
|
+
@doc.add(dc)
|
136
|
+
@map['body_done'] = true
|
137
|
+
elsif name =~ /^article$/
|
138
|
+
@map['art'] = false
|
139
|
+
dc = BioInterchange::TextMining::Content.new(@map['art_s'], @map['art_l'], BioInterchange::TextMining::Content::DOCUMENT, @process)
|
140
|
+
dc.setContext(@doc)
|
141
|
+
@doc.add(dc)
|
142
|
+
@map['art_done'] = true
|
143
|
+
elsif name =~ /^section$/
|
144
|
+
raise 'Error with section stack, stacks not equal in size' unless @map['sec_s'].size == @map['sec_l'].size
|
145
|
+
dc = BioInterchange::TextMining::Content.new(@map['sec_s'].pop, @map['sec_l'].pop, BioInterchange::TextMining::Content::SECTION, @process)
|
146
|
+
dc.setContext(@doc)
|
147
|
+
@doc.add(dc)
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
def document
|
152
|
+
@doc
|
153
|
+
end
|
154
|
+
|
155
|
+
end
|
156
|
+
|
157
|
+
|
158
|
+
|
159
|
+
end
|
160
|
+
|
161
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
module BioInterchange::TextMining
|
2
|
+
|
3
|
+
class Process
|
4
|
+
|
5
|
+
# Constants that describe process types.
|
6
|
+
UNSPECIFIED = 0
|
7
|
+
SOFTWARE = 1
|
8
|
+
WEBSERVICE = 2
|
9
|
+
MANUAL = 3
|
10
|
+
|
11
|
+
# Dictionary of viable metadata keys.
|
12
|
+
VERSION = 0
|
13
|
+
|
14
|
+
# Creates a new process representation.
|
15
|
+
#
|
16
|
+
# +name+:: identification of the software/service/curator involved in the process, e.g. "ABNER" or "Peter Smith"
|
17
|
+
# +uri+:: details about the processes origin, e.g. "http://pages.cs.wisc.edu/~bsettles/abner/" or "peter.smith@example.org"
|
18
|
+
# +date+:: date-time when the process was carried out, which may be nil if the date-time is unknown
|
19
|
+
# +type+:: classification of the described process
|
20
|
+
# +metadata+:: a hash that holds additional information about the process via dictionary defined keywords, e.g. { Process::VERSION => '0.0.1alpha' }
|
21
|
+
def initialize(name, uri, type = UNSPECIFIED, metadata = {}, date = nil)
|
22
|
+
@name = name
|
23
|
+
@uri = uri
|
24
|
+
@date = date
|
25
|
+
@type = type
|
26
|
+
@metadata = metadata
|
27
|
+
end
|
28
|
+
|
29
|
+
# Returns the name of the process.
|
30
|
+
def name
|
31
|
+
@name
|
32
|
+
end
|
33
|
+
|
34
|
+
# Returns the URI that has further details about the process, which can also be the form of an email address
|
35
|
+
# in cases where the process describes human driven annotation.
|
36
|
+
def uri
|
37
|
+
@uri
|
38
|
+
end
|
39
|
+
|
40
|
+
# Returns the date-time when this process was carried out, or nil otherwise if the information is not available.
|
41
|
+
def date
|
42
|
+
@date
|
43
|
+
end
|
44
|
+
|
45
|
+
# Returns the type of this process.
|
46
|
+
def type
|
47
|
+
@type
|
48
|
+
end
|
49
|
+
|
50
|
+
# Returns additional meta-data associated with this process.
|
51
|
+
def metadata
|
52
|
+
@metadata.clone.freeze
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
module BioInterchange::TextMining
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'json'
|
5
|
+
|
6
|
+
class PubannosJsonReader < BioInterchange::TextMining::TMReader
|
7
|
+
|
8
|
+
def deserialize(inputstream)
|
9
|
+
if inputstream.kind_of?(IO) then
|
10
|
+
pubannos(inputstream.read)
|
11
|
+
elsif inputstream.kind_of?(String) then
|
12
|
+
pubannos(inputstream)
|
13
|
+
else
|
14
|
+
#else raise exception
|
15
|
+
super(inputstream)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
# Specific method for parsing of *Pubannotations* json format
|
22
|
+
def pubannos(data)
|
23
|
+
|
24
|
+
result = JSON.parse(data)
|
25
|
+
|
26
|
+
if result.has_key? 'Error'
|
27
|
+
raise BioInterchange::Exceptions::InputFormatError, 'Error parsing the JSON input file: #{result["Error"]}'
|
28
|
+
end
|
29
|
+
|
30
|
+
|
31
|
+
text = result['text']
|
32
|
+
#doc_uri = "http://pubannotation.dbcls.jp/pmdocs/" + result['pmid'].to_s
|
33
|
+
doc_uri = result['docurl']
|
34
|
+
|
35
|
+
doc = Document.new(doc_uri)
|
36
|
+
docContent = Content.new(0, text.length, Content::DOCUMENT, @process)
|
37
|
+
docContent.setContext(doc)
|
38
|
+
doc.add(docContent)
|
39
|
+
|
40
|
+
#so our document requires content of type document or abstract
|
41
|
+
#should it hold the content string?
|
42
|
+
|
43
|
+
if result['catanns']
|
44
|
+
result['catanns'].each do |annot|
|
45
|
+
start_offset = annot['begin']
|
46
|
+
end_offset = annot['end']
|
47
|
+
length = end_offset - start_offset
|
48
|
+
created_time = annot['created_at']
|
49
|
+
updated_time = annot['updated_at']
|
50
|
+
category = annot['category']
|
51
|
+
#annset_id = annot['annset_id']
|
52
|
+
#doc_id = annot['doc_id']
|
53
|
+
#id = annot['id']
|
54
|
+
|
55
|
+
entity = text.slice(start_offset..end_offset)
|
56
|
+
|
57
|
+
#phrase = type for NE
|
58
|
+
con = Content.new(start_offset, length, Content::PHRASE, @process)
|
59
|
+
con.setContext(doc)
|
60
|
+
doc.add(con)
|
61
|
+
|
62
|
+
#set process.date = updated_time?
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
doc
|
67
|
+
end
|
68
|
+
|
69
|
+
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
@@ -0,0 +1,197 @@
|
|
1
|
+
require 'rdf'
|
2
|
+
require 'rdf/ntriples'
|
3
|
+
require 'date'
|
4
|
+
|
5
|
+
module BioInterchange::TextMining
|
6
|
+
|
7
|
+
class RDFWriter < BioInterchange::Writer
|
8
|
+
|
9
|
+
# Creates a new instance of a RDFWriter that will use the provided output stream to serialize RDF.
|
10
|
+
#
|
11
|
+
# +ostream+:: instance of an IO class or derivative that is used for RDF serialization
|
12
|
+
def initialize(ostream)
|
13
|
+
raise BioInterchange::Exceptions::ImplementationWriterError, 'The output stream is not an instance of IO or its subclasses.' unless ostream.kind_of?(IO)
|
14
|
+
@ostream = ostream
|
15
|
+
end
|
16
|
+
|
17
|
+
# Serializes a model as RDF.
|
18
|
+
#
|
19
|
+
# +model+:: a generic representation of input data that is derived from BioInterchange::TextMining::Document
|
20
|
+
def serialize(model)
|
21
|
+
if model.instance_of?(BioInterchange::TextMining::Document) then
|
22
|
+
serialize_document(model)
|
23
|
+
else
|
24
|
+
raise BioInterchange::Exceptions::ImplementationWriterError, 'The provided model cannot be serialized at the moment. ' +
|
25
|
+
'Supported classes are BioInterchange::TextMining::Document (and that\'s it for now).'
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
# Generates an URI for a given process and its contents.
|
32
|
+
#
|
33
|
+
# +process+:: process instance
|
34
|
+
# +kind+:: kind of the URI that should be generated, for example, whether the URI should represent the name, date, etc.
|
35
|
+
def process_uri(process, kind)
|
36
|
+
base_uri = 'biointerchange://textmining/process'
|
37
|
+
case kind
|
38
|
+
when :process
|
39
|
+
RDF::URI.new("#{base_uri}/self/#{process.uri.sub(/^.*?:\/\//, '')}")
|
40
|
+
when :name
|
41
|
+
RDF::URI.new("#{base_uri}/name/#{process.uri.sub(/^.*?:\/\//, '')}")
|
42
|
+
when :uri
|
43
|
+
RDF::URI.new("#{base_uri}/uri/#{process.uri.sub(/^.*?:\/\//, '')}")
|
44
|
+
when :date
|
45
|
+
RDF::URI.new("#{base_uri}/date/#{process.uri.sub(/^.*?:\/\//, '')}")
|
46
|
+
else
|
47
|
+
raise BioInterchange::Exceptions::ImplementationWriterError, "There is no implementation for serializing a process as #{kind}."
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# Generates an URI for a given content and its contents.
|
52
|
+
#
|
53
|
+
# +content+:: content instance
|
54
|
+
# +kind+:: kind of the URI that should be generated, for example, whether the URI should represent the name, date, etc.
|
55
|
+
def content_uri(content, kind)
|
56
|
+
base_uri = 'biointerchange://textmining/content'
|
57
|
+
case kind
|
58
|
+
#when :content
|
59
|
+
# RDF::URI.new("#{base_uri}/self/#{content.uri.sub(/^.*?:\/\//, '')}")
|
60
|
+
when :start
|
61
|
+
RDF::URI.new("#{base_uri}/start/#{content.uri.sub(/^.*?:\/\//, '')}")
|
62
|
+
when :stop
|
63
|
+
RDF::URI.new("#{base_uri}/stop/#{content.uri.sub(/^.*?:\/\//, '')}")
|
64
|
+
#when :type
|
65
|
+
# RDF::URI.new("#{base_uri}/type/#{content.uri.sub(/^.*?:\/\//, '')}")
|
66
|
+
else
|
67
|
+
raise BioInterchange::Exceptions::ImplementationWriterError, "There is no implementation for serializing a content as #{kind}."
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# Serializes RDF for a textual document representation using the Semanticsciene Integrated Ontology
|
72
|
+
# (http://code.google.com/p/semanticscience/wiki/SIO).
|
73
|
+
#
|
74
|
+
# +model+:: an instance of +BioInterchange::TextMining::Document+
|
75
|
+
def serialize_document(model)
|
76
|
+
graph = RDF::Graph.new
|
77
|
+
document_uri = RDF::URI.new(model.uri)
|
78
|
+
graph.insert(RDF::Statement.new(document_uri, RDF.type, BioInterchange::SIO.document))
|
79
|
+
model.contents.each { |content|
|
80
|
+
serialize_content(graph, document_uri, content)
|
81
|
+
}
|
82
|
+
RDF::NTriples::Writer.dump(graph, @ostream)
|
83
|
+
end
|
84
|
+
|
85
|
+
|
86
|
+
# Serializes a Content object for a given document URI.
|
87
|
+
#
|
88
|
+
# +graph+:: RDF graph to which content is added
|
89
|
+
# +document_uri+:: the document URI to which the added content belongs to
|
90
|
+
# +content+:: an instance that describes the content
|
91
|
+
def serialize_content(graph, document_uri, content)
|
92
|
+
content_uri = RDF::URI.new(content.uri)
|
93
|
+
graph.insert(RDF::Statement.new(document_uri, BioInterchange::SIO.has_attribute, content_uri))
|
94
|
+
serialize_process(graph, document_uri, content_uri, content.process) if content.process
|
95
|
+
|
96
|
+
sio_url = BioInterchange::SIO.language_entity
|
97
|
+
case content.type
|
98
|
+
when Content::UNSPECIFIED
|
99
|
+
sio_url = BioInterchange::SIO.language_entity
|
100
|
+
when Content::DOCUMENT
|
101
|
+
sio_url = BioInterchange::SIO.document
|
102
|
+
when Content::PAGE
|
103
|
+
sio_url = BioInterchange::SIO.document_section
|
104
|
+
when Content::TITLE
|
105
|
+
sio_url = BioInterchange::SIO.title
|
106
|
+
when Content::AUTHOR
|
107
|
+
sio_url = BioInterchange::SIO.author_section
|
108
|
+
when Content::ABSTRACT
|
109
|
+
sio_url = BioInterchange::SIO.abstract_section
|
110
|
+
when Content::SECTION
|
111
|
+
sio_url = BioInterchange::SIO.document_section
|
112
|
+
when Content::PARAGRAPH
|
113
|
+
sio_url = BioInterchange::SIO.paragraph
|
114
|
+
when Content::SENTENCE
|
115
|
+
sio_url = BioInterchange::SIO.sentence
|
116
|
+
when Content::PHRASE
|
117
|
+
sio_url = BioInterchange::SIO.phrase
|
118
|
+
when Content::WORD
|
119
|
+
sio_url = BioInterchange::SIO.word
|
120
|
+
when Content::CHARACTER
|
121
|
+
sio_url = BioInterchange::SIO.character
|
122
|
+
end
|
123
|
+
|
124
|
+
graph.insert(RDF::Statement.new(content_uri, RDF.type, sio_url))
|
125
|
+
|
126
|
+
graph.insert(RDF::Statement.new(content_uri, BioInterchange::SIO.has_attribute, serialize_content_start(graph, document_uri, content_uri, content)))
|
127
|
+
graph.insert(RDF::Statement.new(content_uri, BioInterchange::SIO.has_attribute, serialize_content_stop(graph, document_uri, content_uri, content)))
|
128
|
+
|
129
|
+
end
|
130
|
+
|
131
|
+
# Serializes a process object for a specific document uri
|
132
|
+
#
|
133
|
+
#
|
134
|
+
def serialize_process(graph, document_uri, content_uri, process)
|
135
|
+
process_uri = process_uri(process, :process)
|
136
|
+
graph.insert(RDF::Statement.new(content_uri, BioInterchange::SIO.is_direct_part_of, process_uri))
|
137
|
+
# If this is an email address, then create a FOAF representation, otherwise, do something else:
|
138
|
+
if process.type == BioInterchange::TextMining::Process::MANUAL then
|
139
|
+
graph.insert(RDF::Statement.new(process_uri, RDF.type, RDF::FOAF.Person))
|
140
|
+
graph.insert(RDF::Statement.new(process_uri, RDF::FOAF.name, RDF::Literal.new(process.name)))
|
141
|
+
graph.insert(RDF::Statement.new(process_uri, RDF::FOAF.name, RDF::URI.new(process.uri)))
|
142
|
+
else
|
143
|
+
graph.insert(RDF::Statement.new(process_uri, RDF.type, BioInterchange::SIO.process))
|
144
|
+
graph.insert(RDF::Statement.new(process_uri, BioInterchange::SIO.has_attribute, serialize_process_name(graph, document_uri, content_uri, process_uri, process)))
|
145
|
+
graph.insert(RDF::Statement.new(process_uri, BioInterchange::SIO.has_attribute, serialize_process_uri(graph, document_uri, content_uri, process_uri, process)))
|
146
|
+
graph.insert(RDF::Statement.new(process_uri, BioInterchange::SIO.has_attribute, serialize_process_date(graph, document_uri, content_uri, process_uri, process))) if process.date
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
#
|
151
|
+
#
|
152
|
+
#
|
153
|
+
def serialize_process_name(graph, document_uri, content_uri, process_uri, process)
|
154
|
+
kind_uri = process_uri(process, :name)
|
155
|
+
graph.insert(RDF::Statement.new(kind_uri, RDF.type, BioInterchange::SIO.name))
|
156
|
+
graph.insert(RDF::Statement.new(kind_uri, BioInterchange::SIO.has_attribute, RDF::Literal.new("#{process.name}")))
|
157
|
+
end
|
158
|
+
|
159
|
+
#
|
160
|
+
#
|
161
|
+
#
|
162
|
+
def serialize_process_uri(graph, document_uri, content_uri, process_uri, process)
|
163
|
+
kind_uri = process_uri(process, :uri)
|
164
|
+
graph.insert(RDF::Statement.new(kind_uri, RDF.type, BioInterchange::SIO.software_entity))
|
165
|
+
graph.insert(RDF::Statement.new(kind_uri, BioInterchange::SIO.has_attribute, RDF::URI.new(process.uri)))
|
166
|
+
end
|
167
|
+
|
168
|
+
#
|
169
|
+
#
|
170
|
+
#
|
171
|
+
def serialize_process_date(graph, document_uri, content_uri, process_uri, process)
|
172
|
+
kind_uri = process_uri(process, :date)
|
173
|
+
graph.insert(RDF::Statement.new(kind_uri, RDF::DC.date, RDF::Literal.new(Date.parse(process.date))))
|
174
|
+
end
|
175
|
+
|
176
|
+
#
|
177
|
+
#
|
178
|
+
#
|
179
|
+
def serialize_content_start(graph, document_uri, content_uri, content)
|
180
|
+
kind_uri = content_uri(content, :start)
|
181
|
+
graph.insert(RDF::Statement.new(kind_uri, RDF::type, BioInterchange::SIO.start_position))
|
182
|
+
graph.insert(RDF::Statement.new(kind_uri, BioInterchange::SIO.has_attribute, RDF::Literal.new(content.offset)))
|
183
|
+
end
|
184
|
+
|
185
|
+
#
|
186
|
+
#
|
187
|
+
#
|
188
|
+
def serialize_content_stop(graph, document_uri, content_uri, content)
|
189
|
+
kind_uri = content_uri(content, :stop)
|
190
|
+
graph.insert(RDF::Statement.new(kind_uri, RDF::type, BioInterchange::SIO.stop_position))
|
191
|
+
graph.insert(RDF::Statement.new(kind_uri, BioInterchange::SIO.has_attribute, RDF::Literal.new((content.offset+content.length).to_s)))
|
192
|
+
end
|
193
|
+
|
194
|
+
end
|
195
|
+
|
196
|
+
end
|
197
|
+
|
@@ -0,0 +1,41 @@
|
|
1
|
+
module BioInterchange::TextMining
|
2
|
+
|
3
|
+
class TMReader < BioInterchange::Reader
|
4
|
+
|
5
|
+
# Create a new instance of a text-mining data reader. Sets @process to a new +BioInterchange::TextMining::Process+ object.
|
6
|
+
#
|
7
|
+
# +name+:: Name of the process which generated this data
|
8
|
+
# +name_uri+:: URI of the resource that generated this data
|
9
|
+
# +date+:: Optional date of data creation
|
10
|
+
# +processtype+:: Type of process that created this content
|
11
|
+
# +version+:: Optional version number of resource that created this data (nil if manually curated, for example).
|
12
|
+
def initialize(name, name_uri, date = nil, processtype = BioInterchange::TextMining::Process::UNSPECIFIED, version = nil)
|
13
|
+
|
14
|
+
metadata = {}
|
15
|
+
metadata[BioInterchange::TextMining::Process::VERSION] = version
|
16
|
+
@process = BioInterchange::TextMining::Process.new(name, name_uri, processtype, metadata, date)
|
17
|
+
|
18
|
+
end
|
19
|
+
|
20
|
+
|
21
|
+
# Reads input stream and returns associated model. Super call this method before before overriding to provide access to a @data string containing the inputstream text.
|
22
|
+
#
|
23
|
+
# +inputstream+:: Input IO stream to deserialize
|
24
|
+
def deserialize(inputstream)
|
25
|
+
raise BioInterchange::Exceptions::ImplementationReaderError, 'InputStream not of type IO, cannot read.' unless inputstream.kind_of?(IO)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Automatically tries to determine a suitable process from the given name ID, which is assumed
|
29
|
+
# to be either an email address or web-site.
|
30
|
+
#
|
31
|
+
# +name_id+:: name ID that we want to represent by a suitable process
|
32
|
+
def self.determine_process(name_id)
|
33
|
+
process = BioInterchange::TextMining::Process::UNSPECIFIED
|
34
|
+
process = BioInterchange::TextMining::Process::MANUAL if name_id.match(/[^@]+@[^@]+/)
|
35
|
+
process = BioInterchange::TextMining::Process::SOFTWARE if name_id.match(/[a-zA-Z]+:\/\//)
|
36
|
+
process
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module BioInterchange
|
2
|
+
|
3
|
+
class Writer
|
4
|
+
|
5
|
+
# Creates a new instance of a writer that will use the provided output stream to serialize object model instances.
|
6
|
+
#
|
7
|
+
# +ostream+:: instance of IO or derivative class
|
8
|
+
def initialize(ostream)
|
9
|
+
raise BioInterchange::Exceptions::ImplementationWriterError, 'The output stream is not an instance of IO or its subclasses.' unless ostream.kind_of?(IO)
|
10
|
+
@ostream = ostream
|
11
|
+
end
|
12
|
+
|
13
|
+
# Serializes an object model instance.
|
14
|
+
#
|
15
|
+
# +model+:: an object model instance
|
16
|
+
def serialize(model)
|
17
|
+
raise BioInterchange::Exceptions::ImplementationWriterError, 'You must implement this method, which takes an object model and serializes it into the previously provided output stream.'
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
|
@@ -0,0 +1,27 @@
|
|
1
|
+
|
2
|
+
require 'rspec'
|
3
|
+
|
4
|
+
load 'lib/biointerchange/core.rb'
|
5
|
+
load 'lib/biointerchange/exceptions.rb'
|
6
|
+
|
7
|
+
describe BioInterchange::Exceptions do
|
8
|
+
describe 'error and exception creation tests' do
|
9
|
+
it 'raising of InputFormatError' do
|
10
|
+
expect { raise BioInterchange::Exceptions::InputFormatError }.to raise_error(BioInterchange::Exceptions::BioInterchangeError)
|
11
|
+
end
|
12
|
+
|
13
|
+
it 'raising of ImplementationReaderError' do
|
14
|
+
expect { raise BioInterchange::Exceptions::ImplementationReaderError }.to raise_error(BioInterchange::Exceptions::ImplementationError)
|
15
|
+
end
|
16
|
+
|
17
|
+
it 'raising of ImplementationModelError' do
|
18
|
+
expect { raise BioInterchange::Exceptions::ImplementationModelError }.to raise_error(BioInterchange::Exceptions::ImplementationError)
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'raising of ImplementationWriterError' do
|
22
|
+
expect { raise BioInterchange::Exceptions::ImplementationWriterError }.to raise_error(BioInterchange::Exceptions::ImplementationError)
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|