biointerchange 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.rspec +1 -0
- data/.travis.yml +12 -0
- data/Gemfile +17 -0
- data/LICENSE.txt +8 -0
- data/README.md +166 -0
- data/Rakefile +50 -0
- data/VERSION +1 -0
- data/bin/biointerchange +6 -0
- data/docs/exceptions_readme.txt +13 -0
- data/examples/BovineGenomeChrX.gff3.gz +0 -0
- data/examples/gb-2007-8-3-R40.xml +243 -0
- data/examples/pubannotation.json +1 -0
- data/generators/rdfxml.rb +104 -0
- data/lib/biointerchange/core.rb +195 -0
- data/lib/biointerchange/exceptions.rb +38 -0
- data/lib/biointerchange/genomics/gff3_feature.rb +82 -0
- data/lib/biointerchange/genomics/gff3_feature_set.rb +37 -0
- data/lib/biointerchange/genomics/gff3_rdf_ntriples.rb +107 -0
- data/lib/biointerchange/genomics/gff3_reader.rb +86 -0
- data/lib/biointerchange/gff3.rb +135 -0
- data/lib/biointerchange/reader.rb +25 -0
- data/lib/biointerchange/registry.rb +29 -0
- data/lib/biointerchange/sio.rb +7124 -0
- data/lib/biointerchange/sofa.rb +1566 -0
- data/lib/biointerchange/textmining/content.rb +69 -0
- data/lib/biointerchange/textmining/document.rb +36 -0
- data/lib/biointerchange/textmining/pdfx_xml_reader.rb +161 -0
- data/lib/biointerchange/textmining/process.rb +57 -0
- data/lib/biointerchange/textmining/pubannos_json_reader.rb +72 -0
- data/lib/biointerchange/textmining/text_mining_rdf_ntriples.rb +197 -0
- data/lib/biointerchange/textmining/text_mining_reader.rb +41 -0
- data/lib/biointerchange/writer.rb +23 -0
- data/lib/biointerchange.rb +3 -0
- data/spec/exceptions_spec.rb +27 -0
- data/spec/gff3_rdfwriter_spec.rb +67 -0
- data/spec/text_mining_pdfx_xml_reader_spec.rb +89 -0
- data/spec/text_mining_pubannos_json_reader_spec.rb +71 -0
- data/spec/text_mining_rdfwriter_spec.rb +57 -0
- data/web/about.html +89 -0
- data/web/biointerchange.js +133 -0
- data/web/bootstrap/css/bootstrap-responsive.css +1040 -0
- data/web/bootstrap/css/bootstrap-responsive.min.css +9 -0
- data/web/bootstrap/css/bootstrap.css +5624 -0
- data/web/bootstrap/css/bootstrap.min.css +9 -0
- data/web/bootstrap/img/glyphicons-halflings-white.png +0 -0
- data/web/bootstrap/img/glyphicons-halflings.png +0 -0
- data/web/bootstrap/js/bootstrap.js +2027 -0
- data/web/bootstrap/js/bootstrap.min.js +6 -0
- data/web/bootstrap/js/jquery-1.8.1.min.js +2 -0
- data/web/css/rdoc-style.css +5786 -0
- data/web/css/rdoc.css +716 -0
- data/web/images/BioInterchange300.png +0 -0
- data/web/index.html +109 -0
- data/web/service/rdfizer.fcgi +68 -0
- data/web/webservices.html +123 -0
- metadata +240 -0
@@ -0,0 +1,69 @@
|
|
1
|
+
module BioInterchange::TextMining
|
2
|
+
|
3
|
+
class Content
|
4
|
+
|
5
|
+
# Constants that describe content types.
|
6
|
+
UNSPECIFIED = 0
|
7
|
+
DOCUMENT = 1
|
8
|
+
PAGE = 2
|
9
|
+
TITLE = 3
|
10
|
+
AUTHOR = 4
|
11
|
+
ABSTRACT = 5
|
12
|
+
SECTION = 6
|
13
|
+
PARAGRAPH = 7
|
14
|
+
SENTENCE = 8
|
15
|
+
PHRASE = 9
|
16
|
+
WORD = 10
|
17
|
+
CHARACTER = 11
|
18
|
+
|
19
|
+
# Creates a new document content representation at a specific location of the document.
|
20
|
+
#
|
21
|
+
# +offset+:: zero-based offset of the represented content within the document (absolute location within the document)
|
22
|
+
# +length+:: length of the represented content, where a length of zero denotes a boundary between two characters
|
23
|
+
# +type+:: classifaction of the content
|
24
|
+
def initialize(offset, length, type = UNSPECIFIED, process = nil)
|
25
|
+
@offset = offset
|
26
|
+
@length = length
|
27
|
+
@type = type
|
28
|
+
@process = process
|
29
|
+
end
|
30
|
+
|
31
|
+
# Sets the context of this content.
|
32
|
+
#
|
33
|
+
# +context+:: a +BioInterchange::TextMining::Document+ or +BioInterchange::TextMining::Content+ instance in which this content is enclosed in
|
34
|
+
def setContext(context)
|
35
|
+
@context = context
|
36
|
+
end
|
37
|
+
|
38
|
+
# Returns the offset of the content as absolute position within the document.
|
39
|
+
def offset
|
40
|
+
@offset
|
41
|
+
end
|
42
|
+
|
43
|
+
# Returns the length of the content, which is measured in characters.
|
44
|
+
def length
|
45
|
+
@length
|
46
|
+
end
|
47
|
+
|
48
|
+
# Returns the type of the content, if known, or +BioInterchange::TextMining::Content::UNSPECIFIED otherwise.
|
49
|
+
def type
|
50
|
+
@type
|
51
|
+
end
|
52
|
+
|
53
|
+
# Returns the process associated with this content, if previously provided, or nil otherwise.
|
54
|
+
def process
|
55
|
+
@process
|
56
|
+
end
|
57
|
+
|
58
|
+
# Returns a URI that identifies this content.
|
59
|
+
def uri
|
60
|
+
raise BioInterchange::Exceptions::ImplementationModelError, 'An URI can only be returned for content with a context (i.e., use setContext(context) first).' unless @context
|
61
|
+
process = '-'
|
62
|
+
process = "(#{@process.uri.sub(/^.*?:\/\//, '')})" if @process
|
63
|
+
"biointerchange://textmining/content/#{@context.uri.sub(/^.*?:\/\//, '')}/#{@offset},#{@length},#{@type},#{process}"
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module BioInterchange::TextMining
|
2
|
+
|
3
|
+
class Document
|
4
|
+
|
5
|
+
# Creates a blank document representation.
|
6
|
+
#
|
7
|
+
# +uri+:: source of the document
|
8
|
+
def initialize(uri)
|
9
|
+
@uri = uri
|
10
|
+
|
11
|
+
# Will hold content:
|
12
|
+
@content = []
|
13
|
+
end
|
14
|
+
|
15
|
+
# Returns the URI of the document.
|
16
|
+
def uri
|
17
|
+
@uri
|
18
|
+
end
|
19
|
+
|
20
|
+
# Adds content to the document.
|
21
|
+
#
|
22
|
+
# +content+:: content of type +BioInterchange::TextMining::Content+ that should be added to the document
|
23
|
+
def add(content)
|
24
|
+
raise BioInterchange::Exceptions::ImplementationModelError, 'Content has to be of kind BioInterchange::TextMining::Content' unless content.kind_of?(BioInterchange::TextMining::Content)
|
25
|
+
@content << content
|
26
|
+
end
|
27
|
+
|
28
|
+
# Returns the document contents as an array.
|
29
|
+
def contents
|
30
|
+
@content.clone.freeze
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
@@ -0,0 +1,161 @@
|
|
1
|
+
module BioInterchange::TextMining
|
2
|
+
|
3
|
+
require 'rexml/document'
|
4
|
+
require 'rexml/streamlistener'
|
5
|
+
|
6
|
+
class PdfxXmlReader < BioInterchange::TextMining::TMReader
|
7
|
+
|
8
|
+
# Reads input stream and returns associated +BioInterchange::TextMining::Document+ model
|
9
|
+
#
|
10
|
+
# Presently I assume a single document per xml file,
|
11
|
+
# and that <section> tags cannot nest. I also assume
|
12
|
+
# that a Content::DOCUMENT type is everything between
|
13
|
+
# the <article> tags.
|
14
|
+
#
|
15
|
+
# +inputstream+:: Input IO stream to deserialize
|
16
|
+
def deserialize(inputstream)
|
17
|
+
|
18
|
+
#super(inputstream)
|
19
|
+
|
20
|
+
raise BioInterchange::Exceptions::ImplementationReaderError, 'InputStream not of type IO, cannot read.' unless inputstream.kind_of?(IO) or inputstream.kind_of?(String)
|
21
|
+
|
22
|
+
@input = inputstream
|
23
|
+
|
24
|
+
pdfx
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def pdfx
|
33
|
+
list = MyListener.new
|
34
|
+
REXML::Document.parse_stream(@input, list)
|
35
|
+
return list.document
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
|
40
|
+
|
41
|
+
class MyListener
|
42
|
+
|
43
|
+
include REXML::StreamListener
|
44
|
+
|
45
|
+
def initialize
|
46
|
+
@map = {}
|
47
|
+
|
48
|
+
#sections can nest, so "stack" them
|
49
|
+
@map['sec_s'] = []
|
50
|
+
@map['sec_l'] = []
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
def tag_start(name, attr)
|
55
|
+
#puts "tag_start: #{name}"
|
56
|
+
if name =~ /^job$/
|
57
|
+
raise BioInterchange::Exceptions::InputFormatError, 'Input document XML has multiple <job> tags, cannot parse multiple documents within a single file.' if @map['id_done']
|
58
|
+
@map['id'] = true
|
59
|
+
elsif name =~ /^article-title$/
|
60
|
+
raise BioInterchange::Exceptions::InputFormatError, 'Input document XML has multiple <article-title> tags defined, cannot parse multiple documents within a single file.' if @map['title_done']
|
61
|
+
@map['title'] = true
|
62
|
+
@map['title_s'] = @map['art_l']
|
63
|
+
@map['title_l'] = 0
|
64
|
+
elsif name =~ /^abstract$/
|
65
|
+
raise BioInterchange::Exceptions::InputFormatError, 'Input document XML has multiple <abstract> tags defined, cannot parse multiple documents within a single file.' if @map['abs_done']
|
66
|
+
@map['abs'] = true
|
67
|
+
@map['abs_s'] = @map['art_l']
|
68
|
+
@map['abs_l'] = 0
|
69
|
+
elsif name =~ /^body$/
|
70
|
+
raise BioInterchange::Exceptions::InputFormatError, 'Input document XML has multiple <body> tags defined, cannot parse multiple documents within a single file.' if @map['body_done']
|
71
|
+
@map['body'] = true
|
72
|
+
@map['body_s'] = @map['art_l']
|
73
|
+
@map['body_l'] = 0
|
74
|
+
elsif name =~ /^article$/
|
75
|
+
raise BioInterchange::Exceptions::InputFormatError, 'Input document XML has multiple <article> tags defined, cannot parse multiple documents within a single file.' if @map['art_done']
|
76
|
+
@map['art'] = true
|
77
|
+
@map['art_s'] = 0
|
78
|
+
@map['art_l'] = 0
|
79
|
+
elsif name =~ /^section$/
|
80
|
+
raise BioInterchange::Exceptions::InputFormatError, 'Error with section stack, stacks not equal in size: Possibly not a well formed XML input file. Check <section> tags all match up and do not overlap (nesting is fine).' unless @map['sec_s'].size == @map['sec_l'].size
|
81
|
+
@map['sec_s'].push @map['art_l']
|
82
|
+
@map['sec_l'].push 0
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
|
87
|
+
def text(data)
|
88
|
+
|
89
|
+
if @map['art']
|
90
|
+
@map['art_l'] += data.length
|
91
|
+
end
|
92
|
+
|
93
|
+
if @map['id']
|
94
|
+
@doc = BioInterchange::TextMining::Document.new("http://pdfx.cs.man.ac.uk/" + data)
|
95
|
+
elsif @map['title']
|
96
|
+
@map['title_l'] += data.length
|
97
|
+
elsif @map['abs']
|
98
|
+
@map['abs_l'] += data.length
|
99
|
+
end
|
100
|
+
if @map['body']
|
101
|
+
@map['body_l'] += data.length
|
102
|
+
end
|
103
|
+
if @map['sec_l'].size != 0
|
104
|
+
#add length to *all* current sections
|
105
|
+
@map['sec_l'].size.times do |i|
|
106
|
+
@map['sec_l'][i] += data.length
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
end
|
111
|
+
|
112
|
+
#TODO add deal with <author> type tags
|
113
|
+
|
114
|
+
def tag_end(name)
|
115
|
+
#puts "tag_end: #{name}"
|
116
|
+
if name =~ /^job$/
|
117
|
+
@map['id'] = false
|
118
|
+
@map['id_done'] = true
|
119
|
+
elsif name =~ /^article-title$/
|
120
|
+
@map['title'] = false
|
121
|
+
dc = BioInterchange::TextMining::Content.new(@map['title_s'], @map['title_l'], BioInterchange::TextMining::Content::TITLE, @process)
|
122
|
+
dc.setContext(@doc)
|
123
|
+
@doc.add(dc)
|
124
|
+
@map['title_done'] = true
|
125
|
+
elsif name =~ /^abstract$/
|
126
|
+
@map['abs'] = false
|
127
|
+
dc = BioInterchange::TextMining::Content.new(@map['abs_s'], @map['abs_l'], BioInterchange::TextMining::Content::ABSTRACT, @process)
|
128
|
+
dc.setContext(@doc)
|
129
|
+
@doc.add(dc)
|
130
|
+
@map['abs_done'] = true
|
131
|
+
elsif name =~ /^body$/
|
132
|
+
@map['body'] = false
|
133
|
+
dc = BioInterchange::TextMining::Content.new(@map['body_s'], @map['body_l'], BioInterchange::TextMining::Content::SECTION, @process)
|
134
|
+
dc.setContext(@doc)
|
135
|
+
@doc.add(dc)
|
136
|
+
@map['body_done'] = true
|
137
|
+
elsif name =~ /^article$/
|
138
|
+
@map['art'] = false
|
139
|
+
dc = BioInterchange::TextMining::Content.new(@map['art_s'], @map['art_l'], BioInterchange::TextMining::Content::DOCUMENT, @process)
|
140
|
+
dc.setContext(@doc)
|
141
|
+
@doc.add(dc)
|
142
|
+
@map['art_done'] = true
|
143
|
+
elsif name =~ /^section$/
|
144
|
+
raise 'Error with section stack, stacks not equal in size' unless @map['sec_s'].size == @map['sec_l'].size
|
145
|
+
dc = BioInterchange::TextMining::Content.new(@map['sec_s'].pop, @map['sec_l'].pop, BioInterchange::TextMining::Content::SECTION, @process)
|
146
|
+
dc.setContext(@doc)
|
147
|
+
@doc.add(dc)
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
def document
|
152
|
+
@doc
|
153
|
+
end
|
154
|
+
|
155
|
+
end
|
156
|
+
|
157
|
+
|
158
|
+
|
159
|
+
end
|
160
|
+
|
161
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
module BioInterchange::TextMining
|
2
|
+
|
3
|
+
class Process
|
4
|
+
|
5
|
+
# Constants that describe process types.
|
6
|
+
UNSPECIFIED = 0
|
7
|
+
SOFTWARE = 1
|
8
|
+
WEBSERVICE = 2
|
9
|
+
MANUAL = 3
|
10
|
+
|
11
|
+
# Dictionary of viable metadata keys.
|
12
|
+
VERSION = 0
|
13
|
+
|
14
|
+
# Creates a new process representation.
|
15
|
+
#
|
16
|
+
# +name+:: identification of the software/service/curator involved in the process, e.g. "ABNER" or "Peter Smith"
|
17
|
+
# +uri+:: details about the processes origin, e.g. "http://pages.cs.wisc.edu/~bsettles/abner/" or "peter.smith@example.org"
|
18
|
+
# +date+:: date-time when the process was carried out, which may be nil if the date-time is unknown
|
19
|
+
# +type+:: classification of the described process
|
20
|
+
# +metadata+:: a hash that holds additional information about the process via dictionary defined keywords, e.g. { Process::VERSION => '0.0.1alpha' }
|
21
|
+
def initialize(name, uri, type = UNSPECIFIED, metadata = {}, date = nil)
|
22
|
+
@name = name
|
23
|
+
@uri = uri
|
24
|
+
@date = date
|
25
|
+
@type = type
|
26
|
+
@metadata = metadata
|
27
|
+
end
|
28
|
+
|
29
|
+
# Returns the name of the process.
|
30
|
+
def name
|
31
|
+
@name
|
32
|
+
end
|
33
|
+
|
34
|
+
# Returns the URI that has further details about the process, which can also be the form of an email address
|
35
|
+
# in cases where the process describes human driven annotation.
|
36
|
+
def uri
|
37
|
+
@uri
|
38
|
+
end
|
39
|
+
|
40
|
+
# Returns the date-time when this process was carried out, or nil otherwise if the information is not available.
|
41
|
+
def date
|
42
|
+
@date
|
43
|
+
end
|
44
|
+
|
45
|
+
# Returns the type of this process.
|
46
|
+
def type
|
47
|
+
@type
|
48
|
+
end
|
49
|
+
|
50
|
+
# Returns additional meta-data associated with this process.
|
51
|
+
def metadata
|
52
|
+
@metadata.clone.freeze
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
module BioInterchange::TextMining
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'json'
|
5
|
+
|
6
|
+
class PubannosJsonReader < BioInterchange::TextMining::TMReader
|
7
|
+
|
8
|
+
def deserialize(inputstream)
|
9
|
+
if inputstream.kind_of?(IO) then
|
10
|
+
pubannos(inputstream.read)
|
11
|
+
elsif inputstream.kind_of?(String) then
|
12
|
+
pubannos(inputstream)
|
13
|
+
else
|
14
|
+
#else raise exception
|
15
|
+
super(inputstream)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
# Specific method for parsing of *Pubannotations* json format
|
22
|
+
def pubannos(data)
|
23
|
+
|
24
|
+
result = JSON.parse(data)
|
25
|
+
|
26
|
+
if result.has_key? 'Error'
|
27
|
+
raise BioInterchange::Exceptions::InputFormatError, 'Error parsing the JSON input file: #{result["Error"]}'
|
28
|
+
end
|
29
|
+
|
30
|
+
|
31
|
+
text = result['text']
|
32
|
+
#doc_uri = "http://pubannotation.dbcls.jp/pmdocs/" + result['pmid'].to_s
|
33
|
+
doc_uri = result['docurl']
|
34
|
+
|
35
|
+
doc = Document.new(doc_uri)
|
36
|
+
docContent = Content.new(0, text.length, Content::DOCUMENT, @process)
|
37
|
+
docContent.setContext(doc)
|
38
|
+
doc.add(docContent)
|
39
|
+
|
40
|
+
#so our document requires content of type document or abstract
|
41
|
+
#should it hold the content string?
|
42
|
+
|
43
|
+
if result['catanns']
|
44
|
+
result['catanns'].each do |annot|
|
45
|
+
start_offset = annot['begin']
|
46
|
+
end_offset = annot['end']
|
47
|
+
length = end_offset - start_offset
|
48
|
+
created_time = annot['created_at']
|
49
|
+
updated_time = annot['updated_at']
|
50
|
+
category = annot['category']
|
51
|
+
#annset_id = annot['annset_id']
|
52
|
+
#doc_id = annot['doc_id']
|
53
|
+
#id = annot['id']
|
54
|
+
|
55
|
+
entity = text.slice(start_offset..end_offset)
|
56
|
+
|
57
|
+
#phrase = type for NE
|
58
|
+
con = Content.new(start_offset, length, Content::PHRASE, @process)
|
59
|
+
con.setContext(doc)
|
60
|
+
doc.add(con)
|
61
|
+
|
62
|
+
#set process.date = updated_time?
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
doc
|
67
|
+
end
|
68
|
+
|
69
|
+
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
@@ -0,0 +1,197 @@
|
|
1
|
+
require 'rdf'
|
2
|
+
require 'rdf/ntriples'
|
3
|
+
require 'date'
|
4
|
+
|
5
|
+
module BioInterchange::TextMining
|
6
|
+
|
7
|
+
class RDFWriter < BioInterchange::Writer
|
8
|
+
|
9
|
+
# Creates a new instance of a RDFWriter that will use the provided output stream to serialize RDF.
|
10
|
+
#
|
11
|
+
# +ostream+:: instance of an IO class or derivative that is used for RDF serialization
|
12
|
+
def initialize(ostream)
|
13
|
+
raise BioInterchange::Exceptions::ImplementationWriterError, 'The output stream is not an instance of IO or its subclasses.' unless ostream.kind_of?(IO)
|
14
|
+
@ostream = ostream
|
15
|
+
end
|
16
|
+
|
17
|
+
# Serializes a model as RDF.
|
18
|
+
#
|
19
|
+
# +model+:: a generic representation of input data that is derived from BioInterchange::TextMining::Document
|
20
|
+
def serialize(model)
|
21
|
+
if model.instance_of?(BioInterchange::TextMining::Document) then
|
22
|
+
serialize_document(model)
|
23
|
+
else
|
24
|
+
raise BioInterchange::Exceptions::ImplementationWriterError, 'The provided model cannot be serialized at the moment. ' +
|
25
|
+
'Supported classes are BioInterchange::TextMining::Document (and that\'s it for now).'
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
# Generates an URI for a given process and its contents.
|
32
|
+
#
|
33
|
+
# +process+:: process instance
|
34
|
+
# +kind+:: kind of the URI that should be generated, for example, whether the URI should represent the name, date, etc.
|
35
|
+
def process_uri(process, kind)
|
36
|
+
base_uri = 'biointerchange://textmining/process'
|
37
|
+
case kind
|
38
|
+
when :process
|
39
|
+
RDF::URI.new("#{base_uri}/self/#{process.uri.sub(/^.*?:\/\//, '')}")
|
40
|
+
when :name
|
41
|
+
RDF::URI.new("#{base_uri}/name/#{process.uri.sub(/^.*?:\/\//, '')}")
|
42
|
+
when :uri
|
43
|
+
RDF::URI.new("#{base_uri}/uri/#{process.uri.sub(/^.*?:\/\//, '')}")
|
44
|
+
when :date
|
45
|
+
RDF::URI.new("#{base_uri}/date/#{process.uri.sub(/^.*?:\/\//, '')}")
|
46
|
+
else
|
47
|
+
raise BioInterchange::Exceptions::ImplementationWriterError, "There is no implementation for serializing a process as #{kind}."
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# Generates an URI for a given content and its contents.
|
52
|
+
#
|
53
|
+
# +content+:: content instance
|
54
|
+
# +kind+:: kind of the URI that should be generated, for example, whether the URI should represent the name, date, etc.
|
55
|
+
def content_uri(content, kind)
|
56
|
+
base_uri = 'biointerchange://textmining/content'
|
57
|
+
case kind
|
58
|
+
#when :content
|
59
|
+
# RDF::URI.new("#{base_uri}/self/#{content.uri.sub(/^.*?:\/\//, '')}")
|
60
|
+
when :start
|
61
|
+
RDF::URI.new("#{base_uri}/start/#{content.uri.sub(/^.*?:\/\//, '')}")
|
62
|
+
when :stop
|
63
|
+
RDF::URI.new("#{base_uri}/stop/#{content.uri.sub(/^.*?:\/\//, '')}")
|
64
|
+
#when :type
|
65
|
+
# RDF::URI.new("#{base_uri}/type/#{content.uri.sub(/^.*?:\/\//, '')}")
|
66
|
+
else
|
67
|
+
raise BioInterchange::Exceptions::ImplementationWriterError, "There is no implementation for serializing a content as #{kind}."
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# Serializes RDF for a textual document representation using the Semanticsciene Integrated Ontology
|
72
|
+
# (http://code.google.com/p/semanticscience/wiki/SIO).
|
73
|
+
#
|
74
|
+
# +model+:: an instance of +BioInterchange::TextMining::Document+
|
75
|
+
def serialize_document(model)
|
76
|
+
graph = RDF::Graph.new
|
77
|
+
document_uri = RDF::URI.new(model.uri)
|
78
|
+
graph.insert(RDF::Statement.new(document_uri, RDF.type, BioInterchange::SIO.document))
|
79
|
+
model.contents.each { |content|
|
80
|
+
serialize_content(graph, document_uri, content)
|
81
|
+
}
|
82
|
+
RDF::NTriples::Writer.dump(graph, @ostream)
|
83
|
+
end
|
84
|
+
|
85
|
+
|
86
|
+
# Serializes a Content object for a given document URI.
|
87
|
+
#
|
88
|
+
# +graph+:: RDF graph to which content is added
|
89
|
+
# +document_uri+:: the document URI to which the added content belongs to
|
90
|
+
# +content+:: an instance that describes the content
|
91
|
+
def serialize_content(graph, document_uri, content)
|
92
|
+
content_uri = RDF::URI.new(content.uri)
|
93
|
+
graph.insert(RDF::Statement.new(document_uri, BioInterchange::SIO.has_attribute, content_uri))
|
94
|
+
serialize_process(graph, document_uri, content_uri, content.process) if content.process
|
95
|
+
|
96
|
+
sio_url = BioInterchange::SIO.language_entity
|
97
|
+
case content.type
|
98
|
+
when Content::UNSPECIFIED
|
99
|
+
sio_url = BioInterchange::SIO.language_entity
|
100
|
+
when Content::DOCUMENT
|
101
|
+
sio_url = BioInterchange::SIO.document
|
102
|
+
when Content::PAGE
|
103
|
+
sio_url = BioInterchange::SIO.document_section
|
104
|
+
when Content::TITLE
|
105
|
+
sio_url = BioInterchange::SIO.title
|
106
|
+
when Content::AUTHOR
|
107
|
+
sio_url = BioInterchange::SIO.author_section
|
108
|
+
when Content::ABSTRACT
|
109
|
+
sio_url = BioInterchange::SIO.abstract_section
|
110
|
+
when Content::SECTION
|
111
|
+
sio_url = BioInterchange::SIO.document_section
|
112
|
+
when Content::PARAGRAPH
|
113
|
+
sio_url = BioInterchange::SIO.paragraph
|
114
|
+
when Content::SENTENCE
|
115
|
+
sio_url = BioInterchange::SIO.sentence
|
116
|
+
when Content::PHRASE
|
117
|
+
sio_url = BioInterchange::SIO.phrase
|
118
|
+
when Content::WORD
|
119
|
+
sio_url = BioInterchange::SIO.word
|
120
|
+
when Content::CHARACTER
|
121
|
+
sio_url = BioInterchange::SIO.character
|
122
|
+
end
|
123
|
+
|
124
|
+
graph.insert(RDF::Statement.new(content_uri, RDF.type, sio_url))
|
125
|
+
|
126
|
+
graph.insert(RDF::Statement.new(content_uri, BioInterchange::SIO.has_attribute, serialize_content_start(graph, document_uri, content_uri, content)))
|
127
|
+
graph.insert(RDF::Statement.new(content_uri, BioInterchange::SIO.has_attribute, serialize_content_stop(graph, document_uri, content_uri, content)))
|
128
|
+
|
129
|
+
end
|
130
|
+
|
131
|
+
# Serializes a process object for a specific document uri
|
132
|
+
#
|
133
|
+
#
|
134
|
+
def serialize_process(graph, document_uri, content_uri, process)
|
135
|
+
process_uri = process_uri(process, :process)
|
136
|
+
graph.insert(RDF::Statement.new(content_uri, BioInterchange::SIO.is_direct_part_of, process_uri))
|
137
|
+
# If this is an email address, then create a FOAF representation, otherwise, do something else:
|
138
|
+
if process.type == BioInterchange::TextMining::Process::MANUAL then
|
139
|
+
graph.insert(RDF::Statement.new(process_uri, RDF.type, RDF::FOAF.Person))
|
140
|
+
graph.insert(RDF::Statement.new(process_uri, RDF::FOAF.name, RDF::Literal.new(process.name)))
|
141
|
+
graph.insert(RDF::Statement.new(process_uri, RDF::FOAF.name, RDF::URI.new(process.uri)))
|
142
|
+
else
|
143
|
+
graph.insert(RDF::Statement.new(process_uri, RDF.type, BioInterchange::SIO.process))
|
144
|
+
graph.insert(RDF::Statement.new(process_uri, BioInterchange::SIO.has_attribute, serialize_process_name(graph, document_uri, content_uri, process_uri, process)))
|
145
|
+
graph.insert(RDF::Statement.new(process_uri, BioInterchange::SIO.has_attribute, serialize_process_uri(graph, document_uri, content_uri, process_uri, process)))
|
146
|
+
graph.insert(RDF::Statement.new(process_uri, BioInterchange::SIO.has_attribute, serialize_process_date(graph, document_uri, content_uri, process_uri, process))) if process.date
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
#
|
151
|
+
#
|
152
|
+
#
|
153
|
+
def serialize_process_name(graph, document_uri, content_uri, process_uri, process)
|
154
|
+
kind_uri = process_uri(process, :name)
|
155
|
+
graph.insert(RDF::Statement.new(kind_uri, RDF.type, BioInterchange::SIO.name))
|
156
|
+
graph.insert(RDF::Statement.new(kind_uri, BioInterchange::SIO.has_attribute, RDF::Literal.new("#{process.name}")))
|
157
|
+
end
|
158
|
+
|
159
|
+
#
|
160
|
+
#
|
161
|
+
#
|
162
|
+
def serialize_process_uri(graph, document_uri, content_uri, process_uri, process)
|
163
|
+
kind_uri = process_uri(process, :uri)
|
164
|
+
graph.insert(RDF::Statement.new(kind_uri, RDF.type, BioInterchange::SIO.software_entity))
|
165
|
+
graph.insert(RDF::Statement.new(kind_uri, BioInterchange::SIO.has_attribute, RDF::URI.new(process.uri)))
|
166
|
+
end
|
167
|
+
|
168
|
+
#
|
169
|
+
#
|
170
|
+
#
|
171
|
+
def serialize_process_date(graph, document_uri, content_uri, process_uri, process)
|
172
|
+
kind_uri = process_uri(process, :date)
|
173
|
+
graph.insert(RDF::Statement.new(kind_uri, RDF::DC.date, RDF::Literal.new(Date.parse(process.date))))
|
174
|
+
end
|
175
|
+
|
176
|
+
#
|
177
|
+
#
|
178
|
+
#
|
179
|
+
def serialize_content_start(graph, document_uri, content_uri, content)
|
180
|
+
kind_uri = content_uri(content, :start)
|
181
|
+
graph.insert(RDF::Statement.new(kind_uri, RDF::type, BioInterchange::SIO.start_position))
|
182
|
+
graph.insert(RDF::Statement.new(kind_uri, BioInterchange::SIO.has_attribute, RDF::Literal.new(content.offset)))
|
183
|
+
end
|
184
|
+
|
185
|
+
#
|
186
|
+
#
|
187
|
+
#
|
188
|
+
def serialize_content_stop(graph, document_uri, content_uri, content)
|
189
|
+
kind_uri = content_uri(content, :stop)
|
190
|
+
graph.insert(RDF::Statement.new(kind_uri, RDF::type, BioInterchange::SIO.stop_position))
|
191
|
+
graph.insert(RDF::Statement.new(kind_uri, BioInterchange::SIO.has_attribute, RDF::Literal.new((content.offset+content.length).to_s)))
|
192
|
+
end
|
193
|
+
|
194
|
+
end
|
195
|
+
|
196
|
+
end
|
197
|
+
|
@@ -0,0 +1,41 @@
|
|
1
|
+
module BioInterchange::TextMining
|
2
|
+
|
3
|
+
class TMReader < BioInterchange::Reader
|
4
|
+
|
5
|
+
# Create a new instance of a text-mining data reader. Sets @process to a new +BioInterchange::TextMining::Process+ object.
|
6
|
+
#
|
7
|
+
# +name+:: Name of the process which generated this data
|
8
|
+
# +name_uri+:: URI of the resource that generated this data
|
9
|
+
# +date+:: Optional date of data creation
|
10
|
+
# +processtype+:: Type of process that created this content
|
11
|
+
# +version+:: Optional version number of resource that created this data (nil if manually curated, for example).
|
12
|
+
def initialize(name, name_uri, date = nil, processtype = BioInterchange::TextMining::Process::UNSPECIFIED, version = nil)
|
13
|
+
|
14
|
+
metadata = {}
|
15
|
+
metadata[BioInterchange::TextMining::Process::VERSION] = version
|
16
|
+
@process = BioInterchange::TextMining::Process.new(name, name_uri, processtype, metadata, date)
|
17
|
+
|
18
|
+
end
|
19
|
+
|
20
|
+
|
21
|
+
# Reads input stream and returns associated model. Super call this method before before overriding to provide access to a @data string containing the inputstream text.
|
22
|
+
#
|
23
|
+
# +inputstream+:: Input IO stream to deserialize
|
24
|
+
def deserialize(inputstream)
|
25
|
+
raise BioInterchange::Exceptions::ImplementationReaderError, 'InputStream not of type IO, cannot read.' unless inputstream.kind_of?(IO)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Automatically tries to determine a suitable process from the given name ID, which is assumed
|
29
|
+
# to be either an email address or web-site.
|
30
|
+
#
|
31
|
+
# +name_id+:: name ID that we want to represent by a suitable process
|
32
|
+
def self.determine_process(name_id)
|
33
|
+
process = BioInterchange::TextMining::Process::UNSPECIFIED
|
34
|
+
process = BioInterchange::TextMining::Process::MANUAL if name_id.match(/[^@]+@[^@]+/)
|
35
|
+
process = BioInterchange::TextMining::Process::SOFTWARE if name_id.match(/[a-zA-Z]+:\/\//)
|
36
|
+
process
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module BioInterchange
|
2
|
+
|
3
|
+
class Writer
|
4
|
+
|
5
|
+
# Creates a new instance of a writer that will use the provided output stream to serialize object model instances.
|
6
|
+
#
|
7
|
+
# +ostream+:: instance of IO or derivative class
|
8
|
+
def initialize(ostream)
|
9
|
+
raise BioInterchange::Exceptions::ImplementationWriterError, 'The output stream is not an instance of IO or its subclasses.' unless ostream.kind_of?(IO)
|
10
|
+
@ostream = ostream
|
11
|
+
end
|
12
|
+
|
13
|
+
# Serializes an object model instance.
|
14
|
+
#
|
15
|
+
# +model+:: an object model instance
|
16
|
+
def serialize(model)
|
17
|
+
raise BioInterchange::Exceptions::ImplementationWriterError, 'You must implement this method, which takes an object model and serializes it into the previously provided output stream.'
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
|
@@ -0,0 +1,27 @@
|
|
1
|
+
|
2
|
+
require 'rspec'
|
3
|
+
|
4
|
+
load 'lib/biointerchange/core.rb'
|
5
|
+
load 'lib/biointerchange/exceptions.rb'
|
6
|
+
|
7
|
+
describe BioInterchange::Exceptions do
|
8
|
+
describe 'error and exception creation tests' do
|
9
|
+
it 'raising of InputFormatError' do
|
10
|
+
expect { raise BioInterchange::Exceptions::InputFormatError }.to raise_error(BioInterchange::Exceptions::BioInterchangeError)
|
11
|
+
end
|
12
|
+
|
13
|
+
it 'raising of ImplementationReaderError' do
|
14
|
+
expect { raise BioInterchange::Exceptions::ImplementationReaderError }.to raise_error(BioInterchange::Exceptions::ImplementationError)
|
15
|
+
end
|
16
|
+
|
17
|
+
it 'raising of ImplementationModelError' do
|
18
|
+
expect { raise BioInterchange::Exceptions::ImplementationModelError }.to raise_error(BioInterchange::Exceptions::ImplementationError)
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'raising of ImplementationWriterError' do
|
22
|
+
expect { raise BioInterchange::Exceptions::ImplementationWriterError }.to raise_error(BioInterchange::Exceptions::ImplementationError)
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|