biointerchange 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. data/.document +5 -0
  2. data/.rspec +1 -0
  3. data/.travis.yml +12 -0
  4. data/Gemfile +17 -0
  5. data/LICENSE.txt +8 -0
  6. data/README.md +166 -0
  7. data/Rakefile +50 -0
  8. data/VERSION +1 -0
  9. data/bin/biointerchange +6 -0
  10. data/docs/exceptions_readme.txt +13 -0
  11. data/examples/BovineGenomeChrX.gff3.gz +0 -0
  12. data/examples/gb-2007-8-3-R40.xml +243 -0
  13. data/examples/pubannotation.json +1 -0
  14. data/generators/rdfxml.rb +104 -0
  15. data/lib/biointerchange/core.rb +195 -0
  16. data/lib/biointerchange/exceptions.rb +38 -0
  17. data/lib/biointerchange/genomics/gff3_feature.rb +82 -0
  18. data/lib/biointerchange/genomics/gff3_feature_set.rb +37 -0
  19. data/lib/biointerchange/genomics/gff3_rdf_ntriples.rb +107 -0
  20. data/lib/biointerchange/genomics/gff3_reader.rb +86 -0
  21. data/lib/biointerchange/gff3.rb +135 -0
  22. data/lib/biointerchange/reader.rb +25 -0
  23. data/lib/biointerchange/registry.rb +29 -0
  24. data/lib/biointerchange/sio.rb +7124 -0
  25. data/lib/biointerchange/sofa.rb +1566 -0
  26. data/lib/biointerchange/textmining/content.rb +69 -0
  27. data/lib/biointerchange/textmining/document.rb +36 -0
  28. data/lib/biointerchange/textmining/pdfx_xml_reader.rb +161 -0
  29. data/lib/biointerchange/textmining/process.rb +57 -0
  30. data/lib/biointerchange/textmining/pubannos_json_reader.rb +72 -0
  31. data/lib/biointerchange/textmining/text_mining_rdf_ntriples.rb +197 -0
  32. data/lib/biointerchange/textmining/text_mining_reader.rb +41 -0
  33. data/lib/biointerchange/writer.rb +23 -0
  34. data/lib/biointerchange.rb +3 -0
  35. data/spec/exceptions_spec.rb +27 -0
  36. data/spec/gff3_rdfwriter_spec.rb +67 -0
  37. data/spec/text_mining_pdfx_xml_reader_spec.rb +89 -0
  38. data/spec/text_mining_pubannos_json_reader_spec.rb +71 -0
  39. data/spec/text_mining_rdfwriter_spec.rb +57 -0
  40. data/web/about.html +89 -0
  41. data/web/biointerchange.js +133 -0
  42. data/web/bootstrap/css/bootstrap-responsive.css +1040 -0
  43. data/web/bootstrap/css/bootstrap-responsive.min.css +9 -0
  44. data/web/bootstrap/css/bootstrap.css +5624 -0
  45. data/web/bootstrap/css/bootstrap.min.css +9 -0
  46. data/web/bootstrap/img/glyphicons-halflings-white.png +0 -0
  47. data/web/bootstrap/img/glyphicons-halflings.png +0 -0
  48. data/web/bootstrap/js/bootstrap.js +2027 -0
  49. data/web/bootstrap/js/bootstrap.min.js +6 -0
  50. data/web/bootstrap/js/jquery-1.8.1.min.js +2 -0
  51. data/web/css/rdoc-style.css +5786 -0
  52. data/web/css/rdoc.css +716 -0
  53. data/web/images/BioInterchange300.png +0 -0
  54. data/web/index.html +109 -0
  55. data/web/service/rdfizer.fcgi +68 -0
  56. data/web/webservices.html +123 -0
  57. metadata +240 -0
@@ -0,0 +1,69 @@
1
+ module BioInterchange::TextMining
2
+
3
+ class Content
4
+
5
+ # Constants that describe content types.
6
+ UNSPECIFIED = 0
7
+ DOCUMENT = 1
8
+ PAGE = 2
9
+ TITLE = 3
10
+ AUTHOR = 4
11
+ ABSTRACT = 5
12
+ SECTION = 6
13
+ PARAGRAPH = 7
14
+ SENTENCE = 8
15
+ PHRASE = 9
16
+ WORD = 10
17
+ CHARACTER = 11
18
+
19
+ # Creates a new document content representation at a specific location of the document.
20
+ #
21
+ # +offset+:: zero-based offset of the represented content within the document (absolute location within the document)
22
+ # +length+:: length of the represented content, where a length of zero denotes a boundary between two characters
23
+ # +type+:: classifaction of the content
24
+ def initialize(offset, length, type = UNSPECIFIED, process = nil)
25
+ @offset = offset
26
+ @length = length
27
+ @type = type
28
+ @process = process
29
+ end
30
+
31
+ # Sets the context of this content.
32
+ #
33
+ # +context+:: a +BioInterchange::TextMining::Document+ or +BioInterchange::TextMining::Content+ instance in which this content is enclosed in
34
+ def setContext(context)
35
+ @context = context
36
+ end
37
+
38
+ # Returns the offset of the content as absolute position within the document.
39
+ def offset
40
+ @offset
41
+ end
42
+
43
+ # Returns the length of the content, which is measured in characters.
44
+ def length
45
+ @length
46
+ end
47
+
48
+ # Returns the type of the content, if known, or +BioInterchange::TextMining::Content::UNSPECIFIED otherwise.
49
+ def type
50
+ @type
51
+ end
52
+
53
+ # Returns the process associated with this content, if previously provided, or nil otherwise.
54
+ def process
55
+ @process
56
+ end
57
+
58
+ # Returns a URI that identifies this content.
59
+ def uri
60
+ raise BioInterchange::Exceptions::ImplementationModelError, 'An URI can only be returned for content with a context (i.e., use setContext(context) first).' unless @context
61
+ process = '-'
62
+ process = "(#{@process.uri.sub(/^.*?:\/\//, '')})" if @process
63
+ "biointerchange://textmining/content/#{@context.uri.sub(/^.*?:\/\//, '')}/#{@offset},#{@length},#{@type},#{process}"
64
+ end
65
+
66
+ end
67
+
68
+ end
69
+
@@ -0,0 +1,36 @@
1
+ module BioInterchange::TextMining
2
+
3
+ class Document
4
+
5
+ # Creates a blank document representation.
6
+ #
7
+ # +uri+:: source of the document
8
+ def initialize(uri)
9
+ @uri = uri
10
+
11
+ # Will hold content:
12
+ @content = []
13
+ end
14
+
15
+ # Returns the URI of the document.
16
+ def uri
17
+ @uri
18
+ end
19
+
20
+ # Adds content to the document.
21
+ #
22
+ # +content+:: content of type +BioInterchange::TextMining::Content+ that should be added to the document
23
+ def add(content)
24
+ raise BioInterchange::Exceptions::ImplementationModelError, 'Content has to be of kind BioInterchange::TextMining::Content' unless content.kind_of?(BioInterchange::TextMining::Content)
25
+ @content << content
26
+ end
27
+
28
+ # Returns the document contents as an array.
29
+ def contents
30
+ @content.clone.freeze
31
+ end
32
+
33
+ end
34
+
35
+ end
36
+
@@ -0,0 +1,161 @@
1
+ module BioInterchange::TextMining
2
+
3
+ require 'rexml/document'
4
+ require 'rexml/streamlistener'
5
+
6
+ class PdfxXmlReader < BioInterchange::TextMining::TMReader
7
+
8
+ # Reads input stream and returns associated +BioInterchange::TextMining::Document+ model
9
+ #
10
+ # Presently I assume a single document per xml file,
11
+ # and that <section> tags cannot nest. I also assume
12
+ # that a Content::DOCUMENT type is everything between
13
+ # the <article> tags.
14
+ #
15
+ # +inputstream+:: Input IO stream to deserialize
16
+ def deserialize(inputstream)
17
+
18
+ #super(inputstream)
19
+
20
+ raise BioInterchange::Exceptions::ImplementationReaderError, 'InputStream not of type IO, cannot read.' unless inputstream.kind_of?(IO) or inputstream.kind_of?(String)
21
+
22
+ @input = inputstream
23
+
24
+ pdfx
25
+
26
+ end
27
+
28
+
29
+
30
+ private
31
+
32
+ def pdfx
33
+ list = MyListener.new
34
+ REXML::Document.parse_stream(@input, list)
35
+ return list.document
36
+ end
37
+
38
+
39
+
40
+
41
+ class MyListener
42
+
43
+ include REXML::StreamListener
44
+
45
+ def initialize
46
+ @map = {}
47
+
48
+ #sections can nest, so "stack" them
49
+ @map['sec_s'] = []
50
+ @map['sec_l'] = []
51
+ end
52
+
53
+
54
+ def tag_start(name, attr)
55
+ #puts "tag_start: #{name}"
56
+ if name =~ /^job$/
57
+ raise BioInterchange::Exceptions::InputFormatError, 'Input document XML has multiple <job> tags, cannot parse multiple documents within a single file.' if @map['id_done']
58
+ @map['id'] = true
59
+ elsif name =~ /^article-title$/
60
+ raise BioInterchange::Exceptions::InputFormatError, 'Input document XML has multiple <article-title> tags defined, cannot parse multiple documents within a single file.' if @map['title_done']
61
+ @map['title'] = true
62
+ @map['title_s'] = @map['art_l']
63
+ @map['title_l'] = 0
64
+ elsif name =~ /^abstract$/
65
+ raise BioInterchange::Exceptions::InputFormatError, 'Input document XML has multiple <abstract> tags defined, cannot parse multiple documents within a single file.' if @map['abs_done']
66
+ @map['abs'] = true
67
+ @map['abs_s'] = @map['art_l']
68
+ @map['abs_l'] = 0
69
+ elsif name =~ /^body$/
70
+ raise BioInterchange::Exceptions::InputFormatError, 'Input document XML has multiple <body> tags defined, cannot parse multiple documents within a single file.' if @map['body_done']
71
+ @map['body'] = true
72
+ @map['body_s'] = @map['art_l']
73
+ @map['body_l'] = 0
74
+ elsif name =~ /^article$/
75
+ raise BioInterchange::Exceptions::InputFormatError, 'Input document XML has multiple <article> tags defined, cannot parse multiple documents within a single file.' if @map['art_done']
76
+ @map['art'] = true
77
+ @map['art_s'] = 0
78
+ @map['art_l'] = 0
79
+ elsif name =~ /^section$/
80
+ raise BioInterchange::Exceptions::InputFormatError, 'Error with section stack, stacks not equal in size: Possibly not a well formed XML input file. Check <section> tags all match up and do not overlap (nesting is fine).' unless @map['sec_s'].size == @map['sec_l'].size
81
+ @map['sec_s'].push @map['art_l']
82
+ @map['sec_l'].push 0
83
+ end
84
+ end
85
+
86
+
87
+ def text(data)
88
+
89
+ if @map['art']
90
+ @map['art_l'] += data.length
91
+ end
92
+
93
+ if @map['id']
94
+ @doc = BioInterchange::TextMining::Document.new("http://pdfx.cs.man.ac.uk/" + data)
95
+ elsif @map['title']
96
+ @map['title_l'] += data.length
97
+ elsif @map['abs']
98
+ @map['abs_l'] += data.length
99
+ end
100
+ if @map['body']
101
+ @map['body_l'] += data.length
102
+ end
103
+ if @map['sec_l'].size != 0
104
+ #add length to *all* current sections
105
+ @map['sec_l'].size.times do |i|
106
+ @map['sec_l'][i] += data.length
107
+ end
108
+ end
109
+
110
+ end
111
+
112
+ #TODO add deal with <author> type tags
113
+
114
+ def tag_end(name)
115
+ #puts "tag_end: #{name}"
116
+ if name =~ /^job$/
117
+ @map['id'] = false
118
+ @map['id_done'] = true
119
+ elsif name =~ /^article-title$/
120
+ @map['title'] = false
121
+ dc = BioInterchange::TextMining::Content.new(@map['title_s'], @map['title_l'], BioInterchange::TextMining::Content::TITLE, @process)
122
+ dc.setContext(@doc)
123
+ @doc.add(dc)
124
+ @map['title_done'] = true
125
+ elsif name =~ /^abstract$/
126
+ @map['abs'] = false
127
+ dc = BioInterchange::TextMining::Content.new(@map['abs_s'], @map['abs_l'], BioInterchange::TextMining::Content::ABSTRACT, @process)
128
+ dc.setContext(@doc)
129
+ @doc.add(dc)
130
+ @map['abs_done'] = true
131
+ elsif name =~ /^body$/
132
+ @map['body'] = false
133
+ dc = BioInterchange::TextMining::Content.new(@map['body_s'], @map['body_l'], BioInterchange::TextMining::Content::SECTION, @process)
134
+ dc.setContext(@doc)
135
+ @doc.add(dc)
136
+ @map['body_done'] = true
137
+ elsif name =~ /^article$/
138
+ @map['art'] = false
139
+ dc = BioInterchange::TextMining::Content.new(@map['art_s'], @map['art_l'], BioInterchange::TextMining::Content::DOCUMENT, @process)
140
+ dc.setContext(@doc)
141
+ @doc.add(dc)
142
+ @map['art_done'] = true
143
+ elsif name =~ /^section$/
144
+ raise 'Error with section stack, stacks not equal in size' unless @map['sec_s'].size == @map['sec_l'].size
145
+ dc = BioInterchange::TextMining::Content.new(@map['sec_s'].pop, @map['sec_l'].pop, BioInterchange::TextMining::Content::SECTION, @process)
146
+ dc.setContext(@doc)
147
+ @doc.add(dc)
148
+ end
149
+ end
150
+
151
+ def document
152
+ @doc
153
+ end
154
+
155
+ end
156
+
157
+
158
+
159
+ end
160
+
161
+ end
@@ -0,0 +1,57 @@
1
+ module BioInterchange::TextMining
2
+
3
+ class Process
4
+
5
+ # Constants that describe process types.
6
+ UNSPECIFIED = 0
7
+ SOFTWARE = 1
8
+ WEBSERVICE = 2
9
+ MANUAL = 3
10
+
11
+ # Dictionary of viable metadata keys.
12
+ VERSION = 0
13
+
14
+ # Creates a new process representation.
15
+ #
16
+ # +name+:: identification of the software/service/curator involved in the process, e.g. "ABNER" or "Peter Smith"
17
+ # +uri+:: details about the processes origin, e.g. "http://pages.cs.wisc.edu/~bsettles/abner/" or "peter.smith@example.org"
18
+ # +date+:: date-time when the process was carried out, which may be nil if the date-time is unknown
19
+ # +type+:: classification of the described process
20
+ # +metadata+:: a hash that holds additional information about the process via dictionary defined keywords, e.g. { Process::VERSION => '0.0.1alpha' }
21
+ def initialize(name, uri, type = UNSPECIFIED, metadata = {}, date = nil)
22
+ @name = name
23
+ @uri = uri
24
+ @date = date
25
+ @type = type
26
+ @metadata = metadata
27
+ end
28
+
29
+ # Returns the name of the process.
30
+ def name
31
+ @name
32
+ end
33
+
34
+ # Returns the URI that has further details about the process, which can also be the form of an email address
35
+ # in cases where the process describes human driven annotation.
36
+ def uri
37
+ @uri
38
+ end
39
+
40
+ # Returns the date-time when this process was carried out, or nil otherwise if the information is not available.
41
+ def date
42
+ @date
43
+ end
44
+
45
+ # Returns the type of this process.
46
+ def type
47
+ @type
48
+ end
49
+
50
+ # Returns additional meta-data associated with this process.
51
+ def metadata
52
+ @metadata.clone.freeze
53
+ end
54
+
55
+ end
56
+
57
+ end
@@ -0,0 +1,72 @@
1
+ module BioInterchange::TextMining
2
+
3
+ require 'rubygems'
4
+ require 'json'
5
+
6
+ class PubannosJsonReader < BioInterchange::TextMining::TMReader
7
+
8
+ def deserialize(inputstream)
9
+ if inputstream.kind_of?(IO) then
10
+ pubannos(inputstream.read)
11
+ elsif inputstream.kind_of?(String) then
12
+ pubannos(inputstream)
13
+ else
14
+ #else raise exception
15
+ super(inputstream)
16
+ end
17
+ end
18
+
19
+ private
20
+
21
+ # Specific method for parsing of *Pubannotations* json format
22
+ def pubannos(data)
23
+
24
+ result = JSON.parse(data)
25
+
26
+ if result.has_key? 'Error'
27
+ raise BioInterchange::Exceptions::InputFormatError, 'Error parsing the JSON input file: #{result["Error"]}'
28
+ end
29
+
30
+
31
+ text = result['text']
32
+ #doc_uri = "http://pubannotation.dbcls.jp/pmdocs/" + result['pmid'].to_s
33
+ doc_uri = result['docurl']
34
+
35
+ doc = Document.new(doc_uri)
36
+ docContent = Content.new(0, text.length, Content::DOCUMENT, @process)
37
+ docContent.setContext(doc)
38
+ doc.add(docContent)
39
+
40
+ #so our document requires content of type document or abstract
41
+ #should it hold the content string?
42
+
43
+ if result['catanns']
44
+ result['catanns'].each do |annot|
45
+ start_offset = annot['begin']
46
+ end_offset = annot['end']
47
+ length = end_offset - start_offset
48
+ created_time = annot['created_at']
49
+ updated_time = annot['updated_at']
50
+ category = annot['category']
51
+ #annset_id = annot['annset_id']
52
+ #doc_id = annot['doc_id']
53
+ #id = annot['id']
54
+
55
+ entity = text.slice(start_offset..end_offset)
56
+
57
+ #phrase = type for NE
58
+ con = Content.new(start_offset, length, Content::PHRASE, @process)
59
+ con.setContext(doc)
60
+ doc.add(con)
61
+
62
+ #set process.date = updated_time?
63
+ end
64
+ end
65
+
66
+ doc
67
+ end
68
+
69
+
70
+ end
71
+
72
+ end
@@ -0,0 +1,197 @@
1
+ require 'rdf'
2
+ require 'rdf/ntriples'
3
+ require 'date'
4
+
5
+ module BioInterchange::TextMining
6
+
7
+ class RDFWriter < BioInterchange::Writer
8
+
9
+ # Creates a new instance of a RDFWriter that will use the provided output stream to serialize RDF.
10
+ #
11
+ # +ostream+:: instance of an IO class or derivative that is used for RDF serialization
12
+ def initialize(ostream)
13
+ raise BioInterchange::Exceptions::ImplementationWriterError, 'The output stream is not an instance of IO or its subclasses.' unless ostream.kind_of?(IO)
14
+ @ostream = ostream
15
+ end
16
+
17
+ # Serializes a model as RDF.
18
+ #
19
+ # +model+:: a generic representation of input data that is derived from BioInterchange::TextMining::Document
20
+ def serialize(model)
21
+ if model.instance_of?(BioInterchange::TextMining::Document) then
22
+ serialize_document(model)
23
+ else
24
+ raise BioInterchange::Exceptions::ImplementationWriterError, 'The provided model cannot be serialized at the moment. ' +
25
+ 'Supported classes are BioInterchange::TextMining::Document (and that\'s it for now).'
26
+ end
27
+ end
28
+
29
+ private
30
+
31
+ # Generates an URI for a given process and its contents.
32
+ #
33
+ # +process+:: process instance
34
+ # +kind+:: kind of the URI that should be generated, for example, whether the URI should represent the name, date, etc.
35
+ def process_uri(process, kind)
36
+ base_uri = 'biointerchange://textmining/process'
37
+ case kind
38
+ when :process
39
+ RDF::URI.new("#{base_uri}/self/#{process.uri.sub(/^.*?:\/\//, '')}")
40
+ when :name
41
+ RDF::URI.new("#{base_uri}/name/#{process.uri.sub(/^.*?:\/\//, '')}")
42
+ when :uri
43
+ RDF::URI.new("#{base_uri}/uri/#{process.uri.sub(/^.*?:\/\//, '')}")
44
+ when :date
45
+ RDF::URI.new("#{base_uri}/date/#{process.uri.sub(/^.*?:\/\//, '')}")
46
+ else
47
+ raise BioInterchange::Exceptions::ImplementationWriterError, "There is no implementation for serializing a process as #{kind}."
48
+ end
49
+ end
50
+
51
+ # Generates an URI for a given content and its contents.
52
+ #
53
+ # +content+:: content instance
54
+ # +kind+:: kind of the URI that should be generated, for example, whether the URI should represent the name, date, etc.
55
+ def content_uri(content, kind)
56
+ base_uri = 'biointerchange://textmining/content'
57
+ case kind
58
+ #when :content
59
+ # RDF::URI.new("#{base_uri}/self/#{content.uri.sub(/^.*?:\/\//, '')}")
60
+ when :start
61
+ RDF::URI.new("#{base_uri}/start/#{content.uri.sub(/^.*?:\/\//, '')}")
62
+ when :stop
63
+ RDF::URI.new("#{base_uri}/stop/#{content.uri.sub(/^.*?:\/\//, '')}")
64
+ #when :type
65
+ # RDF::URI.new("#{base_uri}/type/#{content.uri.sub(/^.*?:\/\//, '')}")
66
+ else
67
+ raise BioInterchange::Exceptions::ImplementationWriterError, "There is no implementation for serializing a content as #{kind}."
68
+ end
69
+ end
70
+
71
+ # Serializes RDF for a textual document representation using the Semanticsciene Integrated Ontology
72
+ # (http://code.google.com/p/semanticscience/wiki/SIO).
73
+ #
74
+ # +model+:: an instance of +BioInterchange::TextMining::Document+
75
+ def serialize_document(model)
76
+ graph = RDF::Graph.new
77
+ document_uri = RDF::URI.new(model.uri)
78
+ graph.insert(RDF::Statement.new(document_uri, RDF.type, BioInterchange::SIO.document))
79
+ model.contents.each { |content|
80
+ serialize_content(graph, document_uri, content)
81
+ }
82
+ RDF::NTriples::Writer.dump(graph, @ostream)
83
+ end
84
+
85
+
86
+ # Serializes a Content object for a given document URI.
87
+ #
88
+ # +graph+:: RDF graph to which content is added
89
+ # +document_uri+:: the document URI to which the added content belongs to
90
+ # +content+:: an instance that describes the content
91
+ def serialize_content(graph, document_uri, content)
92
+ content_uri = RDF::URI.new(content.uri)
93
+ graph.insert(RDF::Statement.new(document_uri, BioInterchange::SIO.has_attribute, content_uri))
94
+ serialize_process(graph, document_uri, content_uri, content.process) if content.process
95
+
96
+ sio_url = BioInterchange::SIO.language_entity
97
+ case content.type
98
+ when Content::UNSPECIFIED
99
+ sio_url = BioInterchange::SIO.language_entity
100
+ when Content::DOCUMENT
101
+ sio_url = BioInterchange::SIO.document
102
+ when Content::PAGE
103
+ sio_url = BioInterchange::SIO.document_section
104
+ when Content::TITLE
105
+ sio_url = BioInterchange::SIO.title
106
+ when Content::AUTHOR
107
+ sio_url = BioInterchange::SIO.author_section
108
+ when Content::ABSTRACT
109
+ sio_url = BioInterchange::SIO.abstract_section
110
+ when Content::SECTION
111
+ sio_url = BioInterchange::SIO.document_section
112
+ when Content::PARAGRAPH
113
+ sio_url = BioInterchange::SIO.paragraph
114
+ when Content::SENTENCE
115
+ sio_url = BioInterchange::SIO.sentence
116
+ when Content::PHRASE
117
+ sio_url = BioInterchange::SIO.phrase
118
+ when Content::WORD
119
+ sio_url = BioInterchange::SIO.word
120
+ when Content::CHARACTER
121
+ sio_url = BioInterchange::SIO.character
122
+ end
123
+
124
+ graph.insert(RDF::Statement.new(content_uri, RDF.type, sio_url))
125
+
126
+ graph.insert(RDF::Statement.new(content_uri, BioInterchange::SIO.has_attribute, serialize_content_start(graph, document_uri, content_uri, content)))
127
+ graph.insert(RDF::Statement.new(content_uri, BioInterchange::SIO.has_attribute, serialize_content_stop(graph, document_uri, content_uri, content)))
128
+
129
+ end
130
+
131
+ # Serializes a process object for a specific document uri
132
+ #
133
+ #
134
+ def serialize_process(graph, document_uri, content_uri, process)
135
+ process_uri = process_uri(process, :process)
136
+ graph.insert(RDF::Statement.new(content_uri, BioInterchange::SIO.is_direct_part_of, process_uri))
137
+ # If this is an email address, then create a FOAF representation, otherwise, do something else:
138
+ if process.type == BioInterchange::TextMining::Process::MANUAL then
139
+ graph.insert(RDF::Statement.new(process_uri, RDF.type, RDF::FOAF.Person))
140
+ graph.insert(RDF::Statement.new(process_uri, RDF::FOAF.name, RDF::Literal.new(process.name)))
141
+ graph.insert(RDF::Statement.new(process_uri, RDF::FOAF.name, RDF::URI.new(process.uri)))
142
+ else
143
+ graph.insert(RDF::Statement.new(process_uri, RDF.type, BioInterchange::SIO.process))
144
+ graph.insert(RDF::Statement.new(process_uri, BioInterchange::SIO.has_attribute, serialize_process_name(graph, document_uri, content_uri, process_uri, process)))
145
+ graph.insert(RDF::Statement.new(process_uri, BioInterchange::SIO.has_attribute, serialize_process_uri(graph, document_uri, content_uri, process_uri, process)))
146
+ graph.insert(RDF::Statement.new(process_uri, BioInterchange::SIO.has_attribute, serialize_process_date(graph, document_uri, content_uri, process_uri, process))) if process.date
147
+ end
148
+ end
149
+
150
+ #
151
+ #
152
+ #
153
+ def serialize_process_name(graph, document_uri, content_uri, process_uri, process)
154
+ kind_uri = process_uri(process, :name)
155
+ graph.insert(RDF::Statement.new(kind_uri, RDF.type, BioInterchange::SIO.name))
156
+ graph.insert(RDF::Statement.new(kind_uri, BioInterchange::SIO.has_attribute, RDF::Literal.new("#{process.name}")))
157
+ end
158
+
159
+ #
160
+ #
161
+ #
162
+ def serialize_process_uri(graph, document_uri, content_uri, process_uri, process)
163
+ kind_uri = process_uri(process, :uri)
164
+ graph.insert(RDF::Statement.new(kind_uri, RDF.type, BioInterchange::SIO.software_entity))
165
+ graph.insert(RDF::Statement.new(kind_uri, BioInterchange::SIO.has_attribute, RDF::URI.new(process.uri)))
166
+ end
167
+
168
+ #
169
+ #
170
+ #
171
+ def serialize_process_date(graph, document_uri, content_uri, process_uri, process)
172
+ kind_uri = process_uri(process, :date)
173
+ graph.insert(RDF::Statement.new(kind_uri, RDF::DC.date, RDF::Literal.new(Date.parse(process.date))))
174
+ end
175
+
176
+ #
177
+ #
178
+ #
179
+ def serialize_content_start(graph, document_uri, content_uri, content)
180
+ kind_uri = content_uri(content, :start)
181
+ graph.insert(RDF::Statement.new(kind_uri, RDF::type, BioInterchange::SIO.start_position))
182
+ graph.insert(RDF::Statement.new(kind_uri, BioInterchange::SIO.has_attribute, RDF::Literal.new(content.offset)))
183
+ end
184
+
185
+ #
186
+ #
187
+ #
188
+ def serialize_content_stop(graph, document_uri, content_uri, content)
189
+ kind_uri = content_uri(content, :stop)
190
+ graph.insert(RDF::Statement.new(kind_uri, RDF::type, BioInterchange::SIO.stop_position))
191
+ graph.insert(RDF::Statement.new(kind_uri, BioInterchange::SIO.has_attribute, RDF::Literal.new((content.offset+content.length).to_s)))
192
+ end
193
+
194
+ end
195
+
196
+ end
197
+
@@ -0,0 +1,41 @@
1
+ module BioInterchange::TextMining
2
+
3
+ class TMReader < BioInterchange::Reader
4
+
5
+ # Create a new instance of a text-mining data reader. Sets @process to a new +BioInterchange::TextMining::Process+ object.
6
+ #
7
+ # +name+:: Name of the process which generated this data
8
+ # +name_uri+:: URI of the resource that generated this data
9
+ # +date+:: Optional date of data creation
10
+ # +processtype+:: Type of process that created this content
11
+ # +version+:: Optional version number of resource that created this data (nil if manually curated, for example).
12
+ def initialize(name, name_uri, date = nil, processtype = BioInterchange::TextMining::Process::UNSPECIFIED, version = nil)
13
+
14
+ metadata = {}
15
+ metadata[BioInterchange::TextMining::Process::VERSION] = version
16
+ @process = BioInterchange::TextMining::Process.new(name, name_uri, processtype, metadata, date)
17
+
18
+ end
19
+
20
+
21
+ # Reads input stream and returns associated model. Super call this method before before overriding to provide access to a @data string containing the inputstream text.
22
+ #
23
+ # +inputstream+:: Input IO stream to deserialize
24
+ def deserialize(inputstream)
25
+ raise BioInterchange::Exceptions::ImplementationReaderError, 'InputStream not of type IO, cannot read.' unless inputstream.kind_of?(IO)
26
+ end
27
+
28
+ # Automatically tries to determine a suitable process from the given name ID, which is assumed
29
+ # to be either an email address or web-site.
30
+ #
31
+ # +name_id+:: name ID that we want to represent by a suitable process
32
+ def self.determine_process(name_id)
33
+ process = BioInterchange::TextMining::Process::UNSPECIFIED
34
+ process = BioInterchange::TextMining::Process::MANUAL if name_id.match(/[^@]+@[^@]+/)
35
+ process = BioInterchange::TextMining::Process::SOFTWARE if name_id.match(/[a-zA-Z]+:\/\//)
36
+ process
37
+ end
38
+
39
+ end
40
+
41
+ end
@@ -0,0 +1,23 @@
1
+ module BioInterchange
2
+
3
+ class Writer
4
+
5
+ # Creates a new instance of a writer that will use the provided output stream to serialize object model instances.
6
+ #
7
+ # +ostream+:: instance of IO or derivative class
8
+ def initialize(ostream)
9
+ raise BioInterchange::Exceptions::ImplementationWriterError, 'The output stream is not an instance of IO or its subclasses.' unless ostream.kind_of?(IO)
10
+ @ostream = ostream
11
+ end
12
+
13
+ # Serializes an object model instance.
14
+ #
15
+ # +model+:: an object model instance
16
+ def serialize(model)
17
+ raise BioInterchange::Exceptions::ImplementationWriterError, 'You must implement this method, which takes an object model and serializes it into the previously provided output stream.'
18
+ end
19
+
20
+ end
21
+
22
+ end
23
+
@@ -0,0 +1,3 @@
1
+
2
+ require 'biointerchange/core'
3
+
@@ -0,0 +1,27 @@
1
+
2
+ require 'rspec'
3
+
4
+ load 'lib/biointerchange/core.rb'
5
+ load 'lib/biointerchange/exceptions.rb'
6
+
7
+ describe BioInterchange::Exceptions do
8
+ describe 'error and exception creation tests' do
9
+ it 'raising of InputFormatError' do
10
+ expect { raise BioInterchange::Exceptions::InputFormatError }.to raise_error(BioInterchange::Exceptions::BioInterchangeError)
11
+ end
12
+
13
+ it 'raising of ImplementationReaderError' do
14
+ expect { raise BioInterchange::Exceptions::ImplementationReaderError }.to raise_error(BioInterchange::Exceptions::ImplementationError)
15
+ end
16
+
17
+ it 'raising of ImplementationModelError' do
18
+ expect { raise BioInterchange::Exceptions::ImplementationModelError }.to raise_error(BioInterchange::Exceptions::ImplementationError)
19
+ end
20
+
21
+ it 'raising of ImplementationWriterError' do
22
+ expect { raise BioInterchange::Exceptions::ImplementationWriterError }.to raise_error(BioInterchange::Exceptions::ImplementationError)
23
+ end
24
+
25
+ end
26
+ end
27
+