rdfobjects 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/lib/rdf_objects/parsers.rb +245 -186
  2. metadata +2 -2
@@ -58,223 +58,282 @@ class UTF8Parser < StringScanner
58
58
  raise StandardError, "Caught #{e.class}: #{e}"
59
59
  end
60
60
  end
61
- module RDFObject
62
- class NTriplesParser
63
- attr_reader :ntriple, :subject, :predicate, :data_type, :language, :literal
64
- attr_accessor :object
65
- def initialize(line)
66
- @ntriple = line
67
- if @ntriple.respond_to?(:force_encoding)
68
- @ntriple.force_encoding("ASCII-8BIT")
61
+ module RDFObject
62
+ class Collection < Hash
63
+ attr_accessor :objects
64
+ def initialize(subjects=true)
65
+ @objects = Collection.new(false) if subjects
66
+ end
67
+ def uris
68
+ return self.keys
69
+ end
70
+ def find_by_type(type)
71
+ self.find_all {|r| r}
72
+ end
73
+ def resources
74
+ self.merge(@objects)
69
75
  end
70
- parse_ntriple
71
76
  end
72
77
 
73
- def parse_ntriple
74
- scanner = StringScanner.new(@ntriple)
75
- @subject = scanner.scan_until(/> /)
76
- @subject.sub!(/^</,'')
77
- @subject.sub!(/> $/,'')
78
- @predicate = scanner.scan_until(/> /)
79
- @predicate.sub!(/^</,'')
80
- @predicate.sub!(/> $/,'')
81
- if scanner.match?(/</)
82
- object = scanner.scan_until(/>\s?\.\s*\n?$/)
83
- object.sub!(/^</,'')
84
- object.sub!(/>\s?\.\s*\n?$/,'')
85
- @object = Resource.new(object)
86
- else
87
- @literal = true
88
- scanner.getch
89
- object = scanner.scan_until(/("\s?\.\s*\n?$)|("@[A-z])|("\^\^)/)
90
- scanner.pos=(scanner.pos-2)
91
- object.sub!(/"..$/,'')
92
- if object.respond_to?(:force_encoding)
93
- object.force_encoding('utf-8').chomp!
78
+ class Parser
79
+ # Choose the best format parser from an admittedly small group of choices.
80
+ def self.parse(rdf, format=nil)
81
+ if format
82
+ parser = case format
83
+ when 'rdfxml' then XMLParser.new(rdf)
84
+ when 'rdfa' then RDFAParser.new(rdf)
85
+ when 'ntriples' then NTriplesParser.new(rdf)
86
+ when 'json' then JSONParser.new(rdf)
87
+ end
94
88
  else
95
- uscan = UTF8Parser.new(object)
96
- object = uscan.parse_string.chomp
89
+ begin
90
+ # Check if the format is XML or RDFa
91
+ doc = Nokogiri::XML.parse(rdf, nil, nil, Nokogiri::XML::ParseOptions::PEDANTIC)
92
+ raise "Unable to parse XML/HTML document -- no namespace declared" unless doc.root.namespaces
93
+ if doc.root.namespaces.values.index("http://www.w3.org/1999/xhtml")
94
+ parser = RDFAParser.new(doc)
95
+ else
96
+ parser = XMLParser.new(doc)
97
+ end
98
+ rescue Nokogiri::XML::SyntaxError
99
+ begin
100
+ if rdf.respond_to?(:read)
101
+ rdf.rewind
102
+ json = JSON.parse(rdf.read)
103
+ else
104
+ json = JSON.parse(rdf)
105
+ end
106
+ parser = JSONParser.new(json)
107
+ rescue JSON::ParserError
108
+ if rdf.respond_to?(:read)
109
+ rdf.rewind
110
+ end
111
+ parser = NTriplesParser.new(rdf)
112
+ end
113
+ end
97
114
  end
98
- if scanner.match?(/@/)
115
+ parser.parse
116
+ end
117
+ attr_reader :collection
118
+ def initialize(data=nil)
119
+ @collection = Collection.new
120
+ self.data=(data) if data
121
+ end
122
+
123
+ def find_or_create(uri)
124
+ return @collection.resources[uri] if @collection.resources[uri]
125
+ Resource.new(uri)
126
+ end
127
+ end
128
+ class NTriplesParser < RDFObject::Parser
129
+
130
+ def parse_ntriple(ntriple)
131
+ if ntriple.respond_to?(:force_encoding)
132
+ ntriple.force_encoding("ASCII-8BIT")
133
+ end
134
+ scanner = StringScanner.new(ntriple)
135
+ subject = scanner.scan_until(/> /)
136
+ subject.sub!(/^</,'')
137
+ subject.sub!(/> $/,'')
138
+ predicate = scanner.scan_until(/> /)
139
+ predicate.sub!(/^</,'')
140
+ predicate.sub!(/> $/,'')
141
+ if scanner.match?(/</)
142
+ tmp_object = scanner.scan_until(/>\s?\.\s*\n?$/)
143
+ tmp_object.sub!(/^</,'')
144
+ tmp_object.sub!(/>\s?\.\s*\n?$/,'')
145
+ object = find_or_create(tmp_object)
146
+ @collection[object.uri] = object
147
+ else
148
+ language = nil
149
+ data_type = nil
99
150
  scanner.getch
100
- @language = scanner.scan_until(/\s?\.\n?$/)
101
- @language.sub!(/\s?\.\n?$/,'')
102
- elsif scanner.match?(/\^\^/)
103
- scanner.skip_until(/</)
104
- @data_type = scanner.scan_until(/>/)
105
- @data_type.sub!(/>$/,'')
151
+ tmp_object = scanner.scan_until(/("\s?\.\s*\n?$)|("@[A-z])|("\^\^)/)
152
+ scanner.pos=(scanner.pos-2)
153
+ tmp_object.sub!(/"..$/,'')
154
+ if tmp_object.respond_to?(:force_encoding)
155
+ tmp_object.force_encoding('utf-8').chomp!
156
+ else
157
+ uscan = UTF8Parser.new(tmp_object)
158
+ tmp_object = uscan.parse_string.chomp
159
+ end
160
+ if scanner.match?(/@/)
161
+ scanner.getch
162
+ language = scanner.scan_until(/\s?\.\n?$/)
163
+ language.sub!(/\s?\.\n?$/,'')
164
+ elsif scanner.match?(/\^\^/)
165
+ scanner.skip_until(/</)
166
+ data_type = scanner.scan_until(/>/)
167
+ data_type.sub!(/>$/,'')
168
+ end
169
+ object = Literal.new(tmp_object,{:data_type=>data_type,:language=>language})
106
170
  end
107
- @object = Literal.new(object,{:data_type=>@data_type,:language=>@language})
171
+ [subject, predicate, object]
108
172
  end
109
- end
110
-
111
- def self.parse(resources)
112
- collection = []
113
- if resources.is_a?(String)
114
- assertions = resources.split("\n")
115
- elsif resources.is_a?(Array)
116
- assertions = resources
117
- elsif resources.respond_to?(:read)
118
- assertions = resources.readlines
173
+
174
+ def data=(ntriples)
175
+ if ntriples.is_a?(String)
176
+ @ntriples = ntriples.split("\n")
177
+ elsif ntriples.is_a?(Array)
178
+ @ntriples = ntriples
179
+ elsif ntriples.respond_to?(:read)
180
+ @ntriples = ntriples.readlines
181
+ end
119
182
  end
120
- assertions.each do | assertion |
121
- next if assertion[0, 1] == "#" # Ignore comments
122
- triple = self.new(assertion)
123
- resource = Resource.new(triple.subject)
124
- resource.assert(triple.predicate, triple.object)
125
- collection << resource
183
+
184
+ def parse
185
+ @ntriples.each do | assertion |
186
+ next if assertion[0, 1] == "#" # Ignore comments
187
+ triple = parse_ntriple(assertion)
188
+ resource = find_or_create(triple[0])
189
+ resource.assert(triple[1], triple[2])
190
+ @collection[resource.uri] = resource
191
+ end
192
+ @collection
126
193
  end
127
- collection.uniq
128
194
  end
129
- end
130
195
 
131
- class XMLParser
132
- #
133
- # A very unsophisticated RDF/XML Parser -- currently only parses RDF/XML that conforms to
134
- # the SimpleRdfXml convention: http://esw.w3.org/topic/SimpleRdfXml. This is a pragmatic
135
- # rather than dogmatic decision. If it is not working with your RDF/XML let me know and we
136
- # can probably fix it.
137
- #
138
- def self.parse(doc)
139
- namespaces = doc.namespaces
140
- #if namespaces.index("http://purl.org/rss/1.0/")
141
- # collection = parse_rss10(doc)
142
- if namespaces.index("http://www.w3.org/2005/sparql-results#")
143
- raise "Sorry, SPARQL not yet supported"
144
- else
145
- collection = parse_rdfxml(doc)
196
+ class XMLParser < RDFObject::Parser
197
+ #
198
+ # A very unsophisticated RDF/XML Parser -- currently only parses RDF/XML that conforms to
199
+ # the SimpleRdfXml convention: http://esw.w3.org/topic/SimpleRdfXml. This is a pragmatic
200
+ # rather than dogmatic decision. If it is not working with your RDF/XML let me know and we
201
+ # can probably fix it.
202
+ #
203
+
204
+ def parse
205
+ namespaces = @rdfxml.namespaces
206
+ if namespaces.index("http://purl.org/rss/1.0/")
207
+ fix_rss10
208
+ end
209
+ if namespaces.index("http://www.w3.org/2005/sparql-results#")
210
+ raise "Sorry, SPARQL not yet supported"
211
+ else
212
+ parse_rdfxml
213
+ end
214
+ @collection
146
215
  end
147
- collection.uniq
148
- end
149
-
150
- def self.parse_resource_node(resource_node, collection)
151
- resource = Resource.new(resource_node.attribute_with_ns('about', "http://www.w3.org/1999/02/22-rdf-syntax-ns#").value)
152
- unless (resource_node.name == "Description" and resource_node.namespace.href == "http://www.w3.org/1999/02/22-rdf-syntax-ns#") or
153
- (resource_node.name == "item" and resource_node.namespace.href == "http://purl.org/rss/1.0/")
154
- resource.assert("[rdf:type]",Resource.new("#{resource_node.namespace.href}#{resource_node.name}"))
216
+
217
+ def data=(xml)
218
+ if xml.is_a?(Nokogiri::XML::Document)
219
+ @rdfxml = xml
220
+ else
221
+ @rdfxml = Nokogiri::XML.parse(xml, nil, nil, Nokogiri::XML::ParseOptions::PEDANTIC)
222
+ end
155
223
  end
156
- resource_node.children.each do | child |
157
- next if child.text?
158
- predicate = "#{child.namespace.href}#{child.name}"
159
- if object_uri = child.attribute_with_ns("resource", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
160
- obj_resource = Resource.new(object_uri.value)
161
- resource.assert(predicate, obj_resource)
162
- collection << obj_resource
163
- elsif all_text?(child)
164
- opts = {}
165
- if lang = child.attribute_with_ns("lang", "http://www.w3.org/XML/1998/namespace")
166
- opts[:language] = lang.value
224
+
225
+ def parse_resource_node(resource_node, collection)
226
+ resource = find_or_create(resource_node.attribute_with_ns('about', "http://www.w3.org/1999/02/22-rdf-syntax-ns#").value)
227
+ unless (resource_node.name == "Description" and resource_node.namespace.href == "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
228
+ resource.assert("[rdf:type]", find_or_create("#{resource_node.namespace.href}#{resource_node.name}"))
229
+ end
230
+ resource_node.children.each do | child |
231
+ next if child.text?
232
+ predicate = "#{child.namespace.href}#{child.name}"
233
+ if object_uri = child.attribute_with_ns("resource", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
234
+ obj_resource = find_or_create(object_uri.value)
235
+ resource.assert(predicate, obj_resource)
236
+ @collection[obj_resource.uri] = obj_resource
237
+ elsif all_text?(child)
238
+ opts = {}
239
+ if lang = child.attribute_with_ns("lang", "http://www.w3.org/XML/1998/namespace")
240
+ opts[:language] = lang.value
241
+ end
242
+ if datatype = child.attribute_with_ns("datatype", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
243
+ opts[:data_type] = datatype.value
244
+ end
245
+ resource.assert(predicate, Literal.new(child.content.strip,opts))
167
246
  end
168
- if datatype = child.attribute_with_ns("datatype", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
169
- opts[:data_type] = datatype.value
247
+ child.xpath("./*[@rdf:about]").each do | grandchild |
248
+ gc_resource = find_or_create(grandchild.attribute_with_ns('about', "http://www.w3.org/1999/02/22-rdf-syntax-ns#").value)
249
+ resource.assert(predicate, gc_resource)
250
+ @collection[gc_resource.uri] = gc_resource
251
+ parse_resource_node(grandchild, collection)
170
252
  end
171
- resource.assert(predicate, Literal.new(child.content.strip,opts))
172
- end
173
- child.xpath("./*[@rdf:about]").each do | grandchild |
174
- gc_resource = Resource.new(grandchild.attribute_with_ns('about', "http://www.w3.org/1999/02/22-rdf-syntax-ns#").value)
175
- resource.assert(predicate, gc_resource)
176
- collection << gc_resource
177
- parse_resource_node(grandchild, collection)
178
253
  end
254
+ @collection[resource.uri] = resource
179
255
  end
180
- collection << resource
181
- end
182
256
 
183
- def self.all_text?(node)
184
- node.children.each do | child |
185
- return false unless child.text?
257
+ def all_text?(node)
258
+ node.children.each do | child |
259
+ return false unless child.text?
260
+ end
261
+ true
186
262
  end
187
- true
188
- end
189
263
 
190
- def self.parse_rdfxml(doc)
191
- collection = []
192
- doc.root.xpath("./*[@rdf:about]").each do | resource_node |
193
- parse_resource_node(resource_node, collection)
194
- end
195
- collection
196
- end
264
+ def parse_rdfxml
265
+ collection = []
266
+ @rdfxml.root.xpath("./*[@rdf:about]").each do | resource_node |
267
+ parse_resource_node(resource_node, collection)
268
+ end
269
+ end
197
270
 
198
- #def self.parse_rss10(doc)
199
- # collection = []
200
- # doc.root.xpath("./rss:item","rss"=>"http://purl.org/rss/1.0/").each do | resource_node |
201
- # parse_resource_node(resource_node, collection)
202
- # end
203
- # collection
204
- #end
205
- end
206
-
207
- class RDFAParser
208
- def self.parse(doc)
209
- xslt = Nokogiri::XSLT(open(File.dirname(__FILE__) + '/../xsl/RDFa2RDFXML.xsl'))
210
- rdf_doc = xslt.apply_to(doc)
211
- XMLParser.parse(Nokogiri.parse(rdf_doc))
212
- end
213
- end
214
-
215
- class JSONParser
216
- def self.parse(json)
217
- collection = []
218
- json.each_pair do |subject, assertions|
219
- resource = Resource.new(subject)
220
- collection << resource
221
- assertions.each_pair do |predicate, objects|
222
- objects.each do | object |
223
- if object['type'] == 'literal'
224
- opts = {}
225
- if object['lang']
226
- opts[:language] = object['lang']
227
- end
228
- if object['datatype']
229
- opts[:data_type] = object['datatype']
230
- end
231
- literal = Literal.new(object['value'],opts)
232
- resource.assert(predicate, literal)
233
- elsif object['type'] == 'uri'
234
- o = Resource.new(object['value'])
235
- resource.assert(predicate, o)
236
- collection << o
237
- elsif object['type'] == 'bnode' # For now, we're going to treat a blank node like a URI resource.
238
- o = Resource.new(object['value'])
239
- resource.assert(predicate, o)
240
- collection << o
241
- end
271
+ def fix_rss10
272
+ @rdfxml.root.xpath('./rss:channel/rss:items/rdf:Seq/rdf:li', {"rdf"=>"http://www.w3.org/1999/02/22-rdf-syntax-ns#",
273
+ "rss"=>"http://purl.org/rss/1.0/"}).each do | li |
274
+ if li['resource'] && !li["rdf:resource"]
275
+ li["rdf:resource"] = li["resource"]
242
276
  end
243
277
  end
244
278
  end
245
- collection.uniq
246
279
  end
247
- end
248
280
 
249
- class Parser
250
- # Choose the best format parser from an admittedly small group of choices.
251
- def self.parse(rdf)
252
- begin
253
- # Check if the format is XML or RDFa
254
- doc = Nokogiri::XML.parse(rdf, nil, nil, Nokogiri::XML::ParseOptions::PEDANTIC)
255
- raise "Unable to parse XML/HTML document -- no namespace declared" unless doc.root.namespaces
256
- if doc.root.namespaces.values.index("http://www.w3.org/1999/xhtml")
257
- collection = RDFAParser.parse(doc)
281
+ class RDFAParser < XMLParser
282
+ def data=(xhtml)
283
+ if xhtml.is_a?(Nokogiri::XML::Document)
284
+ doc = xhtml
258
285
  else
259
- collection = XMLParser.parse(doc)
286
+ doc = Nokogiri::HTML.parse(xhtml)
260
287
  end
261
- rescue Nokogiri::XML::SyntaxError
262
- begin
263
- if rdf.respond_to?(:read)
264
- rdf.rewind
265
- json = JSON.parse(rdf.read)
266
- else
267
- json = JSON.parse(rdf)
268
- end
269
- collection = JSONParser.parse(json)
270
- rescue JSON::ParserError
271
- if rdf.respond_to?(:read)
272
- rdf.rewind
288
+ xslt = Nokogiri::XSLT(open(File.dirname(__FILE__) + '/../xsl/RDFa2RDFXML.xsl'))
289
+ rdfxml = xslt.apply_to(doc)
290
+ @rdfxml = Nokogiri::XML.parse(rdfxml, nil, nil, Nokogiri::XML::ParseOptions::PEDANTIC)
291
+ end
292
+ end
293
+
294
+ class JSONParser < RDFObject::Parser
295
+
296
+ def data=(json)
297
+ if json.is_a?(String)
298
+ @json = JSON.parse(json)
299
+ elsif json.is_a?(Hash)
300
+ @json = json
301
+ elsif json.respond_to?(:read)
302
+ @json = JSON.parse(json.read)
303
+ end
304
+ end
305
+
306
+ def parse
307
+ @json.each_pair do |subject, assertions|
308
+ resource = find_or_create(subject)
309
+ @collection[resource.uri] = resource
310
+ assertions.each_pair do |predicate, objects|
311
+ objects.each do | object |
312
+ if object['type'] == 'literal'
313
+ opts = {}
314
+ if object['lang']
315
+ opts[:language] = object['lang']
316
+ end
317
+ if object['datatype']
318
+ opts[:data_type] = object['datatype']
319
+ end
320
+ literal = Literal.new(object['value'],opts)
321
+ resource.assert(predicate, literal)
322
+ elsif object['type'] == 'uri'
323
+ o = find_or_create(object['value'])
324
+ resource.assert(predicate, o)
325
+ @collection[o.uri] = o
326
+ elsif object['type'] == 'bnode' # For now, we're going to treat a blank node like a URI resource.
327
+ o = find_or_create(object['value'])
328
+ resource.assert(predicate, o)
329
+ @collection[o.uri] = o
330
+ end
331
+ end
273
332
  end
274
- collection = NTriplesParser.parse(rdf)
275
333
  end
334
+ @collection
276
335
  end
277
- collection
278
336
  end
279
- end
337
+
338
+
280
339
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rdfobjects
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.1
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ross Singer
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-10-08 00:00:00 -04:00
12
+ date: 2009-10-09 00:00:00 -04:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency