rdfobjects 0.4.1 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/lib/rdf_objects/parsers.rb +245 -186
  2. metadata +2 -2
@@ -58,223 +58,282 @@ class UTF8Parser < StringScanner
58
58
  raise StandardError, "Caught #{e.class}: #{e}"
59
59
  end
60
60
  end
61
- module RDFObject
62
- class NTriplesParser
63
- attr_reader :ntriple, :subject, :predicate, :data_type, :language, :literal
64
- attr_accessor :object
65
- def initialize(line)
66
- @ntriple = line
67
- if @ntriple.respond_to?(:force_encoding)
68
- @ntriple.force_encoding("ASCII-8BIT")
61
+ module RDFObject
62
+ class Collection < Hash
63
+ attr_accessor :objects
64
+ def initialize(subjects=true)
65
+ @objects = Collection.new(false) if subjects
66
+ end
67
+ def uris
68
+ return self.keys
69
+ end
70
+ def find_by_type(type)
71
+ self.find_all {|r| r}
72
+ end
73
+ def resources
74
+ self.merge(@objects)
69
75
  end
70
- parse_ntriple
71
76
  end
72
77
 
73
- def parse_ntriple
74
- scanner = StringScanner.new(@ntriple)
75
- @subject = scanner.scan_until(/> /)
76
- @subject.sub!(/^</,'')
77
- @subject.sub!(/> $/,'')
78
- @predicate = scanner.scan_until(/> /)
79
- @predicate.sub!(/^</,'')
80
- @predicate.sub!(/> $/,'')
81
- if scanner.match?(/</)
82
- object = scanner.scan_until(/>\s?\.\s*\n?$/)
83
- object.sub!(/^</,'')
84
- object.sub!(/>\s?\.\s*\n?$/,'')
85
- @object = Resource.new(object)
86
- else
87
- @literal = true
88
- scanner.getch
89
- object = scanner.scan_until(/("\s?\.\s*\n?$)|("@[A-z])|("\^\^)/)
90
- scanner.pos=(scanner.pos-2)
91
- object.sub!(/"..$/,'')
92
- if object.respond_to?(:force_encoding)
93
- object.force_encoding('utf-8').chomp!
78
+ class Parser
79
+ # Choose the best format parser from an admittedly small group of choices.
80
+ def self.parse(rdf, format=nil)
81
+ if format
82
+ parser = case format
83
+ when 'rdfxml' then XMLParser.new(rdf)
84
+ when 'rdfa' then RDFAParser.new(rdf)
85
+ when 'ntriples' then NTriplesParser.new(rdf)
86
+ when 'json' then JSONParser.new(rdf)
87
+ end
94
88
  else
95
- uscan = UTF8Parser.new(object)
96
- object = uscan.parse_string.chomp
89
+ begin
90
+ # Check if the format is XML or RDFa
91
+ doc = Nokogiri::XML.parse(rdf, nil, nil, Nokogiri::XML::ParseOptions::PEDANTIC)
92
+ raise "Unable to parse XML/HTML document -- no namespace declared" unless doc.root.namespaces
93
+ if doc.root.namespaces.values.index("http://www.w3.org/1999/xhtml")
94
+ parser = RDFAParser.new(doc)
95
+ else
96
+ parser = XMLParser.new(doc)
97
+ end
98
+ rescue Nokogiri::XML::SyntaxError
99
+ begin
100
+ if rdf.respond_to?(:read)
101
+ rdf.rewind
102
+ json = JSON.parse(rdf.read)
103
+ else
104
+ json = JSON.parse(rdf)
105
+ end
106
+ parser = JSONParser.new(json)
107
+ rescue JSON::ParserError
108
+ if rdf.respond_to?(:read)
109
+ rdf.rewind
110
+ end
111
+ parser = NTriplesParser.new(rdf)
112
+ end
113
+ end
97
114
  end
98
- if scanner.match?(/@/)
115
+ parser.parse
116
+ end
117
+ attr_reader :collection
118
+ def initialize(data=nil)
119
+ @collection = Collection.new
120
+ self.data=(data) if data
121
+ end
122
+
123
+ def find_or_create(uri)
124
+ return @collection.resources[uri] if @collection.resources[uri]
125
+ Resource.new(uri)
126
+ end
127
+ end
128
+ class NTriplesParser < RDFObject::Parser
129
+
130
+ def parse_ntriple(ntriple)
131
+ if ntriple.respond_to?(:force_encoding)
132
+ ntriple.force_encoding("ASCII-8BIT")
133
+ end
134
+ scanner = StringScanner.new(ntriple)
135
+ subject = scanner.scan_until(/> /)
136
+ subject.sub!(/^</,'')
137
+ subject.sub!(/> $/,'')
138
+ predicate = scanner.scan_until(/> /)
139
+ predicate.sub!(/^</,'')
140
+ predicate.sub!(/> $/,'')
141
+ if scanner.match?(/</)
142
+ tmp_object = scanner.scan_until(/>\s?\.\s*\n?$/)
143
+ tmp_object.sub!(/^</,'')
144
+ tmp_object.sub!(/>\s?\.\s*\n?$/,'')
145
+ object = find_or_create(tmp_object)
146
+ @collection[object.uri] = object
147
+ else
148
+ language = nil
149
+ data_type = nil
99
150
  scanner.getch
100
- @language = scanner.scan_until(/\s?\.\n?$/)
101
- @language.sub!(/\s?\.\n?$/,'')
102
- elsif scanner.match?(/\^\^/)
103
- scanner.skip_until(/</)
104
- @data_type = scanner.scan_until(/>/)
105
- @data_type.sub!(/>$/,'')
151
+ tmp_object = scanner.scan_until(/("\s?\.\s*\n?$)|("@[A-z])|("\^\^)/)
152
+ scanner.pos=(scanner.pos-2)
153
+ tmp_object.sub!(/"..$/,'')
154
+ if tmp_object.respond_to?(:force_encoding)
155
+ tmp_object.force_encoding('utf-8').chomp!
156
+ else
157
+ uscan = UTF8Parser.new(tmp_object)
158
+ tmp_object = uscan.parse_string.chomp
159
+ end
160
+ if scanner.match?(/@/)
161
+ scanner.getch
162
+ language = scanner.scan_until(/\s?\.\n?$/)
163
+ language.sub!(/\s?\.\n?$/,'')
164
+ elsif scanner.match?(/\^\^/)
165
+ scanner.skip_until(/</)
166
+ data_type = scanner.scan_until(/>/)
167
+ data_type.sub!(/>$/,'')
168
+ end
169
+ object = Literal.new(tmp_object,{:data_type=>data_type,:language=>language})
106
170
  end
107
- @object = Literal.new(object,{:data_type=>@data_type,:language=>@language})
171
+ [subject, predicate, object]
108
172
  end
109
- end
110
-
111
- def self.parse(resources)
112
- collection = []
113
- if resources.is_a?(String)
114
- assertions = resources.split("\n")
115
- elsif resources.is_a?(Array)
116
- assertions = resources
117
- elsif resources.respond_to?(:read)
118
- assertions = resources.readlines
173
+
174
+ def data=(ntriples)
175
+ if ntriples.is_a?(String)
176
+ @ntriples = ntriples.split("\n")
177
+ elsif ntriples.is_a?(Array)
178
+ @ntriples = ntriples
179
+ elsif ntriples.respond_to?(:read)
180
+ @ntriples = ntriples.readlines
181
+ end
119
182
  end
120
- assertions.each do | assertion |
121
- next if assertion[0, 1] == "#" # Ignore comments
122
- triple = self.new(assertion)
123
- resource = Resource.new(triple.subject)
124
- resource.assert(triple.predicate, triple.object)
125
- collection << resource
183
+
184
+ def parse
185
+ @ntriples.each do | assertion |
186
+ next if assertion[0, 1] == "#" # Ignore comments
187
+ triple = parse_ntriple(assertion)
188
+ resource = find_or_create(triple[0])
189
+ resource.assert(triple[1], triple[2])
190
+ @collection[resource.uri] = resource
191
+ end
192
+ @collection
126
193
  end
127
- collection.uniq
128
194
  end
129
- end
130
195
 
131
- class XMLParser
132
- #
133
- # A very unsophisticated RDF/XML Parser -- currently only parses RDF/XML that conforms to
134
- # the SimpleRdfXml convention: http://esw.w3.org/topic/SimpleRdfXml. This is a pragmatic
135
- # rather than dogmatic decision. If it is not working with your RDF/XML let me know and we
136
- # can probably fix it.
137
- #
138
- def self.parse(doc)
139
- namespaces = doc.namespaces
140
- #if namespaces.index("http://purl.org/rss/1.0/")
141
- # collection = parse_rss10(doc)
142
- if namespaces.index("http://www.w3.org/2005/sparql-results#")
143
- raise "Sorry, SPARQL not yet supported"
144
- else
145
- collection = parse_rdfxml(doc)
196
+ class XMLParser < RDFObject::Parser
197
+ #
198
+ # A very unsophisticated RDF/XML Parser -- currently only parses RDF/XML that conforms to
199
+ # the SimpleRdfXml convention: http://esw.w3.org/topic/SimpleRdfXml. This is a pragmatic
200
+ # rather than dogmatic decision. If it is not working with your RDF/XML let me know and we
201
+ # can probably fix it.
202
+ #
203
+
204
+ def parse
205
+ namespaces = @rdfxml.namespaces
206
+ if namespaces.index("http://purl.org/rss/1.0/")
207
+ fix_rss10
208
+ end
209
+ if namespaces.index("http://www.w3.org/2005/sparql-results#")
210
+ raise "Sorry, SPARQL not yet supported"
211
+ else
212
+ parse_rdfxml
213
+ end
214
+ @collection
146
215
  end
147
- collection.uniq
148
- end
149
-
150
- def self.parse_resource_node(resource_node, collection)
151
- resource = Resource.new(resource_node.attribute_with_ns('about', "http://www.w3.org/1999/02/22-rdf-syntax-ns#").value)
152
- unless (resource_node.name == "Description" and resource_node.namespace.href == "http://www.w3.org/1999/02/22-rdf-syntax-ns#") or
153
- (resource_node.name == "item" and resource_node.namespace.href == "http://purl.org/rss/1.0/")
154
- resource.assert("[rdf:type]",Resource.new("#{resource_node.namespace.href}#{resource_node.name}"))
216
+
217
+ def data=(xml)
218
+ if xml.is_a?(Nokogiri::XML::Document)
219
+ @rdfxml = xml
220
+ else
221
+ @rdfxml = Nokogiri::XML.parse(xml, nil, nil, Nokogiri::XML::ParseOptions::PEDANTIC)
222
+ end
155
223
  end
156
- resource_node.children.each do | child |
157
- next if child.text?
158
- predicate = "#{child.namespace.href}#{child.name}"
159
- if object_uri = child.attribute_with_ns("resource", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
160
- obj_resource = Resource.new(object_uri.value)
161
- resource.assert(predicate, obj_resource)
162
- collection << obj_resource
163
- elsif all_text?(child)
164
- opts = {}
165
- if lang = child.attribute_with_ns("lang", "http://www.w3.org/XML/1998/namespace")
166
- opts[:language] = lang.value
224
+
225
+ def parse_resource_node(resource_node, collection)
226
+ resource = find_or_create(resource_node.attribute_with_ns('about', "http://www.w3.org/1999/02/22-rdf-syntax-ns#").value)
227
+ unless (resource_node.name == "Description" and resource_node.namespace.href == "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
228
+ resource.assert("[rdf:type]", find_or_create("#{resource_node.namespace.href}#{resource_node.name}"))
229
+ end
230
+ resource_node.children.each do | child |
231
+ next if child.text?
232
+ predicate = "#{child.namespace.href}#{child.name}"
233
+ if object_uri = child.attribute_with_ns("resource", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
234
+ obj_resource = find_or_create(object_uri.value)
235
+ resource.assert(predicate, obj_resource)
236
+ @collection[obj_resource.uri] = obj_resource
237
+ elsif all_text?(child)
238
+ opts = {}
239
+ if lang = child.attribute_with_ns("lang", "http://www.w3.org/XML/1998/namespace")
240
+ opts[:language] = lang.value
241
+ end
242
+ if datatype = child.attribute_with_ns("datatype", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
243
+ opts[:data_type] = datatype.value
244
+ end
245
+ resource.assert(predicate, Literal.new(child.content.strip,opts))
167
246
  end
168
- if datatype = child.attribute_with_ns("datatype", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
169
- opts[:data_type] = datatype.value
247
+ child.xpath("./*[@rdf:about]").each do | grandchild |
248
+ gc_resource = find_or_create(grandchild.attribute_with_ns('about', "http://www.w3.org/1999/02/22-rdf-syntax-ns#").value)
249
+ resource.assert(predicate, gc_resource)
250
+ @collection[gc_resource.uri] = gc_resource
251
+ parse_resource_node(grandchild, collection)
170
252
  end
171
- resource.assert(predicate, Literal.new(child.content.strip,opts))
172
- end
173
- child.xpath("./*[@rdf:about]").each do | grandchild |
174
- gc_resource = Resource.new(grandchild.attribute_with_ns('about', "http://www.w3.org/1999/02/22-rdf-syntax-ns#").value)
175
- resource.assert(predicate, gc_resource)
176
- collection << gc_resource
177
- parse_resource_node(grandchild, collection)
178
253
  end
254
+ @collection[resource.uri] = resource
179
255
  end
180
- collection << resource
181
- end
182
256
 
183
- def self.all_text?(node)
184
- node.children.each do | child |
185
- return false unless child.text?
257
+ def all_text?(node)
258
+ node.children.each do | child |
259
+ return false unless child.text?
260
+ end
261
+ true
186
262
  end
187
- true
188
- end
189
263
 
190
- def self.parse_rdfxml(doc)
191
- collection = []
192
- doc.root.xpath("./*[@rdf:about]").each do | resource_node |
193
- parse_resource_node(resource_node, collection)
194
- end
195
- collection
196
- end
264
+ def parse_rdfxml
265
+ collection = []
266
+ @rdfxml.root.xpath("./*[@rdf:about]").each do | resource_node |
267
+ parse_resource_node(resource_node, collection)
268
+ end
269
+ end
197
270
 
198
- #def self.parse_rss10(doc)
199
- # collection = []
200
- # doc.root.xpath("./rss:item","rss"=>"http://purl.org/rss/1.0/").each do | resource_node |
201
- # parse_resource_node(resource_node, collection)
202
- # end
203
- # collection
204
- #end
205
- end
206
-
207
- class RDFAParser
208
- def self.parse(doc)
209
- xslt = Nokogiri::XSLT(open(File.dirname(__FILE__) + '/../xsl/RDFa2RDFXML.xsl'))
210
- rdf_doc = xslt.apply_to(doc)
211
- XMLParser.parse(Nokogiri.parse(rdf_doc))
212
- end
213
- end
214
-
215
- class JSONParser
216
- def self.parse(json)
217
- collection = []
218
- json.each_pair do |subject, assertions|
219
- resource = Resource.new(subject)
220
- collection << resource
221
- assertions.each_pair do |predicate, objects|
222
- objects.each do | object |
223
- if object['type'] == 'literal'
224
- opts = {}
225
- if object['lang']
226
- opts[:language] = object['lang']
227
- end
228
- if object['datatype']
229
- opts[:data_type] = object['datatype']
230
- end
231
- literal = Literal.new(object['value'],opts)
232
- resource.assert(predicate, literal)
233
- elsif object['type'] == 'uri'
234
- o = Resource.new(object['value'])
235
- resource.assert(predicate, o)
236
- collection << o
237
- elsif object['type'] == 'bnode' # For now, we're going to treat a blank node like a URI resource.
238
- o = Resource.new(object['value'])
239
- resource.assert(predicate, o)
240
- collection << o
241
- end
271
+ def fix_rss10
272
+ @rdfxml.root.xpath('./rss:channel/rss:items/rdf:Seq/rdf:li', {"rdf"=>"http://www.w3.org/1999/02/22-rdf-syntax-ns#",
273
+ "rss"=>"http://purl.org/rss/1.0/"}).each do | li |
274
+ if li['resource'] && !li["rdf:resource"]
275
+ li["rdf:resource"] = li["resource"]
242
276
  end
243
277
  end
244
278
  end
245
- collection.uniq
246
279
  end
247
- end
248
280
 
249
- class Parser
250
- # Choose the best format parser from an admittedly small group of choices.
251
- def self.parse(rdf)
252
- begin
253
- # Check if the format is XML or RDFa
254
- doc = Nokogiri::XML.parse(rdf, nil, nil, Nokogiri::XML::ParseOptions::PEDANTIC)
255
- raise "Unable to parse XML/HTML document -- no namespace declared" unless doc.root.namespaces
256
- if doc.root.namespaces.values.index("http://www.w3.org/1999/xhtml")
257
- collection = RDFAParser.parse(doc)
281
+ class RDFAParser < XMLParser
282
+ def data=(xhtml)
283
+ if xhtml.is_a?(Nokogiri::XML::Document)
284
+ doc = xhtml
258
285
  else
259
- collection = XMLParser.parse(doc)
286
+ doc = Nokogiri::HTML.parse(xhtml)
260
287
  end
261
- rescue Nokogiri::XML::SyntaxError
262
- begin
263
- if rdf.respond_to?(:read)
264
- rdf.rewind
265
- json = JSON.parse(rdf.read)
266
- else
267
- json = JSON.parse(rdf)
268
- end
269
- collection = JSONParser.parse(json)
270
- rescue JSON::ParserError
271
- if rdf.respond_to?(:read)
272
- rdf.rewind
288
+ xslt = Nokogiri::XSLT(open(File.dirname(__FILE__) + '/../xsl/RDFa2RDFXML.xsl'))
289
+ rdfxml = xslt.apply_to(doc)
290
+ @rdfxml = Nokogiri::XML.parse(rdfxml, nil, nil, Nokogiri::XML::ParseOptions::PEDANTIC)
291
+ end
292
+ end
293
+
294
+ class JSONParser < RDFObject::Parser
295
+
296
+ def data=(json)
297
+ if json.is_a?(String)
298
+ @json = JSON.parse(json)
299
+ elsif json.is_a?(Hash)
300
+ @json = json
301
+ elsif json.respond_to?(:read)
302
+ @json = JSON.parse(json.read)
303
+ end
304
+ end
305
+
306
+ def parse
307
+ @json.each_pair do |subject, assertions|
308
+ resource = find_or_create(subject)
309
+ @collection[resource.uri] = resource
310
+ assertions.each_pair do |predicate, objects|
311
+ objects.each do | object |
312
+ if object['type'] == 'literal'
313
+ opts = {}
314
+ if object['lang']
315
+ opts[:language] = object['lang']
316
+ end
317
+ if object['datatype']
318
+ opts[:data_type] = object['datatype']
319
+ end
320
+ literal = Literal.new(object['value'],opts)
321
+ resource.assert(predicate, literal)
322
+ elsif object['type'] == 'uri'
323
+ o = find_or_create(object['value'])
324
+ resource.assert(predicate, o)
325
+ @collection[o.uri] = o
326
+ elsif object['type'] == 'bnode' # For now, we're going to treat a blank node like a URI resource.
327
+ o = find_or_create(object['value'])
328
+ resource.assert(predicate, o)
329
+ @collection[o.uri] = o
330
+ end
331
+ end
273
332
  end
274
- collection = NTriplesParser.parse(rdf)
275
333
  end
334
+ @collection
276
335
  end
277
- collection
278
336
  end
279
- end
337
+
338
+
280
339
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rdfobjects
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.1
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ross Singer
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-10-08 00:00:00 -04:00
12
+ date: 2009-10-09 00:00:00 -04:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency