rdfobjects 0.4.1 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rdf_objects/parsers.rb +245 -186
- metadata +2 -2
data/lib/rdf_objects/parsers.rb
CHANGED
@@ -58,223 +58,282 @@ class UTF8Parser < StringScanner
|
|
58
58
|
raise StandardError, "Caught #{e.class}: #{e}"
|
59
59
|
end
|
60
60
|
end
|
61
|
-
module RDFObject
|
62
|
-
class
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
61
|
+
module RDFObject
|
62
|
+
class Collection < Hash
|
63
|
+
attr_accessor :objects
|
64
|
+
def initialize(subjects=true)
|
65
|
+
@objects = Collection.new(false) if subjects
|
66
|
+
end
|
67
|
+
def uris
|
68
|
+
return self.keys
|
69
|
+
end
|
70
|
+
def find_by_type(type)
|
71
|
+
self.find_all {|r| r}
|
72
|
+
end
|
73
|
+
def resources
|
74
|
+
self.merge(@objects)
|
69
75
|
end
|
70
|
-
parse_ntriple
|
71
76
|
end
|
72
77
|
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
object.sub!(/^</,'')
|
84
|
-
object.sub!(/>\s?\.\s*\n?$/,'')
|
85
|
-
@object = Resource.new(object)
|
86
|
-
else
|
87
|
-
@literal = true
|
88
|
-
scanner.getch
|
89
|
-
object = scanner.scan_until(/("\s?\.\s*\n?$)|("@[A-z])|("\^\^)/)
|
90
|
-
scanner.pos=(scanner.pos-2)
|
91
|
-
object.sub!(/"..$/,'')
|
92
|
-
if object.respond_to?(:force_encoding)
|
93
|
-
object.force_encoding('utf-8').chomp!
|
78
|
+
class Parser
|
79
|
+
# Choose the best format parser from an admittedly small group of choices.
|
80
|
+
def self.parse(rdf, format=nil)
|
81
|
+
if format
|
82
|
+
parser = case format
|
83
|
+
when 'rdfxml' then XMLParser.new(rdf)
|
84
|
+
when 'rdfa' then RDFAParser.new(rdf)
|
85
|
+
when 'ntriples' then NTriplesParser.new(rdf)
|
86
|
+
when 'json' then JSONParser.new(rdf)
|
87
|
+
end
|
94
88
|
else
|
95
|
-
|
96
|
-
|
89
|
+
begin
|
90
|
+
# Check if the format is XML or RDFa
|
91
|
+
doc = Nokogiri::XML.parse(rdf, nil, nil, Nokogiri::XML::ParseOptions::PEDANTIC)
|
92
|
+
raise "Unable to parse XML/HTML document -- no namespace declared" unless doc.root.namespaces
|
93
|
+
if doc.root.namespaces.values.index("http://www.w3.org/1999/xhtml")
|
94
|
+
parser = RDFAParser.new(doc)
|
95
|
+
else
|
96
|
+
parser = XMLParser.new(doc)
|
97
|
+
end
|
98
|
+
rescue Nokogiri::XML::SyntaxError
|
99
|
+
begin
|
100
|
+
if rdf.respond_to?(:read)
|
101
|
+
rdf.rewind
|
102
|
+
json = JSON.parse(rdf.read)
|
103
|
+
else
|
104
|
+
json = JSON.parse(rdf)
|
105
|
+
end
|
106
|
+
parser = JSONParser.new(json)
|
107
|
+
rescue JSON::ParserError
|
108
|
+
if rdf.respond_to?(:read)
|
109
|
+
rdf.rewind
|
110
|
+
end
|
111
|
+
parser = NTriplesParser.new(rdf)
|
112
|
+
end
|
113
|
+
end
|
97
114
|
end
|
98
|
-
|
115
|
+
parser.parse
|
116
|
+
end
|
117
|
+
attr_reader :collection
|
118
|
+
def initialize(data=nil)
|
119
|
+
@collection = Collection.new
|
120
|
+
self.data=(data) if data
|
121
|
+
end
|
122
|
+
|
123
|
+
def find_or_create(uri)
|
124
|
+
return @collection.resources[uri] if @collection.resources[uri]
|
125
|
+
Resource.new(uri)
|
126
|
+
end
|
127
|
+
end
|
128
|
+
class NTriplesParser < RDFObject::Parser
|
129
|
+
|
130
|
+
def parse_ntriple(ntriple)
|
131
|
+
if ntriple.respond_to?(:force_encoding)
|
132
|
+
ntriple.force_encoding("ASCII-8BIT")
|
133
|
+
end
|
134
|
+
scanner = StringScanner.new(ntriple)
|
135
|
+
subject = scanner.scan_until(/> /)
|
136
|
+
subject.sub!(/^</,'')
|
137
|
+
subject.sub!(/> $/,'')
|
138
|
+
predicate = scanner.scan_until(/> /)
|
139
|
+
predicate.sub!(/^</,'')
|
140
|
+
predicate.sub!(/> $/,'')
|
141
|
+
if scanner.match?(/</)
|
142
|
+
tmp_object = scanner.scan_until(/>\s?\.\s*\n?$/)
|
143
|
+
tmp_object.sub!(/^</,'')
|
144
|
+
tmp_object.sub!(/>\s?\.\s*\n?$/,'')
|
145
|
+
object = find_or_create(tmp_object)
|
146
|
+
@collection[object.uri] = object
|
147
|
+
else
|
148
|
+
language = nil
|
149
|
+
data_type = nil
|
99
150
|
scanner.getch
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
151
|
+
tmp_object = scanner.scan_until(/("\s?\.\s*\n?$)|("@[A-z])|("\^\^)/)
|
152
|
+
scanner.pos=(scanner.pos-2)
|
153
|
+
tmp_object.sub!(/"..$/,'')
|
154
|
+
if tmp_object.respond_to?(:force_encoding)
|
155
|
+
tmp_object.force_encoding('utf-8').chomp!
|
156
|
+
else
|
157
|
+
uscan = UTF8Parser.new(tmp_object)
|
158
|
+
tmp_object = uscan.parse_string.chomp
|
159
|
+
end
|
160
|
+
if scanner.match?(/@/)
|
161
|
+
scanner.getch
|
162
|
+
language = scanner.scan_until(/\s?\.\n?$/)
|
163
|
+
language.sub!(/\s?\.\n?$/,'')
|
164
|
+
elsif scanner.match?(/\^\^/)
|
165
|
+
scanner.skip_until(/</)
|
166
|
+
data_type = scanner.scan_until(/>/)
|
167
|
+
data_type.sub!(/>$/,'')
|
168
|
+
end
|
169
|
+
object = Literal.new(tmp_object,{:data_type=>data_type,:language=>language})
|
106
170
|
end
|
107
|
-
|
171
|
+
[subject, predicate, object]
|
108
172
|
end
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
assertions = resources.readlines
|
173
|
+
|
174
|
+
def data=(ntriples)
|
175
|
+
if ntriples.is_a?(String)
|
176
|
+
@ntriples = ntriples.split("\n")
|
177
|
+
elsif ntriples.is_a?(Array)
|
178
|
+
@ntriples = ntriples
|
179
|
+
elsif ntriples.respond_to?(:read)
|
180
|
+
@ntriples = ntriples.readlines
|
181
|
+
end
|
119
182
|
end
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
183
|
+
|
184
|
+
def parse
|
185
|
+
@ntriples.each do | assertion |
|
186
|
+
next if assertion[0, 1] == "#" # Ignore comments
|
187
|
+
triple = parse_ntriple(assertion)
|
188
|
+
resource = find_or_create(triple[0])
|
189
|
+
resource.assert(triple[1], triple[2])
|
190
|
+
@collection[resource.uri] = resource
|
191
|
+
end
|
192
|
+
@collection
|
126
193
|
end
|
127
|
-
collection.uniq
|
128
194
|
end
|
129
|
-
end
|
130
195
|
|
131
|
-
class XMLParser
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
196
|
+
class XMLParser < RDFObject::Parser
|
197
|
+
#
|
198
|
+
# A very unsophisticated RDF/XML Parser -- currently only parses RDF/XML that conforms to
|
199
|
+
# the SimpleRdfXml convention: http://esw.w3.org/topic/SimpleRdfXml. This is a pragmatic
|
200
|
+
# rather than dogmatic decision. If it is not working with your RDF/XML let me know and we
|
201
|
+
# can probably fix it.
|
202
|
+
#
|
203
|
+
|
204
|
+
def parse
|
205
|
+
namespaces = @rdfxml.namespaces
|
206
|
+
if namespaces.index("http://purl.org/rss/1.0/")
|
207
|
+
fix_rss10
|
208
|
+
end
|
209
|
+
if namespaces.index("http://www.w3.org/2005/sparql-results#")
|
210
|
+
raise "Sorry, SPARQL not yet supported"
|
211
|
+
else
|
212
|
+
parse_rdfxml
|
213
|
+
end
|
214
|
+
@collection
|
146
215
|
end
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
resource.assert("[rdf:type]",Resource.new("#{resource_node.namespace.href}#{resource_node.name}"))
|
216
|
+
|
217
|
+
def data=(xml)
|
218
|
+
if xml.is_a?(Nokogiri::XML::Document)
|
219
|
+
@rdfxml = xml
|
220
|
+
else
|
221
|
+
@rdfxml = Nokogiri::XML.parse(xml, nil, nil, Nokogiri::XML::ParseOptions::PEDANTIC)
|
222
|
+
end
|
155
223
|
end
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
if
|
166
|
-
|
224
|
+
|
225
|
+
def parse_resource_node(resource_node, collection)
|
226
|
+
resource = find_or_create(resource_node.attribute_with_ns('about', "http://www.w3.org/1999/02/22-rdf-syntax-ns#").value)
|
227
|
+
unless (resource_node.name == "Description" and resource_node.namespace.href == "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
|
228
|
+
resource.assert("[rdf:type]", find_or_create("#{resource_node.namespace.href}#{resource_node.name}"))
|
229
|
+
end
|
230
|
+
resource_node.children.each do | child |
|
231
|
+
next if child.text?
|
232
|
+
predicate = "#{child.namespace.href}#{child.name}"
|
233
|
+
if object_uri = child.attribute_with_ns("resource", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
|
234
|
+
obj_resource = find_or_create(object_uri.value)
|
235
|
+
resource.assert(predicate, obj_resource)
|
236
|
+
@collection[obj_resource.uri] = obj_resource
|
237
|
+
elsif all_text?(child)
|
238
|
+
opts = {}
|
239
|
+
if lang = child.attribute_with_ns("lang", "http://www.w3.org/XML/1998/namespace")
|
240
|
+
opts[:language] = lang.value
|
241
|
+
end
|
242
|
+
if datatype = child.attribute_with_ns("datatype", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
|
243
|
+
opts[:data_type] = datatype.value
|
244
|
+
end
|
245
|
+
resource.assert(predicate, Literal.new(child.content.strip,opts))
|
167
246
|
end
|
168
|
-
|
169
|
-
|
247
|
+
child.xpath("./*[@rdf:about]").each do | grandchild |
|
248
|
+
gc_resource = find_or_create(grandchild.attribute_with_ns('about', "http://www.w3.org/1999/02/22-rdf-syntax-ns#").value)
|
249
|
+
resource.assert(predicate, gc_resource)
|
250
|
+
@collection[gc_resource.uri] = gc_resource
|
251
|
+
parse_resource_node(grandchild, collection)
|
170
252
|
end
|
171
|
-
resource.assert(predicate, Literal.new(child.content.strip,opts))
|
172
|
-
end
|
173
|
-
child.xpath("./*[@rdf:about]").each do | grandchild |
|
174
|
-
gc_resource = Resource.new(grandchild.attribute_with_ns('about', "http://www.w3.org/1999/02/22-rdf-syntax-ns#").value)
|
175
|
-
resource.assert(predicate, gc_resource)
|
176
|
-
collection << gc_resource
|
177
|
-
parse_resource_node(grandchild, collection)
|
178
253
|
end
|
254
|
+
@collection[resource.uri] = resource
|
179
255
|
end
|
180
|
-
collection << resource
|
181
|
-
end
|
182
256
|
|
183
|
-
|
184
|
-
|
185
|
-
|
257
|
+
def all_text?(node)
|
258
|
+
node.children.each do | child |
|
259
|
+
return false unless child.text?
|
260
|
+
end
|
261
|
+
true
|
186
262
|
end
|
187
|
-
true
|
188
|
-
end
|
189
263
|
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
end
|
264
|
+
def parse_rdfxml
|
265
|
+
collection = []
|
266
|
+
@rdfxml.root.xpath("./*[@rdf:about]").each do | resource_node |
|
267
|
+
parse_resource_node(resource_node, collection)
|
268
|
+
end
|
269
|
+
end
|
197
270
|
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
# collection
|
204
|
-
#end
|
205
|
-
end
|
206
|
-
|
207
|
-
class RDFAParser
|
208
|
-
def self.parse(doc)
|
209
|
-
xslt = Nokogiri::XSLT(open(File.dirname(__FILE__) + '/../xsl/RDFa2RDFXML.xsl'))
|
210
|
-
rdf_doc = xslt.apply_to(doc)
|
211
|
-
XMLParser.parse(Nokogiri.parse(rdf_doc))
|
212
|
-
end
|
213
|
-
end
|
214
|
-
|
215
|
-
class JSONParser
|
216
|
-
def self.parse(json)
|
217
|
-
collection = []
|
218
|
-
json.each_pair do |subject, assertions|
|
219
|
-
resource = Resource.new(subject)
|
220
|
-
collection << resource
|
221
|
-
assertions.each_pair do |predicate, objects|
|
222
|
-
objects.each do | object |
|
223
|
-
if object['type'] == 'literal'
|
224
|
-
opts = {}
|
225
|
-
if object['lang']
|
226
|
-
opts[:language] = object['lang']
|
227
|
-
end
|
228
|
-
if object['datatype']
|
229
|
-
opts[:data_type] = object['datatype']
|
230
|
-
end
|
231
|
-
literal = Literal.new(object['value'],opts)
|
232
|
-
resource.assert(predicate, literal)
|
233
|
-
elsif object['type'] == 'uri'
|
234
|
-
o = Resource.new(object['value'])
|
235
|
-
resource.assert(predicate, o)
|
236
|
-
collection << o
|
237
|
-
elsif object['type'] == 'bnode' # For now, we're going to treat a blank node like a URI resource.
|
238
|
-
o = Resource.new(object['value'])
|
239
|
-
resource.assert(predicate, o)
|
240
|
-
collection << o
|
241
|
-
end
|
271
|
+
def fix_rss10
|
272
|
+
@rdfxml.root.xpath('./rss:channel/rss:items/rdf:Seq/rdf:li', {"rdf"=>"http://www.w3.org/1999/02/22-rdf-syntax-ns#",
|
273
|
+
"rss"=>"http://purl.org/rss/1.0/"}).each do | li |
|
274
|
+
if li['resource'] && !li["rdf:resource"]
|
275
|
+
li["rdf:resource"] = li["resource"]
|
242
276
|
end
|
243
277
|
end
|
244
278
|
end
|
245
|
-
collection.uniq
|
246
279
|
end
|
247
|
-
end
|
248
280
|
|
249
|
-
class
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
# Check if the format is XML or RDFa
|
254
|
-
doc = Nokogiri::XML.parse(rdf, nil, nil, Nokogiri::XML::ParseOptions::PEDANTIC)
|
255
|
-
raise "Unable to parse XML/HTML document -- no namespace declared" unless doc.root.namespaces
|
256
|
-
if doc.root.namespaces.values.index("http://www.w3.org/1999/xhtml")
|
257
|
-
collection = RDFAParser.parse(doc)
|
281
|
+
class RDFAParser < XMLParser
|
282
|
+
def data=(xhtml)
|
283
|
+
if xhtml.is_a?(Nokogiri::XML::Document)
|
284
|
+
doc = xhtml
|
258
285
|
else
|
259
|
-
|
286
|
+
doc = Nokogiri::HTML.parse(xhtml)
|
260
287
|
end
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
288
|
+
xslt = Nokogiri::XSLT(open(File.dirname(__FILE__) + '/../xsl/RDFa2RDFXML.xsl'))
|
289
|
+
rdfxml = xslt.apply_to(doc)
|
290
|
+
@rdfxml = Nokogiri::XML.parse(rdfxml, nil, nil, Nokogiri::XML::ParseOptions::PEDANTIC)
|
291
|
+
end
|
292
|
+
end
|
293
|
+
|
294
|
+
class JSONParser < RDFObject::Parser
|
295
|
+
|
296
|
+
def data=(json)
|
297
|
+
if json.is_a?(String)
|
298
|
+
@json = JSON.parse(json)
|
299
|
+
elsif json.is_a?(Hash)
|
300
|
+
@json = json
|
301
|
+
elsif json.respond_to?(:read)
|
302
|
+
@json = JSON.parse(json.read)
|
303
|
+
end
|
304
|
+
end
|
305
|
+
|
306
|
+
def parse
|
307
|
+
@json.each_pair do |subject, assertions|
|
308
|
+
resource = find_or_create(subject)
|
309
|
+
@collection[resource.uri] = resource
|
310
|
+
assertions.each_pair do |predicate, objects|
|
311
|
+
objects.each do | object |
|
312
|
+
if object['type'] == 'literal'
|
313
|
+
opts = {}
|
314
|
+
if object['lang']
|
315
|
+
opts[:language] = object['lang']
|
316
|
+
end
|
317
|
+
if object['datatype']
|
318
|
+
opts[:data_type] = object['datatype']
|
319
|
+
end
|
320
|
+
literal = Literal.new(object['value'],opts)
|
321
|
+
resource.assert(predicate, literal)
|
322
|
+
elsif object['type'] == 'uri'
|
323
|
+
o = find_or_create(object['value'])
|
324
|
+
resource.assert(predicate, o)
|
325
|
+
@collection[o.uri] = o
|
326
|
+
elsif object['type'] == 'bnode' # For now, we're going to treat a blank node like a URI resource.
|
327
|
+
o = find_or_create(object['value'])
|
328
|
+
resource.assert(predicate, o)
|
329
|
+
@collection[o.uri] = o
|
330
|
+
end
|
331
|
+
end
|
273
332
|
end
|
274
|
-
collection = NTriplesParser.parse(rdf)
|
275
333
|
end
|
334
|
+
@collection
|
276
335
|
end
|
277
|
-
collection
|
278
336
|
end
|
279
|
-
|
337
|
+
|
338
|
+
|
280
339
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rdfobjects
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ross Singer
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-10-
|
12
|
+
date: 2009-10-09 00:00:00 -04:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|