rdfobjects 0.4.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rdf_objects/parsers.rb +245 -186
- metadata +2 -2
data/lib/rdf_objects/parsers.rb
CHANGED
@@ -58,223 +58,282 @@ class UTF8Parser < StringScanner
|
|
58
58
|
raise StandardError, "Caught #{e.class}: #{e}"
|
59
59
|
end
|
60
60
|
end
|
61
|
-
module RDFObject
|
62
|
-
class
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
61
|
+
module RDFObject
|
62
|
+
class Collection < Hash
|
63
|
+
attr_accessor :objects
|
64
|
+
def initialize(subjects=true)
|
65
|
+
@objects = Collection.new(false) if subjects
|
66
|
+
end
|
67
|
+
def uris
|
68
|
+
return self.keys
|
69
|
+
end
|
70
|
+
def find_by_type(type)
|
71
|
+
self.find_all {|r| r}
|
72
|
+
end
|
73
|
+
def resources
|
74
|
+
self.merge(@objects)
|
69
75
|
end
|
70
|
-
parse_ntriple
|
71
76
|
end
|
72
77
|
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
object.sub!(/^</,'')
|
84
|
-
object.sub!(/>\s?\.\s*\n?$/,'')
|
85
|
-
@object = Resource.new(object)
|
86
|
-
else
|
87
|
-
@literal = true
|
88
|
-
scanner.getch
|
89
|
-
object = scanner.scan_until(/("\s?\.\s*\n?$)|("@[A-z])|("\^\^)/)
|
90
|
-
scanner.pos=(scanner.pos-2)
|
91
|
-
object.sub!(/"..$/,'')
|
92
|
-
if object.respond_to?(:force_encoding)
|
93
|
-
object.force_encoding('utf-8').chomp!
|
78
|
+
class Parser
|
79
|
+
# Choose the best format parser from an admittedly small group of choices.
|
80
|
+
def self.parse(rdf, format=nil)
|
81
|
+
if format
|
82
|
+
parser = case format
|
83
|
+
when 'rdfxml' then XMLParser.new(rdf)
|
84
|
+
when 'rdfa' then RDFAParser.new(rdf)
|
85
|
+
when 'ntriples' then NTriplesParser.new(rdf)
|
86
|
+
when 'json' then JSONParser.new(rdf)
|
87
|
+
end
|
94
88
|
else
|
95
|
-
|
96
|
-
|
89
|
+
begin
|
90
|
+
# Check if the format is XML or RDFa
|
91
|
+
doc = Nokogiri::XML.parse(rdf, nil, nil, Nokogiri::XML::ParseOptions::PEDANTIC)
|
92
|
+
raise "Unable to parse XML/HTML document -- no namespace declared" unless doc.root.namespaces
|
93
|
+
if doc.root.namespaces.values.index("http://www.w3.org/1999/xhtml")
|
94
|
+
parser = RDFAParser.new(doc)
|
95
|
+
else
|
96
|
+
parser = XMLParser.new(doc)
|
97
|
+
end
|
98
|
+
rescue Nokogiri::XML::SyntaxError
|
99
|
+
begin
|
100
|
+
if rdf.respond_to?(:read)
|
101
|
+
rdf.rewind
|
102
|
+
json = JSON.parse(rdf.read)
|
103
|
+
else
|
104
|
+
json = JSON.parse(rdf)
|
105
|
+
end
|
106
|
+
parser = JSONParser.new(json)
|
107
|
+
rescue JSON::ParserError
|
108
|
+
if rdf.respond_to?(:read)
|
109
|
+
rdf.rewind
|
110
|
+
end
|
111
|
+
parser = NTriplesParser.new(rdf)
|
112
|
+
end
|
113
|
+
end
|
97
114
|
end
|
98
|
-
|
115
|
+
parser.parse
|
116
|
+
end
|
117
|
+
attr_reader :collection
|
118
|
+
def initialize(data=nil)
|
119
|
+
@collection = Collection.new
|
120
|
+
self.data=(data) if data
|
121
|
+
end
|
122
|
+
|
123
|
+
def find_or_create(uri)
|
124
|
+
return @collection.resources[uri] if @collection.resources[uri]
|
125
|
+
Resource.new(uri)
|
126
|
+
end
|
127
|
+
end
|
128
|
+
class NTriplesParser < RDFObject::Parser
|
129
|
+
|
130
|
+
def parse_ntriple(ntriple)
|
131
|
+
if ntriple.respond_to?(:force_encoding)
|
132
|
+
ntriple.force_encoding("ASCII-8BIT")
|
133
|
+
end
|
134
|
+
scanner = StringScanner.new(ntriple)
|
135
|
+
subject = scanner.scan_until(/> /)
|
136
|
+
subject.sub!(/^</,'')
|
137
|
+
subject.sub!(/> $/,'')
|
138
|
+
predicate = scanner.scan_until(/> /)
|
139
|
+
predicate.sub!(/^</,'')
|
140
|
+
predicate.sub!(/> $/,'')
|
141
|
+
if scanner.match?(/</)
|
142
|
+
tmp_object = scanner.scan_until(/>\s?\.\s*\n?$/)
|
143
|
+
tmp_object.sub!(/^</,'')
|
144
|
+
tmp_object.sub!(/>\s?\.\s*\n?$/,'')
|
145
|
+
object = find_or_create(tmp_object)
|
146
|
+
@collection[object.uri] = object
|
147
|
+
else
|
148
|
+
language = nil
|
149
|
+
data_type = nil
|
99
150
|
scanner.getch
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
151
|
+
tmp_object = scanner.scan_until(/("\s?\.\s*\n?$)|("@[A-z])|("\^\^)/)
|
152
|
+
scanner.pos=(scanner.pos-2)
|
153
|
+
tmp_object.sub!(/"..$/,'')
|
154
|
+
if tmp_object.respond_to?(:force_encoding)
|
155
|
+
tmp_object.force_encoding('utf-8').chomp!
|
156
|
+
else
|
157
|
+
uscan = UTF8Parser.new(tmp_object)
|
158
|
+
tmp_object = uscan.parse_string.chomp
|
159
|
+
end
|
160
|
+
if scanner.match?(/@/)
|
161
|
+
scanner.getch
|
162
|
+
language = scanner.scan_until(/\s?\.\n?$/)
|
163
|
+
language.sub!(/\s?\.\n?$/,'')
|
164
|
+
elsif scanner.match?(/\^\^/)
|
165
|
+
scanner.skip_until(/</)
|
166
|
+
data_type = scanner.scan_until(/>/)
|
167
|
+
data_type.sub!(/>$/,'')
|
168
|
+
end
|
169
|
+
object = Literal.new(tmp_object,{:data_type=>data_type,:language=>language})
|
106
170
|
end
|
107
|
-
|
171
|
+
[subject, predicate, object]
|
108
172
|
end
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
assertions = resources.readlines
|
173
|
+
|
174
|
+
def data=(ntriples)
|
175
|
+
if ntriples.is_a?(String)
|
176
|
+
@ntriples = ntriples.split("\n")
|
177
|
+
elsif ntriples.is_a?(Array)
|
178
|
+
@ntriples = ntriples
|
179
|
+
elsif ntriples.respond_to?(:read)
|
180
|
+
@ntriples = ntriples.readlines
|
181
|
+
end
|
119
182
|
end
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
183
|
+
|
184
|
+
def parse
|
185
|
+
@ntriples.each do | assertion |
|
186
|
+
next if assertion[0, 1] == "#" # Ignore comments
|
187
|
+
triple = parse_ntriple(assertion)
|
188
|
+
resource = find_or_create(triple[0])
|
189
|
+
resource.assert(triple[1], triple[2])
|
190
|
+
@collection[resource.uri] = resource
|
191
|
+
end
|
192
|
+
@collection
|
126
193
|
end
|
127
|
-
collection.uniq
|
128
194
|
end
|
129
|
-
end
|
130
195
|
|
131
|
-
class XMLParser
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
196
|
+
class XMLParser < RDFObject::Parser
|
197
|
+
#
|
198
|
+
# A very unsophisticated RDF/XML Parser -- currently only parses RDF/XML that conforms to
|
199
|
+
# the SimpleRdfXml convention: http://esw.w3.org/topic/SimpleRdfXml. This is a pragmatic
|
200
|
+
# rather than dogmatic decision. If it is not working with your RDF/XML let me know and we
|
201
|
+
# can probably fix it.
|
202
|
+
#
|
203
|
+
|
204
|
+
def parse
|
205
|
+
namespaces = @rdfxml.namespaces
|
206
|
+
if namespaces.index("http://purl.org/rss/1.0/")
|
207
|
+
fix_rss10
|
208
|
+
end
|
209
|
+
if namespaces.index("http://www.w3.org/2005/sparql-results#")
|
210
|
+
raise "Sorry, SPARQL not yet supported"
|
211
|
+
else
|
212
|
+
parse_rdfxml
|
213
|
+
end
|
214
|
+
@collection
|
146
215
|
end
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
resource.assert("[rdf:type]",Resource.new("#{resource_node.namespace.href}#{resource_node.name}"))
|
216
|
+
|
217
|
+
def data=(xml)
|
218
|
+
if xml.is_a?(Nokogiri::XML::Document)
|
219
|
+
@rdfxml = xml
|
220
|
+
else
|
221
|
+
@rdfxml = Nokogiri::XML.parse(xml, nil, nil, Nokogiri::XML::ParseOptions::PEDANTIC)
|
222
|
+
end
|
155
223
|
end
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
if
|
166
|
-
|
224
|
+
|
225
|
+
def parse_resource_node(resource_node, collection)
|
226
|
+
resource = find_or_create(resource_node.attribute_with_ns('about', "http://www.w3.org/1999/02/22-rdf-syntax-ns#").value)
|
227
|
+
unless (resource_node.name == "Description" and resource_node.namespace.href == "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
|
228
|
+
resource.assert("[rdf:type]", find_or_create("#{resource_node.namespace.href}#{resource_node.name}"))
|
229
|
+
end
|
230
|
+
resource_node.children.each do | child |
|
231
|
+
next if child.text?
|
232
|
+
predicate = "#{child.namespace.href}#{child.name}"
|
233
|
+
if object_uri = child.attribute_with_ns("resource", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
|
234
|
+
obj_resource = find_or_create(object_uri.value)
|
235
|
+
resource.assert(predicate, obj_resource)
|
236
|
+
@collection[obj_resource.uri] = obj_resource
|
237
|
+
elsif all_text?(child)
|
238
|
+
opts = {}
|
239
|
+
if lang = child.attribute_with_ns("lang", "http://www.w3.org/XML/1998/namespace")
|
240
|
+
opts[:language] = lang.value
|
241
|
+
end
|
242
|
+
if datatype = child.attribute_with_ns("datatype", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
|
243
|
+
opts[:data_type] = datatype.value
|
244
|
+
end
|
245
|
+
resource.assert(predicate, Literal.new(child.content.strip,opts))
|
167
246
|
end
|
168
|
-
|
169
|
-
|
247
|
+
child.xpath("./*[@rdf:about]").each do | grandchild |
|
248
|
+
gc_resource = find_or_create(grandchild.attribute_with_ns('about', "http://www.w3.org/1999/02/22-rdf-syntax-ns#").value)
|
249
|
+
resource.assert(predicate, gc_resource)
|
250
|
+
@collection[gc_resource.uri] = gc_resource
|
251
|
+
parse_resource_node(grandchild, collection)
|
170
252
|
end
|
171
|
-
resource.assert(predicate, Literal.new(child.content.strip,opts))
|
172
|
-
end
|
173
|
-
child.xpath("./*[@rdf:about]").each do | grandchild |
|
174
|
-
gc_resource = Resource.new(grandchild.attribute_with_ns('about', "http://www.w3.org/1999/02/22-rdf-syntax-ns#").value)
|
175
|
-
resource.assert(predicate, gc_resource)
|
176
|
-
collection << gc_resource
|
177
|
-
parse_resource_node(grandchild, collection)
|
178
253
|
end
|
254
|
+
@collection[resource.uri] = resource
|
179
255
|
end
|
180
|
-
collection << resource
|
181
|
-
end
|
182
256
|
|
183
|
-
|
184
|
-
|
185
|
-
|
257
|
+
def all_text?(node)
|
258
|
+
node.children.each do | child |
|
259
|
+
return false unless child.text?
|
260
|
+
end
|
261
|
+
true
|
186
262
|
end
|
187
|
-
true
|
188
|
-
end
|
189
263
|
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
end
|
264
|
+
def parse_rdfxml
|
265
|
+
collection = []
|
266
|
+
@rdfxml.root.xpath("./*[@rdf:about]").each do | resource_node |
|
267
|
+
parse_resource_node(resource_node, collection)
|
268
|
+
end
|
269
|
+
end
|
197
270
|
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
# collection
|
204
|
-
#end
|
205
|
-
end
|
206
|
-
|
207
|
-
class RDFAParser
|
208
|
-
def self.parse(doc)
|
209
|
-
xslt = Nokogiri::XSLT(open(File.dirname(__FILE__) + '/../xsl/RDFa2RDFXML.xsl'))
|
210
|
-
rdf_doc = xslt.apply_to(doc)
|
211
|
-
XMLParser.parse(Nokogiri.parse(rdf_doc))
|
212
|
-
end
|
213
|
-
end
|
214
|
-
|
215
|
-
class JSONParser
|
216
|
-
def self.parse(json)
|
217
|
-
collection = []
|
218
|
-
json.each_pair do |subject, assertions|
|
219
|
-
resource = Resource.new(subject)
|
220
|
-
collection << resource
|
221
|
-
assertions.each_pair do |predicate, objects|
|
222
|
-
objects.each do | object |
|
223
|
-
if object['type'] == 'literal'
|
224
|
-
opts = {}
|
225
|
-
if object['lang']
|
226
|
-
opts[:language] = object['lang']
|
227
|
-
end
|
228
|
-
if object['datatype']
|
229
|
-
opts[:data_type] = object['datatype']
|
230
|
-
end
|
231
|
-
literal = Literal.new(object['value'],opts)
|
232
|
-
resource.assert(predicate, literal)
|
233
|
-
elsif object['type'] == 'uri'
|
234
|
-
o = Resource.new(object['value'])
|
235
|
-
resource.assert(predicate, o)
|
236
|
-
collection << o
|
237
|
-
elsif object['type'] == 'bnode' # For now, we're going to treat a blank node like a URI resource.
|
238
|
-
o = Resource.new(object['value'])
|
239
|
-
resource.assert(predicate, o)
|
240
|
-
collection << o
|
241
|
-
end
|
271
|
+
def fix_rss10
|
272
|
+
@rdfxml.root.xpath('./rss:channel/rss:items/rdf:Seq/rdf:li', {"rdf"=>"http://www.w3.org/1999/02/22-rdf-syntax-ns#",
|
273
|
+
"rss"=>"http://purl.org/rss/1.0/"}).each do | li |
|
274
|
+
if li['resource'] && !li["rdf:resource"]
|
275
|
+
li["rdf:resource"] = li["resource"]
|
242
276
|
end
|
243
277
|
end
|
244
278
|
end
|
245
|
-
collection.uniq
|
246
279
|
end
|
247
|
-
end
|
248
280
|
|
249
|
-
class
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
# Check if the format is XML or RDFa
|
254
|
-
doc = Nokogiri::XML.parse(rdf, nil, nil, Nokogiri::XML::ParseOptions::PEDANTIC)
|
255
|
-
raise "Unable to parse XML/HTML document -- no namespace declared" unless doc.root.namespaces
|
256
|
-
if doc.root.namespaces.values.index("http://www.w3.org/1999/xhtml")
|
257
|
-
collection = RDFAParser.parse(doc)
|
281
|
+
class RDFAParser < XMLParser
|
282
|
+
def data=(xhtml)
|
283
|
+
if xhtml.is_a?(Nokogiri::XML::Document)
|
284
|
+
doc = xhtml
|
258
285
|
else
|
259
|
-
|
286
|
+
doc = Nokogiri::HTML.parse(xhtml)
|
260
287
|
end
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
288
|
+
xslt = Nokogiri::XSLT(open(File.dirname(__FILE__) + '/../xsl/RDFa2RDFXML.xsl'))
|
289
|
+
rdfxml = xslt.apply_to(doc)
|
290
|
+
@rdfxml = Nokogiri::XML.parse(rdfxml, nil, nil, Nokogiri::XML::ParseOptions::PEDANTIC)
|
291
|
+
end
|
292
|
+
end
|
293
|
+
|
294
|
+
class JSONParser < RDFObject::Parser
|
295
|
+
|
296
|
+
def data=(json)
|
297
|
+
if json.is_a?(String)
|
298
|
+
@json = JSON.parse(json)
|
299
|
+
elsif json.is_a?(Hash)
|
300
|
+
@json = json
|
301
|
+
elsif json.respond_to?(:read)
|
302
|
+
@json = JSON.parse(json.read)
|
303
|
+
end
|
304
|
+
end
|
305
|
+
|
306
|
+
def parse
|
307
|
+
@json.each_pair do |subject, assertions|
|
308
|
+
resource = find_or_create(subject)
|
309
|
+
@collection[resource.uri] = resource
|
310
|
+
assertions.each_pair do |predicate, objects|
|
311
|
+
objects.each do | object |
|
312
|
+
if object['type'] == 'literal'
|
313
|
+
opts = {}
|
314
|
+
if object['lang']
|
315
|
+
opts[:language] = object['lang']
|
316
|
+
end
|
317
|
+
if object['datatype']
|
318
|
+
opts[:data_type] = object['datatype']
|
319
|
+
end
|
320
|
+
literal = Literal.new(object['value'],opts)
|
321
|
+
resource.assert(predicate, literal)
|
322
|
+
elsif object['type'] == 'uri'
|
323
|
+
o = find_or_create(object['value'])
|
324
|
+
resource.assert(predicate, o)
|
325
|
+
@collection[o.uri] = o
|
326
|
+
elsif object['type'] == 'bnode' # For now, we're going to treat a blank node like a URI resource.
|
327
|
+
o = find_or_create(object['value'])
|
328
|
+
resource.assert(predicate, o)
|
329
|
+
@collection[o.uri] = o
|
330
|
+
end
|
331
|
+
end
|
273
332
|
end
|
274
|
-
collection = NTriplesParser.parse(rdf)
|
275
333
|
end
|
334
|
+
@collection
|
276
335
|
end
|
277
|
-
collection
|
278
336
|
end
|
279
|
-
|
337
|
+
|
338
|
+
|
280
339
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rdfobjects
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ross Singer
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-10-
|
12
|
+
date: 2009-10-09 00:00:00 -04:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|